1 /* 2 * kmp_affinity.cpp -- affinity management 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "kmp.h" 17 #include "kmp_i18n.h" 18 #include "kmp_io.h" 19 #include "kmp_str.h" 20 #include "kmp_wrapper_getpid.h" 21 22 #if KMP_AFFINITY_SUPPORTED 23 24 // 25 // Print the affinity mask to the character array in a pretty format. 26 // 27 char * 28 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask) 29 { 30 KMP_ASSERT(buf_len >= 40); 31 char *scan = buf; 32 char *end = buf + buf_len - 1; 33 34 // 35 // Find first element / check for empty set. 36 // 37 size_t i; 38 for (i = 0; i < KMP_CPU_SETSIZE; i++) { 39 if (KMP_CPU_ISSET(i, mask)) { 40 break; 41 } 42 } 43 if (i == KMP_CPU_SETSIZE) { 44 sprintf(scan, "{<empty>}"); 45 while (*scan != '\0') scan++; 46 KMP_ASSERT(scan <= end); 47 return buf; 48 } 49 50 sprintf(scan, "{%ld", (long)i); 51 while (*scan != '\0') scan++; 52 i++; 53 for (; i < KMP_CPU_SETSIZE; i++) { 54 if (! KMP_CPU_ISSET(i, mask)) { 55 continue; 56 } 57 58 // 59 // Check for buffer overflow. A string of the form ",<n>" will have 60 // at most 10 characters, plus we want to leave room to print ",...}" 61 // if the set is too large to print for a total of 15 characters. 62 // We already left room for '\0' in setting end. 63 // 64 if (end - scan < 15) { 65 break; 66 } 67 sprintf(scan, ",%-ld", (long)i); 68 while (*scan != '\0') scan++; 69 } 70 if (i < KMP_CPU_SETSIZE) { 71 sprintf(scan, ",..."); 72 while (*scan != '\0') scan++; 73 } 74 sprintf(scan, "}"); 75 while (*scan != '\0') scan++; 76 KMP_ASSERT(scan <= end); 77 return buf; 78 } 79 80 81 void 82 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) 83 { 84 KMP_CPU_ZERO(mask); 85 86 # if KMP_GROUP_AFFINITY 87 88 if (__kmp_num_proc_groups > 1) { 89 int group; 90 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 91 for (group = 0; group < __kmp_num_proc_groups; group++) { 92 int i; 93 int num = __kmp_GetActiveProcessorCount(group); 94 for (i = 0; i < num; i++) { 95 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 96 } 97 } 98 } 99 else 100 101 # endif /* KMP_GROUP_AFFINITY */ 102 103 { 104 int proc; 105 for (proc = 0; proc < __kmp_xproc; proc++) { 106 KMP_CPU_SET(proc, mask); 107 } 108 } 109 } 110 111 112 // 113 // In Linux* OS debug & cover (-O0) builds, we need to avoid inline member 114 // functions. 115 // 116 // The icc codegen emits sections with extremely long names, of the form 117 // ".gnu.linkonce.<mangled_name>". There seems to have been a linker bug 118 // introduced between GNU ld version 2.14.90.0.4 and 2.15.92.0.2 involving 119 // some sort of memory corruption or table overflow that is triggered by 120 // these long strings. I checked the latest version of the linker - 121 // GNU ld (Linux* OS/GNU Binutils) 2.18.50.0.7.20080422 - and the bug is not 122 // fixed. 123 // 124 // Unfortunately, my attempts to reproduce it in a smaller example have 125 // failed - I'm not sure what the prospects are of getting it fixed 126 // properly - but we need a reproducer smaller than all of libiomp. 127 // 128 // Work around the problem by avoiding inline constructors in such builds. 129 // We do this for all platforms, not just Linux* OS - non-inline functions are 130 // more debuggable and provide better coverage into than inline functions. 131 // Use inline functions in shipping libs, for performance. 132 // 133 134 # if !defined(KMP_DEBUG) && !defined(COVER) 135 136 class Address { 137 public: 138 static const unsigned maxDepth = 32; 139 unsigned labels[maxDepth]; 140 unsigned childNums[maxDepth]; 141 unsigned depth; 142 unsigned leader; 143 Address(unsigned _depth) 144 : depth(_depth), leader(FALSE) { 145 } 146 Address &operator=(const Address &b) { 147 depth = b.depth; 148 for (unsigned i = 0; i < depth; i++) { 149 labels[i] = b.labels[i]; 150 childNums[i] = b.childNums[i]; 151 } 152 leader = FALSE; 153 return *this; 154 } 155 bool operator==(const Address &b) const { 156 if (depth != b.depth) 157 return false; 158 for (unsigned i = 0; i < depth; i++) 159 if(labels[i] != b.labels[i]) 160 return false; 161 return true; 162 } 163 bool isClose(const Address &b, int level) const { 164 if (depth != b.depth) 165 return false; 166 if ((unsigned)level >= depth) 167 return true; 168 for (unsigned i = 0; i < (depth - level); i++) 169 if(labels[i] != b.labels[i]) 170 return false; 171 return true; 172 } 173 bool operator!=(const Address &b) const { 174 return !operator==(b); 175 } 176 }; 177 178 class AddrUnsPair { 179 public: 180 Address first; 181 unsigned second; 182 AddrUnsPair(Address _first, unsigned _second) 183 : first(_first), second(_second) { 184 } 185 AddrUnsPair &operator=(const AddrUnsPair &b) 186 { 187 first = b.first; 188 second = b.second; 189 return *this; 190 } 191 }; 192 193 # else 194 195 class Address { 196 public: 197 static const unsigned maxDepth = 32; 198 unsigned labels[maxDepth]; 199 unsigned childNums[maxDepth]; 200 unsigned depth; 201 unsigned leader; 202 Address(unsigned _depth); 203 Address &operator=(const Address &b); 204 bool operator==(const Address &b) const; 205 bool isClose(const Address &b, int level) const; 206 bool operator!=(const Address &b) const; 207 }; 208 209 Address::Address(unsigned _depth) 210 { 211 depth = _depth; 212 leader = FALSE; 213 } 214 215 Address &Address::operator=(const Address &b) { 216 depth = b.depth; 217 for (unsigned i = 0; i < depth; i++) { 218 labels[i] = b.labels[i]; 219 childNums[i] = b.childNums[i]; 220 } 221 leader = FALSE; 222 return *this; 223 } 224 225 bool Address::operator==(const Address &b) const { 226 if (depth != b.depth) 227 return false; 228 for (unsigned i = 0; i < depth; i++) 229 if(labels[i] != b.labels[i]) 230 return false; 231 return true; 232 } 233 234 bool Address::isClose(const Address &b, int level) const { 235 if (depth != b.depth) 236 return false; 237 if ((unsigned)level >= depth) 238 return true; 239 for (unsigned i = 0; i < (depth - level); i++) 240 if(labels[i] != b.labels[i]) 241 return false; 242 return true; 243 } 244 245 bool Address::operator!=(const Address &b) const { 246 return !operator==(b); 247 } 248 249 class AddrUnsPair { 250 public: 251 Address first; 252 unsigned second; 253 AddrUnsPair(Address _first, unsigned _second); 254 AddrUnsPair &operator=(const AddrUnsPair &b); 255 }; 256 257 AddrUnsPair::AddrUnsPair(Address _first, unsigned _second) 258 : first(_first), second(_second) 259 { 260 } 261 262 AddrUnsPair &AddrUnsPair::operator=(const AddrUnsPair &b) 263 { 264 first = b.first; 265 second = b.second; 266 return *this; 267 } 268 269 # endif /* !defined(KMP_DEBUG) && !defined(COVER) */ 270 271 272 static int 273 __kmp_affinity_cmp_Address_labels(const void *a, const void *b) 274 { 275 const Address *aa = (const Address *)&(((AddrUnsPair *)a) 276 ->first); 277 const Address *bb = (const Address *)&(((AddrUnsPair *)b) 278 ->first); 279 unsigned depth = aa->depth; 280 unsigned i; 281 KMP_DEBUG_ASSERT(depth == bb->depth); 282 for (i = 0; i < depth; i++) { 283 if (aa->labels[i] < bb->labels[i]) return -1; 284 if (aa->labels[i] > bb->labels[i]) return 1; 285 } 286 return 0; 287 } 288 289 290 static int 291 __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) 292 { 293 const Address *aa = (const Address *)&(((AddrUnsPair *)a) 294 ->first); 295 const Address *bb = (const Address *)&(((AddrUnsPair *)b) 296 ->first); 297 unsigned depth = aa->depth; 298 unsigned i; 299 KMP_DEBUG_ASSERT(depth == bb->depth); 300 KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth); 301 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0); 302 for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) { 303 int j = depth - i - 1; 304 if (aa->childNums[j] < bb->childNums[j]) return -1; 305 if (aa->childNums[j] > bb->childNums[j]) return 1; 306 } 307 for (; i < depth; i++) { 308 int j = i - __kmp_affinity_compact; 309 if (aa->childNums[j] < bb->childNums[j]) return -1; 310 if (aa->childNums[j] > bb->childNums[j]) return 1; 311 } 312 return 0; 313 } 314 315 /** A structure for holding machine-specific hierarchy info to be computed once at init. */ 316 class hierarchy_info { 317 public: 318 /** Typical levels are threads/core, cores/package or socket, packages/node, nodes/machine, 319 etc. We don't want to get specific with nomenclature */ 320 static const kmp_uint32 maxLevels=7; 321 322 /** This is specifically the depth of the machine configuration hierarchy, in terms of the 323 number of levels along the longest path from root to any leaf. It corresponds to the 324 number of entries in numPerLevel if we exclude all but one trailing 1. */ 325 kmp_uint32 depth; 326 kmp_uint32 base_depth; 327 kmp_uint32 base_num_threads; 328 bool uninitialized; 329 330 /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children the parent of a 331 node at level i has. For example, if we have a machine with 4 packages, 4 cores/package 332 and 2 HT per core, then numPerLevel = {2, 4, 4, 1, 1}. All empty levels are set to 1. */ 333 kmp_uint32 numPerLevel[maxLevels]; 334 kmp_uint32 skipPerLevel[maxLevels]; 335 336 void deriveLevels(AddrUnsPair *adr2os, int num_addrs) { 337 int hier_depth = adr2os[0].first.depth; 338 int level = 0; 339 for (int i=hier_depth-1; i>=0; --i) { 340 int max = -1; 341 for (int j=0; j<num_addrs; ++j) { 342 int next = adr2os[j].first.childNums[i]; 343 if (next > max) max = next; 344 } 345 numPerLevel[level] = max+1; 346 ++level; 347 } 348 } 349 350 hierarchy_info() : depth(1), uninitialized(true) {} 351 void init(AddrUnsPair *adr2os, int num_addrs) 352 { 353 /* Added explicit initialization of the depth here to prevent usage of dirty value 354 observed when static library is re-initialized multiple times (e.g. when 355 non-OpenMP thread repeatedly launches/joins thread that uses OpenMP). */ 356 depth = 1; 357 uninitialized = false; 358 for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level 359 numPerLevel[i] = 1; 360 skipPerLevel[i] = 1; 361 } 362 363 // Sort table by physical ID 364 if (adr2os) { 365 qsort(adr2os, num_addrs, sizeof(*adr2os), __kmp_affinity_cmp_Address_labels); 366 deriveLevels(adr2os, num_addrs); 367 } 368 else { 369 numPerLevel[0] = 4; 370 numPerLevel[1] = num_addrs/4; 371 if (num_addrs%4) numPerLevel[1]++; 372 } 373 374 base_num_threads = num_addrs; 375 for (int i=maxLevels-1; i>=0; --i) // count non-empty levels to get depth 376 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1' 377 depth++; 378 379 kmp_uint32 branch = 4; 380 if (numPerLevel[0] == 1) branch = num_addrs/4; 381 if (branch<4) branch=4; 382 for (kmp_uint32 d=0; d<depth-1; ++d) { // optimize hierarchy width 383 while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0! 384 if (numPerLevel[d] & 1) numPerLevel[d]++; 385 numPerLevel[d] = numPerLevel[d] >> 1; 386 if (numPerLevel[d+1] == 1) depth++; 387 numPerLevel[d+1] = numPerLevel[d+1] << 1; 388 } 389 if(numPerLevel[0] == 1) { 390 branch = branch >> 1; 391 if (branch<4) branch = 4; 392 } 393 } 394 395 for (kmp_uint32 i=1; i<depth; ++i) 396 skipPerLevel[i] = numPerLevel[i-1] * skipPerLevel[i-1]; 397 398 base_depth = depth; 399 } 400 }; 401 402 static hierarchy_info machine_hierarchy; 403 404 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 405 if (machine_hierarchy.uninitialized) 406 machine_hierarchy.init(NULL, nproc); 407 408 if (nproc <= machine_hierarchy.base_num_threads) 409 machine_hierarchy.depth = machine_hierarchy.base_depth; 410 KMP_DEBUG_ASSERT(machine_hierarchy.depth > 0); 411 while (nproc > machine_hierarchy.skipPerLevel[machine_hierarchy.depth-1]) { 412 machine_hierarchy.depth++; 413 machine_hierarchy.skipPerLevel[machine_hierarchy.depth-1] = 2*machine_hierarchy.skipPerLevel[machine_hierarchy.depth-2]; 414 } 415 thr_bar->depth = machine_hierarchy.depth; 416 thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1; 417 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 418 } 419 420 // 421 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be 422 // called to renumber the labels from [0..n] and place them into the child_num 423 // vector of the address object. This is done in case the labels used for 424 // the children at one node of the hierarchy differ from those used for 425 // another node at the same level. Example: suppose the machine has 2 nodes 426 // with 2 packages each. The first node contains packages 601 and 602, and 427 // second node contains packages 603 and 604. If we try to sort the table 428 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604 429 // because we are paying attention to the labels themselves, not the ordinal 430 // child numbers. By using the child numbers in the sort, the result is 431 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604. 432 // 433 static void 434 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os, 435 int numAddrs) 436 { 437 KMP_DEBUG_ASSERT(numAddrs > 0); 438 int depth = address2os->first.depth; 439 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 440 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth 441 * sizeof(unsigned)); 442 int labCt; 443 for (labCt = 0; labCt < depth; labCt++) { 444 address2os[0].first.childNums[labCt] = counts[labCt] = 0; 445 lastLabel[labCt] = address2os[0].first.labels[labCt]; 446 } 447 int i; 448 for (i = 1; i < numAddrs; i++) { 449 for (labCt = 0; labCt < depth; labCt++) { 450 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) { 451 int labCt2; 452 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) { 453 counts[labCt2] = 0; 454 lastLabel[labCt2] = address2os[i].first.labels[labCt2]; 455 } 456 counts[labCt]++; 457 lastLabel[labCt] = address2os[i].first.labels[labCt]; 458 break; 459 } 460 } 461 for (labCt = 0; labCt < depth; labCt++) { 462 address2os[i].first.childNums[labCt] = counts[labCt]; 463 } 464 for (; labCt < (int)Address::maxDepth; labCt++) { 465 address2os[i].first.childNums[labCt] = 0; 466 } 467 } 468 } 469 470 471 // 472 // All of the __kmp_affinity_create_*_map() routines should set 473 // __kmp_affinity_masks to a vector of affinity mask objects of length 474 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and 475 // return the number of levels in the machine topology tree (zero if 476 // __kmp_affinity_type == affinity_none). 477 // 478 // All of the __kmp_affinity_create_*_map() routines should set *fullMask 479 // to the affinity mask for the initialization thread. They need to save and 480 // restore the mask, and it could be needed later, so saving it is just an 481 // optimization to avoid calling kmp_get_system_affinity() again. 482 // 483 static kmp_affin_mask_t *fullMask = NULL; 484 485 kmp_affin_mask_t * 486 __kmp_affinity_get_fullMask() { return fullMask; } 487 488 489 static int nCoresPerPkg, nPackages; 490 static int __kmp_nThreadsPerCore; 491 #ifndef KMP_DFLT_NTH_CORES 492 static int __kmp_ncores; 493 #endif 494 495 // 496 // __kmp_affinity_uniform_topology() doesn't work when called from 497 // places which support arbitrarily many levels in the machine topology 498 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map() 499 // __kmp_affinity_create_x2apicid_map(). 500 // 501 inline static bool 502 __kmp_affinity_uniform_topology() 503 { 504 return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages); 505 } 506 507 508 // 509 // Print out the detailed machine topology map, i.e. the physical locations 510 // of each OS proc. 511 // 512 static void 513 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth, 514 int pkgLevel, int coreLevel, int threadLevel) 515 { 516 int proc; 517 518 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 519 for (proc = 0; proc < len; proc++) { 520 int level; 521 kmp_str_buf_t buf; 522 __kmp_str_buf_init(&buf); 523 for (level = 0; level < depth; level++) { 524 if (level == threadLevel) { 525 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread)); 526 } 527 else if (level == coreLevel) { 528 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core)); 529 } 530 else if (level == pkgLevel) { 531 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package)); 532 } 533 else if (level > pkgLevel) { 534 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node), 535 level - pkgLevel - 1); 536 } 537 else { 538 __kmp_str_buf_print(&buf, "L%d ", level); 539 } 540 __kmp_str_buf_print(&buf, "%d ", 541 address2os[proc].first.labels[level]); 542 } 543 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second, 544 buf.str); 545 __kmp_str_buf_free(&buf); 546 } 547 } 548 549 550 // 551 // If we don't know how to retrieve the machine's processor topology, or 552 // encounter an error in doing so, this routine is called to form a "flat" 553 // mapping of os thread id's <-> processor id's. 554 // 555 static int 556 __kmp_affinity_create_flat_map(AddrUnsPair **address2os, 557 kmp_i18n_id_t *const msg_id) 558 { 559 *address2os = NULL; 560 *msg_id = kmp_i18n_null; 561 562 // 563 // Even if __kmp_affinity_type == affinity_none, this routine might still 564 // called to set __kmp_ncores, as well as 565 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 566 // 567 if (! KMP_AFFINITY_CAPABLE()) { 568 KMP_ASSERT(__kmp_affinity_type == affinity_none); 569 __kmp_ncores = nPackages = __kmp_xproc; 570 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 571 if (__kmp_affinity_verbose) { 572 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY"); 573 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 574 KMP_INFORM(Uniform, "KMP_AFFINITY"); 575 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 576 __kmp_nThreadsPerCore, __kmp_ncores); 577 } 578 return 0; 579 } 580 581 // 582 // When affinity is off, this routine will still be called to set 583 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 584 // nCoresPerPkg, & nPackages. Make sure all these vars are set 585 // correctly, and return now if affinity is not enabled. 586 // 587 __kmp_ncores = nPackages = __kmp_avail_proc; 588 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 589 if (__kmp_affinity_verbose) { 590 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 591 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask); 592 593 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY"); 594 if (__kmp_affinity_respect_mask) { 595 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 596 } else { 597 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 598 } 599 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 600 KMP_INFORM(Uniform, "KMP_AFFINITY"); 601 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 602 __kmp_nThreadsPerCore, __kmp_ncores); 603 } 604 if (__kmp_affinity_type == affinity_none) { 605 return 0; 606 } 607 608 // 609 // Contruct the data structure to be returned. 610 // 611 *address2os = (AddrUnsPair*) 612 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 613 int avail_ct = 0; 614 unsigned int i; 615 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 616 // 617 // Skip this proc if it is not included in the machine model. 618 // 619 if (! KMP_CPU_ISSET(i, fullMask)) { 620 continue; 621 } 622 623 Address addr(1); 624 addr.labels[0] = i; 625 (*address2os)[avail_ct++] = AddrUnsPair(addr,i); 626 } 627 if (__kmp_affinity_verbose) { 628 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 629 } 630 631 if (__kmp_affinity_gran_levels < 0) { 632 // 633 // Only the package level is modeled in the machine topology map, 634 // so the #levels of granularity is either 0 or 1. 635 // 636 if (__kmp_affinity_gran > affinity_gran_package) { 637 __kmp_affinity_gran_levels = 1; 638 } 639 else { 640 __kmp_affinity_gran_levels = 0; 641 } 642 } 643 return 1; 644 } 645 646 647 # if KMP_GROUP_AFFINITY 648 649 // 650 // If multiple Windows* OS processor groups exist, we can create a 2-level 651 // topology map with the groups at level 0 and the individual procs at 652 // level 1. 653 // 654 // This facilitates letting the threads float among all procs in a group, 655 // if granularity=group (the default when there are multiple groups). 656 // 657 static int 658 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os, 659 kmp_i18n_id_t *const msg_id) 660 { 661 *address2os = NULL; 662 *msg_id = kmp_i18n_null; 663 664 // 665 // If we don't have multiple processor groups, return now. 666 // The flat mapping will be used. 667 // 668 if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) { 669 // FIXME set *msg_id 670 return -1; 671 } 672 673 // 674 // Contruct the data structure to be returned. 675 // 676 *address2os = (AddrUnsPair*) 677 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 678 int avail_ct = 0; 679 int i; 680 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 681 // 682 // Skip this proc if it is not included in the machine model. 683 // 684 if (! KMP_CPU_ISSET(i, fullMask)) { 685 continue; 686 } 687 688 Address addr(2); 689 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR)); 690 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR)); 691 (*address2os)[avail_ct++] = AddrUnsPair(addr,i); 692 693 if (__kmp_affinity_verbose) { 694 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0], 695 addr.labels[1]); 696 } 697 } 698 699 if (__kmp_affinity_gran_levels < 0) { 700 if (__kmp_affinity_gran == affinity_gran_group) { 701 __kmp_affinity_gran_levels = 1; 702 } 703 else if ((__kmp_affinity_gran == affinity_gran_fine) 704 || (__kmp_affinity_gran == affinity_gran_thread)) { 705 __kmp_affinity_gran_levels = 0; 706 } 707 else { 708 const char *gran_str = NULL; 709 if (__kmp_affinity_gran == affinity_gran_core) { 710 gran_str = "core"; 711 } 712 else if (__kmp_affinity_gran == affinity_gran_package) { 713 gran_str = "package"; 714 } 715 else if (__kmp_affinity_gran == affinity_gran_node) { 716 gran_str = "node"; 717 } 718 else { 719 KMP_ASSERT(0); 720 } 721 722 // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread" 723 __kmp_affinity_gran_levels = 0; 724 } 725 } 726 return 2; 727 } 728 729 # endif /* KMP_GROUP_AFFINITY */ 730 731 732 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 733 734 static int 735 __kmp_cpuid_mask_width(int count) { 736 int r = 0; 737 738 while((1<<r) < count) 739 ++r; 740 return r; 741 } 742 743 744 class apicThreadInfo { 745 public: 746 unsigned osId; // param to __kmp_affinity_bind_thread 747 unsigned apicId; // from cpuid after binding 748 unsigned maxCoresPerPkg; // "" 749 unsigned maxThreadsPerPkg; // "" 750 unsigned pkgId; // inferred from above values 751 unsigned coreId; // "" 752 unsigned threadId; // "" 753 }; 754 755 756 static int 757 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b) 758 { 759 const apicThreadInfo *aa = (const apicThreadInfo *)a; 760 const apicThreadInfo *bb = (const apicThreadInfo *)b; 761 if (aa->osId < bb->osId) return -1; 762 if (aa->osId > bb->osId) return 1; 763 return 0; 764 } 765 766 767 static int 768 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b) 769 { 770 const apicThreadInfo *aa = (const apicThreadInfo *)a; 771 const apicThreadInfo *bb = (const apicThreadInfo *)b; 772 if (aa->pkgId < bb->pkgId) return -1; 773 if (aa->pkgId > bb->pkgId) return 1; 774 if (aa->coreId < bb->coreId) return -1; 775 if (aa->coreId > bb->coreId) return 1; 776 if (aa->threadId < bb->threadId) return -1; 777 if (aa->threadId > bb->threadId) return 1; 778 return 0; 779 } 780 781 782 // 783 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 784 // an algorithm which cycles through the available os threads, setting 785 // the current thread's affinity mask to that thread, and then retrieves 786 // the Apic Id for each thread context using the cpuid instruction. 787 // 788 static int 789 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os, 790 kmp_i18n_id_t *const msg_id) 791 { 792 kmp_cpuid buf; 793 int rc; 794 *address2os = NULL; 795 *msg_id = kmp_i18n_null; 796 797 // 798 // Check if cpuid leaf 4 is supported. 799 // 800 __kmp_x86_cpuid(0, 0, &buf); 801 if (buf.eax < 4) { 802 *msg_id = kmp_i18n_str_NoLeaf4Support; 803 return -1; 804 } 805 806 // 807 // The algorithm used starts by setting the affinity to each available 808 // thread and retrieving info from the cpuid instruction, so if we are 809 // not capable of calling __kmp_get_system_affinity() and 810 // _kmp_get_system_affinity(), then we need to do something else - use 811 // the defaults that we calculated from issuing cpuid without binding 812 // to each proc. 813 // 814 if (! KMP_AFFINITY_CAPABLE()) { 815 // 816 // Hack to try and infer the machine topology using only the data 817 // available from cpuid on the current thread, and __kmp_xproc. 818 // 819 KMP_ASSERT(__kmp_affinity_type == affinity_none); 820 821 // 822 // Get an upper bound on the number of threads per package using 823 // cpuid(1). 824 // 825 // On some OS/chps combinations where HT is supported by the chip 826 // but is disabled, this value will be 2 on a single core chip. 827 // Usually, it will be 2 if HT is enabled and 1 if HT is disabled. 828 // 829 __kmp_x86_cpuid(1, 0, &buf); 830 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 831 if (maxThreadsPerPkg == 0) { 832 maxThreadsPerPkg = 1; 833 } 834 835 // 836 // The num cores per pkg comes from cpuid(4). 837 // 1 must be added to the encoded value. 838 // 839 // The author of cpu_count.cpp treated this only an upper bound 840 // on the number of cores, but I haven't seen any cases where it 841 // was greater than the actual number of cores, so we will treat 842 // it as exact in this block of code. 843 // 844 // First, we need to check if cpuid(4) is supported on this chip. 845 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax 846 // has the value n or greater. 847 // 848 __kmp_x86_cpuid(0, 0, &buf); 849 if (buf.eax >= 4) { 850 __kmp_x86_cpuid(4, 0, &buf); 851 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 852 } 853 else { 854 nCoresPerPkg = 1; 855 } 856 857 // 858 // There is no way to reliably tell if HT is enabled without issuing 859 // the cpuid instruction from every thread, can correlating the cpuid 860 // info, so if the machine is not affinity capable, we assume that HT 861 // is off. We have seen quite a few machines where maxThreadsPerPkg 862 // is 2, yet the machine does not support HT. 863 // 864 // - Older OSes are usually found on machines with older chips, which 865 // do not support HT. 866 // 867 // - The performance penalty for mistakenly identifying a machine as 868 // HT when it isn't (which results in blocktime being incorrecly set 869 // to 0) is greater than the penalty when for mistakenly identifying 870 // a machine as being 1 thread/core when it is really HT enabled 871 // (which results in blocktime being incorrectly set to a positive 872 // value). 873 // 874 __kmp_ncores = __kmp_xproc; 875 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 876 __kmp_nThreadsPerCore = 1; 877 if (__kmp_affinity_verbose) { 878 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY"); 879 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 880 if (__kmp_affinity_uniform_topology()) { 881 KMP_INFORM(Uniform, "KMP_AFFINITY"); 882 } else { 883 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 884 } 885 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 886 __kmp_nThreadsPerCore, __kmp_ncores); 887 } 888 return 0; 889 } 890 891 // 892 // 893 // From here on, we can assume that it is safe to call 894 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), 895 // even if __kmp_affinity_type = affinity_none. 896 // 897 898 // 899 // Save the affinity mask for the current thread. 900 // 901 kmp_affin_mask_t *oldMask; 902 KMP_CPU_ALLOC(oldMask); 903 KMP_ASSERT(oldMask != NULL); 904 __kmp_get_system_affinity(oldMask, TRUE); 905 906 // 907 // Run through each of the available contexts, binding the current thread 908 // to it, and obtaining the pertinent information using the cpuid instr. 909 // 910 // The relevant information is: 911 // 912 // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 913 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 914 // 915 // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The 916 // value of this field determines the width of the core# + thread# 917 // fields in the Apic Id. It is also an upper bound on the number 918 // of threads per package, but it has been verified that situations 919 // happen were it is not exact. In particular, on certain OS/chip 920 // combinations where Intel(R) Hyper-Threading Technology is supported 921 // by the chip but has 922 // been disabled, the value of this field will be 2 (for a single core 923 // chip). On other OS/chip combinations supporting 924 // Intel(R) Hyper-Threading Technology, the value of 925 // this field will be 1 when Intel(R) Hyper-Threading Technology is 926 // disabled and 2 when it is enabled. 927 // 928 // Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The 929 // value of this field (+1) determines the width of the core# field in 930 // the Apic Id. The comments in "cpucount.cpp" say that this value is 931 // an upper bound, but the IA-32 architecture manual says that it is 932 // exactly the number of cores per package, and I haven't seen any 933 // case where it wasn't. 934 // 935 // From this information, deduce the package Id, core Id, and thread Id, 936 // and set the corresponding fields in the apicThreadInfo struct. 937 // 938 unsigned i; 939 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 940 __kmp_avail_proc * sizeof(apicThreadInfo)); 941 unsigned nApics = 0; 942 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 943 // 944 // Skip this proc if it is not included in the machine model. 945 // 946 if (! KMP_CPU_ISSET(i, fullMask)) { 947 continue; 948 } 949 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 950 951 __kmp_affinity_bind_thread(i); 952 threadInfo[nApics].osId = i; 953 954 // 955 // The apic id and max threads per pkg come from cpuid(1). 956 // 957 __kmp_x86_cpuid(1, 0, &buf); 958 if (! (buf.edx >> 9) & 1) { 959 __kmp_set_system_affinity(oldMask, TRUE); 960 __kmp_free(threadInfo); 961 KMP_CPU_FREE(oldMask); 962 *msg_id = kmp_i18n_str_ApicNotPresent; 963 return -1; 964 } 965 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 966 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 967 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 968 threadInfo[nApics].maxThreadsPerPkg = 1; 969 } 970 971 // 972 // Max cores per pkg comes from cpuid(4). 973 // 1 must be added to the encoded value. 974 // 975 // First, we need to check if cpuid(4) is supported on this chip. 976 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax 977 // has the value n or greater. 978 // 979 __kmp_x86_cpuid(0, 0, &buf); 980 if (buf.eax >= 4) { 981 __kmp_x86_cpuid(4, 0, &buf); 982 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 983 } 984 else { 985 threadInfo[nApics].maxCoresPerPkg = 1; 986 } 987 988 // 989 // Infer the pkgId / coreId / threadId using only the info 990 // obtained locally. 991 // 992 int widthCT = __kmp_cpuid_mask_width( 993 threadInfo[nApics].maxThreadsPerPkg); 994 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 995 996 int widthC = __kmp_cpuid_mask_width( 997 threadInfo[nApics].maxCoresPerPkg); 998 int widthT = widthCT - widthC; 999 if (widthT < 0) { 1000 // 1001 // I've never seen this one happen, but I suppose it could, if 1002 // the cpuid instruction on a chip was really screwed up. 1003 // Make sure to restore the affinity mask before the tail call. 1004 // 1005 __kmp_set_system_affinity(oldMask, TRUE); 1006 __kmp_free(threadInfo); 1007 KMP_CPU_FREE(oldMask); 1008 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1009 return -1; 1010 } 1011 1012 int maskC = (1 << widthC) - 1; 1013 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) 1014 &maskC; 1015 1016 int maskT = (1 << widthT) - 1; 1017 threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT; 1018 1019 nApics++; 1020 } 1021 1022 // 1023 // We've collected all the info we need. 1024 // Restore the old affinity mask for this thread. 1025 // 1026 __kmp_set_system_affinity(oldMask, TRUE); 1027 1028 // 1029 // If there's only one thread context to bind to, form an Address object 1030 // with depth 1 and return immediately (or, if affinity is off, set 1031 // address2os to NULL and return). 1032 // 1033 // If it is configured to omit the package level when there is only a 1034 // single package, the logic at the end of this routine won't work if 1035 // there is only a single thread - it would try to form an Address 1036 // object with depth 0. 1037 // 1038 KMP_ASSERT(nApics > 0); 1039 if (nApics == 1) { 1040 __kmp_ncores = nPackages = 1; 1041 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1042 if (__kmp_affinity_verbose) { 1043 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1044 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1045 1046 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1047 if (__kmp_affinity_respect_mask) { 1048 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1049 } else { 1050 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1051 } 1052 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1053 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1054 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1055 __kmp_nThreadsPerCore, __kmp_ncores); 1056 } 1057 1058 if (__kmp_affinity_type == affinity_none) { 1059 __kmp_free(threadInfo); 1060 KMP_CPU_FREE(oldMask); 1061 return 0; 1062 } 1063 1064 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair)); 1065 Address addr(1); 1066 addr.labels[0] = threadInfo[0].pkgId; 1067 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId); 1068 1069 if (__kmp_affinity_gran_levels < 0) { 1070 __kmp_affinity_gran_levels = 0; 1071 } 1072 1073 if (__kmp_affinity_verbose) { 1074 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 1075 } 1076 1077 __kmp_free(threadInfo); 1078 KMP_CPU_FREE(oldMask); 1079 return 1; 1080 } 1081 1082 // 1083 // Sort the threadInfo table by physical Id. 1084 // 1085 qsort(threadInfo, nApics, sizeof(*threadInfo), 1086 __kmp_affinity_cmp_apicThreadInfo_phys_id); 1087 1088 // 1089 // The table is now sorted by pkgId / coreId / threadId, but we really 1090 // don't know the radix of any of the fields. pkgId's may be sparsely 1091 // assigned among the chips on a system. Although coreId's are usually 1092 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned 1093 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1094 // 1095 // For that matter, we don't know what coresPerPkg and threadsPerCore 1096 // (or the total # packages) are at this point - we want to determine 1097 // that now. We only have an upper bound on the first two figures. 1098 // 1099 // We also perform a consistency check at this point: the values returned 1100 // by the cpuid instruction for any thread bound to a given package had 1101 // better return the same info for maxThreadsPerPkg and maxCoresPerPkg. 1102 // 1103 nPackages = 1; 1104 nCoresPerPkg = 1; 1105 __kmp_nThreadsPerCore = 1; 1106 unsigned nCores = 1; 1107 1108 unsigned pkgCt = 1; // to determine radii 1109 unsigned lastPkgId = threadInfo[0].pkgId; 1110 unsigned coreCt = 1; 1111 unsigned lastCoreId = threadInfo[0].coreId; 1112 unsigned threadCt = 1; 1113 unsigned lastThreadId = threadInfo[0].threadId; 1114 1115 // intra-pkg consist checks 1116 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 1117 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 1118 1119 for (i = 1; i < nApics; i++) { 1120 if (threadInfo[i].pkgId != lastPkgId) { 1121 nCores++; 1122 pkgCt++; 1123 lastPkgId = threadInfo[i].pkgId; 1124 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; 1125 coreCt = 1; 1126 lastCoreId = threadInfo[i].coreId; 1127 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1128 threadCt = 1; 1129 lastThreadId = threadInfo[i].threadId; 1130 1131 // 1132 // This is a different package, so go on to the next iteration 1133 // without doing any consistency checks. Reset the consistency 1134 // check vars, though. 1135 // 1136 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 1137 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 1138 continue; 1139 } 1140 1141 if (threadInfo[i].coreId != lastCoreId) { 1142 nCores++; 1143 coreCt++; 1144 lastCoreId = threadInfo[i].coreId; 1145 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1146 threadCt = 1; 1147 lastThreadId = threadInfo[i].threadId; 1148 } 1149 else if (threadInfo[i].threadId != lastThreadId) { 1150 threadCt++; 1151 lastThreadId = threadInfo[i].threadId; 1152 } 1153 else { 1154 __kmp_free(threadInfo); 1155 KMP_CPU_FREE(oldMask); 1156 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1157 return -1; 1158 } 1159 1160 // 1161 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 1162 // fields agree between all the threads bounds to a given package. 1163 // 1164 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) 1165 || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 1166 __kmp_free(threadInfo); 1167 KMP_CPU_FREE(oldMask); 1168 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1169 return -1; 1170 } 1171 } 1172 nPackages = pkgCt; 1173 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; 1174 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1175 1176 // 1177 // When affinity is off, this routine will still be called to set 1178 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 1179 // nCoresPerPkg, & nPackages. Make sure all these vars are set 1180 // correctly, and return now if affinity is not enabled. 1181 // 1182 __kmp_ncores = nCores; 1183 if (__kmp_affinity_verbose) { 1184 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1185 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1186 1187 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1188 if (__kmp_affinity_respect_mask) { 1189 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1190 } else { 1191 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1192 } 1193 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1194 if (__kmp_affinity_uniform_topology()) { 1195 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1196 } else { 1197 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1198 } 1199 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1200 __kmp_nThreadsPerCore, __kmp_ncores); 1201 1202 } 1203 1204 if (__kmp_affinity_type == affinity_none) { 1205 __kmp_free(threadInfo); 1206 KMP_CPU_FREE(oldMask); 1207 return 0; 1208 } 1209 1210 // 1211 // Now that we've determined the number of packages, the number of cores 1212 // per package, and the number of threads per core, we can construct the 1213 // data structure that is to be returned. 1214 // 1215 int pkgLevel = 0; 1216 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1; 1217 int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 1218 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 1219 1220 KMP_ASSERT(depth > 0); 1221 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1222 1223 for (i = 0; i < nApics; ++i) { 1224 Address addr(depth); 1225 unsigned os = threadInfo[i].osId; 1226 int d = 0; 1227 1228 if (pkgLevel >= 0) { 1229 addr.labels[d++] = threadInfo[i].pkgId; 1230 } 1231 if (coreLevel >= 0) { 1232 addr.labels[d++] = threadInfo[i].coreId; 1233 } 1234 if (threadLevel >= 0) { 1235 addr.labels[d++] = threadInfo[i].threadId; 1236 } 1237 (*address2os)[i] = AddrUnsPair(addr, os); 1238 } 1239 1240 if (__kmp_affinity_gran_levels < 0) { 1241 // 1242 // Set the granularity level based on what levels are modeled 1243 // in the machine topology map. 1244 // 1245 __kmp_affinity_gran_levels = 0; 1246 if ((threadLevel >= 0) 1247 && (__kmp_affinity_gran > affinity_gran_thread)) { 1248 __kmp_affinity_gran_levels++; 1249 } 1250 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1251 __kmp_affinity_gran_levels++; 1252 } 1253 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) { 1254 __kmp_affinity_gran_levels++; 1255 } 1256 } 1257 1258 if (__kmp_affinity_verbose) { 1259 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel, 1260 coreLevel, threadLevel); 1261 } 1262 1263 __kmp_free(threadInfo); 1264 KMP_CPU_FREE(oldMask); 1265 return depth; 1266 } 1267 1268 1269 // 1270 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 1271 // architectures support a newer interface for specifying the x2APIC Ids, 1272 // based on cpuid leaf 11. 1273 // 1274 static int 1275 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os, 1276 kmp_i18n_id_t *const msg_id) 1277 { 1278 kmp_cpuid buf; 1279 1280 *address2os = NULL; 1281 *msg_id = kmp_i18n_null; 1282 1283 // 1284 // Check to see if cpuid leaf 11 is supported. 1285 // 1286 __kmp_x86_cpuid(0, 0, &buf); 1287 if (buf.eax < 11) { 1288 *msg_id = kmp_i18n_str_NoLeaf11Support; 1289 return -1; 1290 } 1291 __kmp_x86_cpuid(11, 0, &buf); 1292 if (buf.ebx == 0) { 1293 *msg_id = kmp_i18n_str_NoLeaf11Support; 1294 return -1; 1295 } 1296 1297 // 1298 // Find the number of levels in the machine topology. While we're at it, 1299 // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will 1300 // try to get more accurate values later by explicitly counting them, 1301 // but get reasonable defaults now, in case we return early. 1302 // 1303 int level; 1304 int threadLevel = -1; 1305 int coreLevel = -1; 1306 int pkgLevel = -1; 1307 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1308 1309 for (level = 0;; level++) { 1310 if (level > 31) { 1311 // 1312 // FIXME: Hack for DPD200163180 1313 // 1314 // If level is big then something went wrong -> exiting 1315 // 1316 // There could actually be 32 valid levels in the machine topology, 1317 // but so far, the only machine we have seen which does not exit 1318 // this loop before iteration 32 has fubar x2APIC settings. 1319 // 1320 // For now, just reject this case based upon loop trip count. 1321 // 1322 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1323 return -1; 1324 } 1325 __kmp_x86_cpuid(11, level, &buf); 1326 if (buf.ebx == 0) { 1327 if (pkgLevel < 0) { 1328 // 1329 // Will infer nPackages from __kmp_xproc 1330 // 1331 pkgLevel = level; 1332 level++; 1333 } 1334 break; 1335 } 1336 int kind = (buf.ecx >> 8) & 0xff; 1337 if (kind == 1) { 1338 // 1339 // SMT level 1340 // 1341 threadLevel = level; 1342 coreLevel = -1; 1343 pkgLevel = -1; 1344 __kmp_nThreadsPerCore = buf.ebx & 0xff; 1345 if (__kmp_nThreadsPerCore == 0) { 1346 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1347 return -1; 1348 } 1349 } 1350 else if (kind == 2) { 1351 // 1352 // core level 1353 // 1354 coreLevel = level; 1355 pkgLevel = -1; 1356 nCoresPerPkg = buf.ebx & 0xff; 1357 if (nCoresPerPkg == 0) { 1358 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1359 return -1; 1360 } 1361 } 1362 else { 1363 if (level <= 0) { 1364 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1365 return -1; 1366 } 1367 if (pkgLevel >= 0) { 1368 continue; 1369 } 1370 pkgLevel = level; 1371 nPackages = buf.ebx & 0xff; 1372 if (nPackages == 0) { 1373 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1374 return -1; 1375 } 1376 } 1377 } 1378 int depth = level; 1379 1380 // 1381 // In the above loop, "level" was counted from the finest level (usually 1382 // thread) to the coarsest. The caller expects that we will place the 1383 // labels in (*address2os)[].first.labels[] in the inverse order, so 1384 // we need to invert the vars saying which level means what. 1385 // 1386 if (threadLevel >= 0) { 1387 threadLevel = depth - threadLevel - 1; 1388 } 1389 if (coreLevel >= 0) { 1390 coreLevel = depth - coreLevel - 1; 1391 } 1392 KMP_DEBUG_ASSERT(pkgLevel >= 0); 1393 pkgLevel = depth - pkgLevel - 1; 1394 1395 // 1396 // The algorithm used starts by setting the affinity to each available 1397 // thread and retrieving info from the cpuid instruction, so if we are 1398 // not capable of calling __kmp_get_system_affinity() and 1399 // _kmp_get_system_affinity(), then we need to do something else - use 1400 // the defaults that we calculated from issuing cpuid without binding 1401 // to each proc. 1402 // 1403 if (! KMP_AFFINITY_CAPABLE()) 1404 { 1405 // 1406 // Hack to try and infer the machine topology using only the data 1407 // available from cpuid on the current thread, and __kmp_xproc. 1408 // 1409 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1410 1411 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1412 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1413 if (__kmp_affinity_verbose) { 1414 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 1415 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1416 if (__kmp_affinity_uniform_topology()) { 1417 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1418 } else { 1419 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1420 } 1421 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1422 __kmp_nThreadsPerCore, __kmp_ncores); 1423 } 1424 return 0; 1425 } 1426 1427 // 1428 // 1429 // From here on, we can assume that it is safe to call 1430 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), 1431 // even if __kmp_affinity_type = affinity_none. 1432 // 1433 1434 // 1435 // Save the affinity mask for the current thread. 1436 // 1437 kmp_affin_mask_t *oldMask; 1438 KMP_CPU_ALLOC(oldMask); 1439 __kmp_get_system_affinity(oldMask, TRUE); 1440 1441 // 1442 // Allocate the data structure to be returned. 1443 // 1444 AddrUnsPair *retval = (AddrUnsPair *) 1445 __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 1446 1447 // 1448 // Run through each of the available contexts, binding the current thread 1449 // to it, and obtaining the pertinent information using the cpuid instr. 1450 // 1451 unsigned int proc; 1452 int nApics = 0; 1453 for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) { 1454 // 1455 // Skip this proc if it is not included in the machine model. 1456 // 1457 if (! KMP_CPU_ISSET(proc, fullMask)) { 1458 continue; 1459 } 1460 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc); 1461 1462 __kmp_affinity_bind_thread(proc); 1463 1464 // 1465 // Extrach the labels for each level in the machine topology map 1466 // from the Apic ID. 1467 // 1468 Address addr(depth); 1469 int prev_shift = 0; 1470 1471 for (level = 0; level < depth; level++) { 1472 __kmp_x86_cpuid(11, level, &buf); 1473 unsigned apicId = buf.edx; 1474 if (buf.ebx == 0) { 1475 if (level != depth - 1) { 1476 KMP_CPU_FREE(oldMask); 1477 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1478 return -1; 1479 } 1480 addr.labels[depth - level - 1] = apicId >> prev_shift; 1481 level++; 1482 break; 1483 } 1484 int shift = buf.eax & 0x1f; 1485 int mask = (1 << shift) - 1; 1486 addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift; 1487 prev_shift = shift; 1488 } 1489 if (level != depth) { 1490 KMP_CPU_FREE(oldMask); 1491 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1492 return -1; 1493 } 1494 1495 retval[nApics] = AddrUnsPair(addr, proc); 1496 nApics++; 1497 } 1498 1499 // 1500 // We've collected all the info we need. 1501 // Restore the old affinity mask for this thread. 1502 // 1503 __kmp_set_system_affinity(oldMask, TRUE); 1504 1505 // 1506 // If there's only one thread context to bind to, return now. 1507 // 1508 KMP_ASSERT(nApics > 0); 1509 if (nApics == 1) { 1510 __kmp_ncores = nPackages = 1; 1511 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1512 if (__kmp_affinity_verbose) { 1513 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1514 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1515 1516 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1517 if (__kmp_affinity_respect_mask) { 1518 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1519 } else { 1520 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1521 } 1522 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1523 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1524 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1525 __kmp_nThreadsPerCore, __kmp_ncores); 1526 } 1527 1528 if (__kmp_affinity_type == affinity_none) { 1529 __kmp_free(retval); 1530 KMP_CPU_FREE(oldMask); 1531 return 0; 1532 } 1533 1534 // 1535 // Form an Address object which only includes the package level. 1536 // 1537 Address addr(1); 1538 addr.labels[0] = retval[0].first.labels[pkgLevel]; 1539 retval[0].first = addr; 1540 1541 if (__kmp_affinity_gran_levels < 0) { 1542 __kmp_affinity_gran_levels = 0; 1543 } 1544 1545 if (__kmp_affinity_verbose) { 1546 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 1547 } 1548 1549 *address2os = retval; 1550 KMP_CPU_FREE(oldMask); 1551 return 1; 1552 } 1553 1554 // 1555 // Sort the table by physical Id. 1556 // 1557 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels); 1558 1559 // 1560 // Find the radix at each of the levels. 1561 // 1562 unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1563 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1564 unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1565 unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1566 for (level = 0; level < depth; level++) { 1567 totals[level] = 1; 1568 maxCt[level] = 1; 1569 counts[level] = 1; 1570 last[level] = retval[0].first.labels[level]; 1571 } 1572 1573 // 1574 // From here on, the iteration variable "level" runs from the finest 1575 // level to the coarsest, i.e. we iterate forward through 1576 // (*address2os)[].first.labels[] - in the previous loops, we iterated 1577 // backwards. 1578 // 1579 for (proc = 1; (int)proc < nApics; proc++) { 1580 int level; 1581 for (level = 0; level < depth; level++) { 1582 if (retval[proc].first.labels[level] != last[level]) { 1583 int j; 1584 for (j = level + 1; j < depth; j++) { 1585 totals[j]++; 1586 counts[j] = 1; 1587 // The line below causes printing incorrect topology information 1588 // in case the max value for some level (maxCt[level]) is encountered earlier than 1589 // some less value while going through the array. 1590 // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2 1591 // whereas it must be 4. 1592 // TODO!!! Check if it can be commented safely 1593 //maxCt[j] = 1; 1594 last[j] = retval[proc].first.labels[j]; 1595 } 1596 totals[level]++; 1597 counts[level]++; 1598 if (counts[level] > maxCt[level]) { 1599 maxCt[level] = counts[level]; 1600 } 1601 last[level] = retval[proc].first.labels[level]; 1602 break; 1603 } 1604 else if (level == depth - 1) { 1605 __kmp_free(last); 1606 __kmp_free(maxCt); 1607 __kmp_free(counts); 1608 __kmp_free(totals); 1609 __kmp_free(retval); 1610 KMP_CPU_FREE(oldMask); 1611 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 1612 return -1; 1613 } 1614 } 1615 } 1616 1617 // 1618 // When affinity is off, this routine will still be called to set 1619 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 1620 // nCoresPerPkg, & nPackages. Make sure all these vars are set 1621 // correctly, and return if affinity is not enabled. 1622 // 1623 if (threadLevel >= 0) { 1624 __kmp_nThreadsPerCore = maxCt[threadLevel]; 1625 } 1626 else { 1627 __kmp_nThreadsPerCore = 1; 1628 } 1629 nPackages = totals[pkgLevel]; 1630 1631 if (coreLevel >= 0) { 1632 __kmp_ncores = totals[coreLevel]; 1633 nCoresPerPkg = maxCt[coreLevel]; 1634 } 1635 else { 1636 __kmp_ncores = nPackages; 1637 nCoresPerPkg = 1; 1638 } 1639 1640 // 1641 // Check to see if the machine topology is uniform 1642 // 1643 unsigned prod = maxCt[0]; 1644 for (level = 1; level < depth; level++) { 1645 prod *= maxCt[level]; 1646 } 1647 bool uniform = (prod == totals[level - 1]); 1648 1649 // 1650 // Print the machine topology summary. 1651 // 1652 if (__kmp_affinity_verbose) { 1653 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 1654 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1655 1656 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1657 if (__kmp_affinity_respect_mask) { 1658 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 1659 } else { 1660 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 1661 } 1662 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1663 if (uniform) { 1664 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1665 } else { 1666 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1667 } 1668 1669 kmp_str_buf_t buf; 1670 __kmp_str_buf_init(&buf); 1671 1672 __kmp_str_buf_print(&buf, "%d", totals[0]); 1673 for (level = 1; level <= pkgLevel; level++) { 1674 __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 1675 } 1676 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 1677 __kmp_nThreadsPerCore, __kmp_ncores); 1678 1679 __kmp_str_buf_free(&buf); 1680 } 1681 1682 if (__kmp_affinity_type == affinity_none) { 1683 __kmp_free(last); 1684 __kmp_free(maxCt); 1685 __kmp_free(counts); 1686 __kmp_free(totals); 1687 __kmp_free(retval); 1688 KMP_CPU_FREE(oldMask); 1689 return 0; 1690 } 1691 1692 // 1693 // Find any levels with radiix 1, and remove them from the map 1694 // (except for the package level). 1695 // 1696 int new_depth = 0; 1697 for (level = 0; level < depth; level++) { 1698 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1699 continue; 1700 } 1701 new_depth++; 1702 } 1703 1704 // 1705 // If we are removing any levels, allocate a new vector to return, 1706 // and copy the relevant information to it. 1707 // 1708 if (new_depth != depth) { 1709 AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate( 1710 sizeof(AddrUnsPair) * nApics); 1711 for (proc = 0; (int)proc < nApics; proc++) { 1712 Address addr(new_depth); 1713 new_retval[proc] = AddrUnsPair(addr, retval[proc].second); 1714 } 1715 int new_level = 0; 1716 for (level = 0; level < depth; level++) { 1717 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1718 if (level == threadLevel) { 1719 threadLevel = -1; 1720 } 1721 else if ((threadLevel >= 0) && (level < threadLevel)) { 1722 threadLevel--; 1723 } 1724 if (level == coreLevel) { 1725 coreLevel = -1; 1726 } 1727 else if ((coreLevel >= 0) && (level < coreLevel)) { 1728 coreLevel--; 1729 } 1730 if (level < pkgLevel) { 1731 pkgLevel--; 1732 } 1733 continue; 1734 } 1735 for (proc = 0; (int)proc < nApics; proc++) { 1736 new_retval[proc].first.labels[new_level] 1737 = retval[proc].first.labels[level]; 1738 } 1739 new_level++; 1740 } 1741 1742 __kmp_free(retval); 1743 retval = new_retval; 1744 depth = new_depth; 1745 } 1746 1747 if (__kmp_affinity_gran_levels < 0) { 1748 // 1749 // Set the granularity level based on what levels are modeled 1750 // in the machine topology map. 1751 // 1752 __kmp_affinity_gran_levels = 0; 1753 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1754 __kmp_affinity_gran_levels++; 1755 } 1756 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1757 __kmp_affinity_gran_levels++; 1758 } 1759 if (__kmp_affinity_gran > affinity_gran_package) { 1760 __kmp_affinity_gran_levels++; 1761 } 1762 } 1763 1764 if (__kmp_affinity_verbose) { 1765 __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, 1766 coreLevel, threadLevel); 1767 } 1768 1769 __kmp_free(last); 1770 __kmp_free(maxCt); 1771 __kmp_free(counts); 1772 __kmp_free(totals); 1773 KMP_CPU_FREE(oldMask); 1774 *address2os = retval; 1775 return depth; 1776 } 1777 1778 1779 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1780 1781 1782 #define osIdIndex 0 1783 #define threadIdIndex 1 1784 #define coreIdIndex 2 1785 #define pkgIdIndex 3 1786 #define nodeIdIndex 4 1787 1788 typedef unsigned *ProcCpuInfo; 1789 static unsigned maxIndex = pkgIdIndex; 1790 1791 1792 static int 1793 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b) 1794 { 1795 const unsigned *aa = (const unsigned *)a; 1796 const unsigned *bb = (const unsigned *)b; 1797 if (aa[osIdIndex] < bb[osIdIndex]) return -1; 1798 if (aa[osIdIndex] > bb[osIdIndex]) return 1; 1799 return 0; 1800 }; 1801 1802 1803 static int 1804 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b) 1805 { 1806 unsigned i; 1807 const unsigned *aa = *((const unsigned **)a); 1808 const unsigned *bb = *((const unsigned **)b); 1809 for (i = maxIndex; ; i--) { 1810 if (aa[i] < bb[i]) return -1; 1811 if (aa[i] > bb[i]) return 1; 1812 if (i == osIdIndex) break; 1813 } 1814 return 0; 1815 } 1816 1817 1818 // 1819 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 1820 // affinity map. 1821 // 1822 static int 1823 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line, 1824 kmp_i18n_id_t *const msg_id, FILE *f) 1825 { 1826 *address2os = NULL; 1827 *msg_id = kmp_i18n_null; 1828 1829 // 1830 // Scan of the file, and count the number of "processor" (osId) fields, 1831 // and find the highest value of <n> for a node_<n> field. 1832 // 1833 char buf[256]; 1834 unsigned num_records = 0; 1835 while (! feof(f)) { 1836 buf[sizeof(buf) - 1] = 1; 1837 if (! fgets(buf, sizeof(buf), f)) { 1838 // 1839 // Read errors presumably because of EOF 1840 // 1841 break; 1842 } 1843 1844 char s1[] = "processor"; 1845 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1846 num_records++; 1847 continue; 1848 } 1849 1850 // 1851 // FIXME - this will match "node_<n> <garbage>" 1852 // 1853 unsigned level; 1854 if (sscanf(buf, "node_%d id", &level) == 1) { 1855 if (nodeIdIndex + level >= maxIndex) { 1856 maxIndex = nodeIdIndex + level; 1857 } 1858 continue; 1859 } 1860 } 1861 1862 // 1863 // Check for empty file / no valid processor records, or too many. 1864 // The number of records can't exceed the number of valid bits in the 1865 // affinity mask. 1866 // 1867 if (num_records == 0) { 1868 *line = 0; 1869 *msg_id = kmp_i18n_str_NoProcRecords; 1870 return -1; 1871 } 1872 if (num_records > (unsigned)__kmp_xproc) { 1873 *line = 0; 1874 *msg_id = kmp_i18n_str_TooManyProcRecords; 1875 return -1; 1876 } 1877 1878 // 1879 // Set the file pointer back to the begginning, so that we can scan the 1880 // file again, this time performing a full parse of the data. 1881 // Allocate a vector of ProcCpuInfo object, where we will place the data. 1882 // Adding an extra element at the end allows us to remove a lot of extra 1883 // checks for termination conditions. 1884 // 1885 if (fseek(f, 0, SEEK_SET) != 0) { 1886 *line = 0; 1887 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 1888 return -1; 1889 } 1890 1891 // 1892 // Allocate the array of records to store the proc info in. The dummy 1893 // element at the end makes the logic in filling them out easier to code. 1894 // 1895 unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1) 1896 * sizeof(unsigned *)); 1897 unsigned i; 1898 for (i = 0; i <= num_records; i++) { 1899 threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1) 1900 * sizeof(unsigned)); 1901 } 1902 1903 #define CLEANUP_THREAD_INFO \ 1904 for (i = 0; i <= num_records; i++) { \ 1905 __kmp_free(threadInfo[i]); \ 1906 } \ 1907 __kmp_free(threadInfo); 1908 1909 // 1910 // A value of UINT_MAX means that we didn't find the field 1911 // 1912 unsigned __index; 1913 1914 #define INIT_PROC_INFO(p) \ 1915 for (__index = 0; __index <= maxIndex; __index++) { \ 1916 (p)[__index] = UINT_MAX; \ 1917 } 1918 1919 for (i = 0; i <= num_records; i++) { 1920 INIT_PROC_INFO(threadInfo[i]); 1921 } 1922 1923 unsigned num_avail = 0; 1924 *line = 0; 1925 while (! feof(f)) { 1926 // 1927 // Create an inner scoping level, so that all the goto targets at the 1928 // end of the loop appear in an outer scoping level. This avoids 1929 // warnings about jumping past an initialization to a target in the 1930 // same block. 1931 // 1932 { 1933 buf[sizeof(buf) - 1] = 1; 1934 bool long_line = false; 1935 if (! fgets(buf, sizeof(buf), f)) { 1936 // 1937 // Read errors presumably because of EOF 1938 // 1939 // If there is valid data in threadInfo[num_avail], then fake 1940 // a blank line in ensure that the last address gets parsed. 1941 // 1942 bool valid = false; 1943 for (i = 0; i <= maxIndex; i++) { 1944 if (threadInfo[num_avail][i] != UINT_MAX) { 1945 valid = true; 1946 } 1947 } 1948 if (! valid) { 1949 break; 1950 } 1951 buf[0] = 0; 1952 } else if (!buf[sizeof(buf) - 1]) { 1953 // 1954 // The line is longer than the buffer. Set a flag and don't 1955 // emit an error if we were going to ignore the line, anyway. 1956 // 1957 long_line = true; 1958 1959 #define CHECK_LINE \ 1960 if (long_line) { \ 1961 CLEANUP_THREAD_INFO; \ 1962 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 1963 return -1; \ 1964 } 1965 } 1966 (*line)++; 1967 1968 char s1[] = "processor"; 1969 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1970 CHECK_LINE; 1971 char *p = strchr(buf + sizeof(s1) - 1, ':'); 1972 unsigned val; 1973 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val; 1974 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field; 1975 threadInfo[num_avail][osIdIndex] = val; 1976 #if KMP_OS_LINUX && USE_SYSFS_INFO 1977 char path[256]; 1978 snprintf(path, sizeof(path), 1979 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 1980 threadInfo[num_avail][osIdIndex]); 1981 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 1982 1983 snprintf(path, sizeof(path), 1984 "/sys/devices/system/cpu/cpu%u/topology/core_id", 1985 threadInfo[num_avail][osIdIndex]); 1986 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 1987 continue; 1988 #else 1989 } 1990 char s2[] = "physical id"; 1991 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 1992 CHECK_LINE; 1993 char *p = strchr(buf + sizeof(s2) - 1, ':'); 1994 unsigned val; 1995 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val; 1996 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field; 1997 threadInfo[num_avail][pkgIdIndex] = val; 1998 continue; 1999 } 2000 char s3[] = "core id"; 2001 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 2002 CHECK_LINE; 2003 char *p = strchr(buf + sizeof(s3) - 1, ':'); 2004 unsigned val; 2005 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val; 2006 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field; 2007 threadInfo[num_avail][coreIdIndex] = val; 2008 continue; 2009 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 2010 } 2011 char s4[] = "thread id"; 2012 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 2013 CHECK_LINE; 2014 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2015 unsigned val; 2016 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val; 2017 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field; 2018 threadInfo[num_avail][threadIdIndex] = val; 2019 continue; 2020 } 2021 unsigned level; 2022 if (sscanf(buf, "node_%d id", &level) == 1) { 2023 CHECK_LINE; 2024 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2025 unsigned val; 2026 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val; 2027 KMP_ASSERT(nodeIdIndex + level <= maxIndex); 2028 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field; 2029 threadInfo[num_avail][nodeIdIndex + level] = val; 2030 continue; 2031 } 2032 2033 // 2034 // We didn't recognize the leading token on the line. 2035 // There are lots of leading tokens that we don't recognize - 2036 // if the line isn't empty, go on to the next line. 2037 // 2038 if ((*buf != 0) && (*buf != '\n')) { 2039 // 2040 // If the line is longer than the buffer, read characters 2041 // until we find a newline. 2042 // 2043 if (long_line) { 2044 int ch; 2045 while (((ch = fgetc(f)) != EOF) && (ch != '\n')); 2046 } 2047 continue; 2048 } 2049 2050 // 2051 // A newline has signalled the end of the processor record. 2052 // Check that there aren't too many procs specified. 2053 // 2054 if ((int)num_avail == __kmp_xproc) { 2055 CLEANUP_THREAD_INFO; 2056 *msg_id = kmp_i18n_str_TooManyEntries; 2057 return -1; 2058 } 2059 2060 // 2061 // Check for missing fields. The osId field must be there, and we 2062 // currently require that the physical id field is specified, also. 2063 // 2064 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 2065 CLEANUP_THREAD_INFO; 2066 *msg_id = kmp_i18n_str_MissingProcField; 2067 return -1; 2068 } 2069 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 2070 CLEANUP_THREAD_INFO; 2071 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 2072 return -1; 2073 } 2074 2075 // 2076 // Skip this proc if it is not included in the machine model. 2077 // 2078 if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) { 2079 INIT_PROC_INFO(threadInfo[num_avail]); 2080 continue; 2081 } 2082 2083 // 2084 // We have a successful parse of this proc's info. 2085 // Increment the counter, and prepare for the next proc. 2086 // 2087 num_avail++; 2088 KMP_ASSERT(num_avail <= num_records); 2089 INIT_PROC_INFO(threadInfo[num_avail]); 2090 } 2091 continue; 2092 2093 no_val: 2094 CLEANUP_THREAD_INFO; 2095 *msg_id = kmp_i18n_str_MissingValCpuinfo; 2096 return -1; 2097 2098 dup_field: 2099 CLEANUP_THREAD_INFO; 2100 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 2101 return -1; 2102 } 2103 *line = 0; 2104 2105 # if KMP_MIC && REDUCE_TEAM_SIZE 2106 unsigned teamSize = 0; 2107 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2108 2109 // check for num_records == __kmp_xproc ??? 2110 2111 // 2112 // If there's only one thread context to bind to, form an Address object 2113 // with depth 1 and return immediately (or, if affinity is off, set 2114 // address2os to NULL and return). 2115 // 2116 // If it is configured to omit the package level when there is only a 2117 // single package, the logic at the end of this routine won't work if 2118 // there is only a single thread - it would try to form an Address 2119 // object with depth 0. 2120 // 2121 KMP_ASSERT(num_avail > 0); 2122 KMP_ASSERT(num_avail <= num_records); 2123 if (num_avail == 1) { 2124 __kmp_ncores = 1; 2125 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 2126 if (__kmp_affinity_verbose) { 2127 if (! KMP_AFFINITY_CAPABLE()) { 2128 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2129 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2130 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2131 } 2132 else { 2133 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2134 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 2135 fullMask); 2136 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2137 if (__kmp_affinity_respect_mask) { 2138 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2139 } else { 2140 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2141 } 2142 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2143 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2144 } 2145 int index; 2146 kmp_str_buf_t buf; 2147 __kmp_str_buf_init(&buf); 2148 __kmp_str_buf_print(&buf, "1"); 2149 for (index = maxIndex - 1; index > pkgIdIndex; index--) { 2150 __kmp_str_buf_print(&buf, " x 1"); 2151 } 2152 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1); 2153 __kmp_str_buf_free(&buf); 2154 } 2155 2156 if (__kmp_affinity_type == affinity_none) { 2157 CLEANUP_THREAD_INFO; 2158 return 0; 2159 } 2160 2161 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair)); 2162 Address addr(1); 2163 addr.labels[0] = threadInfo[0][pkgIdIndex]; 2164 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]); 2165 2166 if (__kmp_affinity_gran_levels < 0) { 2167 __kmp_affinity_gran_levels = 0; 2168 } 2169 2170 if (__kmp_affinity_verbose) { 2171 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 2172 } 2173 2174 CLEANUP_THREAD_INFO; 2175 return 1; 2176 } 2177 2178 // 2179 // Sort the threadInfo table by physical Id. 2180 // 2181 qsort(threadInfo, num_avail, sizeof(*threadInfo), 2182 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 2183 2184 // 2185 // The table is now sorted by pkgId / coreId / threadId, but we really 2186 // don't know the radix of any of the fields. pkgId's may be sparsely 2187 // assigned among the chips on a system. Although coreId's are usually 2188 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned 2189 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2190 // 2191 // For that matter, we don't know what coresPerPkg and threadsPerCore 2192 // (or the total # packages) are at this point - we want to determine 2193 // that now. We only have an upper bound on the first two figures. 2194 // 2195 unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1) 2196 * sizeof(unsigned)); 2197 unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1) 2198 * sizeof(unsigned)); 2199 unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1) 2200 * sizeof(unsigned)); 2201 unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1) 2202 * sizeof(unsigned)); 2203 2204 bool assign_thread_ids = false; 2205 unsigned threadIdCt; 2206 unsigned index; 2207 2208 restart_radix_check: 2209 threadIdCt = 0; 2210 2211 // 2212 // Initialize the counter arrays with data from threadInfo[0]. 2213 // 2214 if (assign_thread_ids) { 2215 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 2216 threadInfo[0][threadIdIndex] = threadIdCt++; 2217 } 2218 else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 2219 threadIdCt = threadInfo[0][threadIdIndex] + 1; 2220 } 2221 } 2222 for (index = 0; index <= maxIndex; index++) { 2223 counts[index] = 1; 2224 maxCt[index] = 1; 2225 totals[index] = 1; 2226 lastId[index] = threadInfo[0][index];; 2227 } 2228 2229 // 2230 // Run through the rest of the OS procs. 2231 // 2232 for (i = 1; i < num_avail; i++) { 2233 // 2234 // Find the most significant index whose id differs 2235 // from the id for the previous OS proc. 2236 // 2237 for (index = maxIndex; index >= threadIdIndex; index--) { 2238 if (assign_thread_ids && (index == threadIdIndex)) { 2239 // 2240 // Auto-assign the thread id field if it wasn't specified. 2241 // 2242 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2243 threadInfo[i][threadIdIndex] = threadIdCt++; 2244 } 2245 2246 // 2247 // Aparrently the thread id field was specified for some 2248 // entries and not others. Start the thread id counter 2249 // off at the next higher thread id. 2250 // 2251 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2252 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2253 } 2254 } 2255 if (threadInfo[i][index] != lastId[index]) { 2256 // 2257 // Run through all indices which are less significant, 2258 // and reset the counts to 1. 2259 // 2260 // At all levels up to and including index, we need to 2261 // increment the totals and record the last id. 2262 // 2263 unsigned index2; 2264 for (index2 = threadIdIndex; index2 < index; index2++) { 2265 totals[index2]++; 2266 if (counts[index2] > maxCt[index2]) { 2267 maxCt[index2] = counts[index2]; 2268 } 2269 counts[index2] = 1; 2270 lastId[index2] = threadInfo[i][index2]; 2271 } 2272 counts[index]++; 2273 totals[index]++; 2274 lastId[index] = threadInfo[i][index]; 2275 2276 if (assign_thread_ids && (index > threadIdIndex)) { 2277 2278 # if KMP_MIC && REDUCE_TEAM_SIZE 2279 // 2280 // The default team size is the total #threads in the machine 2281 // minus 1 thread for every core that has 3 or more threads. 2282 // 2283 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 ); 2284 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2285 2286 // 2287 // Restart the thread counter, as we are on a new core. 2288 // 2289 threadIdCt = 0; 2290 2291 // 2292 // Auto-assign the thread id field if it wasn't specified. 2293 // 2294 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2295 threadInfo[i][threadIdIndex] = threadIdCt++; 2296 } 2297 2298 // 2299 // Aparrently the thread id field was specified for some 2300 // entries and not others. Start the thread id counter 2301 // off at the next higher thread id. 2302 // 2303 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2304 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2305 } 2306 } 2307 break; 2308 } 2309 } 2310 if (index < threadIdIndex) { 2311 // 2312 // If thread ids were specified, it is an error if they are not 2313 // unique. Also, check that we waven't already restarted the 2314 // loop (to be safe - shouldn't need to). 2315 // 2316 if ((threadInfo[i][threadIdIndex] != UINT_MAX) 2317 || assign_thread_ids) { 2318 __kmp_free(lastId); 2319 __kmp_free(totals); 2320 __kmp_free(maxCt); 2321 __kmp_free(counts); 2322 CLEANUP_THREAD_INFO; 2323 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2324 return -1; 2325 } 2326 2327 // 2328 // If the thread ids were not specified and we see entries 2329 // entries that are duplicates, start the loop over and 2330 // assign the thread ids manually. 2331 // 2332 assign_thread_ids = true; 2333 goto restart_radix_check; 2334 } 2335 } 2336 2337 # if KMP_MIC && REDUCE_TEAM_SIZE 2338 // 2339 // The default team size is the total #threads in the machine 2340 // minus 1 thread for every core that has 3 or more threads. 2341 // 2342 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 ); 2343 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2344 2345 for (index = threadIdIndex; index <= maxIndex; index++) { 2346 if (counts[index] > maxCt[index]) { 2347 maxCt[index] = counts[index]; 2348 } 2349 } 2350 2351 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2352 nCoresPerPkg = maxCt[coreIdIndex]; 2353 nPackages = totals[pkgIdIndex]; 2354 2355 // 2356 // Check to see if the machine topology is uniform 2357 // 2358 unsigned prod = totals[maxIndex]; 2359 for (index = threadIdIndex; index < maxIndex; index++) { 2360 prod *= maxCt[index]; 2361 } 2362 bool uniform = (prod == totals[threadIdIndex]); 2363 2364 // 2365 // When affinity is off, this routine will still be called to set 2366 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 2367 // nCoresPerPkg, & nPackages. Make sure all these vars are set 2368 // correctly, and return now if affinity is not enabled. 2369 // 2370 __kmp_ncores = totals[coreIdIndex]; 2371 2372 if (__kmp_affinity_verbose) { 2373 if (! KMP_AFFINITY_CAPABLE()) { 2374 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2375 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2376 if (uniform) { 2377 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2378 } else { 2379 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2380 } 2381 } 2382 else { 2383 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2384 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask); 2385 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2386 if (__kmp_affinity_respect_mask) { 2387 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2388 } else { 2389 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2390 } 2391 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2392 if (uniform) { 2393 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2394 } else { 2395 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2396 } 2397 } 2398 kmp_str_buf_t buf; 2399 __kmp_str_buf_init(&buf); 2400 2401 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]); 2402 for (index = maxIndex - 1; index >= pkgIdIndex; index--) { 2403 __kmp_str_buf_print(&buf, " x %d", maxCt[index]); 2404 } 2405 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex], 2406 maxCt[threadIdIndex], __kmp_ncores); 2407 2408 __kmp_str_buf_free(&buf); 2409 } 2410 2411 # if KMP_MIC && REDUCE_TEAM_SIZE 2412 // 2413 // Set the default team size. 2414 // 2415 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2416 __kmp_dflt_team_nth = teamSize; 2417 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n", 2418 __kmp_dflt_team_nth)); 2419 } 2420 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2421 2422 if (__kmp_affinity_type == affinity_none) { 2423 __kmp_free(lastId); 2424 __kmp_free(totals); 2425 __kmp_free(maxCt); 2426 __kmp_free(counts); 2427 CLEANUP_THREAD_INFO; 2428 return 0; 2429 } 2430 2431 // 2432 // Count the number of levels which have more nodes at that level than 2433 // at the parent's level (with there being an implicit root node of 2434 // the top level). This is equivalent to saying that there is at least 2435 // one node at this level which has a sibling. These levels are in the 2436 // map, and the package level is always in the map. 2437 // 2438 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2439 int level = 0; 2440 for (index = threadIdIndex; index < maxIndex; index++) { 2441 KMP_ASSERT(totals[index] >= totals[index + 1]); 2442 inMap[index] = (totals[index] > totals[index + 1]); 2443 } 2444 inMap[maxIndex] = (totals[maxIndex] > 1); 2445 inMap[pkgIdIndex] = true; 2446 2447 int depth = 0; 2448 for (index = threadIdIndex; index <= maxIndex; index++) { 2449 if (inMap[index]) { 2450 depth++; 2451 } 2452 } 2453 KMP_ASSERT(depth > 0); 2454 2455 // 2456 // Construct the data structure that is to be returned. 2457 // 2458 *address2os = (AddrUnsPair*) 2459 __kmp_allocate(sizeof(AddrUnsPair) * num_avail); 2460 int pkgLevel = -1; 2461 int coreLevel = -1; 2462 int threadLevel = -1; 2463 2464 for (i = 0; i < num_avail; ++i) { 2465 Address addr(depth); 2466 unsigned os = threadInfo[i][osIdIndex]; 2467 int src_index; 2468 int dst_index = 0; 2469 2470 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2471 if (! inMap[src_index]) { 2472 continue; 2473 } 2474 addr.labels[dst_index] = threadInfo[i][src_index]; 2475 if (src_index == pkgIdIndex) { 2476 pkgLevel = dst_index; 2477 } 2478 else if (src_index == coreIdIndex) { 2479 coreLevel = dst_index; 2480 } 2481 else if (src_index == threadIdIndex) { 2482 threadLevel = dst_index; 2483 } 2484 dst_index++; 2485 } 2486 (*address2os)[i] = AddrUnsPair(addr, os); 2487 } 2488 2489 if (__kmp_affinity_gran_levels < 0) { 2490 // 2491 // Set the granularity level based on what levels are modeled 2492 // in the machine topology map. 2493 // 2494 unsigned src_index; 2495 __kmp_affinity_gran_levels = 0; 2496 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) { 2497 if (! inMap[src_index]) { 2498 continue; 2499 } 2500 switch (src_index) { 2501 case threadIdIndex: 2502 if (__kmp_affinity_gran > affinity_gran_thread) { 2503 __kmp_affinity_gran_levels++; 2504 } 2505 2506 break; 2507 case coreIdIndex: 2508 if (__kmp_affinity_gran > affinity_gran_core) { 2509 __kmp_affinity_gran_levels++; 2510 } 2511 break; 2512 2513 case pkgIdIndex: 2514 if (__kmp_affinity_gran > affinity_gran_package) { 2515 __kmp_affinity_gran_levels++; 2516 } 2517 break; 2518 } 2519 } 2520 } 2521 2522 if (__kmp_affinity_verbose) { 2523 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel, 2524 coreLevel, threadLevel); 2525 } 2526 2527 __kmp_free(inMap); 2528 __kmp_free(lastId); 2529 __kmp_free(totals); 2530 __kmp_free(maxCt); 2531 __kmp_free(counts); 2532 CLEANUP_THREAD_INFO; 2533 return depth; 2534 } 2535 2536 2537 // 2538 // Create and return a table of affinity masks, indexed by OS thread ID. 2539 // This routine handles OR'ing together all the affinity masks of threads 2540 // that are sufficiently close, if granularity > fine. 2541 // 2542 static kmp_affin_mask_t * 2543 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique, 2544 AddrUnsPair *address2os, unsigned numAddrs) 2545 { 2546 // 2547 // First form a table of affinity masks in order of OS thread id. 2548 // 2549 unsigned depth; 2550 unsigned maxOsId; 2551 unsigned i; 2552 2553 KMP_ASSERT(numAddrs > 0); 2554 depth = address2os[0].first.depth; 2555 2556 maxOsId = 0; 2557 for (i = 0; i < numAddrs; i++) { 2558 unsigned osId = address2os[i].second; 2559 if (osId > maxOsId) { 2560 maxOsId = osId; 2561 } 2562 } 2563 kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate( 2564 (maxOsId + 1) * __kmp_affin_mask_size); 2565 2566 // 2567 // Sort the address2os table according to physical order. Doing so 2568 // will put all threads on the same core/package/node in consecutive 2569 // locations. 2570 // 2571 qsort(address2os, numAddrs, sizeof(*address2os), 2572 __kmp_affinity_cmp_Address_labels); 2573 2574 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2575 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2576 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2577 } 2578 if (__kmp_affinity_gran_levels >= (int)depth) { 2579 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2580 && (__kmp_affinity_type != affinity_none))) { 2581 KMP_WARNING(AffThreadsMayMigrate); 2582 } 2583 } 2584 2585 // 2586 // Run through the table, forming the masks for all threads on each 2587 // core. Threads on the same core will have identical "Address" 2588 // objects, not considering the last level, which must be the thread 2589 // id. All threads on a core will appear consecutively. 2590 // 2591 unsigned unique = 0; 2592 unsigned j = 0; // index of 1st thread on core 2593 unsigned leader = 0; 2594 Address *leaderAddr = &(address2os[0].first); 2595 kmp_affin_mask_t *sum 2596 = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size); 2597 KMP_CPU_ZERO(sum); 2598 KMP_CPU_SET(address2os[0].second, sum); 2599 for (i = 1; i < numAddrs; i++) { 2600 // 2601 // If this thread is sufficiently close to the leader (within the 2602 // granularity setting), then set the bit for this os thread in the 2603 // affinity mask for this group, and go on to the next thread. 2604 // 2605 if (leaderAddr->isClose(address2os[i].first, 2606 __kmp_affinity_gran_levels)) { 2607 KMP_CPU_SET(address2os[i].second, sum); 2608 continue; 2609 } 2610 2611 // 2612 // For every thread in this group, copy the mask to the thread's 2613 // entry in the osId2Mask table. Mark the first address as a 2614 // leader. 2615 // 2616 for (; j < i; j++) { 2617 unsigned osId = address2os[j].second; 2618 KMP_DEBUG_ASSERT(osId <= maxOsId); 2619 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2620 KMP_CPU_COPY(mask, sum); 2621 address2os[j].first.leader = (j == leader); 2622 } 2623 unique++; 2624 2625 // 2626 // Start a new mask. 2627 // 2628 leader = i; 2629 leaderAddr = &(address2os[i].first); 2630 KMP_CPU_ZERO(sum); 2631 KMP_CPU_SET(address2os[i].second, sum); 2632 } 2633 2634 // 2635 // For every thread in last group, copy the mask to the thread's 2636 // entry in the osId2Mask table. 2637 // 2638 for (; j < i; j++) { 2639 unsigned osId = address2os[j].second; 2640 KMP_DEBUG_ASSERT(osId <= maxOsId); 2641 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2642 KMP_CPU_COPY(mask, sum); 2643 address2os[j].first.leader = (j == leader); 2644 } 2645 unique++; 2646 2647 *maxIndex = maxOsId; 2648 *numUnique = unique; 2649 return osId2Mask; 2650 } 2651 2652 2653 // 2654 // Stuff for the affinity proclist parsers. It's easier to declare these vars 2655 // as file-static than to try and pass them through the calling sequence of 2656 // the recursive-descent OMP_PLACES parser. 2657 // 2658 static kmp_affin_mask_t *newMasks; 2659 static int numNewMasks; 2660 static int nextNewMask; 2661 2662 #define ADD_MASK(_mask) \ 2663 { \ 2664 if (nextNewMask >= numNewMasks) { \ 2665 numNewMasks *= 2; \ 2666 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \ 2667 numNewMasks * __kmp_affin_mask_size); \ 2668 } \ 2669 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 2670 nextNewMask++; \ 2671 } 2672 2673 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \ 2674 { \ 2675 if (((_osId) > _maxOsId) || \ 2676 (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 2677 if (__kmp_affinity_verbose || (__kmp_affinity_warnings \ 2678 && (__kmp_affinity_type != affinity_none))) { \ 2679 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 2680 } \ 2681 } \ 2682 else { \ 2683 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 2684 } \ 2685 } 2686 2687 2688 // 2689 // Re-parse the proclist (for the explicit affinity type), and form the list 2690 // of affinity newMasks indexed by gtid. 2691 // 2692 static void 2693 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 2694 unsigned int *out_numMasks, const char *proclist, 2695 kmp_affin_mask_t *osId2Mask, int maxOsId) 2696 { 2697 const char *scan = proclist; 2698 const char *next = proclist; 2699 2700 // 2701 // We use malloc() for the temporary mask vector, 2702 // so that we can use realloc() to extend it. 2703 // 2704 numNewMasks = 2; 2705 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks 2706 * __kmp_affin_mask_size); 2707 nextNewMask = 0; 2708 kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate( 2709 __kmp_affin_mask_size); 2710 int setSize = 0; 2711 2712 for (;;) { 2713 int start, end, stride; 2714 2715 SKIP_WS(scan); 2716 next = scan; 2717 if (*next == '\0') { 2718 break; 2719 } 2720 2721 if (*next == '{') { 2722 int num; 2723 setSize = 0; 2724 next++; // skip '{' 2725 SKIP_WS(next); 2726 scan = next; 2727 2728 // 2729 // Read the first integer in the set. 2730 // 2731 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2732 "bad proclist"); 2733 SKIP_DIGITS(next); 2734 num = __kmp_str_to_int(scan, *next); 2735 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2736 2737 // 2738 // Copy the mask for that osId to the sum (union) mask. 2739 // 2740 if ((num > maxOsId) || 2741 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2742 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2743 && (__kmp_affinity_type != affinity_none))) { 2744 KMP_WARNING(AffIgnoreInvalidProcID, num); 2745 } 2746 KMP_CPU_ZERO(sumMask); 2747 } 2748 else { 2749 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2750 setSize = 1; 2751 } 2752 2753 for (;;) { 2754 // 2755 // Check for end of set. 2756 // 2757 SKIP_WS(next); 2758 if (*next == '}') { 2759 next++; // skip '}' 2760 break; 2761 } 2762 2763 // 2764 // Skip optional comma. 2765 // 2766 if (*next == ',') { 2767 next++; 2768 } 2769 SKIP_WS(next); 2770 2771 // 2772 // Read the next integer in the set. 2773 // 2774 scan = next; 2775 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2776 "bad explicit proc list"); 2777 2778 SKIP_DIGITS(next); 2779 num = __kmp_str_to_int(scan, *next); 2780 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2781 2782 // 2783 // Add the mask for that osId to the sum mask. 2784 // 2785 if ((num > maxOsId) || 2786 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2787 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2788 && (__kmp_affinity_type != affinity_none))) { 2789 KMP_WARNING(AffIgnoreInvalidProcID, num); 2790 } 2791 } 2792 else { 2793 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2794 setSize++; 2795 } 2796 } 2797 if (setSize > 0) { 2798 ADD_MASK(sumMask); 2799 } 2800 2801 SKIP_WS(next); 2802 if (*next == ',') { 2803 next++; 2804 } 2805 scan = next; 2806 continue; 2807 } 2808 2809 // 2810 // Read the first integer. 2811 // 2812 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2813 SKIP_DIGITS(next); 2814 start = __kmp_str_to_int(scan, *next); 2815 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 2816 SKIP_WS(next); 2817 2818 // 2819 // If this isn't a range, then add a mask to the list and go on. 2820 // 2821 if (*next != '-') { 2822 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2823 2824 // 2825 // Skip optional comma. 2826 // 2827 if (*next == ',') { 2828 next++; 2829 } 2830 scan = next; 2831 continue; 2832 } 2833 2834 // 2835 // This is a range. Skip over the '-' and read in the 2nd int. 2836 // 2837 next++; // skip '-' 2838 SKIP_WS(next); 2839 scan = next; 2840 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2841 SKIP_DIGITS(next); 2842 end = __kmp_str_to_int(scan, *next); 2843 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 2844 2845 // 2846 // Check for a stride parameter 2847 // 2848 stride = 1; 2849 SKIP_WS(next); 2850 if (*next == ':') { 2851 // 2852 // A stride is specified. Skip over the ':" and read the 3rd int. 2853 // 2854 int sign = +1; 2855 next++; // skip ':' 2856 SKIP_WS(next); 2857 scan = next; 2858 if (*next == '-') { 2859 sign = -1; 2860 next++; 2861 SKIP_WS(next); 2862 scan = next; 2863 } 2864 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2865 "bad explicit proc list"); 2866 SKIP_DIGITS(next); 2867 stride = __kmp_str_to_int(scan, *next); 2868 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 2869 stride *= sign; 2870 } 2871 2872 // 2873 // Do some range checks. 2874 // 2875 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 2876 if (stride > 0) { 2877 KMP_ASSERT2(start <= end, "bad explicit proc list"); 2878 } 2879 else { 2880 KMP_ASSERT2(start >= end, "bad explicit proc list"); 2881 } 2882 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 2883 2884 // 2885 // Add the mask for each OS proc # to the list. 2886 // 2887 if (stride > 0) { 2888 do { 2889 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2890 start += stride; 2891 } while (start <= end); 2892 } 2893 else { 2894 do { 2895 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2896 start += stride; 2897 } while (start >= end); 2898 } 2899 2900 // 2901 // Skip optional comma. 2902 // 2903 SKIP_WS(next); 2904 if (*next == ',') { 2905 next++; 2906 } 2907 scan = next; 2908 } 2909 2910 *out_numMasks = nextNewMask; 2911 if (nextNewMask == 0) { 2912 *out_masks = NULL; 2913 KMP_INTERNAL_FREE(newMasks); 2914 return; 2915 } 2916 *out_masks 2917 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size); 2918 memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size); 2919 __kmp_free(sumMask); 2920 KMP_INTERNAL_FREE(newMasks); 2921 } 2922 2923 2924 # if OMP_40_ENABLED 2925 2926 /*----------------------------------------------------------------------------- 2927 2928 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 2929 places. Again, Here is the grammar: 2930 2931 place_list := place 2932 place_list := place , place_list 2933 place := num 2934 place := place : num 2935 place := place : num : signed 2936 place := { subplacelist } 2937 place := ! place // (lowest priority) 2938 subplace_list := subplace 2939 subplace_list := subplace , subplace_list 2940 subplace := num 2941 subplace := num : num 2942 subplace := num : num : signed 2943 signed := num 2944 signed := + signed 2945 signed := - signed 2946 2947 -----------------------------------------------------------------------------*/ 2948 2949 static void 2950 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask, 2951 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize) 2952 { 2953 const char *next; 2954 2955 for (;;) { 2956 int start, count, stride, i; 2957 2958 // 2959 // Read in the starting proc id 2960 // 2961 SKIP_WS(*scan); 2962 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 2963 "bad explicit places list"); 2964 next = *scan; 2965 SKIP_DIGITS(next); 2966 start = __kmp_str_to_int(*scan, *next); 2967 KMP_ASSERT(start >= 0); 2968 *scan = next; 2969 2970 // 2971 // valid follow sets are ',' ':' and '}' 2972 // 2973 SKIP_WS(*scan); 2974 if (**scan == '}' || **scan == ',') { 2975 if ((start > maxOsId) || 2976 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2977 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2978 && (__kmp_affinity_type != affinity_none))) { 2979 KMP_WARNING(AffIgnoreInvalidProcID, start); 2980 } 2981 } 2982 else { 2983 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2984 (*setSize)++; 2985 } 2986 if (**scan == '}') { 2987 break; 2988 } 2989 (*scan)++; // skip ',' 2990 continue; 2991 } 2992 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 2993 (*scan)++; // skip ':' 2994 2995 // 2996 // Read count parameter 2997 // 2998 SKIP_WS(*scan); 2999 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 3000 "bad explicit places list"); 3001 next = *scan; 3002 SKIP_DIGITS(next); 3003 count = __kmp_str_to_int(*scan, *next); 3004 KMP_ASSERT(count >= 0); 3005 *scan = next; 3006 3007 // 3008 // valid follow sets are ',' ':' and '}' 3009 // 3010 SKIP_WS(*scan); 3011 if (**scan == '}' || **scan == ',') { 3012 for (i = 0; i < count; i++) { 3013 if ((start > maxOsId) || 3014 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3015 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3016 && (__kmp_affinity_type != affinity_none))) { 3017 KMP_WARNING(AffIgnoreInvalidProcID, start); 3018 } 3019 break; // don't proliferate warnings for large count 3020 } 3021 else { 3022 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3023 start++; 3024 (*setSize)++; 3025 } 3026 } 3027 if (**scan == '}') { 3028 break; 3029 } 3030 (*scan)++; // skip ',' 3031 continue; 3032 } 3033 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3034 (*scan)++; // skip ':' 3035 3036 // 3037 // Read stride parameter 3038 // 3039 int sign = +1; 3040 for (;;) { 3041 SKIP_WS(*scan); 3042 if (**scan == '+') { 3043 (*scan)++; // skip '+' 3044 continue; 3045 } 3046 if (**scan == '-') { 3047 sign *= -1; 3048 (*scan)++; // skip '-' 3049 continue; 3050 } 3051 break; 3052 } 3053 SKIP_WS(*scan); 3054 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 3055 "bad explicit places list"); 3056 next = *scan; 3057 SKIP_DIGITS(next); 3058 stride = __kmp_str_to_int(*scan, *next); 3059 KMP_ASSERT(stride >= 0); 3060 *scan = next; 3061 stride *= sign; 3062 3063 // 3064 // valid follow sets are ',' and '}' 3065 // 3066 SKIP_WS(*scan); 3067 if (**scan == '}' || **scan == ',') { 3068 for (i = 0; i < count; i++) { 3069 if ((start > maxOsId) || 3070 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3071 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3072 && (__kmp_affinity_type != affinity_none))) { 3073 KMP_WARNING(AffIgnoreInvalidProcID, start); 3074 } 3075 break; // don't proliferate warnings for large count 3076 } 3077 else { 3078 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3079 start += stride; 3080 (*setSize)++; 3081 } 3082 } 3083 if (**scan == '}') { 3084 break; 3085 } 3086 (*scan)++; // skip ',' 3087 continue; 3088 } 3089 3090 KMP_ASSERT2(0, "bad explicit places list"); 3091 } 3092 } 3093 3094 3095 static void 3096 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 3097 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize) 3098 { 3099 const char *next; 3100 3101 // 3102 // valid follow sets are '{' '!' and num 3103 // 3104 SKIP_WS(*scan); 3105 if (**scan == '{') { 3106 (*scan)++; // skip '{' 3107 __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask, 3108 setSize); 3109 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 3110 (*scan)++; // skip '}' 3111 } 3112 else if (**scan == '!') { 3113 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 3114 KMP_CPU_COMPLEMENT(tempMask); 3115 (*scan)++; // skip '!' 3116 } 3117 else if ((**scan >= '0') && (**scan <= '9')) { 3118 next = *scan; 3119 SKIP_DIGITS(next); 3120 int num = __kmp_str_to_int(*scan, *next); 3121 KMP_ASSERT(num >= 0); 3122 if ((num > maxOsId) || 3123 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3124 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3125 && (__kmp_affinity_type != affinity_none))) { 3126 KMP_WARNING(AffIgnoreInvalidProcID, num); 3127 } 3128 } 3129 else { 3130 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 3131 (*setSize)++; 3132 } 3133 *scan = next; // skip num 3134 } 3135 else { 3136 KMP_ASSERT2(0, "bad explicit places list"); 3137 } 3138 } 3139 3140 3141 //static void 3142 void 3143 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 3144 unsigned int *out_numMasks, const char *placelist, 3145 kmp_affin_mask_t *osId2Mask, int maxOsId) 3146 { 3147 const char *scan = placelist; 3148 const char *next = placelist; 3149 3150 numNewMasks = 2; 3151 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks 3152 * __kmp_affin_mask_size); 3153 nextNewMask = 0; 3154 3155 kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate( 3156 __kmp_affin_mask_size); 3157 KMP_CPU_ZERO(tempMask); 3158 int setSize = 0; 3159 3160 for (;;) { 3161 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 3162 3163 // 3164 // valid follow sets are ',' ':' and EOL 3165 // 3166 SKIP_WS(scan); 3167 if (*scan == '\0' || *scan == ',') { 3168 if (setSize > 0) { 3169 ADD_MASK(tempMask); 3170 } 3171 KMP_CPU_ZERO(tempMask); 3172 setSize = 0; 3173 if (*scan == '\0') { 3174 break; 3175 } 3176 scan++; // skip ',' 3177 continue; 3178 } 3179 3180 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3181 scan++; // skip ':' 3182 3183 // 3184 // Read count parameter 3185 // 3186 SKIP_WS(scan); 3187 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), 3188 "bad explicit places list"); 3189 next = scan; 3190 SKIP_DIGITS(next); 3191 int count = __kmp_str_to_int(scan, *next); 3192 KMP_ASSERT(count >= 0); 3193 scan = next; 3194 3195 // 3196 // valid follow sets are ',' ':' and EOL 3197 // 3198 SKIP_WS(scan); 3199 int stride; 3200 if (*scan == '\0' || *scan == ',') { 3201 stride = +1; 3202 } 3203 else { 3204 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3205 scan++; // skip ':' 3206 3207 // 3208 // Read stride parameter 3209 // 3210 int sign = +1; 3211 for (;;) { 3212 SKIP_WS(scan); 3213 if (*scan == '+') { 3214 scan++; // skip '+' 3215 continue; 3216 } 3217 if (*scan == '-') { 3218 sign *= -1; 3219 scan++; // skip '-' 3220 continue; 3221 } 3222 break; 3223 } 3224 SKIP_WS(scan); 3225 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), 3226 "bad explicit places list"); 3227 next = scan; 3228 SKIP_DIGITS(next); 3229 stride = __kmp_str_to_int(scan, *next); 3230 KMP_DEBUG_ASSERT(stride >= 0); 3231 scan = next; 3232 stride *= sign; 3233 } 3234 3235 if (stride > 0) { 3236 int i; 3237 for (i = 0; i < count; i++) { 3238 int j; 3239 if (setSize == 0) { 3240 break; 3241 } 3242 ADD_MASK(tempMask); 3243 setSize = 0; 3244 for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) { 3245 if (! KMP_CPU_ISSET(j - stride, tempMask)) { 3246 KMP_CPU_CLR(j, tempMask); 3247 } 3248 else if ((j > maxOsId) || 3249 (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) { 3250 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3251 && (__kmp_affinity_type != affinity_none))) { 3252 KMP_WARNING(AffIgnoreInvalidProcID, j); 3253 } 3254 KMP_CPU_CLR(j, tempMask); 3255 } 3256 else { 3257 KMP_CPU_SET(j, tempMask); 3258 setSize++; 3259 } 3260 } 3261 for (; j >= 0; j--) { 3262 KMP_CPU_CLR(j, tempMask); 3263 } 3264 } 3265 } 3266 else { 3267 int i; 3268 for (i = 0; i < count; i++) { 3269 int j; 3270 if (setSize == 0) { 3271 break; 3272 } 3273 ADD_MASK(tempMask); 3274 setSize = 0; 3275 for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride; 3276 j++) { 3277 if (! KMP_CPU_ISSET(j - stride, tempMask)) { 3278 KMP_CPU_CLR(j, tempMask); 3279 } 3280 else if ((j > maxOsId) || 3281 (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) { 3282 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3283 && (__kmp_affinity_type != affinity_none))) { 3284 KMP_WARNING(AffIgnoreInvalidProcID, j); 3285 } 3286 KMP_CPU_CLR(j, tempMask); 3287 } 3288 else { 3289 KMP_CPU_SET(j, tempMask); 3290 setSize++; 3291 } 3292 } 3293 for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) { 3294 KMP_CPU_CLR(j, tempMask); 3295 } 3296 } 3297 } 3298 KMP_CPU_ZERO(tempMask); 3299 setSize = 0; 3300 3301 // 3302 // valid follow sets are ',' and EOL 3303 // 3304 SKIP_WS(scan); 3305 if (*scan == '\0') { 3306 break; 3307 } 3308 if (*scan == ',') { 3309 scan++; // skip ',' 3310 continue; 3311 } 3312 3313 KMP_ASSERT2(0, "bad explicit places list"); 3314 } 3315 3316 *out_numMasks = nextNewMask; 3317 if (nextNewMask == 0) { 3318 *out_masks = NULL; 3319 KMP_INTERNAL_FREE(newMasks); 3320 return; 3321 } 3322 *out_masks 3323 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size); 3324 memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size); 3325 __kmp_free(tempMask); 3326 KMP_INTERNAL_FREE(newMasks); 3327 } 3328 3329 # endif /* OMP_40_ENABLED */ 3330 3331 #undef ADD_MASK 3332 #undef ADD_MASK_OSID 3333 3334 static void 3335 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) 3336 { 3337 if ( __kmp_place_num_cores == 0 ) { 3338 if ( __kmp_place_num_threads_per_core == 0 ) { 3339 return; // no cores limiting actions requested, exit 3340 } 3341 __kmp_place_num_cores = nCoresPerPkg; // use all available cores 3342 } 3343 if ( !__kmp_affinity_uniform_topology() ) { 3344 KMP_WARNING( AffThrPlaceNonUniform ); 3345 return; // don't support non-uniform topology 3346 } 3347 if ( depth != 3 ) { 3348 KMP_WARNING( AffThrPlaceNonThreeLevel ); 3349 return; // don't support not-3-level topology 3350 } 3351 if ( __kmp_place_num_threads_per_core == 0 ) { 3352 __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts 3353 } 3354 if ( __kmp_place_core_offset + __kmp_place_num_cores > (unsigned int)nCoresPerPkg ) { 3355 KMP_WARNING( AffThrPlaceManyCores ); 3356 return; 3357 } 3358 3359 AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) * 3360 nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core); 3361 int i, j, k, n_old = 0, n_new = 0; 3362 for ( i = 0; i < nPackages; ++i ) { 3363 for ( j = 0; j < nCoresPerPkg; ++j ) { 3364 if ( (unsigned int)j < __kmp_place_core_offset || (unsigned int)j >= __kmp_place_core_offset + __kmp_place_num_cores ) { 3365 n_old += __kmp_nThreadsPerCore; // skip not-requested core 3366 } else { 3367 for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) { 3368 if ( (unsigned int)k < __kmp_place_num_threads_per_core ) { 3369 newAddr[n_new] = (*pAddr)[n_old]; // copy requested core' data to new location 3370 n_new++; 3371 } 3372 n_old++; 3373 } 3374 } 3375 } 3376 } 3377 nCoresPerPkg = __kmp_place_num_cores; // correct nCoresPerPkg 3378 __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore 3379 __kmp_avail_proc = n_new; // correct avail_proc 3380 __kmp_ncores = nPackages * __kmp_place_num_cores; // correct ncores 3381 3382 __kmp_free( *pAddr ); 3383 *pAddr = newAddr; // replace old topology with new one 3384 } 3385 3386 3387 static AddrUnsPair *address2os = NULL; 3388 static int * procarr = NULL; 3389 static int __kmp_aff_depth = 0; 3390 3391 static void 3392 __kmp_aux_affinity_initialize(void) 3393 { 3394 if (__kmp_affinity_masks != NULL) { 3395 KMP_ASSERT(fullMask != NULL); 3396 return; 3397 } 3398 3399 // 3400 // Create the "full" mask - this defines all of the processors that we 3401 // consider to be in the machine model. If respect is set, then it is 3402 // the initialization thread's affinity mask. Otherwise, it is all 3403 // processors that we know about on the machine. 3404 // 3405 if (fullMask == NULL) { 3406 fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size); 3407 } 3408 if (KMP_AFFINITY_CAPABLE()) { 3409 if (__kmp_affinity_respect_mask) { 3410 __kmp_get_system_affinity(fullMask, TRUE); 3411 3412 // 3413 // Count the number of available processors. 3414 // 3415 unsigned i; 3416 __kmp_avail_proc = 0; 3417 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 3418 if (! KMP_CPU_ISSET(i, fullMask)) { 3419 continue; 3420 } 3421 __kmp_avail_proc++; 3422 } 3423 if (__kmp_avail_proc > __kmp_xproc) { 3424 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3425 && (__kmp_affinity_type != affinity_none))) { 3426 KMP_WARNING(ErrorInitializeAffinity); 3427 } 3428 __kmp_affinity_type = affinity_none; 3429 __kmp_affin_mask_size = 0; 3430 return; 3431 } 3432 } 3433 else { 3434 __kmp_affinity_entire_machine_mask(fullMask); 3435 __kmp_avail_proc = __kmp_xproc; 3436 } 3437 } 3438 3439 int depth = -1; 3440 kmp_i18n_id_t msg_id = kmp_i18n_null; 3441 3442 // 3443 // For backward compatibility, setting KMP_CPUINFO_FILE => 3444 // KMP_TOPOLOGY_METHOD=cpuinfo 3445 // 3446 if ((__kmp_cpuinfo_file != NULL) && 3447 (__kmp_affinity_top_method == affinity_top_method_all)) { 3448 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 3449 } 3450 3451 if (__kmp_affinity_top_method == affinity_top_method_all) { 3452 // 3453 // In the default code path, errors are not fatal - we just try using 3454 // another method. We only emit a warning message if affinity is on, 3455 // or the verbose flag is set, an the nowarnings flag was not set. 3456 // 3457 const char *file_name = NULL; 3458 int line = 0; 3459 3460 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 3461 3462 if (__kmp_affinity_verbose) { 3463 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 3464 } 3465 3466 file_name = NULL; 3467 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3468 if (depth == 0) { 3469 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3470 KMP_ASSERT(address2os == NULL); 3471 return; 3472 } 3473 3474 if (depth < 0) { 3475 if (__kmp_affinity_verbose) { 3476 if (msg_id != kmp_i18n_null) { 3477 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), 3478 KMP_I18N_STR(DecodingLegacyAPIC)); 3479 } 3480 else { 3481 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 3482 } 3483 } 3484 3485 file_name = NULL; 3486 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3487 if (depth == 0) { 3488 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3489 KMP_ASSERT(address2os == NULL); 3490 return; 3491 } 3492 } 3493 3494 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3495 3496 # if KMP_OS_LINUX 3497 3498 if (depth < 0) { 3499 if (__kmp_affinity_verbose) { 3500 if (msg_id != kmp_i18n_null) { 3501 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo"); 3502 } 3503 else { 3504 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo"); 3505 } 3506 } 3507 3508 FILE *f = fopen("/proc/cpuinfo", "r"); 3509 if (f == NULL) { 3510 msg_id = kmp_i18n_str_CantOpenCpuinfo; 3511 } 3512 else { 3513 file_name = "/proc/cpuinfo"; 3514 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3515 fclose(f); 3516 if (depth == 0) { 3517 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3518 KMP_ASSERT(address2os == NULL); 3519 return; 3520 } 3521 } 3522 } 3523 3524 # endif /* KMP_OS_LINUX */ 3525 3526 # if KMP_GROUP_AFFINITY 3527 3528 if ((depth < 0) && (__kmp_num_proc_groups > 1)) { 3529 if (__kmp_affinity_verbose) { 3530 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 3531 } 3532 3533 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 3534 KMP_ASSERT(depth != 0); 3535 } 3536 3537 # endif /* KMP_GROUP_AFFINITY */ 3538 3539 if (depth < 0) { 3540 if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) { 3541 if (file_name == NULL) { 3542 KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id)); 3543 } 3544 else if (line == 0) { 3545 KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id)); 3546 } 3547 else { 3548 KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id)); 3549 } 3550 } 3551 // FIXME - print msg if msg_id = kmp_i18n_null ??? 3552 3553 file_name = ""; 3554 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3555 if (depth == 0) { 3556 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3557 KMP_ASSERT(address2os == NULL); 3558 return; 3559 } 3560 KMP_ASSERT(depth > 0); 3561 KMP_ASSERT(address2os != NULL); 3562 } 3563 } 3564 3565 // 3566 // If the user has specified that a paricular topology discovery method 3567 // is to be used, then we abort if that method fails. The exception is 3568 // group affinity, which might have been implicitly set. 3569 // 3570 3571 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 3572 3573 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 3574 if (__kmp_affinity_verbose) { 3575 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3576 KMP_I18N_STR(Decodingx2APIC)); 3577 } 3578 3579 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3580 if (depth == 0) { 3581 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3582 KMP_ASSERT(address2os == NULL); 3583 return; 3584 } 3585 if (depth < 0) { 3586 KMP_ASSERT(msg_id != kmp_i18n_null); 3587 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3588 } 3589 } 3590 else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 3591 if (__kmp_affinity_verbose) { 3592 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3593 KMP_I18N_STR(DecodingLegacyAPIC)); 3594 } 3595 3596 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3597 if (depth == 0) { 3598 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3599 KMP_ASSERT(address2os == NULL); 3600 return; 3601 } 3602 if (depth < 0) { 3603 KMP_ASSERT(msg_id != kmp_i18n_null); 3604 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3605 } 3606 } 3607 3608 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3609 3610 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 3611 const char *filename; 3612 if (__kmp_cpuinfo_file != NULL) { 3613 filename = __kmp_cpuinfo_file; 3614 } 3615 else { 3616 filename = "/proc/cpuinfo"; 3617 } 3618 3619 if (__kmp_affinity_verbose) { 3620 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 3621 } 3622 3623 FILE *f = fopen(filename, "r"); 3624 if (f == NULL) { 3625 int code = errno; 3626 if (__kmp_cpuinfo_file != NULL) { 3627 __kmp_msg( 3628 kmp_ms_fatal, 3629 KMP_MSG(CantOpenFileForReading, filename), 3630 KMP_ERR(code), 3631 KMP_HNT(NameComesFrom_CPUINFO_FILE), 3632 __kmp_msg_null 3633 ); 3634 } 3635 else { 3636 __kmp_msg( 3637 kmp_ms_fatal, 3638 KMP_MSG(CantOpenFileForReading, filename), 3639 KMP_ERR(code), 3640 __kmp_msg_null 3641 ); 3642 } 3643 } 3644 int line = 0; 3645 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3646 fclose(f); 3647 if (depth < 0) { 3648 KMP_ASSERT(msg_id != kmp_i18n_null); 3649 if (line > 0) { 3650 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id)); 3651 } 3652 else { 3653 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 3654 } 3655 } 3656 if (__kmp_affinity_type == affinity_none) { 3657 KMP_ASSERT(depth == 0); 3658 KMP_ASSERT(address2os == NULL); 3659 return; 3660 } 3661 } 3662 3663 # if KMP_GROUP_AFFINITY 3664 3665 else if (__kmp_affinity_top_method == affinity_top_method_group) { 3666 if (__kmp_affinity_verbose) { 3667 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 3668 } 3669 3670 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 3671 KMP_ASSERT(depth != 0); 3672 if (depth < 0) { 3673 KMP_ASSERT(msg_id != kmp_i18n_null); 3674 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3675 } 3676 } 3677 3678 # endif /* KMP_GROUP_AFFINITY */ 3679 3680 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 3681 if (__kmp_affinity_verbose) { 3682 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY"); 3683 } 3684 3685 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3686 if (depth == 0) { 3687 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3688 KMP_ASSERT(address2os == NULL); 3689 return; 3690 } 3691 // should not fail 3692 KMP_ASSERT(depth > 0); 3693 KMP_ASSERT(address2os != NULL); 3694 } 3695 3696 if (address2os == NULL) { 3697 if (KMP_AFFINITY_CAPABLE() 3698 && (__kmp_affinity_verbose || (__kmp_affinity_warnings 3699 && (__kmp_affinity_type != affinity_none)))) { 3700 KMP_WARNING(ErrorInitializeAffinity); 3701 } 3702 __kmp_affinity_type = affinity_none; 3703 __kmp_affin_mask_size = 0; 3704 return; 3705 } 3706 3707 __kmp_apply_thread_places(&address2os, depth); 3708 3709 // 3710 // Create the table of masks, indexed by thread Id. 3711 // 3712 unsigned maxIndex; 3713 unsigned numUnique; 3714 kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique, 3715 address2os, __kmp_avail_proc); 3716 if (__kmp_affinity_gran_levels == 0) { 3717 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 3718 } 3719 3720 // 3721 // Set the childNums vector in all Address objects. This must be done 3722 // before we can sort using __kmp_affinity_cmp_Address_child_num(), 3723 // which takes into account the setting of __kmp_affinity_compact. 3724 // 3725 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc); 3726 3727 switch (__kmp_affinity_type) { 3728 3729 case affinity_explicit: 3730 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 3731 # if OMP_40_ENABLED 3732 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 3733 # endif 3734 { 3735 __kmp_affinity_process_proclist(&__kmp_affinity_masks, 3736 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask, 3737 maxIndex); 3738 } 3739 # if OMP_40_ENABLED 3740 else { 3741 __kmp_affinity_process_placelist(&__kmp_affinity_masks, 3742 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask, 3743 maxIndex); 3744 } 3745 # endif 3746 if (__kmp_affinity_num_masks == 0) { 3747 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3748 && (__kmp_affinity_type != affinity_none))) { 3749 KMP_WARNING(AffNoValidProcID); 3750 } 3751 __kmp_affinity_type = affinity_none; 3752 return; 3753 } 3754 break; 3755 3756 // 3757 // The other affinity types rely on sorting the Addresses according 3758 // to some permutation of the machine topology tree. Set 3759 // __kmp_affinity_compact and __kmp_affinity_offset appropriately, 3760 // then jump to a common code fragment to do the sort and create 3761 // the array of affinity masks. 3762 // 3763 3764 case affinity_logical: 3765 __kmp_affinity_compact = 0; 3766 if (__kmp_affinity_offset) { 3767 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset 3768 % __kmp_avail_proc; 3769 } 3770 goto sortAddresses; 3771 3772 case affinity_physical: 3773 if (__kmp_nThreadsPerCore > 1) { 3774 __kmp_affinity_compact = 1; 3775 if (__kmp_affinity_compact >= depth) { 3776 __kmp_affinity_compact = 0; 3777 } 3778 } else { 3779 __kmp_affinity_compact = 0; 3780 } 3781 if (__kmp_affinity_offset) { 3782 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset 3783 % __kmp_avail_proc; 3784 } 3785 goto sortAddresses; 3786 3787 case affinity_scatter: 3788 if (__kmp_affinity_compact >= depth) { 3789 __kmp_affinity_compact = 0; 3790 } 3791 else { 3792 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 3793 } 3794 goto sortAddresses; 3795 3796 case affinity_compact: 3797 if (__kmp_affinity_compact >= depth) { 3798 __kmp_affinity_compact = depth - 1; 3799 } 3800 goto sortAddresses; 3801 3802 case affinity_balanced: 3803 // Balanced works only for the case of a single package 3804 if( nPackages > 1 ) { 3805 if( __kmp_affinity_verbose || __kmp_affinity_warnings ) { 3806 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" ); 3807 } 3808 __kmp_affinity_type = affinity_none; 3809 return; 3810 } else if( __kmp_affinity_uniform_topology() ) { 3811 break; 3812 } else { // Non-uniform topology 3813 3814 // Save the depth for further usage 3815 __kmp_aff_depth = depth; 3816 3817 // Number of hyper threads per core in HT machine 3818 int nth_per_core = __kmp_nThreadsPerCore; 3819 3820 int core_level; 3821 if( nth_per_core > 1 ) { 3822 core_level = depth - 2; 3823 } else { 3824 core_level = depth - 1; 3825 } 3826 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1; 3827 int nproc = nth_per_core * ncores; 3828 3829 procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc ); 3830 for( int i = 0; i < nproc; i++ ) { 3831 procarr[ i ] = -1; 3832 } 3833 3834 for( int i = 0; i < __kmp_avail_proc; i++ ) { 3835 int proc = address2os[ i ].second; 3836 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread. 3837 // If there is only one thread per core then depth == 2: level 0 - package, 3838 // level 1 - core. 3839 int level = depth - 1; 3840 3841 // __kmp_nth_per_core == 1 3842 int thread = 0; 3843 int core = address2os[ i ].first.labels[ level ]; 3844 // If the thread level exists, that is we have more than one thread context per core 3845 if( nth_per_core > 1 ) { 3846 thread = address2os[ i ].first.labels[ level ] % nth_per_core; 3847 core = address2os[ i ].first.labels[ level - 1 ]; 3848 } 3849 procarr[ core * nth_per_core + thread ] = proc; 3850 } 3851 3852 break; 3853 } 3854 3855 sortAddresses: 3856 // 3857 // Allocate the gtid->affinity mask table. 3858 // 3859 if (__kmp_affinity_dups) { 3860 __kmp_affinity_num_masks = __kmp_avail_proc; 3861 } 3862 else { 3863 __kmp_affinity_num_masks = numUnique; 3864 } 3865 3866 # if OMP_40_ENABLED 3867 if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel ) 3868 && ( __kmp_affinity_num_places > 0 ) 3869 && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) { 3870 __kmp_affinity_num_masks = __kmp_affinity_num_places; 3871 } 3872 # endif 3873 3874 __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate( 3875 __kmp_affinity_num_masks * __kmp_affin_mask_size); 3876 3877 // 3878 // Sort the address2os table according to the current setting of 3879 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 3880 // 3881 qsort(address2os, __kmp_avail_proc, sizeof(*address2os), 3882 __kmp_affinity_cmp_Address_child_num); 3883 { 3884 int i; 3885 unsigned j; 3886 for (i = 0, j = 0; i < __kmp_avail_proc; i++) { 3887 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) { 3888 continue; 3889 } 3890 unsigned osId = address2os[i].second; 3891 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 3892 kmp_affin_mask_t *dest 3893 = KMP_CPU_INDEX(__kmp_affinity_masks, j); 3894 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 3895 KMP_CPU_COPY(dest, src); 3896 if (++j >= __kmp_affinity_num_masks) { 3897 break; 3898 } 3899 } 3900 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 3901 } 3902 break; 3903 3904 default: 3905 KMP_ASSERT2(0, "Unexpected affinity setting"); 3906 } 3907 3908 __kmp_free(osId2Mask); 3909 machine_hierarchy.init(address2os, __kmp_avail_proc); 3910 } 3911 3912 3913 void 3914 __kmp_affinity_initialize(void) 3915 { 3916 // 3917 // Much of the code above was written assumming that if a machine was not 3918 // affinity capable, then __kmp_affinity_type == affinity_none. We now 3919 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 3920 // 3921 // There are too many checks for __kmp_affinity_type == affinity_none 3922 // in this code. Instead of trying to change them all, check if 3923 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 3924 // affinity_none, call the real initialization routine, then restore 3925 // __kmp_affinity_type to affinity_disabled. 3926 // 3927 int disabled = (__kmp_affinity_type == affinity_disabled); 3928 if (! KMP_AFFINITY_CAPABLE()) { 3929 KMP_ASSERT(disabled); 3930 } 3931 if (disabled) { 3932 __kmp_affinity_type = affinity_none; 3933 } 3934 __kmp_aux_affinity_initialize(); 3935 if (disabled) { 3936 __kmp_affinity_type = affinity_disabled; 3937 } 3938 } 3939 3940 3941 void 3942 __kmp_affinity_uninitialize(void) 3943 { 3944 if (__kmp_affinity_masks != NULL) { 3945 __kmp_free(__kmp_affinity_masks); 3946 __kmp_affinity_masks = NULL; 3947 } 3948 if (fullMask != NULL) { 3949 KMP_CPU_FREE(fullMask); 3950 fullMask = NULL; 3951 } 3952 __kmp_affinity_num_masks = 0; 3953 # if OMP_40_ENABLED 3954 __kmp_affinity_num_places = 0; 3955 # endif 3956 if (__kmp_affinity_proclist != NULL) { 3957 __kmp_free(__kmp_affinity_proclist); 3958 __kmp_affinity_proclist = NULL; 3959 } 3960 if( address2os != NULL ) { 3961 __kmp_free( address2os ); 3962 address2os = NULL; 3963 } 3964 if( procarr != NULL ) { 3965 __kmp_free( procarr ); 3966 procarr = NULL; 3967 } 3968 } 3969 3970 3971 void 3972 __kmp_affinity_set_init_mask(int gtid, int isa_root) 3973 { 3974 if (! KMP_AFFINITY_CAPABLE()) { 3975 return; 3976 } 3977 3978 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 3979 if (th->th.th_affin_mask == NULL) { 3980 KMP_CPU_ALLOC(th->th.th_affin_mask); 3981 } 3982 else { 3983 KMP_CPU_ZERO(th->th.th_affin_mask); 3984 } 3985 3986 // 3987 // Copy the thread mask to the kmp_info_t strucuture. 3988 // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one 3989 // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask 3990 // is set, then the full mask is the same as the mask of the initialization 3991 // thread. 3992 // 3993 kmp_affin_mask_t *mask; 3994 int i; 3995 3996 # if OMP_40_ENABLED 3997 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 3998 # endif 3999 { 4000 if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced) 4001 ) { 4002 # if KMP_GROUP_AFFINITY 4003 if (__kmp_num_proc_groups > 1) { 4004 return; 4005 } 4006 # endif 4007 KMP_ASSERT(fullMask != NULL); 4008 i = KMP_PLACE_ALL; 4009 mask = fullMask; 4010 } 4011 else { 4012 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 ); 4013 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4014 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4015 } 4016 } 4017 # if OMP_40_ENABLED 4018 else { 4019 if ((! isa_root) 4020 || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 4021 # if KMP_GROUP_AFFINITY 4022 if (__kmp_num_proc_groups > 1) { 4023 return; 4024 } 4025 # endif 4026 KMP_ASSERT(fullMask != NULL); 4027 i = KMP_PLACE_ALL; 4028 mask = fullMask; 4029 } 4030 else { 4031 // 4032 // int i = some hash function or just a counter that doesn't 4033 // always start at 0. Use gtid for now. 4034 // 4035 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 ); 4036 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4037 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4038 } 4039 } 4040 # endif 4041 4042 # if OMP_40_ENABLED 4043 th->th.th_current_place = i; 4044 if (isa_root) { 4045 th->th.th_new_place = i; 4046 th->th.th_first_place = 0; 4047 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4048 } 4049 4050 if (i == KMP_PLACE_ALL) { 4051 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 4052 gtid)); 4053 } 4054 else { 4055 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 4056 gtid, i)); 4057 } 4058 # else 4059 if (i == -1) { 4060 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n", 4061 gtid)); 4062 } 4063 else { 4064 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n", 4065 gtid, i)); 4066 } 4067 # endif /* OMP_40_ENABLED */ 4068 4069 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4070 4071 if (__kmp_affinity_verbose) { 4072 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4073 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4074 th->th.th_affin_mask); 4075 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid, 4076 buf); 4077 } 4078 4079 # if KMP_OS_WINDOWS 4080 // 4081 // On Windows* OS, the process affinity mask might have changed. 4082 // If the user didn't request affinity and this call fails, 4083 // just continue silently. See CQ171393. 4084 // 4085 if ( __kmp_affinity_type == affinity_none ) { 4086 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 4087 } 4088 else 4089 # endif 4090 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4091 } 4092 4093 4094 # if OMP_40_ENABLED 4095 4096 void 4097 __kmp_affinity_set_place(int gtid) 4098 { 4099 int retval; 4100 4101 if (! KMP_AFFINITY_CAPABLE()) { 4102 return; 4103 } 4104 4105 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4106 4107 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n", 4108 gtid, th->th.th_new_place, th->th.th_current_place)); 4109 4110 // 4111 // Check that the new place is within this thread's partition. 4112 // 4113 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4114 KMP_ASSERT(th->th.th_new_place >= 0); 4115 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 4116 if (th->th.th_first_place <= th->th.th_last_place) { 4117 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) 4118 && (th->th.th_new_place <= th->th.th_last_place)); 4119 } 4120 else { 4121 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) 4122 || (th->th.th_new_place >= th->th.th_last_place)); 4123 } 4124 4125 // 4126 // Copy the thread mask to the kmp_info_t strucuture, 4127 // and set this thread's affinity. 4128 // 4129 kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, 4130 th->th.th_new_place); 4131 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4132 th->th.th_current_place = th->th.th_new_place; 4133 4134 if (__kmp_affinity_verbose) { 4135 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4136 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4137 th->th.th_affin_mask); 4138 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), 4139 gtid, buf); 4140 } 4141 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4142 } 4143 4144 # endif /* OMP_40_ENABLED */ 4145 4146 4147 int 4148 __kmp_aux_set_affinity(void **mask) 4149 { 4150 int gtid; 4151 kmp_info_t *th; 4152 int retval; 4153 4154 if (! KMP_AFFINITY_CAPABLE()) { 4155 return -1; 4156 } 4157 4158 gtid = __kmp_entry_gtid(); 4159 KA_TRACE(1000, ;{ 4160 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4161 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4162 (kmp_affin_mask_t *)(*mask)); 4163 __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n", 4164 gtid, buf); 4165 }); 4166 4167 if (__kmp_env_consistency_check) { 4168 if ((mask == NULL) || (*mask == NULL)) { 4169 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4170 } 4171 else { 4172 unsigned proc; 4173 int num_procs = 0; 4174 4175 for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) { 4176 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 4177 continue; 4178 } 4179 num_procs++; 4180 if (! KMP_CPU_ISSET(proc, fullMask)) { 4181 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4182 break; 4183 } 4184 } 4185 if (num_procs == 0) { 4186 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4187 } 4188 4189 # if KMP_GROUP_AFFINITY 4190 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 4191 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4192 } 4193 # endif /* KMP_GROUP_AFFINITY */ 4194 4195 } 4196 } 4197 4198 th = __kmp_threads[gtid]; 4199 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4200 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4201 if (retval == 0) { 4202 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 4203 } 4204 4205 # if OMP_40_ENABLED 4206 th->th.th_current_place = KMP_PLACE_UNDEFINED; 4207 th->th.th_new_place = KMP_PLACE_UNDEFINED; 4208 th->th.th_first_place = 0; 4209 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4210 4211 // 4212 // Turn off 4.0 affinity for the current tread at this parallel level. 4213 // 4214 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 4215 # endif 4216 4217 return retval; 4218 } 4219 4220 4221 int 4222 __kmp_aux_get_affinity(void **mask) 4223 { 4224 int gtid; 4225 int retval; 4226 kmp_info_t *th; 4227 4228 if (! KMP_AFFINITY_CAPABLE()) { 4229 return -1; 4230 } 4231 4232 gtid = __kmp_entry_gtid(); 4233 th = __kmp_threads[gtid]; 4234 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4235 4236 KA_TRACE(1000, ;{ 4237 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4238 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4239 th->th.th_affin_mask); 4240 __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf); 4241 }); 4242 4243 if (__kmp_env_consistency_check) { 4244 if ((mask == NULL) || (*mask == NULL)) { 4245 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 4246 } 4247 } 4248 4249 # if !KMP_OS_WINDOWS 4250 4251 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4252 KA_TRACE(1000, ;{ 4253 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4254 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4255 (kmp_affin_mask_t *)(*mask)); 4256 __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf); 4257 }); 4258 return retval; 4259 4260 # else 4261 4262 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 4263 return 0; 4264 4265 # endif /* KMP_OS_WINDOWS */ 4266 4267 } 4268 4269 int 4270 __kmp_aux_set_affinity_mask_proc(int proc, void **mask) 4271 { 4272 int retval; 4273 4274 if (! KMP_AFFINITY_CAPABLE()) { 4275 return -1; 4276 } 4277 4278 KA_TRACE(1000, ;{ 4279 int gtid = __kmp_entry_gtid(); 4280 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4281 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4282 (kmp_affin_mask_t *)(*mask)); 4283 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n", 4284 proc, gtid, buf); 4285 }); 4286 4287 if (__kmp_env_consistency_check) { 4288 if ((mask == NULL) || (*mask == NULL)) { 4289 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4290 } 4291 } 4292 4293 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4294 return -1; 4295 } 4296 if (! KMP_CPU_ISSET(proc, fullMask)) { 4297 return -2; 4298 } 4299 4300 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 4301 return 0; 4302 } 4303 4304 4305 int 4306 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) 4307 { 4308 int retval; 4309 4310 if (! KMP_AFFINITY_CAPABLE()) { 4311 return -1; 4312 } 4313 4314 KA_TRACE(1000, ;{ 4315 int gtid = __kmp_entry_gtid(); 4316 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4317 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4318 (kmp_affin_mask_t *)(*mask)); 4319 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n", 4320 proc, gtid, buf); 4321 }); 4322 4323 if (__kmp_env_consistency_check) { 4324 if ((mask == NULL) || (*mask == NULL)) { 4325 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 4326 } 4327 } 4328 4329 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4330 return -1; 4331 } 4332 if (! KMP_CPU_ISSET(proc, fullMask)) { 4333 return -2; 4334 } 4335 4336 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 4337 return 0; 4338 } 4339 4340 4341 int 4342 __kmp_aux_get_affinity_mask_proc(int proc, void **mask) 4343 { 4344 int retval; 4345 4346 if (! KMP_AFFINITY_CAPABLE()) { 4347 return -1; 4348 } 4349 4350 KA_TRACE(1000, ;{ 4351 int gtid = __kmp_entry_gtid(); 4352 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4353 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4354 (kmp_affin_mask_t *)(*mask)); 4355 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n", 4356 proc, gtid, buf); 4357 }); 4358 4359 if (__kmp_env_consistency_check) { 4360 if ((mask == NULL) || (*mask == NULL)) { 4361 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc"); 4362 } 4363 } 4364 4365 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4366 return 0; 4367 } 4368 if (! KMP_CPU_ISSET(proc, fullMask)) { 4369 return 0; 4370 } 4371 4372 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 4373 } 4374 4375 4376 // Dynamic affinity settings - Affinity balanced 4377 void __kmp_balanced_affinity( int tid, int nthreads ) 4378 { 4379 if( __kmp_affinity_uniform_topology() ) { 4380 int coreID; 4381 int threadID; 4382 // Number of hyper threads per core in HT machine 4383 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 4384 // Number of cores 4385 int ncores = __kmp_ncores; 4386 // How many threads will be bound to each core 4387 int chunk = nthreads / ncores; 4388 // How many cores will have an additional thread bound to it - "big cores" 4389 int big_cores = nthreads % ncores; 4390 // Number of threads on the big cores 4391 int big_nth = ( chunk + 1 ) * big_cores; 4392 if( tid < big_nth ) { 4393 coreID = tid / (chunk + 1 ); 4394 threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ; 4395 } else { //tid >= big_nth 4396 coreID = ( tid - big_cores ) / chunk; 4397 threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ; 4398 } 4399 4400 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 4401 "Illegal set affinity operation when not capable"); 4402 4403 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size); 4404 KMP_CPU_ZERO(mask); 4405 4406 // Granularity == thread 4407 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4408 int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second; 4409 KMP_CPU_SET( osID, mask); 4410 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4411 for( int i = 0; i < __kmp_nth_per_core; i++ ) { 4412 int osID; 4413 osID = address2os[ coreID * __kmp_nth_per_core + i ].second; 4414 KMP_CPU_SET( osID, mask); 4415 } 4416 } 4417 if (__kmp_affinity_verbose) { 4418 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4419 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4420 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4421 tid, buf); 4422 } 4423 __kmp_set_system_affinity( mask, TRUE ); 4424 } else { // Non-uniform topology 4425 4426 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size); 4427 KMP_CPU_ZERO(mask); 4428 4429 // Number of hyper threads per core in HT machine 4430 int nth_per_core = __kmp_nThreadsPerCore; 4431 int core_level; 4432 if( nth_per_core > 1 ) { 4433 core_level = __kmp_aff_depth - 2; 4434 } else { 4435 core_level = __kmp_aff_depth - 1; 4436 } 4437 4438 // Number of cores - maximum value; it does not count trail cores with 0 processors 4439 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1; 4440 4441 // For performance gain consider the special case nthreads == __kmp_avail_proc 4442 if( nthreads == __kmp_avail_proc ) { 4443 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4444 int osID = address2os[ tid ].second; 4445 KMP_CPU_SET( osID, mask); 4446 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4447 int coreID = address2os[ tid ].first.labels[ core_level ]; 4448 // We'll count found osIDs for the current core; they can be not more than nth_per_core; 4449 // since the address2os is sortied we can break when cnt==nth_per_core 4450 int cnt = 0; 4451 for( int i = 0; i < __kmp_avail_proc; i++ ) { 4452 int osID = address2os[ i ].second; 4453 int core = address2os[ i ].first.labels[ core_level ]; 4454 if( core == coreID ) { 4455 KMP_CPU_SET( osID, mask); 4456 cnt++; 4457 if( cnt == nth_per_core ) { 4458 break; 4459 } 4460 } 4461 } 4462 } 4463 } else if( nthreads <= __kmp_ncores ) { 4464 4465 int core = 0; 4466 for( int i = 0; i < ncores; i++ ) { 4467 // Check if this core from procarr[] is in the mask 4468 int in_mask = 0; 4469 for( int j = 0; j < nth_per_core; j++ ) { 4470 if( procarr[ i * nth_per_core + j ] != - 1 ) { 4471 in_mask = 1; 4472 break; 4473 } 4474 } 4475 if( in_mask ) { 4476 if( tid == core ) { 4477 for( int j = 0; j < nth_per_core; j++ ) { 4478 int osID = procarr[ i * nth_per_core + j ]; 4479 if( osID != -1 ) { 4480 KMP_CPU_SET( osID, mask ); 4481 // For granularity=thread it is enough to set the first available osID for this core 4482 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4483 break; 4484 } 4485 } 4486 } 4487 break; 4488 } else { 4489 core++; 4490 } 4491 } 4492 } 4493 4494 } else { // nthreads > __kmp_ncores 4495 4496 // Array to save the number of processors at each core 4497 int nproc_at_core[ ncores ]; 4498 // Array to save the number of cores with "x" available processors; 4499 int ncores_with_x_procs[ nth_per_core + 1 ]; 4500 // Array to save the number of cores with # procs from x to nth_per_core 4501 int ncores_with_x_to_max_procs[ nth_per_core + 1 ]; 4502 4503 for( int i = 0; i <= nth_per_core; i++ ) { 4504 ncores_with_x_procs[ i ] = 0; 4505 ncores_with_x_to_max_procs[ i ] = 0; 4506 } 4507 4508 for( int i = 0; i < ncores; i++ ) { 4509 int cnt = 0; 4510 for( int j = 0; j < nth_per_core; j++ ) { 4511 if( procarr[ i * nth_per_core + j ] != -1 ) { 4512 cnt++; 4513 } 4514 } 4515 nproc_at_core[ i ] = cnt; 4516 ncores_with_x_procs[ cnt ]++; 4517 } 4518 4519 for( int i = 0; i <= nth_per_core; i++ ) { 4520 for( int j = i; j <= nth_per_core; j++ ) { 4521 ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ]; 4522 } 4523 } 4524 4525 // Max number of processors 4526 int nproc = nth_per_core * ncores; 4527 // An array to keep number of threads per each context 4528 int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc ); 4529 for( int i = 0; i < nproc; i++ ) { 4530 newarr[ i ] = 0; 4531 } 4532 4533 int nth = nthreads; 4534 int flag = 0; 4535 while( nth > 0 ) { 4536 for( int j = 1; j <= nth_per_core; j++ ) { 4537 int cnt = ncores_with_x_to_max_procs[ j ]; 4538 for( int i = 0; i < ncores; i++ ) { 4539 // Skip the core with 0 processors 4540 if( nproc_at_core[ i ] == 0 ) { 4541 continue; 4542 } 4543 for( int k = 0; k < nth_per_core; k++ ) { 4544 if( procarr[ i * nth_per_core + k ] != -1 ) { 4545 if( newarr[ i * nth_per_core + k ] == 0 ) { 4546 newarr[ i * nth_per_core + k ] = 1; 4547 cnt--; 4548 nth--; 4549 break; 4550 } else { 4551 if( flag != 0 ) { 4552 newarr[ i * nth_per_core + k ] ++; 4553 cnt--; 4554 nth--; 4555 break; 4556 } 4557 } 4558 } 4559 } 4560 if( cnt == 0 || nth == 0 ) { 4561 break; 4562 } 4563 } 4564 if( nth == 0 ) { 4565 break; 4566 } 4567 } 4568 flag = 1; 4569 } 4570 int sum = 0; 4571 for( int i = 0; i < nproc; i++ ) { 4572 sum += newarr[ i ]; 4573 if( sum > tid ) { 4574 // Granularity == thread 4575 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4576 int osID = procarr[ i ]; 4577 KMP_CPU_SET( osID, mask); 4578 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4579 int coreID = i / nth_per_core; 4580 for( int ii = 0; ii < nth_per_core; ii++ ) { 4581 int osID = procarr[ coreID * nth_per_core + ii ]; 4582 if( osID != -1 ) { 4583 KMP_CPU_SET( osID, mask); 4584 } 4585 } 4586 } 4587 break; 4588 } 4589 } 4590 __kmp_free( newarr ); 4591 } 4592 4593 if (__kmp_affinity_verbose) { 4594 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4595 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4596 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4597 tid, buf); 4598 } 4599 __kmp_set_system_affinity( mask, TRUE ); 4600 } 4601 } 4602 4603 #else 4604 // affinity not supported 4605 4606 kmp_uint32 mac_skipPerLevel[7]; 4607 kmp_uint32 mac_depth; 4608 kmp_uint8 mac_leaf_kids; 4609 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 4610 static int first = 1; 4611 if (first) { 4612 const kmp_uint32 maxLevels = 7; 4613 kmp_uint32 numPerLevel[maxLevels]; 4614 4615 for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level 4616 numPerLevel[i] = 1; 4617 mac_skipPerLevel[i] = 1; 4618 } 4619 4620 mac_depth = 2; 4621 numPerLevel[0] = nproc; 4622 4623 kmp_uint32 branch = 4; 4624 if (numPerLevel[0] == 1) branch = nproc/4; 4625 if (branch<4) branch=4; 4626 for (kmp_uint32 d=0; d<mac_depth-1; ++d) { // optimize hierarchy width 4627 while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0! 4628 if (numPerLevel[d] & 1) numPerLevel[d]++; 4629 numPerLevel[d] = numPerLevel[d] >> 1; 4630 if (numPerLevel[d+1] == 1) mac_depth++; 4631 numPerLevel[d+1] = numPerLevel[d+1] << 1; 4632 } 4633 if(numPerLevel[0] == 1) { 4634 branch = branch >> 1; 4635 if (branch<4) branch = 4; 4636 } 4637 } 4638 4639 for (kmp_uint32 i=1; i<mac_depth; ++i) 4640 mac_skipPerLevel[i] = numPerLevel[i-1] * mac_skipPerLevel[i-1]; 4641 mac_leaf_kids = (kmp_uint8)numPerLevel[0]-1; 4642 first=0; 4643 } 4644 thr_bar->depth = mac_depth; 4645 thr_bar->base_leaf_kids = mac_leaf_kids; 4646 thr_bar->skip_per_level = mac_skipPerLevel; 4647 } 4648 4649 #endif // KMP_AFFINITY_SUPPORTED 4650