1 /* 2 * kmp_affinity.cpp -- affinity management 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "kmp.h" 17 #include "kmp_i18n.h" 18 #include "kmp_io.h" 19 #include "kmp_str.h" 20 #include "kmp_wrapper_getpid.h" 21 22 #if KMP_AFFINITY_SUPPORTED 23 24 // 25 // Print the affinity mask to the character array in a pretty format. 26 // 27 char * 28 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask) 29 { 30 KMP_ASSERT(buf_len >= 40); 31 char *scan = buf; 32 char *end = buf + buf_len - 1; 33 34 // 35 // Find first element / check for empty set. 36 // 37 size_t i; 38 for (i = 0; i < KMP_CPU_SETSIZE; i++) { 39 if (KMP_CPU_ISSET(i, mask)) { 40 break; 41 } 42 } 43 if (i == KMP_CPU_SETSIZE) { 44 KMP_SNPRINTF(scan, buf_len, "{<empty>}"); 45 while (*scan != '\0') scan++; 46 KMP_ASSERT(scan <= end); 47 return buf; 48 } 49 50 KMP_SNPRINTF(scan, buf_len, "{%ld", (long)i); 51 while (*scan != '\0') scan++; 52 i++; 53 for (; i < KMP_CPU_SETSIZE; i++) { 54 if (! KMP_CPU_ISSET(i, mask)) { 55 continue; 56 } 57 58 // 59 // Check for buffer overflow. A string of the form ",<n>" will have 60 // at most 10 characters, plus we want to leave room to print ",...}" 61 // if the set is too large to print for a total of 15 characters. 62 // We already left room for '\0' in setting end. 63 // 64 if (end - scan < 15) { 65 break; 66 } 67 KMP_SNPRINTF(scan, buf_len, ",%-ld", (long)i); 68 while (*scan != '\0') scan++; 69 } 70 if (i < KMP_CPU_SETSIZE) { 71 KMP_SNPRINTF(scan, buf_len, ",..."); 72 while (*scan != '\0') scan++; 73 } 74 KMP_SNPRINTF(scan, buf_len, "}"); 75 while (*scan != '\0') scan++; 76 KMP_ASSERT(scan <= end); 77 return buf; 78 } 79 80 81 void 82 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) 83 { 84 KMP_CPU_ZERO(mask); 85 86 # if KMP_GROUP_AFFINITY 87 88 if (__kmp_num_proc_groups > 1) { 89 int group; 90 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 91 for (group = 0; group < __kmp_num_proc_groups; group++) { 92 int i; 93 int num = __kmp_GetActiveProcessorCount(group); 94 for (i = 0; i < num; i++) { 95 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 96 } 97 } 98 } 99 else 100 101 # endif /* KMP_GROUP_AFFINITY */ 102 103 { 104 int proc; 105 for (proc = 0; proc < __kmp_xproc; proc++) { 106 KMP_CPU_SET(proc, mask); 107 } 108 } 109 } 110 111 112 // 113 // In Linux* OS debug & cover (-O0) builds, we need to avoid inline member 114 // functions. 115 // 116 // The icc codegen emits sections with extremely long names, of the form 117 // ".gnu.linkonce.<mangled_name>". There seems to have been a linker bug 118 // introduced between GNU ld version 2.14.90.0.4 and 2.15.92.0.2 involving 119 // some sort of memory corruption or table overflow that is triggered by 120 // these long strings. I checked the latest version of the linker - 121 // GNU ld (Linux* OS/GNU Binutils) 2.18.50.0.7.20080422 - and the bug is not 122 // fixed. 123 // 124 // Unfortunately, my attempts to reproduce it in a smaller example have 125 // failed - I'm not sure what the prospects are of getting it fixed 126 // properly - but we need a reproducer smaller than all of libiomp. 127 // 128 // Work around the problem by avoiding inline constructors in such builds. 129 // We do this for all platforms, not just Linux* OS - non-inline functions are 130 // more debuggable and provide better coverage into than inline functions. 131 // Use inline functions in shipping libs, for performance. 132 // 133 134 # if !defined(KMP_DEBUG) && !defined(COVER) 135 136 class Address { 137 public: 138 static const unsigned maxDepth = 32; 139 unsigned labels[maxDepth]; 140 unsigned childNums[maxDepth]; 141 unsigned depth; 142 unsigned leader; 143 Address(unsigned _depth) 144 : depth(_depth), leader(FALSE) { 145 } 146 Address &operator=(const Address &b) { 147 depth = b.depth; 148 for (unsigned i = 0; i < depth; i++) { 149 labels[i] = b.labels[i]; 150 childNums[i] = b.childNums[i]; 151 } 152 leader = FALSE; 153 return *this; 154 } 155 bool operator==(const Address &b) const { 156 if (depth != b.depth) 157 return false; 158 for (unsigned i = 0; i < depth; i++) 159 if(labels[i] != b.labels[i]) 160 return false; 161 return true; 162 } 163 bool isClose(const Address &b, int level) const { 164 if (depth != b.depth) 165 return false; 166 if ((unsigned)level >= depth) 167 return true; 168 for (unsigned i = 0; i < (depth - level); i++) 169 if(labels[i] != b.labels[i]) 170 return false; 171 return true; 172 } 173 bool operator!=(const Address &b) const { 174 return !operator==(b); 175 } 176 }; 177 178 class AddrUnsPair { 179 public: 180 Address first; 181 unsigned second; 182 AddrUnsPair(Address _first, unsigned _second) 183 : first(_first), second(_second) { 184 } 185 AddrUnsPair &operator=(const AddrUnsPair &b) 186 { 187 first = b.first; 188 second = b.second; 189 return *this; 190 } 191 }; 192 193 # else 194 195 class Address { 196 public: 197 static const unsigned maxDepth = 32; 198 unsigned labels[maxDepth]; 199 unsigned childNums[maxDepth]; 200 unsigned depth; 201 unsigned leader; 202 Address(unsigned _depth); 203 Address &operator=(const Address &b); 204 bool operator==(const Address &b) const; 205 bool isClose(const Address &b, int level) const; 206 bool operator!=(const Address &b) const; 207 }; 208 209 Address::Address(unsigned _depth) 210 { 211 depth = _depth; 212 leader = FALSE; 213 } 214 215 Address &Address::operator=(const Address &b) { 216 depth = b.depth; 217 for (unsigned i = 0; i < depth; i++) { 218 labels[i] = b.labels[i]; 219 childNums[i] = b.childNums[i]; 220 } 221 leader = FALSE; 222 return *this; 223 } 224 225 bool Address::operator==(const Address &b) const { 226 if (depth != b.depth) 227 return false; 228 for (unsigned i = 0; i < depth; i++) 229 if(labels[i] != b.labels[i]) 230 return false; 231 return true; 232 } 233 234 bool Address::isClose(const Address &b, int level) const { 235 if (depth != b.depth) 236 return false; 237 if ((unsigned)level >= depth) 238 return true; 239 for (unsigned i = 0; i < (depth - level); i++) 240 if(labels[i] != b.labels[i]) 241 return false; 242 return true; 243 } 244 245 bool Address::operator!=(const Address &b) const { 246 return !operator==(b); 247 } 248 249 class AddrUnsPair { 250 public: 251 Address first; 252 unsigned second; 253 AddrUnsPair(Address _first, unsigned _second); 254 AddrUnsPair &operator=(const AddrUnsPair &b); 255 }; 256 257 AddrUnsPair::AddrUnsPair(Address _first, unsigned _second) 258 : first(_first), second(_second) 259 { 260 } 261 262 AddrUnsPair &AddrUnsPair::operator=(const AddrUnsPair &b) 263 { 264 first = b.first; 265 second = b.second; 266 return *this; 267 } 268 269 # endif /* !defined(KMP_DEBUG) && !defined(COVER) */ 270 271 272 static int 273 __kmp_affinity_cmp_Address_labels(const void *a, const void *b) 274 { 275 const Address *aa = (const Address *)&(((AddrUnsPair *)a) 276 ->first); 277 const Address *bb = (const Address *)&(((AddrUnsPair *)b) 278 ->first); 279 unsigned depth = aa->depth; 280 unsigned i; 281 KMP_DEBUG_ASSERT(depth == bb->depth); 282 for (i = 0; i < depth; i++) { 283 if (aa->labels[i] < bb->labels[i]) return -1; 284 if (aa->labels[i] > bb->labels[i]) return 1; 285 } 286 return 0; 287 } 288 289 290 static int 291 __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) 292 { 293 const Address *aa = (const Address *)&(((AddrUnsPair *)a) 294 ->first); 295 const Address *bb = (const Address *)&(((AddrUnsPair *)b) 296 ->first); 297 unsigned depth = aa->depth; 298 unsigned i; 299 KMP_DEBUG_ASSERT(depth == bb->depth); 300 KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth); 301 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0); 302 for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) { 303 int j = depth - i - 1; 304 if (aa->childNums[j] < bb->childNums[j]) return -1; 305 if (aa->childNums[j] > bb->childNums[j]) return 1; 306 } 307 for (; i < depth; i++) { 308 int j = i - __kmp_affinity_compact; 309 if (aa->childNums[j] < bb->childNums[j]) return -1; 310 if (aa->childNums[j] > bb->childNums[j]) return 1; 311 } 312 return 0; 313 } 314 315 /** A structure for holding machine-specific hierarchy info to be computed once at init. */ 316 class hierarchy_info { 317 public: 318 /** Typical levels are threads/core, cores/package or socket, packages/node, nodes/machine, 319 etc. We don't want to get specific with nomenclature */ 320 static const kmp_uint32 maxLevels=7; 321 322 /** This is specifically the depth of the machine configuration hierarchy, in terms of the 323 number of levels along the longest path from root to any leaf. It corresponds to the 324 number of entries in numPerLevel if we exclude all but one trailing 1. */ 325 kmp_uint32 depth; 326 kmp_uint32 base_num_threads; 327 volatile kmp_int8 uninitialized; // 0=initialized, 1=uninitialized, 2=initialization in progress 328 329 /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children the parent of a 330 node at level i has. For example, if we have a machine with 4 packages, 4 cores/package 331 and 2 HT per core, then numPerLevel = {2, 4, 4, 1, 1}. All empty levels are set to 1. */ 332 kmp_uint32 numPerLevel[maxLevels]; 333 kmp_uint32 skipPerLevel[maxLevels]; 334 335 void deriveLevels(AddrUnsPair *adr2os, int num_addrs) { 336 int hier_depth = adr2os[0].first.depth; 337 int level = 0; 338 for (int i=hier_depth-1; i>=0; --i) { 339 int max = -1; 340 for (int j=0; j<num_addrs; ++j) { 341 int next = adr2os[j].first.childNums[i]; 342 if (next > max) max = next; 343 } 344 numPerLevel[level] = max+1; 345 ++level; 346 } 347 } 348 349 hierarchy_info() : depth(1), uninitialized(1) {} 350 void init(AddrUnsPair *adr2os, int num_addrs) 351 { 352 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&uninitialized, 1, 2); 353 if (bool_result == 0) { // Wait for initialization 354 while (TCR_1(uninitialized) != 0) KMP_CPU_PAUSE(); 355 return; 356 } 357 KMP_DEBUG_ASSERT(bool_result==1); 358 359 /* Added explicit initialization of the depth here to prevent usage of dirty value 360 observed when static library is re-initialized multiple times (e.g. when 361 non-OpenMP thread repeatedly launches/joins thread that uses OpenMP). */ 362 depth = 1; 363 for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level 364 numPerLevel[i] = 1; 365 skipPerLevel[i] = 1; 366 } 367 368 // Sort table by physical ID 369 if (adr2os) { 370 qsort(adr2os, num_addrs, sizeof(*adr2os), __kmp_affinity_cmp_Address_labels); 371 deriveLevels(adr2os, num_addrs); 372 } 373 else { 374 numPerLevel[0] = 4; 375 numPerLevel[1] = num_addrs/4; 376 if (num_addrs%4) numPerLevel[1]++; 377 } 378 379 base_num_threads = num_addrs; 380 for (int i=maxLevels-1; i>=0; --i) // count non-empty levels to get depth 381 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1' 382 depth++; 383 384 kmp_uint32 branch = 4; 385 if (numPerLevel[0] == 1) branch = num_addrs/4; 386 if (branch<4) branch=4; 387 for (kmp_uint32 d=0; d<depth-1; ++d) { // optimize hierarchy width 388 while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0! 389 if (numPerLevel[d] & 1) numPerLevel[d]++; 390 numPerLevel[d] = numPerLevel[d] >> 1; 391 if (numPerLevel[d+1] == 1) depth++; 392 numPerLevel[d+1] = numPerLevel[d+1] << 1; 393 } 394 if(numPerLevel[0] == 1) { 395 branch = branch >> 1; 396 if (branch<4) branch = 4; 397 } 398 } 399 400 for (kmp_uint32 i=1; i<depth; ++i) 401 skipPerLevel[i] = numPerLevel[i-1] * skipPerLevel[i-1]; 402 // Fill in hierarchy in the case of oversubscription 403 for (kmp_uint32 i=depth; i<maxLevels; ++i) 404 skipPerLevel[i] = 2*skipPerLevel[i-1]; 405 406 uninitialized = 0; // One writer 407 408 } 409 }; 410 411 static hierarchy_info machine_hierarchy; 412 413 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 414 kmp_uint32 depth; 415 // The test below is true if affinity is available, but set to "none". Need to init on first use of hierarchical barrier. 416 if (TCR_1(machine_hierarchy.uninitialized)) 417 machine_hierarchy.init(NULL, nproc); 418 419 depth = machine_hierarchy.depth; 420 KMP_DEBUG_ASSERT(depth > 0); 421 // The loop below adjusts the depth in the case of oversubscription 422 while (nproc > machine_hierarchy.skipPerLevel[depth-1] && depth<machine_hierarchy.maxLevels-1) 423 depth++; 424 425 thr_bar->depth = depth; 426 thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1; 427 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 428 } 429 430 // 431 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be 432 // called to renumber the labels from [0..n] and place them into the child_num 433 // vector of the address object. This is done in case the labels used for 434 // the children at one node of the hierarchy differ from those used for 435 // another node at the same level. Example: suppose the machine has 2 nodes 436 // with 2 packages each. The first node contains packages 601 and 602, and 437 // second node contains packages 603 and 604. If we try to sort the table 438 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604 439 // because we are paying attention to the labels themselves, not the ordinal 440 // child numbers. By using the child numbers in the sort, the result is 441 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604. 442 // 443 static void 444 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os, 445 int numAddrs) 446 { 447 KMP_DEBUG_ASSERT(numAddrs > 0); 448 int depth = address2os->first.depth; 449 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 450 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth 451 * sizeof(unsigned)); 452 int labCt; 453 for (labCt = 0; labCt < depth; labCt++) { 454 address2os[0].first.childNums[labCt] = counts[labCt] = 0; 455 lastLabel[labCt] = address2os[0].first.labels[labCt]; 456 } 457 int i; 458 for (i = 1; i < numAddrs; i++) { 459 for (labCt = 0; labCt < depth; labCt++) { 460 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) { 461 int labCt2; 462 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) { 463 counts[labCt2] = 0; 464 lastLabel[labCt2] = address2os[i].first.labels[labCt2]; 465 } 466 counts[labCt]++; 467 lastLabel[labCt] = address2os[i].first.labels[labCt]; 468 break; 469 } 470 } 471 for (labCt = 0; labCt < depth; labCt++) { 472 address2os[i].first.childNums[labCt] = counts[labCt]; 473 } 474 for (; labCt < (int)Address::maxDepth; labCt++) { 475 address2os[i].first.childNums[labCt] = 0; 476 } 477 } 478 } 479 480 481 // 482 // All of the __kmp_affinity_create_*_map() routines should set 483 // __kmp_affinity_masks to a vector of affinity mask objects of length 484 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and 485 // return the number of levels in the machine topology tree (zero if 486 // __kmp_affinity_type == affinity_none). 487 // 488 // All of the __kmp_affinity_create_*_map() routines should set *fullMask 489 // to the affinity mask for the initialization thread. They need to save and 490 // restore the mask, and it could be needed later, so saving it is just an 491 // optimization to avoid calling kmp_get_system_affinity() again. 492 // 493 static kmp_affin_mask_t *fullMask = NULL; 494 495 kmp_affin_mask_t * 496 __kmp_affinity_get_fullMask() { return fullMask; } 497 498 499 static int nCoresPerPkg, nPackages; 500 static int __kmp_nThreadsPerCore; 501 #ifndef KMP_DFLT_NTH_CORES 502 static int __kmp_ncores; 503 #endif 504 505 // 506 // __kmp_affinity_uniform_topology() doesn't work when called from 507 // places which support arbitrarily many levels in the machine topology 508 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map() 509 // __kmp_affinity_create_x2apicid_map(). 510 // 511 inline static bool 512 __kmp_affinity_uniform_topology() 513 { 514 return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages); 515 } 516 517 518 // 519 // Print out the detailed machine topology map, i.e. the physical locations 520 // of each OS proc. 521 // 522 static void 523 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth, 524 int pkgLevel, int coreLevel, int threadLevel) 525 { 526 int proc; 527 528 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 529 for (proc = 0; proc < len; proc++) { 530 int level; 531 kmp_str_buf_t buf; 532 __kmp_str_buf_init(&buf); 533 for (level = 0; level < depth; level++) { 534 if (level == threadLevel) { 535 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread)); 536 } 537 else if (level == coreLevel) { 538 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core)); 539 } 540 else if (level == pkgLevel) { 541 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package)); 542 } 543 else if (level > pkgLevel) { 544 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node), 545 level - pkgLevel - 1); 546 } 547 else { 548 __kmp_str_buf_print(&buf, "L%d ", level); 549 } 550 __kmp_str_buf_print(&buf, "%d ", 551 address2os[proc].first.labels[level]); 552 } 553 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second, 554 buf.str); 555 __kmp_str_buf_free(&buf); 556 } 557 } 558 559 560 // 561 // If we don't know how to retrieve the machine's processor topology, or 562 // encounter an error in doing so, this routine is called to form a "flat" 563 // mapping of os thread id's <-> processor id's. 564 // 565 static int 566 __kmp_affinity_create_flat_map(AddrUnsPair **address2os, 567 kmp_i18n_id_t *const msg_id) 568 { 569 *address2os = NULL; 570 *msg_id = kmp_i18n_null; 571 572 // 573 // Even if __kmp_affinity_type == affinity_none, this routine might still 574 // called to set __kmp_ncores, as well as 575 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 576 // 577 if (! KMP_AFFINITY_CAPABLE()) { 578 KMP_ASSERT(__kmp_affinity_type == affinity_none); 579 __kmp_ncores = nPackages = __kmp_xproc; 580 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 581 if (__kmp_affinity_verbose) { 582 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY"); 583 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 584 KMP_INFORM(Uniform, "KMP_AFFINITY"); 585 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 586 __kmp_nThreadsPerCore, __kmp_ncores); 587 } 588 return 0; 589 } 590 591 // 592 // When affinity is off, this routine will still be called to set 593 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 594 // nCoresPerPkg, & nPackages. Make sure all these vars are set 595 // correctly, and return now if affinity is not enabled. 596 // 597 __kmp_ncores = nPackages = __kmp_avail_proc; 598 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 599 if (__kmp_affinity_verbose) { 600 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 601 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask); 602 603 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY"); 604 if (__kmp_affinity_respect_mask) { 605 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 606 } else { 607 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 608 } 609 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 610 KMP_INFORM(Uniform, "KMP_AFFINITY"); 611 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 612 __kmp_nThreadsPerCore, __kmp_ncores); 613 } 614 if (__kmp_affinity_type == affinity_none) { 615 return 0; 616 } 617 618 // 619 // Contruct the data structure to be returned. 620 // 621 *address2os = (AddrUnsPair*) 622 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 623 int avail_ct = 0; 624 unsigned int i; 625 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 626 // 627 // Skip this proc if it is not included in the machine model. 628 // 629 if (! KMP_CPU_ISSET(i, fullMask)) { 630 continue; 631 } 632 633 Address addr(1); 634 addr.labels[0] = i; 635 (*address2os)[avail_ct++] = AddrUnsPair(addr,i); 636 } 637 if (__kmp_affinity_verbose) { 638 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 639 } 640 641 if (__kmp_affinity_gran_levels < 0) { 642 // 643 // Only the package level is modeled in the machine topology map, 644 // so the #levels of granularity is either 0 or 1. 645 // 646 if (__kmp_affinity_gran > affinity_gran_package) { 647 __kmp_affinity_gran_levels = 1; 648 } 649 else { 650 __kmp_affinity_gran_levels = 0; 651 } 652 } 653 return 1; 654 } 655 656 657 # if KMP_GROUP_AFFINITY 658 659 // 660 // If multiple Windows* OS processor groups exist, we can create a 2-level 661 // topology map with the groups at level 0 and the individual procs at 662 // level 1. 663 // 664 // This facilitates letting the threads float among all procs in a group, 665 // if granularity=group (the default when there are multiple groups). 666 // 667 static int 668 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os, 669 kmp_i18n_id_t *const msg_id) 670 { 671 *address2os = NULL; 672 *msg_id = kmp_i18n_null; 673 674 // 675 // If we don't have multiple processor groups, return now. 676 // The flat mapping will be used. 677 // 678 if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) { 679 // FIXME set *msg_id 680 return -1; 681 } 682 683 // 684 // Contruct the data structure to be returned. 685 // 686 *address2os = (AddrUnsPair*) 687 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 688 int avail_ct = 0; 689 int i; 690 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 691 // 692 // Skip this proc if it is not included in the machine model. 693 // 694 if (! KMP_CPU_ISSET(i, fullMask)) { 695 continue; 696 } 697 698 Address addr(2); 699 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR)); 700 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR)); 701 (*address2os)[avail_ct++] = AddrUnsPair(addr,i); 702 703 if (__kmp_affinity_verbose) { 704 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0], 705 addr.labels[1]); 706 } 707 } 708 709 if (__kmp_affinity_gran_levels < 0) { 710 if (__kmp_affinity_gran == affinity_gran_group) { 711 __kmp_affinity_gran_levels = 1; 712 } 713 else if ((__kmp_affinity_gran == affinity_gran_fine) 714 || (__kmp_affinity_gran == affinity_gran_thread)) { 715 __kmp_affinity_gran_levels = 0; 716 } 717 else { 718 const char *gran_str = NULL; 719 if (__kmp_affinity_gran == affinity_gran_core) { 720 gran_str = "core"; 721 } 722 else if (__kmp_affinity_gran == affinity_gran_package) { 723 gran_str = "package"; 724 } 725 else if (__kmp_affinity_gran == affinity_gran_node) { 726 gran_str = "node"; 727 } 728 else { 729 KMP_ASSERT(0); 730 } 731 732 // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread" 733 __kmp_affinity_gran_levels = 0; 734 } 735 } 736 return 2; 737 } 738 739 # endif /* KMP_GROUP_AFFINITY */ 740 741 742 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 743 744 static int 745 __kmp_cpuid_mask_width(int count) { 746 int r = 0; 747 748 while((1<<r) < count) 749 ++r; 750 return r; 751 } 752 753 754 class apicThreadInfo { 755 public: 756 unsigned osId; // param to __kmp_affinity_bind_thread 757 unsigned apicId; // from cpuid after binding 758 unsigned maxCoresPerPkg; // "" 759 unsigned maxThreadsPerPkg; // "" 760 unsigned pkgId; // inferred from above values 761 unsigned coreId; // "" 762 unsigned threadId; // "" 763 }; 764 765 766 static int 767 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b) 768 { 769 const apicThreadInfo *aa = (const apicThreadInfo *)a; 770 const apicThreadInfo *bb = (const apicThreadInfo *)b; 771 if (aa->osId < bb->osId) return -1; 772 if (aa->osId > bb->osId) return 1; 773 return 0; 774 } 775 776 777 static int 778 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b) 779 { 780 const apicThreadInfo *aa = (const apicThreadInfo *)a; 781 const apicThreadInfo *bb = (const apicThreadInfo *)b; 782 if (aa->pkgId < bb->pkgId) return -1; 783 if (aa->pkgId > bb->pkgId) return 1; 784 if (aa->coreId < bb->coreId) return -1; 785 if (aa->coreId > bb->coreId) return 1; 786 if (aa->threadId < bb->threadId) return -1; 787 if (aa->threadId > bb->threadId) return 1; 788 return 0; 789 } 790 791 792 // 793 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 794 // an algorithm which cycles through the available os threads, setting 795 // the current thread's affinity mask to that thread, and then retrieves 796 // the Apic Id for each thread context using the cpuid instruction. 797 // 798 static int 799 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os, 800 kmp_i18n_id_t *const msg_id) 801 { 802 kmp_cpuid buf; 803 int rc; 804 *address2os = NULL; 805 *msg_id = kmp_i18n_null; 806 807 // 808 // Check if cpuid leaf 4 is supported. 809 // 810 __kmp_x86_cpuid(0, 0, &buf); 811 if (buf.eax < 4) { 812 *msg_id = kmp_i18n_str_NoLeaf4Support; 813 return -1; 814 } 815 816 // 817 // The algorithm used starts by setting the affinity to each available 818 // thread and retrieving info from the cpuid instruction, so if we are 819 // not capable of calling __kmp_get_system_affinity() and 820 // _kmp_get_system_affinity(), then we need to do something else - use 821 // the defaults that we calculated from issuing cpuid without binding 822 // to each proc. 823 // 824 if (! KMP_AFFINITY_CAPABLE()) { 825 // 826 // Hack to try and infer the machine topology using only the data 827 // available from cpuid on the current thread, and __kmp_xproc. 828 // 829 KMP_ASSERT(__kmp_affinity_type == affinity_none); 830 831 // 832 // Get an upper bound on the number of threads per package using 833 // cpuid(1). 834 // 835 // On some OS/chps combinations where HT is supported by the chip 836 // but is disabled, this value will be 2 on a single core chip. 837 // Usually, it will be 2 if HT is enabled and 1 if HT is disabled. 838 // 839 __kmp_x86_cpuid(1, 0, &buf); 840 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 841 if (maxThreadsPerPkg == 0) { 842 maxThreadsPerPkg = 1; 843 } 844 845 // 846 // The num cores per pkg comes from cpuid(4). 847 // 1 must be added to the encoded value. 848 // 849 // The author of cpu_count.cpp treated this only an upper bound 850 // on the number of cores, but I haven't seen any cases where it 851 // was greater than the actual number of cores, so we will treat 852 // it as exact in this block of code. 853 // 854 // First, we need to check if cpuid(4) is supported on this chip. 855 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax 856 // has the value n or greater. 857 // 858 __kmp_x86_cpuid(0, 0, &buf); 859 if (buf.eax >= 4) { 860 __kmp_x86_cpuid(4, 0, &buf); 861 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 862 } 863 else { 864 nCoresPerPkg = 1; 865 } 866 867 // 868 // There is no way to reliably tell if HT is enabled without issuing 869 // the cpuid instruction from every thread, can correlating the cpuid 870 // info, so if the machine is not affinity capable, we assume that HT 871 // is off. We have seen quite a few machines where maxThreadsPerPkg 872 // is 2, yet the machine does not support HT. 873 // 874 // - Older OSes are usually found on machines with older chips, which 875 // do not support HT. 876 // 877 // - The performance penalty for mistakenly identifying a machine as 878 // HT when it isn't (which results in blocktime being incorrecly set 879 // to 0) is greater than the penalty when for mistakenly identifying 880 // a machine as being 1 thread/core when it is really HT enabled 881 // (which results in blocktime being incorrectly set to a positive 882 // value). 883 // 884 __kmp_ncores = __kmp_xproc; 885 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 886 __kmp_nThreadsPerCore = 1; 887 if (__kmp_affinity_verbose) { 888 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY"); 889 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 890 if (__kmp_affinity_uniform_topology()) { 891 KMP_INFORM(Uniform, "KMP_AFFINITY"); 892 } else { 893 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 894 } 895 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 896 __kmp_nThreadsPerCore, __kmp_ncores); 897 } 898 return 0; 899 } 900 901 // 902 // 903 // From here on, we can assume that it is safe to call 904 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), 905 // even if __kmp_affinity_type = affinity_none. 906 // 907 908 // 909 // Save the affinity mask for the current thread. 910 // 911 kmp_affin_mask_t *oldMask; 912 KMP_CPU_ALLOC(oldMask); 913 KMP_ASSERT(oldMask != NULL); 914 __kmp_get_system_affinity(oldMask, TRUE); 915 916 // 917 // Run through each of the available contexts, binding the current thread 918 // to it, and obtaining the pertinent information using the cpuid instr. 919 // 920 // The relevant information is: 921 // 922 // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 923 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 924 // 925 // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The 926 // value of this field determines the width of the core# + thread# 927 // fields in the Apic Id. It is also an upper bound on the number 928 // of threads per package, but it has been verified that situations 929 // happen were it is not exact. In particular, on certain OS/chip 930 // combinations where Intel(R) Hyper-Threading Technology is supported 931 // by the chip but has 932 // been disabled, the value of this field will be 2 (for a single core 933 // chip). On other OS/chip combinations supporting 934 // Intel(R) Hyper-Threading Technology, the value of 935 // this field will be 1 when Intel(R) Hyper-Threading Technology is 936 // disabled and 2 when it is enabled. 937 // 938 // Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The 939 // value of this field (+1) determines the width of the core# field in 940 // the Apic Id. The comments in "cpucount.cpp" say that this value is 941 // an upper bound, but the IA-32 architecture manual says that it is 942 // exactly the number of cores per package, and I haven't seen any 943 // case where it wasn't. 944 // 945 // From this information, deduce the package Id, core Id, and thread Id, 946 // and set the corresponding fields in the apicThreadInfo struct. 947 // 948 unsigned i; 949 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 950 __kmp_avail_proc * sizeof(apicThreadInfo)); 951 unsigned nApics = 0; 952 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 953 // 954 // Skip this proc if it is not included in the machine model. 955 // 956 if (! KMP_CPU_ISSET(i, fullMask)) { 957 continue; 958 } 959 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 960 961 __kmp_affinity_bind_thread(i); 962 threadInfo[nApics].osId = i; 963 964 // 965 // The apic id and max threads per pkg come from cpuid(1). 966 // 967 __kmp_x86_cpuid(1, 0, &buf); 968 if (! (buf.edx >> 9) & 1) { 969 __kmp_set_system_affinity(oldMask, TRUE); 970 __kmp_free(threadInfo); 971 KMP_CPU_FREE(oldMask); 972 *msg_id = kmp_i18n_str_ApicNotPresent; 973 return -1; 974 } 975 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 976 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 977 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 978 threadInfo[nApics].maxThreadsPerPkg = 1; 979 } 980 981 // 982 // Max cores per pkg comes from cpuid(4). 983 // 1 must be added to the encoded value. 984 // 985 // First, we need to check if cpuid(4) is supported on this chip. 986 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax 987 // has the value n or greater. 988 // 989 __kmp_x86_cpuid(0, 0, &buf); 990 if (buf.eax >= 4) { 991 __kmp_x86_cpuid(4, 0, &buf); 992 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 993 } 994 else { 995 threadInfo[nApics].maxCoresPerPkg = 1; 996 } 997 998 // 999 // Infer the pkgId / coreId / threadId using only the info 1000 // obtained locally. 1001 // 1002 int widthCT = __kmp_cpuid_mask_width( 1003 threadInfo[nApics].maxThreadsPerPkg); 1004 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 1005 1006 int widthC = __kmp_cpuid_mask_width( 1007 threadInfo[nApics].maxCoresPerPkg); 1008 int widthT = widthCT - widthC; 1009 if (widthT < 0) { 1010 // 1011 // I've never seen this one happen, but I suppose it could, if 1012 // the cpuid instruction on a chip was really screwed up. 1013 // Make sure to restore the affinity mask before the tail call. 1014 // 1015 __kmp_set_system_affinity(oldMask, TRUE); 1016 __kmp_free(threadInfo); 1017 KMP_CPU_FREE(oldMask); 1018 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1019 return -1; 1020 } 1021 1022 int maskC = (1 << widthC) - 1; 1023 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) 1024 &maskC; 1025 1026 int maskT = (1 << widthT) - 1; 1027 threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT; 1028 1029 nApics++; 1030 } 1031 1032 // 1033 // We've collected all the info we need. 1034 // Restore the old affinity mask for this thread. 1035 // 1036 __kmp_set_system_affinity(oldMask, TRUE); 1037 1038 // 1039 // If there's only one thread context to bind to, form an Address object 1040 // with depth 1 and return immediately (or, if affinity is off, set 1041 // address2os to NULL and return). 1042 // 1043 // If it is configured to omit the package level when there is only a 1044 // single package, the logic at the end of this routine won't work if 1045 // there is only a single thread - it would try to form an Address 1046 // object with depth 0. 1047 // 1048 KMP_ASSERT(nApics > 0); 1049 if (nApics == 1) { 1050 __kmp_ncores = nPackages = 1; 1051 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1052 if (__kmp_affinity_verbose) { 1053 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1054 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1055 1056 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1057 if (__kmp_affinity_respect_mask) { 1058 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1059 } else { 1060 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1061 } 1062 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1063 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1064 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1065 __kmp_nThreadsPerCore, __kmp_ncores); 1066 } 1067 1068 if (__kmp_affinity_type == affinity_none) { 1069 __kmp_free(threadInfo); 1070 KMP_CPU_FREE(oldMask); 1071 return 0; 1072 } 1073 1074 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair)); 1075 Address addr(1); 1076 addr.labels[0] = threadInfo[0].pkgId; 1077 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId); 1078 1079 if (__kmp_affinity_gran_levels < 0) { 1080 __kmp_affinity_gran_levels = 0; 1081 } 1082 1083 if (__kmp_affinity_verbose) { 1084 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 1085 } 1086 1087 __kmp_free(threadInfo); 1088 KMP_CPU_FREE(oldMask); 1089 return 1; 1090 } 1091 1092 // 1093 // Sort the threadInfo table by physical Id. 1094 // 1095 qsort(threadInfo, nApics, sizeof(*threadInfo), 1096 __kmp_affinity_cmp_apicThreadInfo_phys_id); 1097 1098 // 1099 // The table is now sorted by pkgId / coreId / threadId, but we really 1100 // don't know the radix of any of the fields. pkgId's may be sparsely 1101 // assigned among the chips on a system. Although coreId's are usually 1102 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned 1103 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1104 // 1105 // For that matter, we don't know what coresPerPkg and threadsPerCore 1106 // (or the total # packages) are at this point - we want to determine 1107 // that now. We only have an upper bound on the first two figures. 1108 // 1109 // We also perform a consistency check at this point: the values returned 1110 // by the cpuid instruction for any thread bound to a given package had 1111 // better return the same info for maxThreadsPerPkg and maxCoresPerPkg. 1112 // 1113 nPackages = 1; 1114 nCoresPerPkg = 1; 1115 __kmp_nThreadsPerCore = 1; 1116 unsigned nCores = 1; 1117 1118 unsigned pkgCt = 1; // to determine radii 1119 unsigned lastPkgId = threadInfo[0].pkgId; 1120 unsigned coreCt = 1; 1121 unsigned lastCoreId = threadInfo[0].coreId; 1122 unsigned threadCt = 1; 1123 unsigned lastThreadId = threadInfo[0].threadId; 1124 1125 // intra-pkg consist checks 1126 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 1127 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 1128 1129 for (i = 1; i < nApics; i++) { 1130 if (threadInfo[i].pkgId != lastPkgId) { 1131 nCores++; 1132 pkgCt++; 1133 lastPkgId = threadInfo[i].pkgId; 1134 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; 1135 coreCt = 1; 1136 lastCoreId = threadInfo[i].coreId; 1137 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1138 threadCt = 1; 1139 lastThreadId = threadInfo[i].threadId; 1140 1141 // 1142 // This is a different package, so go on to the next iteration 1143 // without doing any consistency checks. Reset the consistency 1144 // check vars, though. 1145 // 1146 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 1147 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 1148 continue; 1149 } 1150 1151 if (threadInfo[i].coreId != lastCoreId) { 1152 nCores++; 1153 coreCt++; 1154 lastCoreId = threadInfo[i].coreId; 1155 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1156 threadCt = 1; 1157 lastThreadId = threadInfo[i].threadId; 1158 } 1159 else if (threadInfo[i].threadId != lastThreadId) { 1160 threadCt++; 1161 lastThreadId = threadInfo[i].threadId; 1162 } 1163 else { 1164 __kmp_free(threadInfo); 1165 KMP_CPU_FREE(oldMask); 1166 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1167 return -1; 1168 } 1169 1170 // 1171 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 1172 // fields agree between all the threads bounds to a given package. 1173 // 1174 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) 1175 || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 1176 __kmp_free(threadInfo); 1177 KMP_CPU_FREE(oldMask); 1178 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1179 return -1; 1180 } 1181 } 1182 nPackages = pkgCt; 1183 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; 1184 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1185 1186 // 1187 // When affinity is off, this routine will still be called to set 1188 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 1189 // nCoresPerPkg, & nPackages. Make sure all these vars are set 1190 // correctly, and return now if affinity is not enabled. 1191 // 1192 __kmp_ncores = nCores; 1193 if (__kmp_affinity_verbose) { 1194 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1195 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1196 1197 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1198 if (__kmp_affinity_respect_mask) { 1199 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1200 } else { 1201 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1202 } 1203 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1204 if (__kmp_affinity_uniform_topology()) { 1205 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1206 } else { 1207 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1208 } 1209 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1210 __kmp_nThreadsPerCore, __kmp_ncores); 1211 1212 } 1213 1214 if (__kmp_affinity_type == affinity_none) { 1215 __kmp_free(threadInfo); 1216 KMP_CPU_FREE(oldMask); 1217 return 0; 1218 } 1219 1220 // 1221 // Now that we've determined the number of packages, the number of cores 1222 // per package, and the number of threads per core, we can construct the 1223 // data structure that is to be returned. 1224 // 1225 int pkgLevel = 0; 1226 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1; 1227 int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 1228 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 1229 1230 KMP_ASSERT(depth > 0); 1231 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1232 1233 for (i = 0; i < nApics; ++i) { 1234 Address addr(depth); 1235 unsigned os = threadInfo[i].osId; 1236 int d = 0; 1237 1238 if (pkgLevel >= 0) { 1239 addr.labels[d++] = threadInfo[i].pkgId; 1240 } 1241 if (coreLevel >= 0) { 1242 addr.labels[d++] = threadInfo[i].coreId; 1243 } 1244 if (threadLevel >= 0) { 1245 addr.labels[d++] = threadInfo[i].threadId; 1246 } 1247 (*address2os)[i] = AddrUnsPair(addr, os); 1248 } 1249 1250 if (__kmp_affinity_gran_levels < 0) { 1251 // 1252 // Set the granularity level based on what levels are modeled 1253 // in the machine topology map. 1254 // 1255 __kmp_affinity_gran_levels = 0; 1256 if ((threadLevel >= 0) 1257 && (__kmp_affinity_gran > affinity_gran_thread)) { 1258 __kmp_affinity_gran_levels++; 1259 } 1260 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1261 __kmp_affinity_gran_levels++; 1262 } 1263 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) { 1264 __kmp_affinity_gran_levels++; 1265 } 1266 } 1267 1268 if (__kmp_affinity_verbose) { 1269 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel, 1270 coreLevel, threadLevel); 1271 } 1272 1273 __kmp_free(threadInfo); 1274 KMP_CPU_FREE(oldMask); 1275 return depth; 1276 } 1277 1278 1279 // 1280 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 1281 // architectures support a newer interface for specifying the x2APIC Ids, 1282 // based on cpuid leaf 11. 1283 // 1284 static int 1285 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os, 1286 kmp_i18n_id_t *const msg_id) 1287 { 1288 kmp_cpuid buf; 1289 1290 *address2os = NULL; 1291 *msg_id = kmp_i18n_null; 1292 1293 // 1294 // Check to see if cpuid leaf 11 is supported. 1295 // 1296 __kmp_x86_cpuid(0, 0, &buf); 1297 if (buf.eax < 11) { 1298 *msg_id = kmp_i18n_str_NoLeaf11Support; 1299 return -1; 1300 } 1301 __kmp_x86_cpuid(11, 0, &buf); 1302 if (buf.ebx == 0) { 1303 *msg_id = kmp_i18n_str_NoLeaf11Support; 1304 return -1; 1305 } 1306 1307 // 1308 // Find the number of levels in the machine topology. While we're at it, 1309 // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will 1310 // try to get more accurate values later by explicitly counting them, 1311 // but get reasonable defaults now, in case we return early. 1312 // 1313 int level; 1314 int threadLevel = -1; 1315 int coreLevel = -1; 1316 int pkgLevel = -1; 1317 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1318 1319 for (level = 0;; level++) { 1320 if (level > 31) { 1321 // 1322 // FIXME: Hack for DPD200163180 1323 // 1324 // If level is big then something went wrong -> exiting 1325 // 1326 // There could actually be 32 valid levels in the machine topology, 1327 // but so far, the only machine we have seen which does not exit 1328 // this loop before iteration 32 has fubar x2APIC settings. 1329 // 1330 // For now, just reject this case based upon loop trip count. 1331 // 1332 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1333 return -1; 1334 } 1335 __kmp_x86_cpuid(11, level, &buf); 1336 if (buf.ebx == 0) { 1337 if (pkgLevel < 0) { 1338 // 1339 // Will infer nPackages from __kmp_xproc 1340 // 1341 pkgLevel = level; 1342 level++; 1343 } 1344 break; 1345 } 1346 int kind = (buf.ecx >> 8) & 0xff; 1347 if (kind == 1) { 1348 // 1349 // SMT level 1350 // 1351 threadLevel = level; 1352 coreLevel = -1; 1353 pkgLevel = -1; 1354 __kmp_nThreadsPerCore = buf.ebx & 0xff; 1355 if (__kmp_nThreadsPerCore == 0) { 1356 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1357 return -1; 1358 } 1359 } 1360 else if (kind == 2) { 1361 // 1362 // core level 1363 // 1364 coreLevel = level; 1365 pkgLevel = -1; 1366 nCoresPerPkg = buf.ebx & 0xff; 1367 if (nCoresPerPkg == 0) { 1368 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1369 return -1; 1370 } 1371 } 1372 else { 1373 if (level <= 0) { 1374 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1375 return -1; 1376 } 1377 if (pkgLevel >= 0) { 1378 continue; 1379 } 1380 pkgLevel = level; 1381 nPackages = buf.ebx & 0xff; 1382 if (nPackages == 0) { 1383 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1384 return -1; 1385 } 1386 } 1387 } 1388 int depth = level; 1389 1390 // 1391 // In the above loop, "level" was counted from the finest level (usually 1392 // thread) to the coarsest. The caller expects that we will place the 1393 // labels in (*address2os)[].first.labels[] in the inverse order, so 1394 // we need to invert the vars saying which level means what. 1395 // 1396 if (threadLevel >= 0) { 1397 threadLevel = depth - threadLevel - 1; 1398 } 1399 if (coreLevel >= 0) { 1400 coreLevel = depth - coreLevel - 1; 1401 } 1402 KMP_DEBUG_ASSERT(pkgLevel >= 0); 1403 pkgLevel = depth - pkgLevel - 1; 1404 1405 // 1406 // The algorithm used starts by setting the affinity to each available 1407 // thread and retrieving info from the cpuid instruction, so if we are 1408 // not capable of calling __kmp_get_system_affinity() and 1409 // _kmp_get_system_affinity(), then we need to do something else - use 1410 // the defaults that we calculated from issuing cpuid without binding 1411 // to each proc. 1412 // 1413 if (! KMP_AFFINITY_CAPABLE()) 1414 { 1415 // 1416 // Hack to try and infer the machine topology using only the data 1417 // available from cpuid on the current thread, and __kmp_xproc. 1418 // 1419 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1420 1421 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1422 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1423 if (__kmp_affinity_verbose) { 1424 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 1425 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1426 if (__kmp_affinity_uniform_topology()) { 1427 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1428 } else { 1429 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1430 } 1431 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1432 __kmp_nThreadsPerCore, __kmp_ncores); 1433 } 1434 return 0; 1435 } 1436 1437 // 1438 // 1439 // From here on, we can assume that it is safe to call 1440 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), 1441 // even if __kmp_affinity_type = affinity_none. 1442 // 1443 1444 // 1445 // Save the affinity mask for the current thread. 1446 // 1447 kmp_affin_mask_t *oldMask; 1448 KMP_CPU_ALLOC(oldMask); 1449 __kmp_get_system_affinity(oldMask, TRUE); 1450 1451 // 1452 // Allocate the data structure to be returned. 1453 // 1454 AddrUnsPair *retval = (AddrUnsPair *) 1455 __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 1456 1457 // 1458 // Run through each of the available contexts, binding the current thread 1459 // to it, and obtaining the pertinent information using the cpuid instr. 1460 // 1461 unsigned int proc; 1462 int nApics = 0; 1463 for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) { 1464 // 1465 // Skip this proc if it is not included in the machine model. 1466 // 1467 if (! KMP_CPU_ISSET(proc, fullMask)) { 1468 continue; 1469 } 1470 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc); 1471 1472 __kmp_affinity_bind_thread(proc); 1473 1474 // 1475 // Extrach the labels for each level in the machine topology map 1476 // from the Apic ID. 1477 // 1478 Address addr(depth); 1479 int prev_shift = 0; 1480 1481 for (level = 0; level < depth; level++) { 1482 __kmp_x86_cpuid(11, level, &buf); 1483 unsigned apicId = buf.edx; 1484 if (buf.ebx == 0) { 1485 if (level != depth - 1) { 1486 KMP_CPU_FREE(oldMask); 1487 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1488 return -1; 1489 } 1490 addr.labels[depth - level - 1] = apicId >> prev_shift; 1491 level++; 1492 break; 1493 } 1494 int shift = buf.eax & 0x1f; 1495 int mask = (1 << shift) - 1; 1496 addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift; 1497 prev_shift = shift; 1498 } 1499 if (level != depth) { 1500 KMP_CPU_FREE(oldMask); 1501 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1502 return -1; 1503 } 1504 1505 retval[nApics] = AddrUnsPair(addr, proc); 1506 nApics++; 1507 } 1508 1509 // 1510 // We've collected all the info we need. 1511 // Restore the old affinity mask for this thread. 1512 // 1513 __kmp_set_system_affinity(oldMask, TRUE); 1514 1515 // 1516 // If there's only one thread context to bind to, return now. 1517 // 1518 KMP_ASSERT(nApics > 0); 1519 if (nApics == 1) { 1520 __kmp_ncores = nPackages = 1; 1521 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1522 if (__kmp_affinity_verbose) { 1523 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1524 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1525 1526 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1527 if (__kmp_affinity_respect_mask) { 1528 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1529 } else { 1530 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1531 } 1532 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1533 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1534 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1535 __kmp_nThreadsPerCore, __kmp_ncores); 1536 } 1537 1538 if (__kmp_affinity_type == affinity_none) { 1539 __kmp_free(retval); 1540 KMP_CPU_FREE(oldMask); 1541 return 0; 1542 } 1543 1544 // 1545 // Form an Address object which only includes the package level. 1546 // 1547 Address addr(1); 1548 addr.labels[0] = retval[0].first.labels[pkgLevel]; 1549 retval[0].first = addr; 1550 1551 if (__kmp_affinity_gran_levels < 0) { 1552 __kmp_affinity_gran_levels = 0; 1553 } 1554 1555 if (__kmp_affinity_verbose) { 1556 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 1557 } 1558 1559 *address2os = retval; 1560 KMP_CPU_FREE(oldMask); 1561 return 1; 1562 } 1563 1564 // 1565 // Sort the table by physical Id. 1566 // 1567 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels); 1568 1569 // 1570 // Find the radix at each of the levels. 1571 // 1572 unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1573 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1574 unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1575 unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1576 for (level = 0; level < depth; level++) { 1577 totals[level] = 1; 1578 maxCt[level] = 1; 1579 counts[level] = 1; 1580 last[level] = retval[0].first.labels[level]; 1581 } 1582 1583 // 1584 // From here on, the iteration variable "level" runs from the finest 1585 // level to the coarsest, i.e. we iterate forward through 1586 // (*address2os)[].first.labels[] - in the previous loops, we iterated 1587 // backwards. 1588 // 1589 for (proc = 1; (int)proc < nApics; proc++) { 1590 int level; 1591 for (level = 0; level < depth; level++) { 1592 if (retval[proc].first.labels[level] != last[level]) { 1593 int j; 1594 for (j = level + 1; j < depth; j++) { 1595 totals[j]++; 1596 counts[j] = 1; 1597 // The line below causes printing incorrect topology information 1598 // in case the max value for some level (maxCt[level]) is encountered earlier than 1599 // some less value while going through the array. 1600 // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2 1601 // whereas it must be 4. 1602 // TODO!!! Check if it can be commented safely 1603 //maxCt[j] = 1; 1604 last[j] = retval[proc].first.labels[j]; 1605 } 1606 totals[level]++; 1607 counts[level]++; 1608 if (counts[level] > maxCt[level]) { 1609 maxCt[level] = counts[level]; 1610 } 1611 last[level] = retval[proc].first.labels[level]; 1612 break; 1613 } 1614 else if (level == depth - 1) { 1615 __kmp_free(last); 1616 __kmp_free(maxCt); 1617 __kmp_free(counts); 1618 __kmp_free(totals); 1619 __kmp_free(retval); 1620 KMP_CPU_FREE(oldMask); 1621 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 1622 return -1; 1623 } 1624 } 1625 } 1626 1627 // 1628 // When affinity is off, this routine will still be called to set 1629 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 1630 // nCoresPerPkg, & nPackages. Make sure all these vars are set 1631 // correctly, and return if affinity is not enabled. 1632 // 1633 if (threadLevel >= 0) { 1634 __kmp_nThreadsPerCore = maxCt[threadLevel]; 1635 } 1636 else { 1637 __kmp_nThreadsPerCore = 1; 1638 } 1639 nPackages = totals[pkgLevel]; 1640 1641 if (coreLevel >= 0) { 1642 __kmp_ncores = totals[coreLevel]; 1643 nCoresPerPkg = maxCt[coreLevel]; 1644 } 1645 else { 1646 __kmp_ncores = nPackages; 1647 nCoresPerPkg = 1; 1648 } 1649 1650 // 1651 // Check to see if the machine topology is uniform 1652 // 1653 unsigned prod = maxCt[0]; 1654 for (level = 1; level < depth; level++) { 1655 prod *= maxCt[level]; 1656 } 1657 bool uniform = (prod == totals[level - 1]); 1658 1659 // 1660 // Print the machine topology summary. 1661 // 1662 if (__kmp_affinity_verbose) { 1663 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 1664 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1665 1666 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1667 if (__kmp_affinity_respect_mask) { 1668 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 1669 } else { 1670 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 1671 } 1672 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1673 if (uniform) { 1674 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1675 } else { 1676 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1677 } 1678 1679 kmp_str_buf_t buf; 1680 __kmp_str_buf_init(&buf); 1681 1682 __kmp_str_buf_print(&buf, "%d", totals[0]); 1683 for (level = 1; level <= pkgLevel; level++) { 1684 __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 1685 } 1686 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 1687 __kmp_nThreadsPerCore, __kmp_ncores); 1688 1689 __kmp_str_buf_free(&buf); 1690 } 1691 1692 if (__kmp_affinity_type == affinity_none) { 1693 __kmp_free(last); 1694 __kmp_free(maxCt); 1695 __kmp_free(counts); 1696 __kmp_free(totals); 1697 __kmp_free(retval); 1698 KMP_CPU_FREE(oldMask); 1699 return 0; 1700 } 1701 1702 // 1703 // Find any levels with radiix 1, and remove them from the map 1704 // (except for the package level). 1705 // 1706 int new_depth = 0; 1707 for (level = 0; level < depth; level++) { 1708 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1709 continue; 1710 } 1711 new_depth++; 1712 } 1713 1714 // 1715 // If we are removing any levels, allocate a new vector to return, 1716 // and copy the relevant information to it. 1717 // 1718 if (new_depth != depth) { 1719 AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate( 1720 sizeof(AddrUnsPair) * nApics); 1721 for (proc = 0; (int)proc < nApics; proc++) { 1722 Address addr(new_depth); 1723 new_retval[proc] = AddrUnsPair(addr, retval[proc].second); 1724 } 1725 int new_level = 0; 1726 for (level = 0; level < depth; level++) { 1727 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1728 if (level == threadLevel) { 1729 threadLevel = -1; 1730 } 1731 else if ((threadLevel >= 0) && (level < threadLevel)) { 1732 threadLevel--; 1733 } 1734 if (level == coreLevel) { 1735 coreLevel = -1; 1736 } 1737 else if ((coreLevel >= 0) && (level < coreLevel)) { 1738 coreLevel--; 1739 } 1740 if (level < pkgLevel) { 1741 pkgLevel--; 1742 } 1743 continue; 1744 } 1745 for (proc = 0; (int)proc < nApics; proc++) { 1746 new_retval[proc].first.labels[new_level] 1747 = retval[proc].first.labels[level]; 1748 } 1749 new_level++; 1750 } 1751 1752 __kmp_free(retval); 1753 retval = new_retval; 1754 depth = new_depth; 1755 } 1756 1757 if (__kmp_affinity_gran_levels < 0) { 1758 // 1759 // Set the granularity level based on what levels are modeled 1760 // in the machine topology map. 1761 // 1762 __kmp_affinity_gran_levels = 0; 1763 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1764 __kmp_affinity_gran_levels++; 1765 } 1766 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1767 __kmp_affinity_gran_levels++; 1768 } 1769 if (__kmp_affinity_gran > affinity_gran_package) { 1770 __kmp_affinity_gran_levels++; 1771 } 1772 } 1773 1774 if (__kmp_affinity_verbose) { 1775 __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, 1776 coreLevel, threadLevel); 1777 } 1778 1779 __kmp_free(last); 1780 __kmp_free(maxCt); 1781 __kmp_free(counts); 1782 __kmp_free(totals); 1783 KMP_CPU_FREE(oldMask); 1784 *address2os = retval; 1785 return depth; 1786 } 1787 1788 1789 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1790 1791 1792 #define osIdIndex 0 1793 #define threadIdIndex 1 1794 #define coreIdIndex 2 1795 #define pkgIdIndex 3 1796 #define nodeIdIndex 4 1797 1798 typedef unsigned *ProcCpuInfo; 1799 static unsigned maxIndex = pkgIdIndex; 1800 1801 1802 static int 1803 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b) 1804 { 1805 const unsigned *aa = (const unsigned *)a; 1806 const unsigned *bb = (const unsigned *)b; 1807 if (aa[osIdIndex] < bb[osIdIndex]) return -1; 1808 if (aa[osIdIndex] > bb[osIdIndex]) return 1; 1809 return 0; 1810 }; 1811 1812 1813 static int 1814 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b) 1815 { 1816 unsigned i; 1817 const unsigned *aa = *((const unsigned **)a); 1818 const unsigned *bb = *((const unsigned **)b); 1819 for (i = maxIndex; ; i--) { 1820 if (aa[i] < bb[i]) return -1; 1821 if (aa[i] > bb[i]) return 1; 1822 if (i == osIdIndex) break; 1823 } 1824 return 0; 1825 } 1826 1827 1828 // 1829 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 1830 // affinity map. 1831 // 1832 static int 1833 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line, 1834 kmp_i18n_id_t *const msg_id, FILE *f) 1835 { 1836 *address2os = NULL; 1837 *msg_id = kmp_i18n_null; 1838 1839 // 1840 // Scan of the file, and count the number of "processor" (osId) fields, 1841 // and find the highest value of <n> for a node_<n> field. 1842 // 1843 char buf[256]; 1844 unsigned num_records = 0; 1845 while (! feof(f)) { 1846 buf[sizeof(buf) - 1] = 1; 1847 if (! fgets(buf, sizeof(buf), f)) { 1848 // 1849 // Read errors presumably because of EOF 1850 // 1851 break; 1852 } 1853 1854 char s1[] = "processor"; 1855 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1856 num_records++; 1857 continue; 1858 } 1859 1860 // 1861 // FIXME - this will match "node_<n> <garbage>" 1862 // 1863 unsigned level; 1864 if (KMP_SSCANF(buf, "node_%d id", &level) == 1) { 1865 if (nodeIdIndex + level >= maxIndex) { 1866 maxIndex = nodeIdIndex + level; 1867 } 1868 continue; 1869 } 1870 } 1871 1872 // 1873 // Check for empty file / no valid processor records, or too many. 1874 // The number of records can't exceed the number of valid bits in the 1875 // affinity mask. 1876 // 1877 if (num_records == 0) { 1878 *line = 0; 1879 *msg_id = kmp_i18n_str_NoProcRecords; 1880 return -1; 1881 } 1882 if (num_records > (unsigned)__kmp_xproc) { 1883 *line = 0; 1884 *msg_id = kmp_i18n_str_TooManyProcRecords; 1885 return -1; 1886 } 1887 1888 // 1889 // Set the file pointer back to the begginning, so that we can scan the 1890 // file again, this time performing a full parse of the data. 1891 // Allocate a vector of ProcCpuInfo object, where we will place the data. 1892 // Adding an extra element at the end allows us to remove a lot of extra 1893 // checks for termination conditions. 1894 // 1895 if (fseek(f, 0, SEEK_SET) != 0) { 1896 *line = 0; 1897 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 1898 return -1; 1899 } 1900 1901 // 1902 // Allocate the array of records to store the proc info in. The dummy 1903 // element at the end makes the logic in filling them out easier to code. 1904 // 1905 unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1) 1906 * sizeof(unsigned *)); 1907 unsigned i; 1908 for (i = 0; i <= num_records; i++) { 1909 threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1) 1910 * sizeof(unsigned)); 1911 } 1912 1913 #define CLEANUP_THREAD_INFO \ 1914 for (i = 0; i <= num_records; i++) { \ 1915 __kmp_free(threadInfo[i]); \ 1916 } \ 1917 __kmp_free(threadInfo); 1918 1919 // 1920 // A value of UINT_MAX means that we didn't find the field 1921 // 1922 unsigned __index; 1923 1924 #define INIT_PROC_INFO(p) \ 1925 for (__index = 0; __index <= maxIndex; __index++) { \ 1926 (p)[__index] = UINT_MAX; \ 1927 } 1928 1929 for (i = 0; i <= num_records; i++) { 1930 INIT_PROC_INFO(threadInfo[i]); 1931 } 1932 1933 unsigned num_avail = 0; 1934 *line = 0; 1935 while (! feof(f)) { 1936 // 1937 // Create an inner scoping level, so that all the goto targets at the 1938 // end of the loop appear in an outer scoping level. This avoids 1939 // warnings about jumping past an initialization to a target in the 1940 // same block. 1941 // 1942 { 1943 buf[sizeof(buf) - 1] = 1; 1944 bool long_line = false; 1945 if (! fgets(buf, sizeof(buf), f)) { 1946 // 1947 // Read errors presumably because of EOF 1948 // 1949 // If there is valid data in threadInfo[num_avail], then fake 1950 // a blank line in ensure that the last address gets parsed. 1951 // 1952 bool valid = false; 1953 for (i = 0; i <= maxIndex; i++) { 1954 if (threadInfo[num_avail][i] != UINT_MAX) { 1955 valid = true; 1956 } 1957 } 1958 if (! valid) { 1959 break; 1960 } 1961 buf[0] = 0; 1962 } else if (!buf[sizeof(buf) - 1]) { 1963 // 1964 // The line is longer than the buffer. Set a flag and don't 1965 // emit an error if we were going to ignore the line, anyway. 1966 // 1967 long_line = true; 1968 1969 #define CHECK_LINE \ 1970 if (long_line) { \ 1971 CLEANUP_THREAD_INFO; \ 1972 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 1973 return -1; \ 1974 } 1975 } 1976 (*line)++; 1977 1978 char s1[] = "processor"; 1979 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1980 CHECK_LINE; 1981 char *p = strchr(buf + sizeof(s1) - 1, ':'); 1982 unsigned val; 1983 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 1984 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field; 1985 threadInfo[num_avail][osIdIndex] = val; 1986 #if KMP_OS_LINUX && USE_SYSFS_INFO 1987 char path[256]; 1988 KMP_SNPRINTF(path, sizeof(path), 1989 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 1990 threadInfo[num_avail][osIdIndex]); 1991 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 1992 1993 KMP_SNPRINTF(path, sizeof(path), 1994 "/sys/devices/system/cpu/cpu%u/topology/core_id", 1995 threadInfo[num_avail][osIdIndex]); 1996 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 1997 continue; 1998 #else 1999 } 2000 char s2[] = "physical id"; 2001 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 2002 CHECK_LINE; 2003 char *p = strchr(buf + sizeof(s2) - 1, ':'); 2004 unsigned val; 2005 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2006 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field; 2007 threadInfo[num_avail][pkgIdIndex] = val; 2008 continue; 2009 } 2010 char s3[] = "core id"; 2011 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 2012 CHECK_LINE; 2013 char *p = strchr(buf + sizeof(s3) - 1, ':'); 2014 unsigned val; 2015 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2016 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field; 2017 threadInfo[num_avail][coreIdIndex] = val; 2018 continue; 2019 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 2020 } 2021 char s4[] = "thread id"; 2022 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 2023 CHECK_LINE; 2024 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2025 unsigned val; 2026 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2027 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field; 2028 threadInfo[num_avail][threadIdIndex] = val; 2029 continue; 2030 } 2031 unsigned level; 2032 if (KMP_SSCANF(buf, "node_%d id", &level) == 1) { 2033 CHECK_LINE; 2034 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2035 unsigned val; 2036 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2037 KMP_ASSERT(nodeIdIndex + level <= maxIndex); 2038 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field; 2039 threadInfo[num_avail][nodeIdIndex + level] = val; 2040 continue; 2041 } 2042 2043 // 2044 // We didn't recognize the leading token on the line. 2045 // There are lots of leading tokens that we don't recognize - 2046 // if the line isn't empty, go on to the next line. 2047 // 2048 if ((*buf != 0) && (*buf != '\n')) { 2049 // 2050 // If the line is longer than the buffer, read characters 2051 // until we find a newline. 2052 // 2053 if (long_line) { 2054 int ch; 2055 while (((ch = fgetc(f)) != EOF) && (ch != '\n')); 2056 } 2057 continue; 2058 } 2059 2060 // 2061 // A newline has signalled the end of the processor record. 2062 // Check that there aren't too many procs specified. 2063 // 2064 if ((int)num_avail == __kmp_xproc) { 2065 CLEANUP_THREAD_INFO; 2066 *msg_id = kmp_i18n_str_TooManyEntries; 2067 return -1; 2068 } 2069 2070 // 2071 // Check for missing fields. The osId field must be there, and we 2072 // currently require that the physical id field is specified, also. 2073 // 2074 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 2075 CLEANUP_THREAD_INFO; 2076 *msg_id = kmp_i18n_str_MissingProcField; 2077 return -1; 2078 } 2079 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 2080 CLEANUP_THREAD_INFO; 2081 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 2082 return -1; 2083 } 2084 2085 // 2086 // Skip this proc if it is not included in the machine model. 2087 // 2088 if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) { 2089 INIT_PROC_INFO(threadInfo[num_avail]); 2090 continue; 2091 } 2092 2093 // 2094 // We have a successful parse of this proc's info. 2095 // Increment the counter, and prepare for the next proc. 2096 // 2097 num_avail++; 2098 KMP_ASSERT(num_avail <= num_records); 2099 INIT_PROC_INFO(threadInfo[num_avail]); 2100 } 2101 continue; 2102 2103 no_val: 2104 CLEANUP_THREAD_INFO; 2105 *msg_id = kmp_i18n_str_MissingValCpuinfo; 2106 return -1; 2107 2108 dup_field: 2109 CLEANUP_THREAD_INFO; 2110 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 2111 return -1; 2112 } 2113 *line = 0; 2114 2115 # if KMP_MIC && REDUCE_TEAM_SIZE 2116 unsigned teamSize = 0; 2117 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2118 2119 // check for num_records == __kmp_xproc ??? 2120 2121 // 2122 // If there's only one thread context to bind to, form an Address object 2123 // with depth 1 and return immediately (or, if affinity is off, set 2124 // address2os to NULL and return). 2125 // 2126 // If it is configured to omit the package level when there is only a 2127 // single package, the logic at the end of this routine won't work if 2128 // there is only a single thread - it would try to form an Address 2129 // object with depth 0. 2130 // 2131 KMP_ASSERT(num_avail > 0); 2132 KMP_ASSERT(num_avail <= num_records); 2133 if (num_avail == 1) { 2134 __kmp_ncores = 1; 2135 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 2136 if (__kmp_affinity_verbose) { 2137 if (! KMP_AFFINITY_CAPABLE()) { 2138 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2139 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2140 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2141 } 2142 else { 2143 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2144 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 2145 fullMask); 2146 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2147 if (__kmp_affinity_respect_mask) { 2148 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2149 } else { 2150 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2151 } 2152 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2153 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2154 } 2155 int index; 2156 kmp_str_buf_t buf; 2157 __kmp_str_buf_init(&buf); 2158 __kmp_str_buf_print(&buf, "1"); 2159 for (index = maxIndex - 1; index > pkgIdIndex; index--) { 2160 __kmp_str_buf_print(&buf, " x 1"); 2161 } 2162 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1); 2163 __kmp_str_buf_free(&buf); 2164 } 2165 2166 if (__kmp_affinity_type == affinity_none) { 2167 CLEANUP_THREAD_INFO; 2168 return 0; 2169 } 2170 2171 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair)); 2172 Address addr(1); 2173 addr.labels[0] = threadInfo[0][pkgIdIndex]; 2174 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]); 2175 2176 if (__kmp_affinity_gran_levels < 0) { 2177 __kmp_affinity_gran_levels = 0; 2178 } 2179 2180 if (__kmp_affinity_verbose) { 2181 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 2182 } 2183 2184 CLEANUP_THREAD_INFO; 2185 return 1; 2186 } 2187 2188 // 2189 // Sort the threadInfo table by physical Id. 2190 // 2191 qsort(threadInfo, num_avail, sizeof(*threadInfo), 2192 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 2193 2194 // 2195 // The table is now sorted by pkgId / coreId / threadId, but we really 2196 // don't know the radix of any of the fields. pkgId's may be sparsely 2197 // assigned among the chips on a system. Although coreId's are usually 2198 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned 2199 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2200 // 2201 // For that matter, we don't know what coresPerPkg and threadsPerCore 2202 // (or the total # packages) are at this point - we want to determine 2203 // that now. We only have an upper bound on the first two figures. 2204 // 2205 unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1) 2206 * sizeof(unsigned)); 2207 unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1) 2208 * sizeof(unsigned)); 2209 unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1) 2210 * sizeof(unsigned)); 2211 unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1) 2212 * sizeof(unsigned)); 2213 2214 bool assign_thread_ids = false; 2215 unsigned threadIdCt; 2216 unsigned index; 2217 2218 restart_radix_check: 2219 threadIdCt = 0; 2220 2221 // 2222 // Initialize the counter arrays with data from threadInfo[0]. 2223 // 2224 if (assign_thread_ids) { 2225 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 2226 threadInfo[0][threadIdIndex] = threadIdCt++; 2227 } 2228 else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 2229 threadIdCt = threadInfo[0][threadIdIndex] + 1; 2230 } 2231 } 2232 for (index = 0; index <= maxIndex; index++) { 2233 counts[index] = 1; 2234 maxCt[index] = 1; 2235 totals[index] = 1; 2236 lastId[index] = threadInfo[0][index];; 2237 } 2238 2239 // 2240 // Run through the rest of the OS procs. 2241 // 2242 for (i = 1; i < num_avail; i++) { 2243 // 2244 // Find the most significant index whose id differs 2245 // from the id for the previous OS proc. 2246 // 2247 for (index = maxIndex; index >= threadIdIndex; index--) { 2248 if (assign_thread_ids && (index == threadIdIndex)) { 2249 // 2250 // Auto-assign the thread id field if it wasn't specified. 2251 // 2252 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2253 threadInfo[i][threadIdIndex] = threadIdCt++; 2254 } 2255 2256 // 2257 // Aparrently the thread id field was specified for some 2258 // entries and not others. Start the thread id counter 2259 // off at the next higher thread id. 2260 // 2261 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2262 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2263 } 2264 } 2265 if (threadInfo[i][index] != lastId[index]) { 2266 // 2267 // Run through all indices which are less significant, 2268 // and reset the counts to 1. 2269 // 2270 // At all levels up to and including index, we need to 2271 // increment the totals and record the last id. 2272 // 2273 unsigned index2; 2274 for (index2 = threadIdIndex; index2 < index; index2++) { 2275 totals[index2]++; 2276 if (counts[index2] > maxCt[index2]) { 2277 maxCt[index2] = counts[index2]; 2278 } 2279 counts[index2] = 1; 2280 lastId[index2] = threadInfo[i][index2]; 2281 } 2282 counts[index]++; 2283 totals[index]++; 2284 lastId[index] = threadInfo[i][index]; 2285 2286 if (assign_thread_ids && (index > threadIdIndex)) { 2287 2288 # if KMP_MIC && REDUCE_TEAM_SIZE 2289 // 2290 // The default team size is the total #threads in the machine 2291 // minus 1 thread for every core that has 3 or more threads. 2292 // 2293 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 ); 2294 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2295 2296 // 2297 // Restart the thread counter, as we are on a new core. 2298 // 2299 threadIdCt = 0; 2300 2301 // 2302 // Auto-assign the thread id field if it wasn't specified. 2303 // 2304 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2305 threadInfo[i][threadIdIndex] = threadIdCt++; 2306 } 2307 2308 // 2309 // Aparrently the thread id field was specified for some 2310 // entries and not others. Start the thread id counter 2311 // off at the next higher thread id. 2312 // 2313 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2314 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2315 } 2316 } 2317 break; 2318 } 2319 } 2320 if (index < threadIdIndex) { 2321 // 2322 // If thread ids were specified, it is an error if they are not 2323 // unique. Also, check that we waven't already restarted the 2324 // loop (to be safe - shouldn't need to). 2325 // 2326 if ((threadInfo[i][threadIdIndex] != UINT_MAX) 2327 || assign_thread_ids) { 2328 __kmp_free(lastId); 2329 __kmp_free(totals); 2330 __kmp_free(maxCt); 2331 __kmp_free(counts); 2332 CLEANUP_THREAD_INFO; 2333 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2334 return -1; 2335 } 2336 2337 // 2338 // If the thread ids were not specified and we see entries 2339 // entries that are duplicates, start the loop over and 2340 // assign the thread ids manually. 2341 // 2342 assign_thread_ids = true; 2343 goto restart_radix_check; 2344 } 2345 } 2346 2347 # if KMP_MIC && REDUCE_TEAM_SIZE 2348 // 2349 // The default team size is the total #threads in the machine 2350 // minus 1 thread for every core that has 3 or more threads. 2351 // 2352 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 ); 2353 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2354 2355 for (index = threadIdIndex; index <= maxIndex; index++) { 2356 if (counts[index] > maxCt[index]) { 2357 maxCt[index] = counts[index]; 2358 } 2359 } 2360 2361 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2362 nCoresPerPkg = maxCt[coreIdIndex]; 2363 nPackages = totals[pkgIdIndex]; 2364 2365 // 2366 // Check to see if the machine topology is uniform 2367 // 2368 unsigned prod = totals[maxIndex]; 2369 for (index = threadIdIndex; index < maxIndex; index++) { 2370 prod *= maxCt[index]; 2371 } 2372 bool uniform = (prod == totals[threadIdIndex]); 2373 2374 // 2375 // When affinity is off, this routine will still be called to set 2376 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 2377 // nCoresPerPkg, & nPackages. Make sure all these vars are set 2378 // correctly, and return now if affinity is not enabled. 2379 // 2380 __kmp_ncores = totals[coreIdIndex]; 2381 2382 if (__kmp_affinity_verbose) { 2383 if (! KMP_AFFINITY_CAPABLE()) { 2384 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2385 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2386 if (uniform) { 2387 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2388 } else { 2389 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2390 } 2391 } 2392 else { 2393 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2394 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask); 2395 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2396 if (__kmp_affinity_respect_mask) { 2397 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2398 } else { 2399 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2400 } 2401 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2402 if (uniform) { 2403 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2404 } else { 2405 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2406 } 2407 } 2408 kmp_str_buf_t buf; 2409 __kmp_str_buf_init(&buf); 2410 2411 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]); 2412 for (index = maxIndex - 1; index >= pkgIdIndex; index--) { 2413 __kmp_str_buf_print(&buf, " x %d", maxCt[index]); 2414 } 2415 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex], 2416 maxCt[threadIdIndex], __kmp_ncores); 2417 2418 __kmp_str_buf_free(&buf); 2419 } 2420 2421 # if KMP_MIC && REDUCE_TEAM_SIZE 2422 // 2423 // Set the default team size. 2424 // 2425 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2426 __kmp_dflt_team_nth = teamSize; 2427 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n", 2428 __kmp_dflt_team_nth)); 2429 } 2430 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2431 2432 if (__kmp_affinity_type == affinity_none) { 2433 __kmp_free(lastId); 2434 __kmp_free(totals); 2435 __kmp_free(maxCt); 2436 __kmp_free(counts); 2437 CLEANUP_THREAD_INFO; 2438 return 0; 2439 } 2440 2441 // 2442 // Count the number of levels which have more nodes at that level than 2443 // at the parent's level (with there being an implicit root node of 2444 // the top level). This is equivalent to saying that there is at least 2445 // one node at this level which has a sibling. These levels are in the 2446 // map, and the package level is always in the map. 2447 // 2448 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2449 int level = 0; 2450 for (index = threadIdIndex; index < maxIndex; index++) { 2451 KMP_ASSERT(totals[index] >= totals[index + 1]); 2452 inMap[index] = (totals[index] > totals[index + 1]); 2453 } 2454 inMap[maxIndex] = (totals[maxIndex] > 1); 2455 inMap[pkgIdIndex] = true; 2456 2457 int depth = 0; 2458 for (index = threadIdIndex; index <= maxIndex; index++) { 2459 if (inMap[index]) { 2460 depth++; 2461 } 2462 } 2463 KMP_ASSERT(depth > 0); 2464 2465 // 2466 // Construct the data structure that is to be returned. 2467 // 2468 *address2os = (AddrUnsPair*) 2469 __kmp_allocate(sizeof(AddrUnsPair) * num_avail); 2470 int pkgLevel = -1; 2471 int coreLevel = -1; 2472 int threadLevel = -1; 2473 2474 for (i = 0; i < num_avail; ++i) { 2475 Address addr(depth); 2476 unsigned os = threadInfo[i][osIdIndex]; 2477 int src_index; 2478 int dst_index = 0; 2479 2480 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2481 if (! inMap[src_index]) { 2482 continue; 2483 } 2484 addr.labels[dst_index] = threadInfo[i][src_index]; 2485 if (src_index == pkgIdIndex) { 2486 pkgLevel = dst_index; 2487 } 2488 else if (src_index == coreIdIndex) { 2489 coreLevel = dst_index; 2490 } 2491 else if (src_index == threadIdIndex) { 2492 threadLevel = dst_index; 2493 } 2494 dst_index++; 2495 } 2496 (*address2os)[i] = AddrUnsPair(addr, os); 2497 } 2498 2499 if (__kmp_affinity_gran_levels < 0) { 2500 // 2501 // Set the granularity level based on what levels are modeled 2502 // in the machine topology map. 2503 // 2504 unsigned src_index; 2505 __kmp_affinity_gran_levels = 0; 2506 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) { 2507 if (! inMap[src_index]) { 2508 continue; 2509 } 2510 switch (src_index) { 2511 case threadIdIndex: 2512 if (__kmp_affinity_gran > affinity_gran_thread) { 2513 __kmp_affinity_gran_levels++; 2514 } 2515 2516 break; 2517 case coreIdIndex: 2518 if (__kmp_affinity_gran > affinity_gran_core) { 2519 __kmp_affinity_gran_levels++; 2520 } 2521 break; 2522 2523 case pkgIdIndex: 2524 if (__kmp_affinity_gran > affinity_gran_package) { 2525 __kmp_affinity_gran_levels++; 2526 } 2527 break; 2528 } 2529 } 2530 } 2531 2532 if (__kmp_affinity_verbose) { 2533 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel, 2534 coreLevel, threadLevel); 2535 } 2536 2537 __kmp_free(inMap); 2538 __kmp_free(lastId); 2539 __kmp_free(totals); 2540 __kmp_free(maxCt); 2541 __kmp_free(counts); 2542 CLEANUP_THREAD_INFO; 2543 return depth; 2544 } 2545 2546 2547 // 2548 // Create and return a table of affinity masks, indexed by OS thread ID. 2549 // This routine handles OR'ing together all the affinity masks of threads 2550 // that are sufficiently close, if granularity > fine. 2551 // 2552 static kmp_affin_mask_t * 2553 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique, 2554 AddrUnsPair *address2os, unsigned numAddrs) 2555 { 2556 // 2557 // First form a table of affinity masks in order of OS thread id. 2558 // 2559 unsigned depth; 2560 unsigned maxOsId; 2561 unsigned i; 2562 2563 KMP_ASSERT(numAddrs > 0); 2564 depth = address2os[0].first.depth; 2565 2566 maxOsId = 0; 2567 for (i = 0; i < numAddrs; i++) { 2568 unsigned osId = address2os[i].second; 2569 if (osId > maxOsId) { 2570 maxOsId = osId; 2571 } 2572 } 2573 kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate( 2574 (maxOsId + 1) * __kmp_affin_mask_size); 2575 2576 // 2577 // Sort the address2os table according to physical order. Doing so 2578 // will put all threads on the same core/package/node in consecutive 2579 // locations. 2580 // 2581 qsort(address2os, numAddrs, sizeof(*address2os), 2582 __kmp_affinity_cmp_Address_labels); 2583 2584 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2585 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2586 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2587 } 2588 if (__kmp_affinity_gran_levels >= (int)depth) { 2589 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2590 && (__kmp_affinity_type != affinity_none))) { 2591 KMP_WARNING(AffThreadsMayMigrate); 2592 } 2593 } 2594 2595 // 2596 // Run through the table, forming the masks for all threads on each 2597 // core. Threads on the same core will have identical "Address" 2598 // objects, not considering the last level, which must be the thread 2599 // id. All threads on a core will appear consecutively. 2600 // 2601 unsigned unique = 0; 2602 unsigned j = 0; // index of 1st thread on core 2603 unsigned leader = 0; 2604 Address *leaderAddr = &(address2os[0].first); 2605 kmp_affin_mask_t *sum 2606 = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size); 2607 KMP_CPU_ZERO(sum); 2608 KMP_CPU_SET(address2os[0].second, sum); 2609 for (i = 1; i < numAddrs; i++) { 2610 // 2611 // If this thread is sufficiently close to the leader (within the 2612 // granularity setting), then set the bit for this os thread in the 2613 // affinity mask for this group, and go on to the next thread. 2614 // 2615 if (leaderAddr->isClose(address2os[i].first, 2616 __kmp_affinity_gran_levels)) { 2617 KMP_CPU_SET(address2os[i].second, sum); 2618 continue; 2619 } 2620 2621 // 2622 // For every thread in this group, copy the mask to the thread's 2623 // entry in the osId2Mask table. Mark the first address as a 2624 // leader. 2625 // 2626 for (; j < i; j++) { 2627 unsigned osId = address2os[j].second; 2628 KMP_DEBUG_ASSERT(osId <= maxOsId); 2629 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2630 KMP_CPU_COPY(mask, sum); 2631 address2os[j].first.leader = (j == leader); 2632 } 2633 unique++; 2634 2635 // 2636 // Start a new mask. 2637 // 2638 leader = i; 2639 leaderAddr = &(address2os[i].first); 2640 KMP_CPU_ZERO(sum); 2641 KMP_CPU_SET(address2os[i].second, sum); 2642 } 2643 2644 // 2645 // For every thread in last group, copy the mask to the thread's 2646 // entry in the osId2Mask table. 2647 // 2648 for (; j < i; j++) { 2649 unsigned osId = address2os[j].second; 2650 KMP_DEBUG_ASSERT(osId <= maxOsId); 2651 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2652 KMP_CPU_COPY(mask, sum); 2653 address2os[j].first.leader = (j == leader); 2654 } 2655 unique++; 2656 2657 *maxIndex = maxOsId; 2658 *numUnique = unique; 2659 return osId2Mask; 2660 } 2661 2662 2663 // 2664 // Stuff for the affinity proclist parsers. It's easier to declare these vars 2665 // as file-static than to try and pass them through the calling sequence of 2666 // the recursive-descent OMP_PLACES parser. 2667 // 2668 static kmp_affin_mask_t *newMasks; 2669 static int numNewMasks; 2670 static int nextNewMask; 2671 2672 #define ADD_MASK(_mask) \ 2673 { \ 2674 if (nextNewMask >= numNewMasks) { \ 2675 numNewMasks *= 2; \ 2676 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \ 2677 numNewMasks * __kmp_affin_mask_size); \ 2678 } \ 2679 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 2680 nextNewMask++; \ 2681 } 2682 2683 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \ 2684 { \ 2685 if (((_osId) > _maxOsId) || \ 2686 (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 2687 if (__kmp_affinity_verbose || (__kmp_affinity_warnings \ 2688 && (__kmp_affinity_type != affinity_none))) { \ 2689 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 2690 } \ 2691 } \ 2692 else { \ 2693 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 2694 } \ 2695 } 2696 2697 2698 // 2699 // Re-parse the proclist (for the explicit affinity type), and form the list 2700 // of affinity newMasks indexed by gtid. 2701 // 2702 static void 2703 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 2704 unsigned int *out_numMasks, const char *proclist, 2705 kmp_affin_mask_t *osId2Mask, int maxOsId) 2706 { 2707 const char *scan = proclist; 2708 const char *next = proclist; 2709 2710 // 2711 // We use malloc() for the temporary mask vector, 2712 // so that we can use realloc() to extend it. 2713 // 2714 numNewMasks = 2; 2715 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks 2716 * __kmp_affin_mask_size); 2717 nextNewMask = 0; 2718 kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate( 2719 __kmp_affin_mask_size); 2720 int setSize = 0; 2721 2722 for (;;) { 2723 int start, end, stride; 2724 2725 SKIP_WS(scan); 2726 next = scan; 2727 if (*next == '\0') { 2728 break; 2729 } 2730 2731 if (*next == '{') { 2732 int num; 2733 setSize = 0; 2734 next++; // skip '{' 2735 SKIP_WS(next); 2736 scan = next; 2737 2738 // 2739 // Read the first integer in the set. 2740 // 2741 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2742 "bad proclist"); 2743 SKIP_DIGITS(next); 2744 num = __kmp_str_to_int(scan, *next); 2745 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2746 2747 // 2748 // Copy the mask for that osId to the sum (union) mask. 2749 // 2750 if ((num > maxOsId) || 2751 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2752 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2753 && (__kmp_affinity_type != affinity_none))) { 2754 KMP_WARNING(AffIgnoreInvalidProcID, num); 2755 } 2756 KMP_CPU_ZERO(sumMask); 2757 } 2758 else { 2759 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2760 setSize = 1; 2761 } 2762 2763 for (;;) { 2764 // 2765 // Check for end of set. 2766 // 2767 SKIP_WS(next); 2768 if (*next == '}') { 2769 next++; // skip '}' 2770 break; 2771 } 2772 2773 // 2774 // Skip optional comma. 2775 // 2776 if (*next == ',') { 2777 next++; 2778 } 2779 SKIP_WS(next); 2780 2781 // 2782 // Read the next integer in the set. 2783 // 2784 scan = next; 2785 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2786 "bad explicit proc list"); 2787 2788 SKIP_DIGITS(next); 2789 num = __kmp_str_to_int(scan, *next); 2790 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2791 2792 // 2793 // Add the mask for that osId to the sum mask. 2794 // 2795 if ((num > maxOsId) || 2796 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2797 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2798 && (__kmp_affinity_type != affinity_none))) { 2799 KMP_WARNING(AffIgnoreInvalidProcID, num); 2800 } 2801 } 2802 else { 2803 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2804 setSize++; 2805 } 2806 } 2807 if (setSize > 0) { 2808 ADD_MASK(sumMask); 2809 } 2810 2811 SKIP_WS(next); 2812 if (*next == ',') { 2813 next++; 2814 } 2815 scan = next; 2816 continue; 2817 } 2818 2819 // 2820 // Read the first integer. 2821 // 2822 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2823 SKIP_DIGITS(next); 2824 start = __kmp_str_to_int(scan, *next); 2825 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 2826 SKIP_WS(next); 2827 2828 // 2829 // If this isn't a range, then add a mask to the list and go on. 2830 // 2831 if (*next != '-') { 2832 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2833 2834 // 2835 // Skip optional comma. 2836 // 2837 if (*next == ',') { 2838 next++; 2839 } 2840 scan = next; 2841 continue; 2842 } 2843 2844 // 2845 // This is a range. Skip over the '-' and read in the 2nd int. 2846 // 2847 next++; // skip '-' 2848 SKIP_WS(next); 2849 scan = next; 2850 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2851 SKIP_DIGITS(next); 2852 end = __kmp_str_to_int(scan, *next); 2853 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 2854 2855 // 2856 // Check for a stride parameter 2857 // 2858 stride = 1; 2859 SKIP_WS(next); 2860 if (*next == ':') { 2861 // 2862 // A stride is specified. Skip over the ':" and read the 3rd int. 2863 // 2864 int sign = +1; 2865 next++; // skip ':' 2866 SKIP_WS(next); 2867 scan = next; 2868 if (*next == '-') { 2869 sign = -1; 2870 next++; 2871 SKIP_WS(next); 2872 scan = next; 2873 } 2874 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2875 "bad explicit proc list"); 2876 SKIP_DIGITS(next); 2877 stride = __kmp_str_to_int(scan, *next); 2878 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 2879 stride *= sign; 2880 } 2881 2882 // 2883 // Do some range checks. 2884 // 2885 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 2886 if (stride > 0) { 2887 KMP_ASSERT2(start <= end, "bad explicit proc list"); 2888 } 2889 else { 2890 KMP_ASSERT2(start >= end, "bad explicit proc list"); 2891 } 2892 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 2893 2894 // 2895 // Add the mask for each OS proc # to the list. 2896 // 2897 if (stride > 0) { 2898 do { 2899 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2900 start += stride; 2901 } while (start <= end); 2902 } 2903 else { 2904 do { 2905 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2906 start += stride; 2907 } while (start >= end); 2908 } 2909 2910 // 2911 // Skip optional comma. 2912 // 2913 SKIP_WS(next); 2914 if (*next == ',') { 2915 next++; 2916 } 2917 scan = next; 2918 } 2919 2920 *out_numMasks = nextNewMask; 2921 if (nextNewMask == 0) { 2922 *out_masks = NULL; 2923 KMP_INTERNAL_FREE(newMasks); 2924 return; 2925 } 2926 *out_masks 2927 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size); 2928 KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size); 2929 __kmp_free(sumMask); 2930 KMP_INTERNAL_FREE(newMasks); 2931 } 2932 2933 2934 # if OMP_40_ENABLED 2935 2936 /*----------------------------------------------------------------------------- 2937 2938 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 2939 places. Again, Here is the grammar: 2940 2941 place_list := place 2942 place_list := place , place_list 2943 place := num 2944 place := place : num 2945 place := place : num : signed 2946 place := { subplacelist } 2947 place := ! place // (lowest priority) 2948 subplace_list := subplace 2949 subplace_list := subplace , subplace_list 2950 subplace := num 2951 subplace := num : num 2952 subplace := num : num : signed 2953 signed := num 2954 signed := + signed 2955 signed := - signed 2956 2957 -----------------------------------------------------------------------------*/ 2958 2959 static void 2960 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask, 2961 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize) 2962 { 2963 const char *next; 2964 2965 for (;;) { 2966 int start, count, stride, i; 2967 2968 // 2969 // Read in the starting proc id 2970 // 2971 SKIP_WS(*scan); 2972 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 2973 "bad explicit places list"); 2974 next = *scan; 2975 SKIP_DIGITS(next); 2976 start = __kmp_str_to_int(*scan, *next); 2977 KMP_ASSERT(start >= 0); 2978 *scan = next; 2979 2980 // 2981 // valid follow sets are ',' ':' and '}' 2982 // 2983 SKIP_WS(*scan); 2984 if (**scan == '}' || **scan == ',') { 2985 if ((start > maxOsId) || 2986 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2987 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2988 && (__kmp_affinity_type != affinity_none))) { 2989 KMP_WARNING(AffIgnoreInvalidProcID, start); 2990 } 2991 } 2992 else { 2993 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2994 (*setSize)++; 2995 } 2996 if (**scan == '}') { 2997 break; 2998 } 2999 (*scan)++; // skip ',' 3000 continue; 3001 } 3002 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3003 (*scan)++; // skip ':' 3004 3005 // 3006 // Read count parameter 3007 // 3008 SKIP_WS(*scan); 3009 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 3010 "bad explicit places list"); 3011 next = *scan; 3012 SKIP_DIGITS(next); 3013 count = __kmp_str_to_int(*scan, *next); 3014 KMP_ASSERT(count >= 0); 3015 *scan = next; 3016 3017 // 3018 // valid follow sets are ',' ':' and '}' 3019 // 3020 SKIP_WS(*scan); 3021 if (**scan == '}' || **scan == ',') { 3022 for (i = 0; i < count; i++) { 3023 if ((start > maxOsId) || 3024 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3025 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3026 && (__kmp_affinity_type != affinity_none))) { 3027 KMP_WARNING(AffIgnoreInvalidProcID, start); 3028 } 3029 break; // don't proliferate warnings for large count 3030 } 3031 else { 3032 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3033 start++; 3034 (*setSize)++; 3035 } 3036 } 3037 if (**scan == '}') { 3038 break; 3039 } 3040 (*scan)++; // skip ',' 3041 continue; 3042 } 3043 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3044 (*scan)++; // skip ':' 3045 3046 // 3047 // Read stride parameter 3048 // 3049 int sign = +1; 3050 for (;;) { 3051 SKIP_WS(*scan); 3052 if (**scan == '+') { 3053 (*scan)++; // skip '+' 3054 continue; 3055 } 3056 if (**scan == '-') { 3057 sign *= -1; 3058 (*scan)++; // skip '-' 3059 continue; 3060 } 3061 break; 3062 } 3063 SKIP_WS(*scan); 3064 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 3065 "bad explicit places list"); 3066 next = *scan; 3067 SKIP_DIGITS(next); 3068 stride = __kmp_str_to_int(*scan, *next); 3069 KMP_ASSERT(stride >= 0); 3070 *scan = next; 3071 stride *= sign; 3072 3073 // 3074 // valid follow sets are ',' and '}' 3075 // 3076 SKIP_WS(*scan); 3077 if (**scan == '}' || **scan == ',') { 3078 for (i = 0; i < count; i++) { 3079 if ((start > maxOsId) || 3080 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3081 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3082 && (__kmp_affinity_type != affinity_none))) { 3083 KMP_WARNING(AffIgnoreInvalidProcID, start); 3084 } 3085 break; // don't proliferate warnings for large count 3086 } 3087 else { 3088 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3089 start += stride; 3090 (*setSize)++; 3091 } 3092 } 3093 if (**scan == '}') { 3094 break; 3095 } 3096 (*scan)++; // skip ',' 3097 continue; 3098 } 3099 3100 KMP_ASSERT2(0, "bad explicit places list"); 3101 } 3102 } 3103 3104 3105 static void 3106 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 3107 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize) 3108 { 3109 const char *next; 3110 3111 // 3112 // valid follow sets are '{' '!' and num 3113 // 3114 SKIP_WS(*scan); 3115 if (**scan == '{') { 3116 (*scan)++; // skip '{' 3117 __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask, 3118 setSize); 3119 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 3120 (*scan)++; // skip '}' 3121 } 3122 else if (**scan == '!') { 3123 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 3124 KMP_CPU_COMPLEMENT(tempMask); 3125 (*scan)++; // skip '!' 3126 } 3127 else if ((**scan >= '0') && (**scan <= '9')) { 3128 next = *scan; 3129 SKIP_DIGITS(next); 3130 int num = __kmp_str_to_int(*scan, *next); 3131 KMP_ASSERT(num >= 0); 3132 if ((num > maxOsId) || 3133 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3134 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3135 && (__kmp_affinity_type != affinity_none))) { 3136 KMP_WARNING(AffIgnoreInvalidProcID, num); 3137 } 3138 } 3139 else { 3140 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 3141 (*setSize)++; 3142 } 3143 *scan = next; // skip num 3144 } 3145 else { 3146 KMP_ASSERT2(0, "bad explicit places list"); 3147 } 3148 } 3149 3150 3151 //static void 3152 void 3153 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 3154 unsigned int *out_numMasks, const char *placelist, 3155 kmp_affin_mask_t *osId2Mask, int maxOsId) 3156 { 3157 const char *scan = placelist; 3158 const char *next = placelist; 3159 3160 numNewMasks = 2; 3161 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks 3162 * __kmp_affin_mask_size); 3163 nextNewMask = 0; 3164 3165 kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate( 3166 __kmp_affin_mask_size); 3167 KMP_CPU_ZERO(tempMask); 3168 int setSize = 0; 3169 3170 for (;;) { 3171 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 3172 3173 // 3174 // valid follow sets are ',' ':' and EOL 3175 // 3176 SKIP_WS(scan); 3177 if (*scan == '\0' || *scan == ',') { 3178 if (setSize > 0) { 3179 ADD_MASK(tempMask); 3180 } 3181 KMP_CPU_ZERO(tempMask); 3182 setSize = 0; 3183 if (*scan == '\0') { 3184 break; 3185 } 3186 scan++; // skip ',' 3187 continue; 3188 } 3189 3190 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3191 scan++; // skip ':' 3192 3193 // 3194 // Read count parameter 3195 // 3196 SKIP_WS(scan); 3197 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), 3198 "bad explicit places list"); 3199 next = scan; 3200 SKIP_DIGITS(next); 3201 int count = __kmp_str_to_int(scan, *next); 3202 KMP_ASSERT(count >= 0); 3203 scan = next; 3204 3205 // 3206 // valid follow sets are ',' ':' and EOL 3207 // 3208 SKIP_WS(scan); 3209 int stride; 3210 if (*scan == '\0' || *scan == ',') { 3211 stride = +1; 3212 } 3213 else { 3214 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3215 scan++; // skip ':' 3216 3217 // 3218 // Read stride parameter 3219 // 3220 int sign = +1; 3221 for (;;) { 3222 SKIP_WS(scan); 3223 if (*scan == '+') { 3224 scan++; // skip '+' 3225 continue; 3226 } 3227 if (*scan == '-') { 3228 sign *= -1; 3229 scan++; // skip '-' 3230 continue; 3231 } 3232 break; 3233 } 3234 SKIP_WS(scan); 3235 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), 3236 "bad explicit places list"); 3237 next = scan; 3238 SKIP_DIGITS(next); 3239 stride = __kmp_str_to_int(scan, *next); 3240 KMP_DEBUG_ASSERT(stride >= 0); 3241 scan = next; 3242 stride *= sign; 3243 } 3244 3245 if (stride > 0) { 3246 int i; 3247 for (i = 0; i < count; i++) { 3248 int j; 3249 if (setSize == 0) { 3250 break; 3251 } 3252 ADD_MASK(tempMask); 3253 setSize = 0; 3254 for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) { 3255 if (! KMP_CPU_ISSET(j - stride, tempMask)) { 3256 KMP_CPU_CLR(j, tempMask); 3257 } 3258 else if ((j > maxOsId) || 3259 (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) { 3260 if ((__kmp_affinity_verbose || (__kmp_affinity_warnings 3261 && (__kmp_affinity_type != affinity_none))) && i < count - 1) { 3262 KMP_WARNING(AffIgnoreInvalidProcID, j); 3263 } 3264 KMP_CPU_CLR(j, tempMask); 3265 } 3266 else { 3267 KMP_CPU_SET(j, tempMask); 3268 setSize++; 3269 } 3270 } 3271 for (; j >= 0; j--) { 3272 KMP_CPU_CLR(j, tempMask); 3273 } 3274 } 3275 } 3276 else { 3277 int i; 3278 for (i = 0; i < count; i++) { 3279 int j; 3280 if (setSize == 0) { 3281 break; 3282 } 3283 ADD_MASK(tempMask); 3284 setSize = 0; 3285 for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride; 3286 j++) { 3287 if (! KMP_CPU_ISSET(j - stride, tempMask)) { 3288 KMP_CPU_CLR(j, tempMask); 3289 } 3290 else if ((j > maxOsId) || 3291 (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) { 3292 if ((__kmp_affinity_verbose || (__kmp_affinity_warnings 3293 && (__kmp_affinity_type != affinity_none))) && i < count - 1) { 3294 KMP_WARNING(AffIgnoreInvalidProcID, j); 3295 } 3296 KMP_CPU_CLR(j, tempMask); 3297 } 3298 else { 3299 KMP_CPU_SET(j, tempMask); 3300 setSize++; 3301 } 3302 } 3303 for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) { 3304 KMP_CPU_CLR(j, tempMask); 3305 } 3306 } 3307 } 3308 KMP_CPU_ZERO(tempMask); 3309 setSize = 0; 3310 3311 // 3312 // valid follow sets are ',' and EOL 3313 // 3314 SKIP_WS(scan); 3315 if (*scan == '\0') { 3316 break; 3317 } 3318 if (*scan == ',') { 3319 scan++; // skip ',' 3320 continue; 3321 } 3322 3323 KMP_ASSERT2(0, "bad explicit places list"); 3324 } 3325 3326 *out_numMasks = nextNewMask; 3327 if (nextNewMask == 0) { 3328 *out_masks = NULL; 3329 KMP_INTERNAL_FREE(newMasks); 3330 return; 3331 } 3332 *out_masks 3333 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size); 3334 KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size); 3335 __kmp_free(tempMask); 3336 KMP_INTERNAL_FREE(newMasks); 3337 } 3338 3339 # endif /* OMP_40_ENABLED */ 3340 3341 #undef ADD_MASK 3342 #undef ADD_MASK_OSID 3343 3344 static void 3345 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) 3346 { 3347 if ( __kmp_place_num_cores == 0 ) { 3348 if ( __kmp_place_num_threads_per_core == 0 ) { 3349 return; // no cores limiting actions requested, exit 3350 } 3351 __kmp_place_num_cores = nCoresPerPkg; // use all available cores 3352 } 3353 if ( !__kmp_affinity_uniform_topology() ) { 3354 KMP_WARNING( AffThrPlaceNonUniform ); 3355 return; // don't support non-uniform topology 3356 } 3357 if ( depth != 3 ) { 3358 KMP_WARNING( AffThrPlaceNonThreeLevel ); 3359 return; // don't support not-3-level topology 3360 } 3361 if ( __kmp_place_num_threads_per_core == 0 ) { 3362 __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts 3363 } 3364 if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) { 3365 KMP_WARNING( AffThrPlaceManyCores ); 3366 return; 3367 } 3368 3369 AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) * 3370 nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core); 3371 int i, j, k, n_old = 0, n_new = 0; 3372 for ( i = 0; i < nPackages; ++i ) { 3373 for ( j = 0; j < nCoresPerPkg; ++j ) { 3374 if ( j < __kmp_place_core_offset || j >= __kmp_place_core_offset + __kmp_place_num_cores ) { 3375 n_old += __kmp_nThreadsPerCore; // skip not-requested core 3376 } else { 3377 for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) { 3378 if ( k < __kmp_place_num_threads_per_core ) { 3379 newAddr[n_new] = (*pAddr)[n_old]; // copy requested core' data to new location 3380 n_new++; 3381 } 3382 n_old++; 3383 } 3384 } 3385 } 3386 } 3387 nCoresPerPkg = __kmp_place_num_cores; // correct nCoresPerPkg 3388 __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore 3389 __kmp_avail_proc = n_new; // correct avail_proc 3390 __kmp_ncores = nPackages * __kmp_place_num_cores; // correct ncores 3391 3392 __kmp_free( *pAddr ); 3393 *pAddr = newAddr; // replace old topology with new one 3394 } 3395 3396 3397 static AddrUnsPair *address2os = NULL; 3398 static int * procarr = NULL; 3399 static int __kmp_aff_depth = 0; 3400 3401 static void 3402 __kmp_aux_affinity_initialize(void) 3403 { 3404 if (__kmp_affinity_masks != NULL) { 3405 KMP_ASSERT(fullMask != NULL); 3406 return; 3407 } 3408 3409 // 3410 // Create the "full" mask - this defines all of the processors that we 3411 // consider to be in the machine model. If respect is set, then it is 3412 // the initialization thread's affinity mask. Otherwise, it is all 3413 // processors that we know about on the machine. 3414 // 3415 if (fullMask == NULL) { 3416 fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size); 3417 } 3418 if (KMP_AFFINITY_CAPABLE()) { 3419 if (__kmp_affinity_respect_mask) { 3420 __kmp_get_system_affinity(fullMask, TRUE); 3421 3422 // 3423 // Count the number of available processors. 3424 // 3425 unsigned i; 3426 __kmp_avail_proc = 0; 3427 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 3428 if (! KMP_CPU_ISSET(i, fullMask)) { 3429 continue; 3430 } 3431 __kmp_avail_proc++; 3432 } 3433 if (__kmp_avail_proc > __kmp_xproc) { 3434 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3435 && (__kmp_affinity_type != affinity_none))) { 3436 KMP_WARNING(ErrorInitializeAffinity); 3437 } 3438 __kmp_affinity_type = affinity_none; 3439 KMP_AFFINITY_DISABLE(); 3440 return; 3441 } 3442 } 3443 else { 3444 __kmp_affinity_entire_machine_mask(fullMask); 3445 __kmp_avail_proc = __kmp_xproc; 3446 } 3447 } 3448 3449 int depth = -1; 3450 kmp_i18n_id_t msg_id = kmp_i18n_null; 3451 3452 // 3453 // For backward compatibility, setting KMP_CPUINFO_FILE => 3454 // KMP_TOPOLOGY_METHOD=cpuinfo 3455 // 3456 if ((__kmp_cpuinfo_file != NULL) && 3457 (__kmp_affinity_top_method == affinity_top_method_all)) { 3458 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 3459 } 3460 3461 if (__kmp_affinity_top_method == affinity_top_method_all) { 3462 // 3463 // In the default code path, errors are not fatal - we just try using 3464 // another method. We only emit a warning message if affinity is on, 3465 // or the verbose flag is set, an the nowarnings flag was not set. 3466 // 3467 const char *file_name = NULL; 3468 int line = 0; 3469 3470 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 3471 3472 if (__kmp_affinity_verbose) { 3473 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 3474 } 3475 3476 file_name = NULL; 3477 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3478 if (depth == 0) { 3479 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3480 KMP_ASSERT(address2os == NULL); 3481 return; 3482 } 3483 3484 if (depth < 0) { 3485 if (__kmp_affinity_verbose) { 3486 if (msg_id != kmp_i18n_null) { 3487 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), 3488 KMP_I18N_STR(DecodingLegacyAPIC)); 3489 } 3490 else { 3491 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 3492 } 3493 } 3494 3495 file_name = NULL; 3496 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3497 if (depth == 0) { 3498 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3499 KMP_ASSERT(address2os == NULL); 3500 return; 3501 } 3502 } 3503 3504 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3505 3506 # if KMP_OS_LINUX 3507 3508 if (depth < 0) { 3509 if (__kmp_affinity_verbose) { 3510 if (msg_id != kmp_i18n_null) { 3511 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo"); 3512 } 3513 else { 3514 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo"); 3515 } 3516 } 3517 3518 FILE *f = fopen("/proc/cpuinfo", "r"); 3519 if (f == NULL) { 3520 msg_id = kmp_i18n_str_CantOpenCpuinfo; 3521 } 3522 else { 3523 file_name = "/proc/cpuinfo"; 3524 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3525 fclose(f); 3526 if (depth == 0) { 3527 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3528 KMP_ASSERT(address2os == NULL); 3529 return; 3530 } 3531 } 3532 } 3533 3534 # endif /* KMP_OS_LINUX */ 3535 3536 # if KMP_GROUP_AFFINITY 3537 3538 if ((depth < 0) && (__kmp_num_proc_groups > 1)) { 3539 if (__kmp_affinity_verbose) { 3540 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 3541 } 3542 3543 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 3544 KMP_ASSERT(depth != 0); 3545 } 3546 3547 # endif /* KMP_GROUP_AFFINITY */ 3548 3549 if (depth < 0) { 3550 if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) { 3551 if (file_name == NULL) { 3552 KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id)); 3553 } 3554 else if (line == 0) { 3555 KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id)); 3556 } 3557 else { 3558 KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id)); 3559 } 3560 } 3561 // FIXME - print msg if msg_id = kmp_i18n_null ??? 3562 3563 file_name = ""; 3564 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3565 if (depth == 0) { 3566 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3567 KMP_ASSERT(address2os == NULL); 3568 return; 3569 } 3570 KMP_ASSERT(depth > 0); 3571 KMP_ASSERT(address2os != NULL); 3572 } 3573 } 3574 3575 // 3576 // If the user has specified that a paricular topology discovery method 3577 // is to be used, then we abort if that method fails. The exception is 3578 // group affinity, which might have been implicitly set. 3579 // 3580 3581 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 3582 3583 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 3584 if (__kmp_affinity_verbose) { 3585 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3586 KMP_I18N_STR(Decodingx2APIC)); 3587 } 3588 3589 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3590 if (depth == 0) { 3591 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3592 KMP_ASSERT(address2os == NULL); 3593 return; 3594 } 3595 if (depth < 0) { 3596 KMP_ASSERT(msg_id != kmp_i18n_null); 3597 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3598 } 3599 } 3600 else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 3601 if (__kmp_affinity_verbose) { 3602 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3603 KMP_I18N_STR(DecodingLegacyAPIC)); 3604 } 3605 3606 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3607 if (depth == 0) { 3608 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3609 KMP_ASSERT(address2os == NULL); 3610 return; 3611 } 3612 if (depth < 0) { 3613 KMP_ASSERT(msg_id != kmp_i18n_null); 3614 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3615 } 3616 } 3617 3618 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3619 3620 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 3621 const char *filename; 3622 if (__kmp_cpuinfo_file != NULL) { 3623 filename = __kmp_cpuinfo_file; 3624 } 3625 else { 3626 filename = "/proc/cpuinfo"; 3627 } 3628 3629 if (__kmp_affinity_verbose) { 3630 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 3631 } 3632 3633 FILE *f = fopen(filename, "r"); 3634 if (f == NULL) { 3635 int code = errno; 3636 if (__kmp_cpuinfo_file != NULL) { 3637 __kmp_msg( 3638 kmp_ms_fatal, 3639 KMP_MSG(CantOpenFileForReading, filename), 3640 KMP_ERR(code), 3641 KMP_HNT(NameComesFrom_CPUINFO_FILE), 3642 __kmp_msg_null 3643 ); 3644 } 3645 else { 3646 __kmp_msg( 3647 kmp_ms_fatal, 3648 KMP_MSG(CantOpenFileForReading, filename), 3649 KMP_ERR(code), 3650 __kmp_msg_null 3651 ); 3652 } 3653 } 3654 int line = 0; 3655 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3656 fclose(f); 3657 if (depth < 0) { 3658 KMP_ASSERT(msg_id != kmp_i18n_null); 3659 if (line > 0) { 3660 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id)); 3661 } 3662 else { 3663 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 3664 } 3665 } 3666 if (__kmp_affinity_type == affinity_none) { 3667 KMP_ASSERT(depth == 0); 3668 KMP_ASSERT(address2os == NULL); 3669 return; 3670 } 3671 } 3672 3673 # if KMP_GROUP_AFFINITY 3674 3675 else if (__kmp_affinity_top_method == affinity_top_method_group) { 3676 if (__kmp_affinity_verbose) { 3677 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 3678 } 3679 3680 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 3681 KMP_ASSERT(depth != 0); 3682 if (depth < 0) { 3683 KMP_ASSERT(msg_id != kmp_i18n_null); 3684 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3685 } 3686 } 3687 3688 # endif /* KMP_GROUP_AFFINITY */ 3689 3690 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 3691 if (__kmp_affinity_verbose) { 3692 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY"); 3693 } 3694 3695 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3696 if (depth == 0) { 3697 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3698 KMP_ASSERT(address2os == NULL); 3699 return; 3700 } 3701 // should not fail 3702 KMP_ASSERT(depth > 0); 3703 KMP_ASSERT(address2os != NULL); 3704 } 3705 3706 if (address2os == NULL) { 3707 if (KMP_AFFINITY_CAPABLE() 3708 && (__kmp_affinity_verbose || (__kmp_affinity_warnings 3709 && (__kmp_affinity_type != affinity_none)))) { 3710 KMP_WARNING(ErrorInitializeAffinity); 3711 } 3712 __kmp_affinity_type = affinity_none; 3713 KMP_AFFINITY_DISABLE(); 3714 return; 3715 } 3716 3717 __kmp_apply_thread_places(&address2os, depth); 3718 3719 // 3720 // Create the table of masks, indexed by thread Id. 3721 // 3722 unsigned maxIndex; 3723 unsigned numUnique; 3724 kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique, 3725 address2os, __kmp_avail_proc); 3726 if (__kmp_affinity_gran_levels == 0) { 3727 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 3728 } 3729 3730 // 3731 // Set the childNums vector in all Address objects. This must be done 3732 // before we can sort using __kmp_affinity_cmp_Address_child_num(), 3733 // which takes into account the setting of __kmp_affinity_compact. 3734 // 3735 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc); 3736 3737 switch (__kmp_affinity_type) { 3738 3739 case affinity_explicit: 3740 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 3741 # if OMP_40_ENABLED 3742 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 3743 # endif 3744 { 3745 __kmp_affinity_process_proclist(&__kmp_affinity_masks, 3746 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask, 3747 maxIndex); 3748 } 3749 # if OMP_40_ENABLED 3750 else { 3751 __kmp_affinity_process_placelist(&__kmp_affinity_masks, 3752 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask, 3753 maxIndex); 3754 } 3755 # endif 3756 if (__kmp_affinity_num_masks == 0) { 3757 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3758 && (__kmp_affinity_type != affinity_none))) { 3759 KMP_WARNING(AffNoValidProcID); 3760 } 3761 __kmp_affinity_type = affinity_none; 3762 return; 3763 } 3764 break; 3765 3766 // 3767 // The other affinity types rely on sorting the Addresses according 3768 // to some permutation of the machine topology tree. Set 3769 // __kmp_affinity_compact and __kmp_affinity_offset appropriately, 3770 // then jump to a common code fragment to do the sort and create 3771 // the array of affinity masks. 3772 // 3773 3774 case affinity_logical: 3775 __kmp_affinity_compact = 0; 3776 if (__kmp_affinity_offset) { 3777 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset 3778 % __kmp_avail_proc; 3779 } 3780 goto sortAddresses; 3781 3782 case affinity_physical: 3783 if (__kmp_nThreadsPerCore > 1) { 3784 __kmp_affinity_compact = 1; 3785 if (__kmp_affinity_compact >= depth) { 3786 __kmp_affinity_compact = 0; 3787 } 3788 } else { 3789 __kmp_affinity_compact = 0; 3790 } 3791 if (__kmp_affinity_offset) { 3792 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset 3793 % __kmp_avail_proc; 3794 } 3795 goto sortAddresses; 3796 3797 case affinity_scatter: 3798 if (__kmp_affinity_compact >= depth) { 3799 __kmp_affinity_compact = 0; 3800 } 3801 else { 3802 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 3803 } 3804 goto sortAddresses; 3805 3806 case affinity_compact: 3807 if (__kmp_affinity_compact >= depth) { 3808 __kmp_affinity_compact = depth - 1; 3809 } 3810 goto sortAddresses; 3811 3812 case affinity_balanced: 3813 // Balanced works only for the case of a single package and uniform topology 3814 if( nPackages > 1 ) { 3815 if( __kmp_affinity_verbose || __kmp_affinity_warnings ) { 3816 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" ); 3817 } 3818 __kmp_affinity_type = affinity_none; 3819 return; 3820 } else if( __kmp_affinity_uniform_topology() ) { 3821 break; 3822 } else { // Non-uniform topology 3823 3824 // Save the depth for further usage 3825 __kmp_aff_depth = depth; 3826 3827 // Number of hyper threads per core in HT machine 3828 int nth_per_core = __kmp_nThreadsPerCore; 3829 3830 int core_level; 3831 if( nth_per_core > 1 ) { 3832 core_level = depth - 2; 3833 } else { 3834 core_level = depth - 1; 3835 } 3836 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1; 3837 int nproc = nth_per_core * ncores; 3838 3839 procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc ); 3840 for( int i = 0; i < nproc; i++ ) { 3841 procarr[ i ] = -1; 3842 } 3843 3844 for( int i = 0; i < __kmp_avail_proc; i++ ) { 3845 int proc = address2os[ i ].second; 3846 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread. 3847 // If there is only one thread per core then depth == 2: level 0 - package, 3848 // level 1 - core. 3849 int level = depth - 1; 3850 3851 // __kmp_nth_per_core == 1 3852 int thread = 0; 3853 int core = address2os[ i ].first.labels[ level ]; 3854 // If the thread level exists, that is we have more than one thread context per core 3855 if( nth_per_core > 1 ) { 3856 thread = address2os[ i ].first.labels[ level ] % nth_per_core; 3857 core = address2os[ i ].first.labels[ level - 1 ]; 3858 } 3859 procarr[ core * nth_per_core + thread ] = proc; 3860 } 3861 3862 break; 3863 } 3864 3865 sortAddresses: 3866 // 3867 // Allocate the gtid->affinity mask table. 3868 // 3869 if (__kmp_affinity_dups) { 3870 __kmp_affinity_num_masks = __kmp_avail_proc; 3871 } 3872 else { 3873 __kmp_affinity_num_masks = numUnique; 3874 } 3875 3876 # if OMP_40_ENABLED 3877 if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel ) 3878 && ( __kmp_affinity_num_places > 0 ) 3879 && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) { 3880 __kmp_affinity_num_masks = __kmp_affinity_num_places; 3881 } 3882 # endif 3883 3884 __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate( 3885 __kmp_affinity_num_masks * __kmp_affin_mask_size); 3886 3887 // 3888 // Sort the address2os table according to the current setting of 3889 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 3890 // 3891 qsort(address2os, __kmp_avail_proc, sizeof(*address2os), 3892 __kmp_affinity_cmp_Address_child_num); 3893 { 3894 int i; 3895 unsigned j; 3896 for (i = 0, j = 0; i < __kmp_avail_proc; i++) { 3897 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) { 3898 continue; 3899 } 3900 unsigned osId = address2os[i].second; 3901 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 3902 kmp_affin_mask_t *dest 3903 = KMP_CPU_INDEX(__kmp_affinity_masks, j); 3904 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 3905 KMP_CPU_COPY(dest, src); 3906 if (++j >= __kmp_affinity_num_masks) { 3907 break; 3908 } 3909 } 3910 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 3911 } 3912 break; 3913 3914 default: 3915 KMP_ASSERT2(0, "Unexpected affinity setting"); 3916 } 3917 3918 __kmp_free(osId2Mask); 3919 machine_hierarchy.init(address2os, __kmp_avail_proc); 3920 } 3921 3922 3923 void 3924 __kmp_affinity_initialize(void) 3925 { 3926 // 3927 // Much of the code above was written assumming that if a machine was not 3928 // affinity capable, then __kmp_affinity_type == affinity_none. We now 3929 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 3930 // 3931 // There are too many checks for __kmp_affinity_type == affinity_none 3932 // in this code. Instead of trying to change them all, check if 3933 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 3934 // affinity_none, call the real initialization routine, then restore 3935 // __kmp_affinity_type to affinity_disabled. 3936 // 3937 int disabled = (__kmp_affinity_type == affinity_disabled); 3938 if (! KMP_AFFINITY_CAPABLE()) { 3939 KMP_ASSERT(disabled); 3940 } 3941 if (disabled) { 3942 __kmp_affinity_type = affinity_none; 3943 } 3944 __kmp_aux_affinity_initialize(); 3945 if (disabled) { 3946 __kmp_affinity_type = affinity_disabled; 3947 } 3948 } 3949 3950 3951 void 3952 __kmp_affinity_uninitialize(void) 3953 { 3954 if (__kmp_affinity_masks != NULL) { 3955 __kmp_free(__kmp_affinity_masks); 3956 __kmp_affinity_masks = NULL; 3957 } 3958 if (fullMask != NULL) { 3959 KMP_CPU_FREE(fullMask); 3960 fullMask = NULL; 3961 } 3962 __kmp_affinity_num_masks = 0; 3963 # if OMP_40_ENABLED 3964 __kmp_affinity_num_places = 0; 3965 # endif 3966 if (__kmp_affinity_proclist != NULL) { 3967 __kmp_free(__kmp_affinity_proclist); 3968 __kmp_affinity_proclist = NULL; 3969 } 3970 if( address2os != NULL ) { 3971 __kmp_free( address2os ); 3972 address2os = NULL; 3973 } 3974 if( procarr != NULL ) { 3975 __kmp_free( procarr ); 3976 procarr = NULL; 3977 } 3978 } 3979 3980 3981 void 3982 __kmp_affinity_set_init_mask(int gtid, int isa_root) 3983 { 3984 if (! KMP_AFFINITY_CAPABLE()) { 3985 return; 3986 } 3987 3988 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 3989 if (th->th.th_affin_mask == NULL) { 3990 KMP_CPU_ALLOC(th->th.th_affin_mask); 3991 } 3992 else { 3993 KMP_CPU_ZERO(th->th.th_affin_mask); 3994 } 3995 3996 // 3997 // Copy the thread mask to the kmp_info_t strucuture. 3998 // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one 3999 // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask 4000 // is set, then the full mask is the same as the mask of the initialization 4001 // thread. 4002 // 4003 kmp_affin_mask_t *mask; 4004 int i; 4005 4006 # if OMP_40_ENABLED 4007 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 4008 # endif 4009 { 4010 if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced) 4011 ) { 4012 # if KMP_GROUP_AFFINITY 4013 if (__kmp_num_proc_groups > 1) { 4014 return; 4015 } 4016 # endif 4017 KMP_ASSERT(fullMask != NULL); 4018 i = KMP_PLACE_ALL; 4019 mask = fullMask; 4020 } 4021 else { 4022 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 ); 4023 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4024 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4025 } 4026 } 4027 # if OMP_40_ENABLED 4028 else { 4029 if ((! isa_root) 4030 || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 4031 # if KMP_GROUP_AFFINITY 4032 if (__kmp_num_proc_groups > 1) { 4033 return; 4034 } 4035 # endif 4036 KMP_ASSERT(fullMask != NULL); 4037 i = KMP_PLACE_ALL; 4038 mask = fullMask; 4039 } 4040 else { 4041 // 4042 // int i = some hash function or just a counter that doesn't 4043 // always start at 0. Use gtid for now. 4044 // 4045 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 ); 4046 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4047 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4048 } 4049 } 4050 # endif 4051 4052 # if OMP_40_ENABLED 4053 th->th.th_current_place = i; 4054 if (isa_root) { 4055 th->th.th_new_place = i; 4056 th->th.th_first_place = 0; 4057 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4058 } 4059 4060 if (i == KMP_PLACE_ALL) { 4061 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 4062 gtid)); 4063 } 4064 else { 4065 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 4066 gtid, i)); 4067 } 4068 # else 4069 if (i == -1) { 4070 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n", 4071 gtid)); 4072 } 4073 else { 4074 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n", 4075 gtid, i)); 4076 } 4077 # endif /* OMP_40_ENABLED */ 4078 4079 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4080 4081 if (__kmp_affinity_verbose) { 4082 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4083 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4084 th->th.th_affin_mask); 4085 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid, 4086 buf); 4087 } 4088 4089 # if KMP_OS_WINDOWS 4090 // 4091 // On Windows* OS, the process affinity mask might have changed. 4092 // If the user didn't request affinity and this call fails, 4093 // just continue silently. See CQ171393. 4094 // 4095 if ( __kmp_affinity_type == affinity_none ) { 4096 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 4097 } 4098 else 4099 # endif 4100 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4101 } 4102 4103 4104 # if OMP_40_ENABLED 4105 4106 void 4107 __kmp_affinity_set_place(int gtid) 4108 { 4109 int retval; 4110 4111 if (! KMP_AFFINITY_CAPABLE()) { 4112 return; 4113 } 4114 4115 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4116 4117 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n", 4118 gtid, th->th.th_new_place, th->th.th_current_place)); 4119 4120 // 4121 // Check that the new place is within this thread's partition. 4122 // 4123 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4124 KMP_ASSERT(th->th.th_new_place >= 0); 4125 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 4126 if (th->th.th_first_place <= th->th.th_last_place) { 4127 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) 4128 && (th->th.th_new_place <= th->th.th_last_place)); 4129 } 4130 else { 4131 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) 4132 || (th->th.th_new_place >= th->th.th_last_place)); 4133 } 4134 4135 // 4136 // Copy the thread mask to the kmp_info_t strucuture, 4137 // and set this thread's affinity. 4138 // 4139 kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, 4140 th->th.th_new_place); 4141 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4142 th->th.th_current_place = th->th.th_new_place; 4143 4144 if (__kmp_affinity_verbose) { 4145 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4146 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4147 th->th.th_affin_mask); 4148 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), 4149 gtid, buf); 4150 } 4151 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4152 } 4153 4154 # endif /* OMP_40_ENABLED */ 4155 4156 4157 int 4158 __kmp_aux_set_affinity(void **mask) 4159 { 4160 int gtid; 4161 kmp_info_t *th; 4162 int retval; 4163 4164 if (! KMP_AFFINITY_CAPABLE()) { 4165 return -1; 4166 } 4167 4168 gtid = __kmp_entry_gtid(); 4169 KA_TRACE(1000, ;{ 4170 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4171 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4172 (kmp_affin_mask_t *)(*mask)); 4173 __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n", 4174 gtid, buf); 4175 }); 4176 4177 if (__kmp_env_consistency_check) { 4178 if ((mask == NULL) || (*mask == NULL)) { 4179 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4180 } 4181 else { 4182 unsigned proc; 4183 int num_procs = 0; 4184 4185 for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) { 4186 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 4187 continue; 4188 } 4189 num_procs++; 4190 if (! KMP_CPU_ISSET(proc, fullMask)) { 4191 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4192 break; 4193 } 4194 } 4195 if (num_procs == 0) { 4196 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4197 } 4198 4199 # if KMP_GROUP_AFFINITY 4200 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 4201 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4202 } 4203 # endif /* KMP_GROUP_AFFINITY */ 4204 4205 } 4206 } 4207 4208 th = __kmp_threads[gtid]; 4209 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4210 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4211 if (retval == 0) { 4212 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 4213 } 4214 4215 # if OMP_40_ENABLED 4216 th->th.th_current_place = KMP_PLACE_UNDEFINED; 4217 th->th.th_new_place = KMP_PLACE_UNDEFINED; 4218 th->th.th_first_place = 0; 4219 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4220 4221 // 4222 // Turn off 4.0 affinity for the current tread at this parallel level. 4223 // 4224 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 4225 # endif 4226 4227 return retval; 4228 } 4229 4230 4231 int 4232 __kmp_aux_get_affinity(void **mask) 4233 { 4234 int gtid; 4235 int retval; 4236 kmp_info_t *th; 4237 4238 if (! KMP_AFFINITY_CAPABLE()) { 4239 return -1; 4240 } 4241 4242 gtid = __kmp_entry_gtid(); 4243 th = __kmp_threads[gtid]; 4244 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4245 4246 KA_TRACE(1000, ;{ 4247 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4248 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4249 th->th.th_affin_mask); 4250 __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf); 4251 }); 4252 4253 if (__kmp_env_consistency_check) { 4254 if ((mask == NULL) || (*mask == NULL)) { 4255 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 4256 } 4257 } 4258 4259 # if !KMP_OS_WINDOWS 4260 4261 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4262 KA_TRACE(1000, ;{ 4263 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4264 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4265 (kmp_affin_mask_t *)(*mask)); 4266 __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf); 4267 }); 4268 return retval; 4269 4270 # else 4271 4272 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 4273 return 0; 4274 4275 # endif /* KMP_OS_WINDOWS */ 4276 4277 } 4278 4279 int 4280 __kmp_aux_set_affinity_mask_proc(int proc, void **mask) 4281 { 4282 int retval; 4283 4284 if (! KMP_AFFINITY_CAPABLE()) { 4285 return -1; 4286 } 4287 4288 KA_TRACE(1000, ;{ 4289 int gtid = __kmp_entry_gtid(); 4290 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4291 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4292 (kmp_affin_mask_t *)(*mask)); 4293 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n", 4294 proc, gtid, buf); 4295 }); 4296 4297 if (__kmp_env_consistency_check) { 4298 if ((mask == NULL) || (*mask == NULL)) { 4299 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4300 } 4301 } 4302 4303 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4304 return -1; 4305 } 4306 if (! KMP_CPU_ISSET(proc, fullMask)) { 4307 return -2; 4308 } 4309 4310 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 4311 return 0; 4312 } 4313 4314 4315 int 4316 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) 4317 { 4318 int retval; 4319 4320 if (! KMP_AFFINITY_CAPABLE()) { 4321 return -1; 4322 } 4323 4324 KA_TRACE(1000, ;{ 4325 int gtid = __kmp_entry_gtid(); 4326 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4327 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4328 (kmp_affin_mask_t *)(*mask)); 4329 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n", 4330 proc, gtid, buf); 4331 }); 4332 4333 if (__kmp_env_consistency_check) { 4334 if ((mask == NULL) || (*mask == NULL)) { 4335 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 4336 } 4337 } 4338 4339 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4340 return -1; 4341 } 4342 if (! KMP_CPU_ISSET(proc, fullMask)) { 4343 return -2; 4344 } 4345 4346 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 4347 return 0; 4348 } 4349 4350 4351 int 4352 __kmp_aux_get_affinity_mask_proc(int proc, void **mask) 4353 { 4354 int retval; 4355 4356 if (! KMP_AFFINITY_CAPABLE()) { 4357 return -1; 4358 } 4359 4360 KA_TRACE(1000, ;{ 4361 int gtid = __kmp_entry_gtid(); 4362 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4363 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4364 (kmp_affin_mask_t *)(*mask)); 4365 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n", 4366 proc, gtid, buf); 4367 }); 4368 4369 if (__kmp_env_consistency_check) { 4370 if ((mask == NULL) || (*mask == NULL)) { 4371 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc"); 4372 } 4373 } 4374 4375 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4376 return 0; 4377 } 4378 if (! KMP_CPU_ISSET(proc, fullMask)) { 4379 return 0; 4380 } 4381 4382 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 4383 } 4384 4385 4386 // Dynamic affinity settings - Affinity balanced 4387 void __kmp_balanced_affinity( int tid, int nthreads ) 4388 { 4389 if( __kmp_affinity_uniform_topology() ) { 4390 int coreID; 4391 int threadID; 4392 // Number of hyper threads per core in HT machine 4393 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 4394 // Number of cores 4395 int ncores = __kmp_ncores; 4396 // How many threads will be bound to each core 4397 int chunk = nthreads / ncores; 4398 // How many cores will have an additional thread bound to it - "big cores" 4399 int big_cores = nthreads % ncores; 4400 // Number of threads on the big cores 4401 int big_nth = ( chunk + 1 ) * big_cores; 4402 if( tid < big_nth ) { 4403 coreID = tid / (chunk + 1 ); 4404 threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ; 4405 } else { //tid >= big_nth 4406 coreID = ( tid - big_cores ) / chunk; 4407 threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ; 4408 } 4409 4410 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 4411 "Illegal set affinity operation when not capable"); 4412 4413 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size); 4414 KMP_CPU_ZERO(mask); 4415 4416 // Granularity == thread 4417 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4418 int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second; 4419 KMP_CPU_SET( osID, mask); 4420 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4421 for( int i = 0; i < __kmp_nth_per_core; i++ ) { 4422 int osID; 4423 osID = address2os[ coreID * __kmp_nth_per_core + i ].second; 4424 KMP_CPU_SET( osID, mask); 4425 } 4426 } 4427 if (__kmp_affinity_verbose) { 4428 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4429 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4430 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4431 tid, buf); 4432 } 4433 __kmp_set_system_affinity( mask, TRUE ); 4434 } else { // Non-uniform topology 4435 4436 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size); 4437 KMP_CPU_ZERO(mask); 4438 4439 // Number of hyper threads per core in HT machine 4440 int nth_per_core = __kmp_nThreadsPerCore; 4441 int core_level; 4442 if( nth_per_core > 1 ) { 4443 core_level = __kmp_aff_depth - 2; 4444 } else { 4445 core_level = __kmp_aff_depth - 1; 4446 } 4447 4448 // Number of cores - maximum value; it does not count trail cores with 0 processors 4449 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1; 4450 4451 // For performance gain consider the special case nthreads == __kmp_avail_proc 4452 if( nthreads == __kmp_avail_proc ) { 4453 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4454 int osID = address2os[ tid ].second; 4455 KMP_CPU_SET( osID, mask); 4456 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4457 int coreID = address2os[ tid ].first.labels[ core_level ]; 4458 // We'll count found osIDs for the current core; they can be not more than nth_per_core; 4459 // since the address2os is sortied we can break when cnt==nth_per_core 4460 int cnt = 0; 4461 for( int i = 0; i < __kmp_avail_proc; i++ ) { 4462 int osID = address2os[ i ].second; 4463 int core = address2os[ i ].first.labels[ core_level ]; 4464 if( core == coreID ) { 4465 KMP_CPU_SET( osID, mask); 4466 cnt++; 4467 if( cnt == nth_per_core ) { 4468 break; 4469 } 4470 } 4471 } 4472 } 4473 } else if( nthreads <= __kmp_ncores ) { 4474 4475 int core = 0; 4476 for( int i = 0; i < ncores; i++ ) { 4477 // Check if this core from procarr[] is in the mask 4478 int in_mask = 0; 4479 for( int j = 0; j < nth_per_core; j++ ) { 4480 if( procarr[ i * nth_per_core + j ] != - 1 ) { 4481 in_mask = 1; 4482 break; 4483 } 4484 } 4485 if( in_mask ) { 4486 if( tid == core ) { 4487 for( int j = 0; j < nth_per_core; j++ ) { 4488 int osID = procarr[ i * nth_per_core + j ]; 4489 if( osID != -1 ) { 4490 KMP_CPU_SET( osID, mask ); 4491 // For granularity=thread it is enough to set the first available osID for this core 4492 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4493 break; 4494 } 4495 } 4496 } 4497 break; 4498 } else { 4499 core++; 4500 } 4501 } 4502 } 4503 4504 } else { // nthreads > __kmp_ncores 4505 4506 // Array to save the number of processors at each core 4507 int nproc_at_core[ ncores ]; 4508 // Array to save the number of cores with "x" available processors; 4509 int ncores_with_x_procs[ nth_per_core + 1 ]; 4510 // Array to save the number of cores with # procs from x to nth_per_core 4511 int ncores_with_x_to_max_procs[ nth_per_core + 1 ]; 4512 4513 for( int i = 0; i <= nth_per_core; i++ ) { 4514 ncores_with_x_procs[ i ] = 0; 4515 ncores_with_x_to_max_procs[ i ] = 0; 4516 } 4517 4518 for( int i = 0; i < ncores; i++ ) { 4519 int cnt = 0; 4520 for( int j = 0; j < nth_per_core; j++ ) { 4521 if( procarr[ i * nth_per_core + j ] != -1 ) { 4522 cnt++; 4523 } 4524 } 4525 nproc_at_core[ i ] = cnt; 4526 ncores_with_x_procs[ cnt ]++; 4527 } 4528 4529 for( int i = 0; i <= nth_per_core; i++ ) { 4530 for( int j = i; j <= nth_per_core; j++ ) { 4531 ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ]; 4532 } 4533 } 4534 4535 // Max number of processors 4536 int nproc = nth_per_core * ncores; 4537 // An array to keep number of threads per each context 4538 int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc ); 4539 for( int i = 0; i < nproc; i++ ) { 4540 newarr[ i ] = 0; 4541 } 4542 4543 int nth = nthreads; 4544 int flag = 0; 4545 while( nth > 0 ) { 4546 for( int j = 1; j <= nth_per_core; j++ ) { 4547 int cnt = ncores_with_x_to_max_procs[ j ]; 4548 for( int i = 0; i < ncores; i++ ) { 4549 // Skip the core with 0 processors 4550 if( nproc_at_core[ i ] == 0 ) { 4551 continue; 4552 } 4553 for( int k = 0; k < nth_per_core; k++ ) { 4554 if( procarr[ i * nth_per_core + k ] != -1 ) { 4555 if( newarr[ i * nth_per_core + k ] == 0 ) { 4556 newarr[ i * nth_per_core + k ] = 1; 4557 cnt--; 4558 nth--; 4559 break; 4560 } else { 4561 if( flag != 0 ) { 4562 newarr[ i * nth_per_core + k ] ++; 4563 cnt--; 4564 nth--; 4565 break; 4566 } 4567 } 4568 } 4569 } 4570 if( cnt == 0 || nth == 0 ) { 4571 break; 4572 } 4573 } 4574 if( nth == 0 ) { 4575 break; 4576 } 4577 } 4578 flag = 1; 4579 } 4580 int sum = 0; 4581 for( int i = 0; i < nproc; i++ ) { 4582 sum += newarr[ i ]; 4583 if( sum > tid ) { 4584 // Granularity == thread 4585 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4586 int osID = procarr[ i ]; 4587 KMP_CPU_SET( osID, mask); 4588 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4589 int coreID = i / nth_per_core; 4590 for( int ii = 0; ii < nth_per_core; ii++ ) { 4591 int osID = procarr[ coreID * nth_per_core + ii ]; 4592 if( osID != -1 ) { 4593 KMP_CPU_SET( osID, mask); 4594 } 4595 } 4596 } 4597 break; 4598 } 4599 } 4600 __kmp_free( newarr ); 4601 } 4602 4603 if (__kmp_affinity_verbose) { 4604 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4605 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4606 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4607 tid, buf); 4608 } 4609 __kmp_set_system_affinity( mask, TRUE ); 4610 } 4611 } 4612 4613 #else 4614 // affinity not supported 4615 4616 static const kmp_uint32 noaff_maxLevels=7; 4617 kmp_uint32 noaff_skipPerLevel[noaff_maxLevels]; 4618 kmp_uint32 noaff_depth; 4619 kmp_uint8 noaff_leaf_kids; 4620 kmp_int8 noaff_uninitialized=1; 4621 4622 void noaff_init(int nprocs) 4623 { 4624 kmp_int8 result = KMP_COMPARE_AND_STORE_ACQ8(&noaff_uninitialized, 1, 2); 4625 if (result == 0) return; // Already initialized 4626 else if (result == 2) { // Someone else is initializing 4627 while (TCR_1(noaff_uninitialized) != 0) KMP_CPU_PAUSE(); 4628 return; 4629 } 4630 KMP_DEBUG_ASSERT(result==1); 4631 4632 kmp_uint32 numPerLevel[noaff_maxLevels]; 4633 noaff_depth = 1; 4634 for (kmp_uint32 i=0; i<noaff_maxLevels; ++i) { // init numPerLevel[*] to 1 item per level 4635 numPerLevel[i] = 1; 4636 noaff_skipPerLevel[i] = 1; 4637 } 4638 4639 numPerLevel[0] = 4; 4640 numPerLevel[1] = nprocs/4; 4641 if (nprocs%4) numPerLevel[1]++; 4642 4643 for (int i=noaff_maxLevels-1; i>=0; --i) // count non-empty levels to get depth 4644 if (numPerLevel[i] != 1 || noaff_depth > 1) // only count one top-level '1' 4645 noaff_depth++; 4646 4647 kmp_uint32 branch = 4; 4648 if (numPerLevel[0] == 1) branch = nprocs/4; 4649 if (branch<4) branch=4; 4650 for (kmp_uint32 d=0; d<noaff_depth-1; ++d) { // optimize hierarchy width 4651 while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0! 4652 if (numPerLevel[d] & 1) numPerLevel[d]++; 4653 numPerLevel[d] = numPerLevel[d] >> 1; 4654 if (numPerLevel[d+1] == 1) noaff_depth++; 4655 numPerLevel[d+1] = numPerLevel[d+1] << 1; 4656 } 4657 if(numPerLevel[0] == 1) { 4658 branch = branch >> 1; 4659 if (branch<4) branch = 4; 4660 } 4661 } 4662 4663 for (kmp_uint32 i=1; i<noaff_depth; ++i) 4664 noaff_skipPerLevel[i] = numPerLevel[i-1] * noaff_skipPerLevel[i-1]; 4665 // Fill in hierarchy in the case of oversubscription 4666 for (kmp_uint32 i=noaff_depth; i<noaff_maxLevels; ++i) 4667 noaff_skipPerLevel[i] = 2*noaff_skipPerLevel[i-1]; 4668 noaff_leaf_kids = (kmp_uint8)numPerLevel[0]-1; 4669 noaff_uninitialized = 0; // One writer 4670 4671 } 4672 4673 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 4674 if (noaff_uninitialized) 4675 noaff_init(nproc); 4676 4677 thr_bar->depth = noaff_depth; 4678 thr_bar->base_leaf_kids = noaff_leaf_kids; 4679 thr_bar->skip_per_level = noaff_skipPerLevel; 4680 } 4681 4682 #endif // KMP_AFFINITY_SUPPORTED 4683