1 /* 2 * kmp_affinity.cpp -- affinity management 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "kmp.h" 17 #include "kmp_i18n.h" 18 #include "kmp_io.h" 19 #include "kmp_str.h" 20 #include "kmp_wrapper_getpid.h" 21 22 #if KMP_AFFINITY_SUPPORTED 23 24 // 25 // Print the affinity mask to the character array in a pretty format. 26 // 27 char * 28 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask) 29 { 30 KMP_ASSERT(buf_len >= 40); 31 char *scan = buf; 32 char *end = buf + buf_len - 1; 33 34 // 35 // Find first element / check for empty set. 36 // 37 size_t i; 38 for (i = 0; i < KMP_CPU_SETSIZE; i++) { 39 if (KMP_CPU_ISSET(i, mask)) { 40 break; 41 } 42 } 43 if (i == KMP_CPU_SETSIZE) { 44 KMP_SNPRINTF(scan, buf_len, "{<empty>}"); 45 while (*scan != '\0') scan++; 46 KMP_ASSERT(scan <= end); 47 return buf; 48 } 49 50 KMP_SNPRINTF(scan, buf_len, "{%ld", (long)i); 51 while (*scan != '\0') scan++; 52 i++; 53 for (; i < KMP_CPU_SETSIZE; i++) { 54 if (! KMP_CPU_ISSET(i, mask)) { 55 continue; 56 } 57 58 // 59 // Check for buffer overflow. A string of the form ",<n>" will have 60 // at most 10 characters, plus we want to leave room to print ",...}" 61 // if the set is too large to print for a total of 15 characters. 62 // We already left room for '\0' in setting end. 63 // 64 if (end - scan < 15) { 65 break; 66 } 67 KMP_SNPRINTF(scan, buf_len, ",%-ld", (long)i); 68 while (*scan != '\0') scan++; 69 } 70 if (i < KMP_CPU_SETSIZE) { 71 KMP_SNPRINTF(scan, buf_len, ",..."); 72 while (*scan != '\0') scan++; 73 } 74 KMP_SNPRINTF(scan, buf_len, "}"); 75 while (*scan != '\0') scan++; 76 KMP_ASSERT(scan <= end); 77 return buf; 78 } 79 80 81 void 82 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) 83 { 84 KMP_CPU_ZERO(mask); 85 86 # if KMP_GROUP_AFFINITY 87 88 if (__kmp_num_proc_groups > 1) { 89 int group; 90 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 91 for (group = 0; group < __kmp_num_proc_groups; group++) { 92 int i; 93 int num = __kmp_GetActiveProcessorCount(group); 94 for (i = 0; i < num; i++) { 95 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 96 } 97 } 98 } 99 else 100 101 # endif /* KMP_GROUP_AFFINITY */ 102 103 { 104 int proc; 105 for (proc = 0; proc < __kmp_xproc; proc++) { 106 KMP_CPU_SET(proc, mask); 107 } 108 } 109 } 110 111 112 // 113 // In Linux* OS debug & cover (-O0) builds, we need to avoid inline member 114 // functions. 115 // 116 // The icc codegen emits sections with extremely long names, of the form 117 // ".gnu.linkonce.<mangled_name>". There seems to have been a linker bug 118 // introduced between GNU ld version 2.14.90.0.4 and 2.15.92.0.2 involving 119 // some sort of memory corruption or table overflow that is triggered by 120 // these long strings. I checked the latest version of the linker - 121 // GNU ld (Linux* OS/GNU Binutils) 2.18.50.0.7.20080422 - and the bug is not 122 // fixed. 123 // 124 // Unfortunately, my attempts to reproduce it in a smaller example have 125 // failed - I'm not sure what the prospects are of getting it fixed 126 // properly - but we need a reproducer smaller than all of libiomp. 127 // 128 // Work around the problem by avoiding inline constructors in such builds. 129 // We do this for all platforms, not just Linux* OS - non-inline functions are 130 // more debuggable and provide better coverage into than inline functions. 131 // Use inline functions in shipping libs, for performance. 132 // 133 134 # if !defined(KMP_DEBUG) && !defined(COVER) 135 136 class Address { 137 public: 138 static const unsigned maxDepth = 32; 139 unsigned labels[maxDepth]; 140 unsigned childNums[maxDepth]; 141 unsigned depth; 142 unsigned leader; 143 Address(unsigned _depth) 144 : depth(_depth), leader(FALSE) { 145 } 146 Address &operator=(const Address &b) { 147 depth = b.depth; 148 for (unsigned i = 0; i < depth; i++) { 149 labels[i] = b.labels[i]; 150 childNums[i] = b.childNums[i]; 151 } 152 leader = FALSE; 153 return *this; 154 } 155 bool operator==(const Address &b) const { 156 if (depth != b.depth) 157 return false; 158 for (unsigned i = 0; i < depth; i++) 159 if(labels[i] != b.labels[i]) 160 return false; 161 return true; 162 } 163 bool isClose(const Address &b, int level) const { 164 if (depth != b.depth) 165 return false; 166 if ((unsigned)level >= depth) 167 return true; 168 for (unsigned i = 0; i < (depth - level); i++) 169 if(labels[i] != b.labels[i]) 170 return false; 171 return true; 172 } 173 bool operator!=(const Address &b) const { 174 return !operator==(b); 175 } 176 }; 177 178 class AddrUnsPair { 179 public: 180 Address first; 181 unsigned second; 182 AddrUnsPair(Address _first, unsigned _second) 183 : first(_first), second(_second) { 184 } 185 AddrUnsPair &operator=(const AddrUnsPair &b) 186 { 187 first = b.first; 188 second = b.second; 189 return *this; 190 } 191 }; 192 193 # else 194 195 class Address { 196 public: 197 static const unsigned maxDepth = 32; 198 unsigned labels[maxDepth]; 199 unsigned childNums[maxDepth]; 200 unsigned depth; 201 unsigned leader; 202 Address(unsigned _depth); 203 Address &operator=(const Address &b); 204 bool operator==(const Address &b) const; 205 bool isClose(const Address &b, int level) const; 206 bool operator!=(const Address &b) const; 207 }; 208 209 Address::Address(unsigned _depth) 210 { 211 depth = _depth; 212 leader = FALSE; 213 } 214 215 Address &Address::operator=(const Address &b) { 216 depth = b.depth; 217 for (unsigned i = 0; i < depth; i++) { 218 labels[i] = b.labels[i]; 219 childNums[i] = b.childNums[i]; 220 } 221 leader = FALSE; 222 return *this; 223 } 224 225 bool Address::operator==(const Address &b) const { 226 if (depth != b.depth) 227 return false; 228 for (unsigned i = 0; i < depth; i++) 229 if(labels[i] != b.labels[i]) 230 return false; 231 return true; 232 } 233 234 bool Address::isClose(const Address &b, int level) const { 235 if (depth != b.depth) 236 return false; 237 if ((unsigned)level >= depth) 238 return true; 239 for (unsigned i = 0; i < (depth - level); i++) 240 if(labels[i] != b.labels[i]) 241 return false; 242 return true; 243 } 244 245 bool Address::operator!=(const Address &b) const { 246 return !operator==(b); 247 } 248 249 class AddrUnsPair { 250 public: 251 Address first; 252 unsigned second; 253 AddrUnsPair(Address _first, unsigned _second); 254 AddrUnsPair &operator=(const AddrUnsPair &b); 255 }; 256 257 AddrUnsPair::AddrUnsPair(Address _first, unsigned _second) 258 : first(_first), second(_second) 259 { 260 } 261 262 AddrUnsPair &AddrUnsPair::operator=(const AddrUnsPair &b) 263 { 264 first = b.first; 265 second = b.second; 266 return *this; 267 } 268 269 # endif /* !defined(KMP_DEBUG) && !defined(COVER) */ 270 271 272 static int 273 __kmp_affinity_cmp_Address_labels(const void *a, const void *b) 274 { 275 const Address *aa = (const Address *)&(((AddrUnsPair *)a) 276 ->first); 277 const Address *bb = (const Address *)&(((AddrUnsPair *)b) 278 ->first); 279 unsigned depth = aa->depth; 280 unsigned i; 281 KMP_DEBUG_ASSERT(depth == bb->depth); 282 for (i = 0; i < depth; i++) { 283 if (aa->labels[i] < bb->labels[i]) return -1; 284 if (aa->labels[i] > bb->labels[i]) return 1; 285 } 286 return 0; 287 } 288 289 290 static int 291 __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) 292 { 293 const Address *aa = (const Address *)&(((AddrUnsPair *)a) 294 ->first); 295 const Address *bb = (const Address *)&(((AddrUnsPair *)b) 296 ->first); 297 unsigned depth = aa->depth; 298 unsigned i; 299 KMP_DEBUG_ASSERT(depth == bb->depth); 300 KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth); 301 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0); 302 for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) { 303 int j = depth - i - 1; 304 if (aa->childNums[j] < bb->childNums[j]) return -1; 305 if (aa->childNums[j] > bb->childNums[j]) return 1; 306 } 307 for (; i < depth; i++) { 308 int j = i - __kmp_affinity_compact; 309 if (aa->childNums[j] < bb->childNums[j]) return -1; 310 if (aa->childNums[j] > bb->childNums[j]) return 1; 311 } 312 return 0; 313 } 314 315 /** A structure for holding machine-specific hierarchy info to be computed once at init. */ 316 class hierarchy_info { 317 public: 318 /** Typical levels are threads/core, cores/package or socket, packages/node, nodes/machine, 319 etc. We don't want to get specific with nomenclature */ 320 static const kmp_uint32 maxLevels=7; 321 322 /** This is specifically the depth of the machine configuration hierarchy, in terms of the 323 number of levels along the longest path from root to any leaf. It corresponds to the 324 number of entries in numPerLevel if we exclude all but one trailing 1. */ 325 kmp_uint32 depth; 326 kmp_uint32 base_num_threads; 327 bool uninitialized; 328 329 /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children the parent of a 330 node at level i has. For example, if we have a machine with 4 packages, 4 cores/package 331 and 2 HT per core, then numPerLevel = {2, 4, 4, 1, 1}. All empty levels are set to 1. */ 332 kmp_uint32 numPerLevel[maxLevels]; 333 kmp_uint32 skipPerLevel[maxLevels]; 334 335 void deriveLevels(AddrUnsPair *adr2os, int num_addrs) { 336 int hier_depth = adr2os[0].first.depth; 337 int level = 0; 338 for (int i=hier_depth-1; i>=0; --i) { 339 int max = -1; 340 for (int j=0; j<num_addrs; ++j) { 341 int next = adr2os[j].first.childNums[i]; 342 if (next > max) max = next; 343 } 344 numPerLevel[level] = max+1; 345 ++level; 346 } 347 } 348 349 hierarchy_info() : depth(1), uninitialized(true) {} 350 void init(AddrUnsPair *adr2os, int num_addrs) 351 { 352 /* Added explicit initialization of the depth here to prevent usage of dirty value 353 observed when static library is re-initialized multiple times (e.g. when 354 non-OpenMP thread repeatedly launches/joins thread that uses OpenMP). */ 355 depth = 1; 356 uninitialized = false; 357 for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level 358 numPerLevel[i] = 1; 359 skipPerLevel[i] = 1; 360 } 361 362 // Sort table by physical ID 363 if (adr2os) { 364 qsort(adr2os, num_addrs, sizeof(*adr2os), __kmp_affinity_cmp_Address_labels); 365 deriveLevels(adr2os, num_addrs); 366 } 367 else { 368 numPerLevel[0] = 4; 369 numPerLevel[1] = num_addrs/4; 370 if (num_addrs%4) numPerLevel[1]++; 371 } 372 373 base_num_threads = num_addrs; 374 for (int i=maxLevels-1; i>=0; --i) // count non-empty levels to get depth 375 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1' 376 depth++; 377 378 kmp_uint32 branch = 4; 379 if (numPerLevel[0] == 1) branch = num_addrs/4; 380 if (branch<4) branch=4; 381 for (kmp_uint32 d=0; d<depth-1; ++d) { // optimize hierarchy width 382 while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0! 383 if (numPerLevel[d] & 1) numPerLevel[d]++; 384 numPerLevel[d] = numPerLevel[d] >> 1; 385 if (numPerLevel[d+1] == 1) depth++; 386 numPerLevel[d+1] = numPerLevel[d+1] << 1; 387 } 388 if(numPerLevel[0] == 1) { 389 branch = branch >> 1; 390 if (branch<4) branch = 4; 391 } 392 } 393 394 for (kmp_uint32 i=1; i<depth; ++i) 395 skipPerLevel[i] = numPerLevel[i-1] * skipPerLevel[i-1]; 396 397 } 398 }; 399 400 static hierarchy_info machine_hierarchy; 401 402 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 403 kmp_uint32 depth; 404 if (machine_hierarchy.uninitialized) 405 machine_hierarchy.init(NULL, nproc); 406 407 depth = machine_hierarchy.depth; 408 KMP_DEBUG_ASSERT(depth > 0); 409 while (nproc > machine_hierarchy.skipPerLevel[depth-1]) { 410 depth++; 411 machine_hierarchy.skipPerLevel[depth-1] = 2*machine_hierarchy.skipPerLevel[depth-2]; 412 } 413 thr_bar->depth = depth; 414 thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1; 415 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 416 } 417 418 // 419 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be 420 // called to renumber the labels from [0..n] and place them into the child_num 421 // vector of the address object. This is done in case the labels used for 422 // the children at one node of the hierarchy differ from those used for 423 // another node at the same level. Example: suppose the machine has 2 nodes 424 // with 2 packages each. The first node contains packages 601 and 602, and 425 // second node contains packages 603 and 604. If we try to sort the table 426 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604 427 // because we are paying attention to the labels themselves, not the ordinal 428 // child numbers. By using the child numbers in the sort, the result is 429 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604. 430 // 431 static void 432 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os, 433 int numAddrs) 434 { 435 KMP_DEBUG_ASSERT(numAddrs > 0); 436 int depth = address2os->first.depth; 437 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 438 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth 439 * sizeof(unsigned)); 440 int labCt; 441 for (labCt = 0; labCt < depth; labCt++) { 442 address2os[0].first.childNums[labCt] = counts[labCt] = 0; 443 lastLabel[labCt] = address2os[0].first.labels[labCt]; 444 } 445 int i; 446 for (i = 1; i < numAddrs; i++) { 447 for (labCt = 0; labCt < depth; labCt++) { 448 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) { 449 int labCt2; 450 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) { 451 counts[labCt2] = 0; 452 lastLabel[labCt2] = address2os[i].first.labels[labCt2]; 453 } 454 counts[labCt]++; 455 lastLabel[labCt] = address2os[i].first.labels[labCt]; 456 break; 457 } 458 } 459 for (labCt = 0; labCt < depth; labCt++) { 460 address2os[i].first.childNums[labCt] = counts[labCt]; 461 } 462 for (; labCt < (int)Address::maxDepth; labCt++) { 463 address2os[i].first.childNums[labCt] = 0; 464 } 465 } 466 } 467 468 469 // 470 // All of the __kmp_affinity_create_*_map() routines should set 471 // __kmp_affinity_masks to a vector of affinity mask objects of length 472 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and 473 // return the number of levels in the machine topology tree (zero if 474 // __kmp_affinity_type == affinity_none). 475 // 476 // All of the __kmp_affinity_create_*_map() routines should set *fullMask 477 // to the affinity mask for the initialization thread. They need to save and 478 // restore the mask, and it could be needed later, so saving it is just an 479 // optimization to avoid calling kmp_get_system_affinity() again. 480 // 481 static kmp_affin_mask_t *fullMask = NULL; 482 483 kmp_affin_mask_t * 484 __kmp_affinity_get_fullMask() { return fullMask; } 485 486 487 static int nCoresPerPkg, nPackages; 488 static int __kmp_nThreadsPerCore; 489 #ifndef KMP_DFLT_NTH_CORES 490 static int __kmp_ncores; 491 #endif 492 493 // 494 // __kmp_affinity_uniform_topology() doesn't work when called from 495 // places which support arbitrarily many levels in the machine topology 496 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map() 497 // __kmp_affinity_create_x2apicid_map(). 498 // 499 inline static bool 500 __kmp_affinity_uniform_topology() 501 { 502 return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages); 503 } 504 505 506 // 507 // Print out the detailed machine topology map, i.e. the physical locations 508 // of each OS proc. 509 // 510 static void 511 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth, 512 int pkgLevel, int coreLevel, int threadLevel) 513 { 514 int proc; 515 516 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 517 for (proc = 0; proc < len; proc++) { 518 int level; 519 kmp_str_buf_t buf; 520 __kmp_str_buf_init(&buf); 521 for (level = 0; level < depth; level++) { 522 if (level == threadLevel) { 523 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread)); 524 } 525 else if (level == coreLevel) { 526 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core)); 527 } 528 else if (level == pkgLevel) { 529 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package)); 530 } 531 else if (level > pkgLevel) { 532 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node), 533 level - pkgLevel - 1); 534 } 535 else { 536 __kmp_str_buf_print(&buf, "L%d ", level); 537 } 538 __kmp_str_buf_print(&buf, "%d ", 539 address2os[proc].first.labels[level]); 540 } 541 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second, 542 buf.str); 543 __kmp_str_buf_free(&buf); 544 } 545 } 546 547 548 // 549 // If we don't know how to retrieve the machine's processor topology, or 550 // encounter an error in doing so, this routine is called to form a "flat" 551 // mapping of os thread id's <-> processor id's. 552 // 553 static int 554 __kmp_affinity_create_flat_map(AddrUnsPair **address2os, 555 kmp_i18n_id_t *const msg_id) 556 { 557 *address2os = NULL; 558 *msg_id = kmp_i18n_null; 559 560 // 561 // Even if __kmp_affinity_type == affinity_none, this routine might still 562 // called to set __kmp_ncores, as well as 563 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 564 // 565 if (! KMP_AFFINITY_CAPABLE()) { 566 KMP_ASSERT(__kmp_affinity_type == affinity_none); 567 __kmp_ncores = nPackages = __kmp_xproc; 568 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 569 if (__kmp_affinity_verbose) { 570 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY"); 571 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 572 KMP_INFORM(Uniform, "KMP_AFFINITY"); 573 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 574 __kmp_nThreadsPerCore, __kmp_ncores); 575 } 576 return 0; 577 } 578 579 // 580 // When affinity is off, this routine will still be called to set 581 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 582 // nCoresPerPkg, & nPackages. Make sure all these vars are set 583 // correctly, and return now if affinity is not enabled. 584 // 585 __kmp_ncores = nPackages = __kmp_avail_proc; 586 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 587 if (__kmp_affinity_verbose) { 588 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 589 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask); 590 591 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY"); 592 if (__kmp_affinity_respect_mask) { 593 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 594 } else { 595 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 596 } 597 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 598 KMP_INFORM(Uniform, "KMP_AFFINITY"); 599 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 600 __kmp_nThreadsPerCore, __kmp_ncores); 601 } 602 if (__kmp_affinity_type == affinity_none) { 603 return 0; 604 } 605 606 // 607 // Contruct the data structure to be returned. 608 // 609 *address2os = (AddrUnsPair*) 610 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 611 int avail_ct = 0; 612 unsigned int i; 613 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 614 // 615 // Skip this proc if it is not included in the machine model. 616 // 617 if (! KMP_CPU_ISSET(i, fullMask)) { 618 continue; 619 } 620 621 Address addr(1); 622 addr.labels[0] = i; 623 (*address2os)[avail_ct++] = AddrUnsPair(addr,i); 624 } 625 if (__kmp_affinity_verbose) { 626 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 627 } 628 629 if (__kmp_affinity_gran_levels < 0) { 630 // 631 // Only the package level is modeled in the machine topology map, 632 // so the #levels of granularity is either 0 or 1. 633 // 634 if (__kmp_affinity_gran > affinity_gran_package) { 635 __kmp_affinity_gran_levels = 1; 636 } 637 else { 638 __kmp_affinity_gran_levels = 0; 639 } 640 } 641 return 1; 642 } 643 644 645 # if KMP_GROUP_AFFINITY 646 647 // 648 // If multiple Windows* OS processor groups exist, we can create a 2-level 649 // topology map with the groups at level 0 and the individual procs at 650 // level 1. 651 // 652 // This facilitates letting the threads float among all procs in a group, 653 // if granularity=group (the default when there are multiple groups). 654 // 655 static int 656 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os, 657 kmp_i18n_id_t *const msg_id) 658 { 659 *address2os = NULL; 660 *msg_id = kmp_i18n_null; 661 662 // 663 // If we don't have multiple processor groups, return now. 664 // The flat mapping will be used. 665 // 666 if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) { 667 // FIXME set *msg_id 668 return -1; 669 } 670 671 // 672 // Contruct the data structure to be returned. 673 // 674 *address2os = (AddrUnsPair*) 675 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 676 int avail_ct = 0; 677 int i; 678 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 679 // 680 // Skip this proc if it is not included in the machine model. 681 // 682 if (! KMP_CPU_ISSET(i, fullMask)) { 683 continue; 684 } 685 686 Address addr(2); 687 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR)); 688 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR)); 689 (*address2os)[avail_ct++] = AddrUnsPair(addr,i); 690 691 if (__kmp_affinity_verbose) { 692 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0], 693 addr.labels[1]); 694 } 695 } 696 697 if (__kmp_affinity_gran_levels < 0) { 698 if (__kmp_affinity_gran == affinity_gran_group) { 699 __kmp_affinity_gran_levels = 1; 700 } 701 else if ((__kmp_affinity_gran == affinity_gran_fine) 702 || (__kmp_affinity_gran == affinity_gran_thread)) { 703 __kmp_affinity_gran_levels = 0; 704 } 705 else { 706 const char *gran_str = NULL; 707 if (__kmp_affinity_gran == affinity_gran_core) { 708 gran_str = "core"; 709 } 710 else if (__kmp_affinity_gran == affinity_gran_package) { 711 gran_str = "package"; 712 } 713 else if (__kmp_affinity_gran == affinity_gran_node) { 714 gran_str = "node"; 715 } 716 else { 717 KMP_ASSERT(0); 718 } 719 720 // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread" 721 __kmp_affinity_gran_levels = 0; 722 } 723 } 724 return 2; 725 } 726 727 # endif /* KMP_GROUP_AFFINITY */ 728 729 730 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 731 732 static int 733 __kmp_cpuid_mask_width(int count) { 734 int r = 0; 735 736 while((1<<r) < count) 737 ++r; 738 return r; 739 } 740 741 742 class apicThreadInfo { 743 public: 744 unsigned osId; // param to __kmp_affinity_bind_thread 745 unsigned apicId; // from cpuid after binding 746 unsigned maxCoresPerPkg; // "" 747 unsigned maxThreadsPerPkg; // "" 748 unsigned pkgId; // inferred from above values 749 unsigned coreId; // "" 750 unsigned threadId; // "" 751 }; 752 753 754 static int 755 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b) 756 { 757 const apicThreadInfo *aa = (const apicThreadInfo *)a; 758 const apicThreadInfo *bb = (const apicThreadInfo *)b; 759 if (aa->osId < bb->osId) return -1; 760 if (aa->osId > bb->osId) return 1; 761 return 0; 762 } 763 764 765 static int 766 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b) 767 { 768 const apicThreadInfo *aa = (const apicThreadInfo *)a; 769 const apicThreadInfo *bb = (const apicThreadInfo *)b; 770 if (aa->pkgId < bb->pkgId) return -1; 771 if (aa->pkgId > bb->pkgId) return 1; 772 if (aa->coreId < bb->coreId) return -1; 773 if (aa->coreId > bb->coreId) return 1; 774 if (aa->threadId < bb->threadId) return -1; 775 if (aa->threadId > bb->threadId) return 1; 776 return 0; 777 } 778 779 780 // 781 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 782 // an algorithm which cycles through the available os threads, setting 783 // the current thread's affinity mask to that thread, and then retrieves 784 // the Apic Id for each thread context using the cpuid instruction. 785 // 786 static int 787 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os, 788 kmp_i18n_id_t *const msg_id) 789 { 790 kmp_cpuid buf; 791 int rc; 792 *address2os = NULL; 793 *msg_id = kmp_i18n_null; 794 795 // 796 // Check if cpuid leaf 4 is supported. 797 // 798 __kmp_x86_cpuid(0, 0, &buf); 799 if (buf.eax < 4) { 800 *msg_id = kmp_i18n_str_NoLeaf4Support; 801 return -1; 802 } 803 804 // 805 // The algorithm used starts by setting the affinity to each available 806 // thread and retrieving info from the cpuid instruction, so if we are 807 // not capable of calling __kmp_get_system_affinity() and 808 // _kmp_get_system_affinity(), then we need to do something else - use 809 // the defaults that we calculated from issuing cpuid without binding 810 // to each proc. 811 // 812 if (! KMP_AFFINITY_CAPABLE()) { 813 // 814 // Hack to try and infer the machine topology using only the data 815 // available from cpuid on the current thread, and __kmp_xproc. 816 // 817 KMP_ASSERT(__kmp_affinity_type == affinity_none); 818 819 // 820 // Get an upper bound on the number of threads per package using 821 // cpuid(1). 822 // 823 // On some OS/chps combinations where HT is supported by the chip 824 // but is disabled, this value will be 2 on a single core chip. 825 // Usually, it will be 2 if HT is enabled and 1 if HT is disabled. 826 // 827 __kmp_x86_cpuid(1, 0, &buf); 828 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 829 if (maxThreadsPerPkg == 0) { 830 maxThreadsPerPkg = 1; 831 } 832 833 // 834 // The num cores per pkg comes from cpuid(4). 835 // 1 must be added to the encoded value. 836 // 837 // The author of cpu_count.cpp treated this only an upper bound 838 // on the number of cores, but I haven't seen any cases where it 839 // was greater than the actual number of cores, so we will treat 840 // it as exact in this block of code. 841 // 842 // First, we need to check if cpuid(4) is supported on this chip. 843 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax 844 // has the value n or greater. 845 // 846 __kmp_x86_cpuid(0, 0, &buf); 847 if (buf.eax >= 4) { 848 __kmp_x86_cpuid(4, 0, &buf); 849 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 850 } 851 else { 852 nCoresPerPkg = 1; 853 } 854 855 // 856 // There is no way to reliably tell if HT is enabled without issuing 857 // the cpuid instruction from every thread, can correlating the cpuid 858 // info, so if the machine is not affinity capable, we assume that HT 859 // is off. We have seen quite a few machines where maxThreadsPerPkg 860 // is 2, yet the machine does not support HT. 861 // 862 // - Older OSes are usually found on machines with older chips, which 863 // do not support HT. 864 // 865 // - The performance penalty for mistakenly identifying a machine as 866 // HT when it isn't (which results in blocktime being incorrecly set 867 // to 0) is greater than the penalty when for mistakenly identifying 868 // a machine as being 1 thread/core when it is really HT enabled 869 // (which results in blocktime being incorrectly set to a positive 870 // value). 871 // 872 __kmp_ncores = __kmp_xproc; 873 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 874 __kmp_nThreadsPerCore = 1; 875 if (__kmp_affinity_verbose) { 876 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY"); 877 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 878 if (__kmp_affinity_uniform_topology()) { 879 KMP_INFORM(Uniform, "KMP_AFFINITY"); 880 } else { 881 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 882 } 883 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 884 __kmp_nThreadsPerCore, __kmp_ncores); 885 } 886 return 0; 887 } 888 889 // 890 // 891 // From here on, we can assume that it is safe to call 892 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), 893 // even if __kmp_affinity_type = affinity_none. 894 // 895 896 // 897 // Save the affinity mask for the current thread. 898 // 899 kmp_affin_mask_t *oldMask; 900 KMP_CPU_ALLOC(oldMask); 901 KMP_ASSERT(oldMask != NULL); 902 __kmp_get_system_affinity(oldMask, TRUE); 903 904 // 905 // Run through each of the available contexts, binding the current thread 906 // to it, and obtaining the pertinent information using the cpuid instr. 907 // 908 // The relevant information is: 909 // 910 // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 911 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 912 // 913 // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The 914 // value of this field determines the width of the core# + thread# 915 // fields in the Apic Id. It is also an upper bound on the number 916 // of threads per package, but it has been verified that situations 917 // happen were it is not exact. In particular, on certain OS/chip 918 // combinations where Intel(R) Hyper-Threading Technology is supported 919 // by the chip but has 920 // been disabled, the value of this field will be 2 (for a single core 921 // chip). On other OS/chip combinations supporting 922 // Intel(R) Hyper-Threading Technology, the value of 923 // this field will be 1 when Intel(R) Hyper-Threading Technology is 924 // disabled and 2 when it is enabled. 925 // 926 // Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The 927 // value of this field (+1) determines the width of the core# field in 928 // the Apic Id. The comments in "cpucount.cpp" say that this value is 929 // an upper bound, but the IA-32 architecture manual says that it is 930 // exactly the number of cores per package, and I haven't seen any 931 // case where it wasn't. 932 // 933 // From this information, deduce the package Id, core Id, and thread Id, 934 // and set the corresponding fields in the apicThreadInfo struct. 935 // 936 unsigned i; 937 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 938 __kmp_avail_proc * sizeof(apicThreadInfo)); 939 unsigned nApics = 0; 940 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 941 // 942 // Skip this proc if it is not included in the machine model. 943 // 944 if (! KMP_CPU_ISSET(i, fullMask)) { 945 continue; 946 } 947 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 948 949 __kmp_affinity_bind_thread(i); 950 threadInfo[nApics].osId = i; 951 952 // 953 // The apic id and max threads per pkg come from cpuid(1). 954 // 955 __kmp_x86_cpuid(1, 0, &buf); 956 if (! (buf.edx >> 9) & 1) { 957 __kmp_set_system_affinity(oldMask, TRUE); 958 __kmp_free(threadInfo); 959 KMP_CPU_FREE(oldMask); 960 *msg_id = kmp_i18n_str_ApicNotPresent; 961 return -1; 962 } 963 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 964 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 965 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 966 threadInfo[nApics].maxThreadsPerPkg = 1; 967 } 968 969 // 970 // Max cores per pkg comes from cpuid(4). 971 // 1 must be added to the encoded value. 972 // 973 // First, we need to check if cpuid(4) is supported on this chip. 974 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax 975 // has the value n or greater. 976 // 977 __kmp_x86_cpuid(0, 0, &buf); 978 if (buf.eax >= 4) { 979 __kmp_x86_cpuid(4, 0, &buf); 980 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 981 } 982 else { 983 threadInfo[nApics].maxCoresPerPkg = 1; 984 } 985 986 // 987 // Infer the pkgId / coreId / threadId using only the info 988 // obtained locally. 989 // 990 int widthCT = __kmp_cpuid_mask_width( 991 threadInfo[nApics].maxThreadsPerPkg); 992 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 993 994 int widthC = __kmp_cpuid_mask_width( 995 threadInfo[nApics].maxCoresPerPkg); 996 int widthT = widthCT - widthC; 997 if (widthT < 0) { 998 // 999 // I've never seen this one happen, but I suppose it could, if 1000 // the cpuid instruction on a chip was really screwed up. 1001 // Make sure to restore the affinity mask before the tail call. 1002 // 1003 __kmp_set_system_affinity(oldMask, TRUE); 1004 __kmp_free(threadInfo); 1005 KMP_CPU_FREE(oldMask); 1006 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1007 return -1; 1008 } 1009 1010 int maskC = (1 << widthC) - 1; 1011 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) 1012 &maskC; 1013 1014 int maskT = (1 << widthT) - 1; 1015 threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT; 1016 1017 nApics++; 1018 } 1019 1020 // 1021 // We've collected all the info we need. 1022 // Restore the old affinity mask for this thread. 1023 // 1024 __kmp_set_system_affinity(oldMask, TRUE); 1025 1026 // 1027 // If there's only one thread context to bind to, form an Address object 1028 // with depth 1 and return immediately (or, if affinity is off, set 1029 // address2os to NULL and return). 1030 // 1031 // If it is configured to omit the package level when there is only a 1032 // single package, the logic at the end of this routine won't work if 1033 // there is only a single thread - it would try to form an Address 1034 // object with depth 0. 1035 // 1036 KMP_ASSERT(nApics > 0); 1037 if (nApics == 1) { 1038 __kmp_ncores = nPackages = 1; 1039 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1040 if (__kmp_affinity_verbose) { 1041 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1042 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1043 1044 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1045 if (__kmp_affinity_respect_mask) { 1046 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1047 } else { 1048 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1049 } 1050 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1051 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1052 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1053 __kmp_nThreadsPerCore, __kmp_ncores); 1054 } 1055 1056 if (__kmp_affinity_type == affinity_none) { 1057 __kmp_free(threadInfo); 1058 KMP_CPU_FREE(oldMask); 1059 return 0; 1060 } 1061 1062 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair)); 1063 Address addr(1); 1064 addr.labels[0] = threadInfo[0].pkgId; 1065 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId); 1066 1067 if (__kmp_affinity_gran_levels < 0) { 1068 __kmp_affinity_gran_levels = 0; 1069 } 1070 1071 if (__kmp_affinity_verbose) { 1072 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 1073 } 1074 1075 __kmp_free(threadInfo); 1076 KMP_CPU_FREE(oldMask); 1077 return 1; 1078 } 1079 1080 // 1081 // Sort the threadInfo table by physical Id. 1082 // 1083 qsort(threadInfo, nApics, sizeof(*threadInfo), 1084 __kmp_affinity_cmp_apicThreadInfo_phys_id); 1085 1086 // 1087 // The table is now sorted by pkgId / coreId / threadId, but we really 1088 // don't know the radix of any of the fields. pkgId's may be sparsely 1089 // assigned among the chips on a system. Although coreId's are usually 1090 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned 1091 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1092 // 1093 // For that matter, we don't know what coresPerPkg and threadsPerCore 1094 // (or the total # packages) are at this point - we want to determine 1095 // that now. We only have an upper bound on the first two figures. 1096 // 1097 // We also perform a consistency check at this point: the values returned 1098 // by the cpuid instruction for any thread bound to a given package had 1099 // better return the same info for maxThreadsPerPkg and maxCoresPerPkg. 1100 // 1101 nPackages = 1; 1102 nCoresPerPkg = 1; 1103 __kmp_nThreadsPerCore = 1; 1104 unsigned nCores = 1; 1105 1106 unsigned pkgCt = 1; // to determine radii 1107 unsigned lastPkgId = threadInfo[0].pkgId; 1108 unsigned coreCt = 1; 1109 unsigned lastCoreId = threadInfo[0].coreId; 1110 unsigned threadCt = 1; 1111 unsigned lastThreadId = threadInfo[0].threadId; 1112 1113 // intra-pkg consist checks 1114 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 1115 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 1116 1117 for (i = 1; i < nApics; i++) { 1118 if (threadInfo[i].pkgId != lastPkgId) { 1119 nCores++; 1120 pkgCt++; 1121 lastPkgId = threadInfo[i].pkgId; 1122 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; 1123 coreCt = 1; 1124 lastCoreId = threadInfo[i].coreId; 1125 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1126 threadCt = 1; 1127 lastThreadId = threadInfo[i].threadId; 1128 1129 // 1130 // This is a different package, so go on to the next iteration 1131 // without doing any consistency checks. Reset the consistency 1132 // check vars, though. 1133 // 1134 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 1135 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 1136 continue; 1137 } 1138 1139 if (threadInfo[i].coreId != lastCoreId) { 1140 nCores++; 1141 coreCt++; 1142 lastCoreId = threadInfo[i].coreId; 1143 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1144 threadCt = 1; 1145 lastThreadId = threadInfo[i].threadId; 1146 } 1147 else if (threadInfo[i].threadId != lastThreadId) { 1148 threadCt++; 1149 lastThreadId = threadInfo[i].threadId; 1150 } 1151 else { 1152 __kmp_free(threadInfo); 1153 KMP_CPU_FREE(oldMask); 1154 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1155 return -1; 1156 } 1157 1158 // 1159 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 1160 // fields agree between all the threads bounds to a given package. 1161 // 1162 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) 1163 || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 1164 __kmp_free(threadInfo); 1165 KMP_CPU_FREE(oldMask); 1166 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1167 return -1; 1168 } 1169 } 1170 nPackages = pkgCt; 1171 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; 1172 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1173 1174 // 1175 // When affinity is off, this routine will still be called to set 1176 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 1177 // nCoresPerPkg, & nPackages. Make sure all these vars are set 1178 // correctly, and return now if affinity is not enabled. 1179 // 1180 __kmp_ncores = nCores; 1181 if (__kmp_affinity_verbose) { 1182 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1183 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1184 1185 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1186 if (__kmp_affinity_respect_mask) { 1187 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1188 } else { 1189 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1190 } 1191 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1192 if (__kmp_affinity_uniform_topology()) { 1193 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1194 } else { 1195 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1196 } 1197 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1198 __kmp_nThreadsPerCore, __kmp_ncores); 1199 1200 } 1201 1202 if (__kmp_affinity_type == affinity_none) { 1203 __kmp_free(threadInfo); 1204 KMP_CPU_FREE(oldMask); 1205 return 0; 1206 } 1207 1208 // 1209 // Now that we've determined the number of packages, the number of cores 1210 // per package, and the number of threads per core, we can construct the 1211 // data structure that is to be returned. 1212 // 1213 int pkgLevel = 0; 1214 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1; 1215 int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 1216 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 1217 1218 KMP_ASSERT(depth > 0); 1219 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1220 1221 for (i = 0; i < nApics; ++i) { 1222 Address addr(depth); 1223 unsigned os = threadInfo[i].osId; 1224 int d = 0; 1225 1226 if (pkgLevel >= 0) { 1227 addr.labels[d++] = threadInfo[i].pkgId; 1228 } 1229 if (coreLevel >= 0) { 1230 addr.labels[d++] = threadInfo[i].coreId; 1231 } 1232 if (threadLevel >= 0) { 1233 addr.labels[d++] = threadInfo[i].threadId; 1234 } 1235 (*address2os)[i] = AddrUnsPair(addr, os); 1236 } 1237 1238 if (__kmp_affinity_gran_levels < 0) { 1239 // 1240 // Set the granularity level based on what levels are modeled 1241 // in the machine topology map. 1242 // 1243 __kmp_affinity_gran_levels = 0; 1244 if ((threadLevel >= 0) 1245 && (__kmp_affinity_gran > affinity_gran_thread)) { 1246 __kmp_affinity_gran_levels++; 1247 } 1248 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1249 __kmp_affinity_gran_levels++; 1250 } 1251 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) { 1252 __kmp_affinity_gran_levels++; 1253 } 1254 } 1255 1256 if (__kmp_affinity_verbose) { 1257 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel, 1258 coreLevel, threadLevel); 1259 } 1260 1261 __kmp_free(threadInfo); 1262 KMP_CPU_FREE(oldMask); 1263 return depth; 1264 } 1265 1266 1267 // 1268 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 1269 // architectures support a newer interface for specifying the x2APIC Ids, 1270 // based on cpuid leaf 11. 1271 // 1272 static int 1273 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os, 1274 kmp_i18n_id_t *const msg_id) 1275 { 1276 kmp_cpuid buf; 1277 1278 *address2os = NULL; 1279 *msg_id = kmp_i18n_null; 1280 1281 // 1282 // Check to see if cpuid leaf 11 is supported. 1283 // 1284 __kmp_x86_cpuid(0, 0, &buf); 1285 if (buf.eax < 11) { 1286 *msg_id = kmp_i18n_str_NoLeaf11Support; 1287 return -1; 1288 } 1289 __kmp_x86_cpuid(11, 0, &buf); 1290 if (buf.ebx == 0) { 1291 *msg_id = kmp_i18n_str_NoLeaf11Support; 1292 return -1; 1293 } 1294 1295 // 1296 // Find the number of levels in the machine topology. While we're at it, 1297 // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will 1298 // try to get more accurate values later by explicitly counting them, 1299 // but get reasonable defaults now, in case we return early. 1300 // 1301 int level; 1302 int threadLevel = -1; 1303 int coreLevel = -1; 1304 int pkgLevel = -1; 1305 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1306 1307 for (level = 0;; level++) { 1308 if (level > 31) { 1309 // 1310 // FIXME: Hack for DPD200163180 1311 // 1312 // If level is big then something went wrong -> exiting 1313 // 1314 // There could actually be 32 valid levels in the machine topology, 1315 // but so far, the only machine we have seen which does not exit 1316 // this loop before iteration 32 has fubar x2APIC settings. 1317 // 1318 // For now, just reject this case based upon loop trip count. 1319 // 1320 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1321 return -1; 1322 } 1323 __kmp_x86_cpuid(11, level, &buf); 1324 if (buf.ebx == 0) { 1325 if (pkgLevel < 0) { 1326 // 1327 // Will infer nPackages from __kmp_xproc 1328 // 1329 pkgLevel = level; 1330 level++; 1331 } 1332 break; 1333 } 1334 int kind = (buf.ecx >> 8) & 0xff; 1335 if (kind == 1) { 1336 // 1337 // SMT level 1338 // 1339 threadLevel = level; 1340 coreLevel = -1; 1341 pkgLevel = -1; 1342 __kmp_nThreadsPerCore = buf.ebx & 0xff; 1343 if (__kmp_nThreadsPerCore == 0) { 1344 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1345 return -1; 1346 } 1347 } 1348 else if (kind == 2) { 1349 // 1350 // core level 1351 // 1352 coreLevel = level; 1353 pkgLevel = -1; 1354 nCoresPerPkg = buf.ebx & 0xff; 1355 if (nCoresPerPkg == 0) { 1356 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1357 return -1; 1358 } 1359 } 1360 else { 1361 if (level <= 0) { 1362 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1363 return -1; 1364 } 1365 if (pkgLevel >= 0) { 1366 continue; 1367 } 1368 pkgLevel = level; 1369 nPackages = buf.ebx & 0xff; 1370 if (nPackages == 0) { 1371 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1372 return -1; 1373 } 1374 } 1375 } 1376 int depth = level; 1377 1378 // 1379 // In the above loop, "level" was counted from the finest level (usually 1380 // thread) to the coarsest. The caller expects that we will place the 1381 // labels in (*address2os)[].first.labels[] in the inverse order, so 1382 // we need to invert the vars saying which level means what. 1383 // 1384 if (threadLevel >= 0) { 1385 threadLevel = depth - threadLevel - 1; 1386 } 1387 if (coreLevel >= 0) { 1388 coreLevel = depth - coreLevel - 1; 1389 } 1390 KMP_DEBUG_ASSERT(pkgLevel >= 0); 1391 pkgLevel = depth - pkgLevel - 1; 1392 1393 // 1394 // The algorithm used starts by setting the affinity to each available 1395 // thread and retrieving info from the cpuid instruction, so if we are 1396 // not capable of calling __kmp_get_system_affinity() and 1397 // _kmp_get_system_affinity(), then we need to do something else - use 1398 // the defaults that we calculated from issuing cpuid without binding 1399 // to each proc. 1400 // 1401 if (! KMP_AFFINITY_CAPABLE()) 1402 { 1403 // 1404 // Hack to try and infer the machine topology using only the data 1405 // available from cpuid on the current thread, and __kmp_xproc. 1406 // 1407 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1408 1409 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1410 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1411 if (__kmp_affinity_verbose) { 1412 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 1413 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1414 if (__kmp_affinity_uniform_topology()) { 1415 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1416 } else { 1417 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1418 } 1419 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1420 __kmp_nThreadsPerCore, __kmp_ncores); 1421 } 1422 return 0; 1423 } 1424 1425 // 1426 // 1427 // From here on, we can assume that it is safe to call 1428 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), 1429 // even if __kmp_affinity_type = affinity_none. 1430 // 1431 1432 // 1433 // Save the affinity mask for the current thread. 1434 // 1435 kmp_affin_mask_t *oldMask; 1436 KMP_CPU_ALLOC(oldMask); 1437 __kmp_get_system_affinity(oldMask, TRUE); 1438 1439 // 1440 // Allocate the data structure to be returned. 1441 // 1442 AddrUnsPair *retval = (AddrUnsPair *) 1443 __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 1444 1445 // 1446 // Run through each of the available contexts, binding the current thread 1447 // to it, and obtaining the pertinent information using the cpuid instr. 1448 // 1449 unsigned int proc; 1450 int nApics = 0; 1451 for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) { 1452 // 1453 // Skip this proc if it is not included in the machine model. 1454 // 1455 if (! KMP_CPU_ISSET(proc, fullMask)) { 1456 continue; 1457 } 1458 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc); 1459 1460 __kmp_affinity_bind_thread(proc); 1461 1462 // 1463 // Extrach the labels for each level in the machine topology map 1464 // from the Apic ID. 1465 // 1466 Address addr(depth); 1467 int prev_shift = 0; 1468 1469 for (level = 0; level < depth; level++) { 1470 __kmp_x86_cpuid(11, level, &buf); 1471 unsigned apicId = buf.edx; 1472 if (buf.ebx == 0) { 1473 if (level != depth - 1) { 1474 KMP_CPU_FREE(oldMask); 1475 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1476 return -1; 1477 } 1478 addr.labels[depth - level - 1] = apicId >> prev_shift; 1479 level++; 1480 break; 1481 } 1482 int shift = buf.eax & 0x1f; 1483 int mask = (1 << shift) - 1; 1484 addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift; 1485 prev_shift = shift; 1486 } 1487 if (level != depth) { 1488 KMP_CPU_FREE(oldMask); 1489 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1490 return -1; 1491 } 1492 1493 retval[nApics] = AddrUnsPair(addr, proc); 1494 nApics++; 1495 } 1496 1497 // 1498 // We've collected all the info we need. 1499 // Restore the old affinity mask for this thread. 1500 // 1501 __kmp_set_system_affinity(oldMask, TRUE); 1502 1503 // 1504 // If there's only one thread context to bind to, return now. 1505 // 1506 KMP_ASSERT(nApics > 0); 1507 if (nApics == 1) { 1508 __kmp_ncores = nPackages = 1; 1509 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1510 if (__kmp_affinity_verbose) { 1511 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1512 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1513 1514 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1515 if (__kmp_affinity_respect_mask) { 1516 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1517 } else { 1518 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1519 } 1520 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1521 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1522 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1523 __kmp_nThreadsPerCore, __kmp_ncores); 1524 } 1525 1526 if (__kmp_affinity_type == affinity_none) { 1527 __kmp_free(retval); 1528 KMP_CPU_FREE(oldMask); 1529 return 0; 1530 } 1531 1532 // 1533 // Form an Address object which only includes the package level. 1534 // 1535 Address addr(1); 1536 addr.labels[0] = retval[0].first.labels[pkgLevel]; 1537 retval[0].first = addr; 1538 1539 if (__kmp_affinity_gran_levels < 0) { 1540 __kmp_affinity_gran_levels = 0; 1541 } 1542 1543 if (__kmp_affinity_verbose) { 1544 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 1545 } 1546 1547 *address2os = retval; 1548 KMP_CPU_FREE(oldMask); 1549 return 1; 1550 } 1551 1552 // 1553 // Sort the table by physical Id. 1554 // 1555 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels); 1556 1557 // 1558 // Find the radix at each of the levels. 1559 // 1560 unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1561 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1562 unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1563 unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1564 for (level = 0; level < depth; level++) { 1565 totals[level] = 1; 1566 maxCt[level] = 1; 1567 counts[level] = 1; 1568 last[level] = retval[0].first.labels[level]; 1569 } 1570 1571 // 1572 // From here on, the iteration variable "level" runs from the finest 1573 // level to the coarsest, i.e. we iterate forward through 1574 // (*address2os)[].first.labels[] - in the previous loops, we iterated 1575 // backwards. 1576 // 1577 for (proc = 1; (int)proc < nApics; proc++) { 1578 int level; 1579 for (level = 0; level < depth; level++) { 1580 if (retval[proc].first.labels[level] != last[level]) { 1581 int j; 1582 for (j = level + 1; j < depth; j++) { 1583 totals[j]++; 1584 counts[j] = 1; 1585 // The line below causes printing incorrect topology information 1586 // in case the max value for some level (maxCt[level]) is encountered earlier than 1587 // some less value while going through the array. 1588 // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2 1589 // whereas it must be 4. 1590 // TODO!!! Check if it can be commented safely 1591 //maxCt[j] = 1; 1592 last[j] = retval[proc].first.labels[j]; 1593 } 1594 totals[level]++; 1595 counts[level]++; 1596 if (counts[level] > maxCt[level]) { 1597 maxCt[level] = counts[level]; 1598 } 1599 last[level] = retval[proc].first.labels[level]; 1600 break; 1601 } 1602 else if (level == depth - 1) { 1603 __kmp_free(last); 1604 __kmp_free(maxCt); 1605 __kmp_free(counts); 1606 __kmp_free(totals); 1607 __kmp_free(retval); 1608 KMP_CPU_FREE(oldMask); 1609 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 1610 return -1; 1611 } 1612 } 1613 } 1614 1615 // 1616 // When affinity is off, this routine will still be called to set 1617 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 1618 // nCoresPerPkg, & nPackages. Make sure all these vars are set 1619 // correctly, and return if affinity is not enabled. 1620 // 1621 if (threadLevel >= 0) { 1622 __kmp_nThreadsPerCore = maxCt[threadLevel]; 1623 } 1624 else { 1625 __kmp_nThreadsPerCore = 1; 1626 } 1627 nPackages = totals[pkgLevel]; 1628 1629 if (coreLevel >= 0) { 1630 __kmp_ncores = totals[coreLevel]; 1631 nCoresPerPkg = maxCt[coreLevel]; 1632 } 1633 else { 1634 __kmp_ncores = nPackages; 1635 nCoresPerPkg = 1; 1636 } 1637 1638 // 1639 // Check to see if the machine topology is uniform 1640 // 1641 unsigned prod = maxCt[0]; 1642 for (level = 1; level < depth; level++) { 1643 prod *= maxCt[level]; 1644 } 1645 bool uniform = (prod == totals[level - 1]); 1646 1647 // 1648 // Print the machine topology summary. 1649 // 1650 if (__kmp_affinity_verbose) { 1651 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 1652 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1653 1654 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1655 if (__kmp_affinity_respect_mask) { 1656 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 1657 } else { 1658 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 1659 } 1660 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1661 if (uniform) { 1662 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1663 } else { 1664 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1665 } 1666 1667 kmp_str_buf_t buf; 1668 __kmp_str_buf_init(&buf); 1669 1670 __kmp_str_buf_print(&buf, "%d", totals[0]); 1671 for (level = 1; level <= pkgLevel; level++) { 1672 __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 1673 } 1674 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 1675 __kmp_nThreadsPerCore, __kmp_ncores); 1676 1677 __kmp_str_buf_free(&buf); 1678 } 1679 1680 if (__kmp_affinity_type == affinity_none) { 1681 __kmp_free(last); 1682 __kmp_free(maxCt); 1683 __kmp_free(counts); 1684 __kmp_free(totals); 1685 __kmp_free(retval); 1686 KMP_CPU_FREE(oldMask); 1687 return 0; 1688 } 1689 1690 // 1691 // Find any levels with radiix 1, and remove them from the map 1692 // (except for the package level). 1693 // 1694 int new_depth = 0; 1695 for (level = 0; level < depth; level++) { 1696 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1697 continue; 1698 } 1699 new_depth++; 1700 } 1701 1702 // 1703 // If we are removing any levels, allocate a new vector to return, 1704 // and copy the relevant information to it. 1705 // 1706 if (new_depth != depth) { 1707 AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate( 1708 sizeof(AddrUnsPair) * nApics); 1709 for (proc = 0; (int)proc < nApics; proc++) { 1710 Address addr(new_depth); 1711 new_retval[proc] = AddrUnsPair(addr, retval[proc].second); 1712 } 1713 int new_level = 0; 1714 for (level = 0; level < depth; level++) { 1715 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1716 if (level == threadLevel) { 1717 threadLevel = -1; 1718 } 1719 else if ((threadLevel >= 0) && (level < threadLevel)) { 1720 threadLevel--; 1721 } 1722 if (level == coreLevel) { 1723 coreLevel = -1; 1724 } 1725 else if ((coreLevel >= 0) && (level < coreLevel)) { 1726 coreLevel--; 1727 } 1728 if (level < pkgLevel) { 1729 pkgLevel--; 1730 } 1731 continue; 1732 } 1733 for (proc = 0; (int)proc < nApics; proc++) { 1734 new_retval[proc].first.labels[new_level] 1735 = retval[proc].first.labels[level]; 1736 } 1737 new_level++; 1738 } 1739 1740 __kmp_free(retval); 1741 retval = new_retval; 1742 depth = new_depth; 1743 } 1744 1745 if (__kmp_affinity_gran_levels < 0) { 1746 // 1747 // Set the granularity level based on what levels are modeled 1748 // in the machine topology map. 1749 // 1750 __kmp_affinity_gran_levels = 0; 1751 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1752 __kmp_affinity_gran_levels++; 1753 } 1754 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1755 __kmp_affinity_gran_levels++; 1756 } 1757 if (__kmp_affinity_gran > affinity_gran_package) { 1758 __kmp_affinity_gran_levels++; 1759 } 1760 } 1761 1762 if (__kmp_affinity_verbose) { 1763 __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, 1764 coreLevel, threadLevel); 1765 } 1766 1767 __kmp_free(last); 1768 __kmp_free(maxCt); 1769 __kmp_free(counts); 1770 __kmp_free(totals); 1771 KMP_CPU_FREE(oldMask); 1772 *address2os = retval; 1773 return depth; 1774 } 1775 1776 1777 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1778 1779 1780 #define osIdIndex 0 1781 #define threadIdIndex 1 1782 #define coreIdIndex 2 1783 #define pkgIdIndex 3 1784 #define nodeIdIndex 4 1785 1786 typedef unsigned *ProcCpuInfo; 1787 static unsigned maxIndex = pkgIdIndex; 1788 1789 1790 static int 1791 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b) 1792 { 1793 const unsigned *aa = (const unsigned *)a; 1794 const unsigned *bb = (const unsigned *)b; 1795 if (aa[osIdIndex] < bb[osIdIndex]) return -1; 1796 if (aa[osIdIndex] > bb[osIdIndex]) return 1; 1797 return 0; 1798 }; 1799 1800 1801 static int 1802 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b) 1803 { 1804 unsigned i; 1805 const unsigned *aa = *((const unsigned **)a); 1806 const unsigned *bb = *((const unsigned **)b); 1807 for (i = maxIndex; ; i--) { 1808 if (aa[i] < bb[i]) return -1; 1809 if (aa[i] > bb[i]) return 1; 1810 if (i == osIdIndex) break; 1811 } 1812 return 0; 1813 } 1814 1815 1816 // 1817 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 1818 // affinity map. 1819 // 1820 static int 1821 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line, 1822 kmp_i18n_id_t *const msg_id, FILE *f) 1823 { 1824 *address2os = NULL; 1825 *msg_id = kmp_i18n_null; 1826 1827 // 1828 // Scan of the file, and count the number of "processor" (osId) fields, 1829 // and find the highest value of <n> for a node_<n> field. 1830 // 1831 char buf[256]; 1832 unsigned num_records = 0; 1833 while (! feof(f)) { 1834 buf[sizeof(buf) - 1] = 1; 1835 if (! fgets(buf, sizeof(buf), f)) { 1836 // 1837 // Read errors presumably because of EOF 1838 // 1839 break; 1840 } 1841 1842 char s1[] = "processor"; 1843 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1844 num_records++; 1845 continue; 1846 } 1847 1848 // 1849 // FIXME - this will match "node_<n> <garbage>" 1850 // 1851 unsigned level; 1852 if (KMP_SSCANF(buf, "node_%d id", &level) == 1) { 1853 if (nodeIdIndex + level >= maxIndex) { 1854 maxIndex = nodeIdIndex + level; 1855 } 1856 continue; 1857 } 1858 } 1859 1860 // 1861 // Check for empty file / no valid processor records, or too many. 1862 // The number of records can't exceed the number of valid bits in the 1863 // affinity mask. 1864 // 1865 if (num_records == 0) { 1866 *line = 0; 1867 *msg_id = kmp_i18n_str_NoProcRecords; 1868 return -1; 1869 } 1870 if (num_records > (unsigned)__kmp_xproc) { 1871 *line = 0; 1872 *msg_id = kmp_i18n_str_TooManyProcRecords; 1873 return -1; 1874 } 1875 1876 // 1877 // Set the file pointer back to the begginning, so that we can scan the 1878 // file again, this time performing a full parse of the data. 1879 // Allocate a vector of ProcCpuInfo object, where we will place the data. 1880 // Adding an extra element at the end allows us to remove a lot of extra 1881 // checks for termination conditions. 1882 // 1883 if (fseek(f, 0, SEEK_SET) != 0) { 1884 *line = 0; 1885 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 1886 return -1; 1887 } 1888 1889 // 1890 // Allocate the array of records to store the proc info in. The dummy 1891 // element at the end makes the logic in filling them out easier to code. 1892 // 1893 unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1) 1894 * sizeof(unsigned *)); 1895 unsigned i; 1896 for (i = 0; i <= num_records; i++) { 1897 threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1) 1898 * sizeof(unsigned)); 1899 } 1900 1901 #define CLEANUP_THREAD_INFO \ 1902 for (i = 0; i <= num_records; i++) { \ 1903 __kmp_free(threadInfo[i]); \ 1904 } \ 1905 __kmp_free(threadInfo); 1906 1907 // 1908 // A value of UINT_MAX means that we didn't find the field 1909 // 1910 unsigned __index; 1911 1912 #define INIT_PROC_INFO(p) \ 1913 for (__index = 0; __index <= maxIndex; __index++) { \ 1914 (p)[__index] = UINT_MAX; \ 1915 } 1916 1917 for (i = 0; i <= num_records; i++) { 1918 INIT_PROC_INFO(threadInfo[i]); 1919 } 1920 1921 unsigned num_avail = 0; 1922 *line = 0; 1923 while (! feof(f)) { 1924 // 1925 // Create an inner scoping level, so that all the goto targets at the 1926 // end of the loop appear in an outer scoping level. This avoids 1927 // warnings about jumping past an initialization to a target in the 1928 // same block. 1929 // 1930 { 1931 buf[sizeof(buf) - 1] = 1; 1932 bool long_line = false; 1933 if (! fgets(buf, sizeof(buf), f)) { 1934 // 1935 // Read errors presumably because of EOF 1936 // 1937 // If there is valid data in threadInfo[num_avail], then fake 1938 // a blank line in ensure that the last address gets parsed. 1939 // 1940 bool valid = false; 1941 for (i = 0; i <= maxIndex; i++) { 1942 if (threadInfo[num_avail][i] != UINT_MAX) { 1943 valid = true; 1944 } 1945 } 1946 if (! valid) { 1947 break; 1948 } 1949 buf[0] = 0; 1950 } else if (!buf[sizeof(buf) - 1]) { 1951 // 1952 // The line is longer than the buffer. Set a flag and don't 1953 // emit an error if we were going to ignore the line, anyway. 1954 // 1955 long_line = true; 1956 1957 #define CHECK_LINE \ 1958 if (long_line) { \ 1959 CLEANUP_THREAD_INFO; \ 1960 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 1961 return -1; \ 1962 } 1963 } 1964 (*line)++; 1965 1966 char s1[] = "processor"; 1967 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1968 CHECK_LINE; 1969 char *p = strchr(buf + sizeof(s1) - 1, ':'); 1970 unsigned val; 1971 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 1972 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field; 1973 threadInfo[num_avail][osIdIndex] = val; 1974 #if KMP_OS_LINUX && USE_SYSFS_INFO 1975 char path[256]; 1976 KMP_SNPRINTF(path, sizeof(path), 1977 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 1978 threadInfo[num_avail][osIdIndex]); 1979 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 1980 1981 KMP_SNPRINTF(path, sizeof(path), 1982 "/sys/devices/system/cpu/cpu%u/topology/core_id", 1983 threadInfo[num_avail][osIdIndex]); 1984 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 1985 continue; 1986 #else 1987 } 1988 char s2[] = "physical id"; 1989 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 1990 CHECK_LINE; 1991 char *p = strchr(buf + sizeof(s2) - 1, ':'); 1992 unsigned val; 1993 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 1994 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field; 1995 threadInfo[num_avail][pkgIdIndex] = val; 1996 continue; 1997 } 1998 char s3[] = "core id"; 1999 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 2000 CHECK_LINE; 2001 char *p = strchr(buf + sizeof(s3) - 1, ':'); 2002 unsigned val; 2003 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2004 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field; 2005 threadInfo[num_avail][coreIdIndex] = val; 2006 continue; 2007 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 2008 } 2009 char s4[] = "thread id"; 2010 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 2011 CHECK_LINE; 2012 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2013 unsigned val; 2014 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2015 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field; 2016 threadInfo[num_avail][threadIdIndex] = val; 2017 continue; 2018 } 2019 unsigned level; 2020 if (KMP_SSCANF(buf, "node_%d id", &level) == 1) { 2021 CHECK_LINE; 2022 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2023 unsigned val; 2024 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2025 KMP_ASSERT(nodeIdIndex + level <= maxIndex); 2026 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field; 2027 threadInfo[num_avail][nodeIdIndex + level] = val; 2028 continue; 2029 } 2030 2031 // 2032 // We didn't recognize the leading token on the line. 2033 // There are lots of leading tokens that we don't recognize - 2034 // if the line isn't empty, go on to the next line. 2035 // 2036 if ((*buf != 0) && (*buf != '\n')) { 2037 // 2038 // If the line is longer than the buffer, read characters 2039 // until we find a newline. 2040 // 2041 if (long_line) { 2042 int ch; 2043 while (((ch = fgetc(f)) != EOF) && (ch != '\n')); 2044 } 2045 continue; 2046 } 2047 2048 // 2049 // A newline has signalled the end of the processor record. 2050 // Check that there aren't too many procs specified. 2051 // 2052 if ((int)num_avail == __kmp_xproc) { 2053 CLEANUP_THREAD_INFO; 2054 *msg_id = kmp_i18n_str_TooManyEntries; 2055 return -1; 2056 } 2057 2058 // 2059 // Check for missing fields. The osId field must be there, and we 2060 // currently require that the physical id field is specified, also. 2061 // 2062 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 2063 CLEANUP_THREAD_INFO; 2064 *msg_id = kmp_i18n_str_MissingProcField; 2065 return -1; 2066 } 2067 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 2068 CLEANUP_THREAD_INFO; 2069 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 2070 return -1; 2071 } 2072 2073 // 2074 // Skip this proc if it is not included in the machine model. 2075 // 2076 if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) { 2077 INIT_PROC_INFO(threadInfo[num_avail]); 2078 continue; 2079 } 2080 2081 // 2082 // We have a successful parse of this proc's info. 2083 // Increment the counter, and prepare for the next proc. 2084 // 2085 num_avail++; 2086 KMP_ASSERT(num_avail <= num_records); 2087 INIT_PROC_INFO(threadInfo[num_avail]); 2088 } 2089 continue; 2090 2091 no_val: 2092 CLEANUP_THREAD_INFO; 2093 *msg_id = kmp_i18n_str_MissingValCpuinfo; 2094 return -1; 2095 2096 dup_field: 2097 CLEANUP_THREAD_INFO; 2098 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 2099 return -1; 2100 } 2101 *line = 0; 2102 2103 # if KMP_MIC && REDUCE_TEAM_SIZE 2104 unsigned teamSize = 0; 2105 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2106 2107 // check for num_records == __kmp_xproc ??? 2108 2109 // 2110 // If there's only one thread context to bind to, form an Address object 2111 // with depth 1 and return immediately (or, if affinity is off, set 2112 // address2os to NULL and return). 2113 // 2114 // If it is configured to omit the package level when there is only a 2115 // single package, the logic at the end of this routine won't work if 2116 // there is only a single thread - it would try to form an Address 2117 // object with depth 0. 2118 // 2119 KMP_ASSERT(num_avail > 0); 2120 KMP_ASSERT(num_avail <= num_records); 2121 if (num_avail == 1) { 2122 __kmp_ncores = 1; 2123 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 2124 if (__kmp_affinity_verbose) { 2125 if (! KMP_AFFINITY_CAPABLE()) { 2126 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2127 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2128 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2129 } 2130 else { 2131 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2132 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 2133 fullMask); 2134 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2135 if (__kmp_affinity_respect_mask) { 2136 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2137 } else { 2138 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2139 } 2140 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2141 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2142 } 2143 int index; 2144 kmp_str_buf_t buf; 2145 __kmp_str_buf_init(&buf); 2146 __kmp_str_buf_print(&buf, "1"); 2147 for (index = maxIndex - 1; index > pkgIdIndex; index--) { 2148 __kmp_str_buf_print(&buf, " x 1"); 2149 } 2150 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1); 2151 __kmp_str_buf_free(&buf); 2152 } 2153 2154 if (__kmp_affinity_type == affinity_none) { 2155 CLEANUP_THREAD_INFO; 2156 return 0; 2157 } 2158 2159 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair)); 2160 Address addr(1); 2161 addr.labels[0] = threadInfo[0][pkgIdIndex]; 2162 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]); 2163 2164 if (__kmp_affinity_gran_levels < 0) { 2165 __kmp_affinity_gran_levels = 0; 2166 } 2167 2168 if (__kmp_affinity_verbose) { 2169 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 2170 } 2171 2172 CLEANUP_THREAD_INFO; 2173 return 1; 2174 } 2175 2176 // 2177 // Sort the threadInfo table by physical Id. 2178 // 2179 qsort(threadInfo, num_avail, sizeof(*threadInfo), 2180 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 2181 2182 // 2183 // The table is now sorted by pkgId / coreId / threadId, but we really 2184 // don't know the radix of any of the fields. pkgId's may be sparsely 2185 // assigned among the chips on a system. Although coreId's are usually 2186 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned 2187 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2188 // 2189 // For that matter, we don't know what coresPerPkg and threadsPerCore 2190 // (or the total # packages) are at this point - we want to determine 2191 // that now. We only have an upper bound on the first two figures. 2192 // 2193 unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1) 2194 * sizeof(unsigned)); 2195 unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1) 2196 * sizeof(unsigned)); 2197 unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1) 2198 * sizeof(unsigned)); 2199 unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1) 2200 * sizeof(unsigned)); 2201 2202 bool assign_thread_ids = false; 2203 unsigned threadIdCt; 2204 unsigned index; 2205 2206 restart_radix_check: 2207 threadIdCt = 0; 2208 2209 // 2210 // Initialize the counter arrays with data from threadInfo[0]. 2211 // 2212 if (assign_thread_ids) { 2213 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 2214 threadInfo[0][threadIdIndex] = threadIdCt++; 2215 } 2216 else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 2217 threadIdCt = threadInfo[0][threadIdIndex] + 1; 2218 } 2219 } 2220 for (index = 0; index <= maxIndex; index++) { 2221 counts[index] = 1; 2222 maxCt[index] = 1; 2223 totals[index] = 1; 2224 lastId[index] = threadInfo[0][index];; 2225 } 2226 2227 // 2228 // Run through the rest of the OS procs. 2229 // 2230 for (i = 1; i < num_avail; i++) { 2231 // 2232 // Find the most significant index whose id differs 2233 // from the id for the previous OS proc. 2234 // 2235 for (index = maxIndex; index >= threadIdIndex; index--) { 2236 if (assign_thread_ids && (index == threadIdIndex)) { 2237 // 2238 // Auto-assign the thread id field if it wasn't specified. 2239 // 2240 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2241 threadInfo[i][threadIdIndex] = threadIdCt++; 2242 } 2243 2244 // 2245 // Aparrently the thread id field was specified for some 2246 // entries and not others. Start the thread id counter 2247 // off at the next higher thread id. 2248 // 2249 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2250 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2251 } 2252 } 2253 if (threadInfo[i][index] != lastId[index]) { 2254 // 2255 // Run through all indices which are less significant, 2256 // and reset the counts to 1. 2257 // 2258 // At all levels up to and including index, we need to 2259 // increment the totals and record the last id. 2260 // 2261 unsigned index2; 2262 for (index2 = threadIdIndex; index2 < index; index2++) { 2263 totals[index2]++; 2264 if (counts[index2] > maxCt[index2]) { 2265 maxCt[index2] = counts[index2]; 2266 } 2267 counts[index2] = 1; 2268 lastId[index2] = threadInfo[i][index2]; 2269 } 2270 counts[index]++; 2271 totals[index]++; 2272 lastId[index] = threadInfo[i][index]; 2273 2274 if (assign_thread_ids && (index > threadIdIndex)) { 2275 2276 # if KMP_MIC && REDUCE_TEAM_SIZE 2277 // 2278 // The default team size is the total #threads in the machine 2279 // minus 1 thread for every core that has 3 or more threads. 2280 // 2281 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 ); 2282 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2283 2284 // 2285 // Restart the thread counter, as we are on a new core. 2286 // 2287 threadIdCt = 0; 2288 2289 // 2290 // Auto-assign the thread id field if it wasn't specified. 2291 // 2292 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2293 threadInfo[i][threadIdIndex] = threadIdCt++; 2294 } 2295 2296 // 2297 // Aparrently the thread id field was specified for some 2298 // entries and not others. Start the thread id counter 2299 // off at the next higher thread id. 2300 // 2301 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2302 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2303 } 2304 } 2305 break; 2306 } 2307 } 2308 if (index < threadIdIndex) { 2309 // 2310 // If thread ids were specified, it is an error if they are not 2311 // unique. Also, check that we waven't already restarted the 2312 // loop (to be safe - shouldn't need to). 2313 // 2314 if ((threadInfo[i][threadIdIndex] != UINT_MAX) 2315 || assign_thread_ids) { 2316 __kmp_free(lastId); 2317 __kmp_free(totals); 2318 __kmp_free(maxCt); 2319 __kmp_free(counts); 2320 CLEANUP_THREAD_INFO; 2321 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2322 return -1; 2323 } 2324 2325 // 2326 // If the thread ids were not specified and we see entries 2327 // entries that are duplicates, start the loop over and 2328 // assign the thread ids manually. 2329 // 2330 assign_thread_ids = true; 2331 goto restart_radix_check; 2332 } 2333 } 2334 2335 # if KMP_MIC && REDUCE_TEAM_SIZE 2336 // 2337 // The default team size is the total #threads in the machine 2338 // minus 1 thread for every core that has 3 or more threads. 2339 // 2340 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 ); 2341 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2342 2343 for (index = threadIdIndex; index <= maxIndex; index++) { 2344 if (counts[index] > maxCt[index]) { 2345 maxCt[index] = counts[index]; 2346 } 2347 } 2348 2349 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2350 nCoresPerPkg = maxCt[coreIdIndex]; 2351 nPackages = totals[pkgIdIndex]; 2352 2353 // 2354 // Check to see if the machine topology is uniform 2355 // 2356 unsigned prod = totals[maxIndex]; 2357 for (index = threadIdIndex; index < maxIndex; index++) { 2358 prod *= maxCt[index]; 2359 } 2360 bool uniform = (prod == totals[threadIdIndex]); 2361 2362 // 2363 // When affinity is off, this routine will still be called to set 2364 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 2365 // nCoresPerPkg, & nPackages. Make sure all these vars are set 2366 // correctly, and return now if affinity is not enabled. 2367 // 2368 __kmp_ncores = totals[coreIdIndex]; 2369 2370 if (__kmp_affinity_verbose) { 2371 if (! KMP_AFFINITY_CAPABLE()) { 2372 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2373 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2374 if (uniform) { 2375 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2376 } else { 2377 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2378 } 2379 } 2380 else { 2381 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2382 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask); 2383 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2384 if (__kmp_affinity_respect_mask) { 2385 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2386 } else { 2387 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2388 } 2389 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2390 if (uniform) { 2391 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2392 } else { 2393 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2394 } 2395 } 2396 kmp_str_buf_t buf; 2397 __kmp_str_buf_init(&buf); 2398 2399 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]); 2400 for (index = maxIndex - 1; index >= pkgIdIndex; index--) { 2401 __kmp_str_buf_print(&buf, " x %d", maxCt[index]); 2402 } 2403 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex], 2404 maxCt[threadIdIndex], __kmp_ncores); 2405 2406 __kmp_str_buf_free(&buf); 2407 } 2408 2409 # if KMP_MIC && REDUCE_TEAM_SIZE 2410 // 2411 // Set the default team size. 2412 // 2413 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2414 __kmp_dflt_team_nth = teamSize; 2415 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n", 2416 __kmp_dflt_team_nth)); 2417 } 2418 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2419 2420 if (__kmp_affinity_type == affinity_none) { 2421 __kmp_free(lastId); 2422 __kmp_free(totals); 2423 __kmp_free(maxCt); 2424 __kmp_free(counts); 2425 CLEANUP_THREAD_INFO; 2426 return 0; 2427 } 2428 2429 // 2430 // Count the number of levels which have more nodes at that level than 2431 // at the parent's level (with there being an implicit root node of 2432 // the top level). This is equivalent to saying that there is at least 2433 // one node at this level which has a sibling. These levels are in the 2434 // map, and the package level is always in the map. 2435 // 2436 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2437 int level = 0; 2438 for (index = threadIdIndex; index < maxIndex; index++) { 2439 KMP_ASSERT(totals[index] >= totals[index + 1]); 2440 inMap[index] = (totals[index] > totals[index + 1]); 2441 } 2442 inMap[maxIndex] = (totals[maxIndex] > 1); 2443 inMap[pkgIdIndex] = true; 2444 2445 int depth = 0; 2446 for (index = threadIdIndex; index <= maxIndex; index++) { 2447 if (inMap[index]) { 2448 depth++; 2449 } 2450 } 2451 KMP_ASSERT(depth > 0); 2452 2453 // 2454 // Construct the data structure that is to be returned. 2455 // 2456 *address2os = (AddrUnsPair*) 2457 __kmp_allocate(sizeof(AddrUnsPair) * num_avail); 2458 int pkgLevel = -1; 2459 int coreLevel = -1; 2460 int threadLevel = -1; 2461 2462 for (i = 0; i < num_avail; ++i) { 2463 Address addr(depth); 2464 unsigned os = threadInfo[i][osIdIndex]; 2465 int src_index; 2466 int dst_index = 0; 2467 2468 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2469 if (! inMap[src_index]) { 2470 continue; 2471 } 2472 addr.labels[dst_index] = threadInfo[i][src_index]; 2473 if (src_index == pkgIdIndex) { 2474 pkgLevel = dst_index; 2475 } 2476 else if (src_index == coreIdIndex) { 2477 coreLevel = dst_index; 2478 } 2479 else if (src_index == threadIdIndex) { 2480 threadLevel = dst_index; 2481 } 2482 dst_index++; 2483 } 2484 (*address2os)[i] = AddrUnsPair(addr, os); 2485 } 2486 2487 if (__kmp_affinity_gran_levels < 0) { 2488 // 2489 // Set the granularity level based on what levels are modeled 2490 // in the machine topology map. 2491 // 2492 unsigned src_index; 2493 __kmp_affinity_gran_levels = 0; 2494 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) { 2495 if (! inMap[src_index]) { 2496 continue; 2497 } 2498 switch (src_index) { 2499 case threadIdIndex: 2500 if (__kmp_affinity_gran > affinity_gran_thread) { 2501 __kmp_affinity_gran_levels++; 2502 } 2503 2504 break; 2505 case coreIdIndex: 2506 if (__kmp_affinity_gran > affinity_gran_core) { 2507 __kmp_affinity_gran_levels++; 2508 } 2509 break; 2510 2511 case pkgIdIndex: 2512 if (__kmp_affinity_gran > affinity_gran_package) { 2513 __kmp_affinity_gran_levels++; 2514 } 2515 break; 2516 } 2517 } 2518 } 2519 2520 if (__kmp_affinity_verbose) { 2521 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel, 2522 coreLevel, threadLevel); 2523 } 2524 2525 __kmp_free(inMap); 2526 __kmp_free(lastId); 2527 __kmp_free(totals); 2528 __kmp_free(maxCt); 2529 __kmp_free(counts); 2530 CLEANUP_THREAD_INFO; 2531 return depth; 2532 } 2533 2534 2535 // 2536 // Create and return a table of affinity masks, indexed by OS thread ID. 2537 // This routine handles OR'ing together all the affinity masks of threads 2538 // that are sufficiently close, if granularity > fine. 2539 // 2540 static kmp_affin_mask_t * 2541 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique, 2542 AddrUnsPair *address2os, unsigned numAddrs) 2543 { 2544 // 2545 // First form a table of affinity masks in order of OS thread id. 2546 // 2547 unsigned depth; 2548 unsigned maxOsId; 2549 unsigned i; 2550 2551 KMP_ASSERT(numAddrs > 0); 2552 depth = address2os[0].first.depth; 2553 2554 maxOsId = 0; 2555 for (i = 0; i < numAddrs; i++) { 2556 unsigned osId = address2os[i].second; 2557 if (osId > maxOsId) { 2558 maxOsId = osId; 2559 } 2560 } 2561 kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate( 2562 (maxOsId + 1) * __kmp_affin_mask_size); 2563 2564 // 2565 // Sort the address2os table according to physical order. Doing so 2566 // will put all threads on the same core/package/node in consecutive 2567 // locations. 2568 // 2569 qsort(address2os, numAddrs, sizeof(*address2os), 2570 __kmp_affinity_cmp_Address_labels); 2571 2572 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2573 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2574 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2575 } 2576 if (__kmp_affinity_gran_levels >= (int)depth) { 2577 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2578 && (__kmp_affinity_type != affinity_none))) { 2579 KMP_WARNING(AffThreadsMayMigrate); 2580 } 2581 } 2582 2583 // 2584 // Run through the table, forming the masks for all threads on each 2585 // core. Threads on the same core will have identical "Address" 2586 // objects, not considering the last level, which must be the thread 2587 // id. All threads on a core will appear consecutively. 2588 // 2589 unsigned unique = 0; 2590 unsigned j = 0; // index of 1st thread on core 2591 unsigned leader = 0; 2592 Address *leaderAddr = &(address2os[0].first); 2593 kmp_affin_mask_t *sum 2594 = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size); 2595 KMP_CPU_ZERO(sum); 2596 KMP_CPU_SET(address2os[0].second, sum); 2597 for (i = 1; i < numAddrs; i++) { 2598 // 2599 // If this thread is sufficiently close to the leader (within the 2600 // granularity setting), then set the bit for this os thread in the 2601 // affinity mask for this group, and go on to the next thread. 2602 // 2603 if (leaderAddr->isClose(address2os[i].first, 2604 __kmp_affinity_gran_levels)) { 2605 KMP_CPU_SET(address2os[i].second, sum); 2606 continue; 2607 } 2608 2609 // 2610 // For every thread in this group, copy the mask to the thread's 2611 // entry in the osId2Mask table. Mark the first address as a 2612 // leader. 2613 // 2614 for (; j < i; j++) { 2615 unsigned osId = address2os[j].second; 2616 KMP_DEBUG_ASSERT(osId <= maxOsId); 2617 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2618 KMP_CPU_COPY(mask, sum); 2619 address2os[j].first.leader = (j == leader); 2620 } 2621 unique++; 2622 2623 // 2624 // Start a new mask. 2625 // 2626 leader = i; 2627 leaderAddr = &(address2os[i].first); 2628 KMP_CPU_ZERO(sum); 2629 KMP_CPU_SET(address2os[i].second, sum); 2630 } 2631 2632 // 2633 // For every thread in last group, copy the mask to the thread's 2634 // entry in the osId2Mask table. 2635 // 2636 for (; j < i; j++) { 2637 unsigned osId = address2os[j].second; 2638 KMP_DEBUG_ASSERT(osId <= maxOsId); 2639 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2640 KMP_CPU_COPY(mask, sum); 2641 address2os[j].first.leader = (j == leader); 2642 } 2643 unique++; 2644 2645 *maxIndex = maxOsId; 2646 *numUnique = unique; 2647 return osId2Mask; 2648 } 2649 2650 2651 // 2652 // Stuff for the affinity proclist parsers. It's easier to declare these vars 2653 // as file-static than to try and pass them through the calling sequence of 2654 // the recursive-descent OMP_PLACES parser. 2655 // 2656 static kmp_affin_mask_t *newMasks; 2657 static int numNewMasks; 2658 static int nextNewMask; 2659 2660 #define ADD_MASK(_mask) \ 2661 { \ 2662 if (nextNewMask >= numNewMasks) { \ 2663 numNewMasks *= 2; \ 2664 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \ 2665 numNewMasks * __kmp_affin_mask_size); \ 2666 } \ 2667 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 2668 nextNewMask++; \ 2669 } 2670 2671 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \ 2672 { \ 2673 if (((_osId) > _maxOsId) || \ 2674 (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 2675 if (__kmp_affinity_verbose || (__kmp_affinity_warnings \ 2676 && (__kmp_affinity_type != affinity_none))) { \ 2677 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 2678 } \ 2679 } \ 2680 else { \ 2681 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 2682 } \ 2683 } 2684 2685 2686 // 2687 // Re-parse the proclist (for the explicit affinity type), and form the list 2688 // of affinity newMasks indexed by gtid. 2689 // 2690 static void 2691 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 2692 unsigned int *out_numMasks, const char *proclist, 2693 kmp_affin_mask_t *osId2Mask, int maxOsId) 2694 { 2695 const char *scan = proclist; 2696 const char *next = proclist; 2697 2698 // 2699 // We use malloc() for the temporary mask vector, 2700 // so that we can use realloc() to extend it. 2701 // 2702 numNewMasks = 2; 2703 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks 2704 * __kmp_affin_mask_size); 2705 nextNewMask = 0; 2706 kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate( 2707 __kmp_affin_mask_size); 2708 int setSize = 0; 2709 2710 for (;;) { 2711 int start, end, stride; 2712 2713 SKIP_WS(scan); 2714 next = scan; 2715 if (*next == '\0') { 2716 break; 2717 } 2718 2719 if (*next == '{') { 2720 int num; 2721 setSize = 0; 2722 next++; // skip '{' 2723 SKIP_WS(next); 2724 scan = next; 2725 2726 // 2727 // Read the first integer in the set. 2728 // 2729 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2730 "bad proclist"); 2731 SKIP_DIGITS(next); 2732 num = __kmp_str_to_int(scan, *next); 2733 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2734 2735 // 2736 // Copy the mask for that osId to the sum (union) mask. 2737 // 2738 if ((num > maxOsId) || 2739 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2740 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2741 && (__kmp_affinity_type != affinity_none))) { 2742 KMP_WARNING(AffIgnoreInvalidProcID, num); 2743 } 2744 KMP_CPU_ZERO(sumMask); 2745 } 2746 else { 2747 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2748 setSize = 1; 2749 } 2750 2751 for (;;) { 2752 // 2753 // Check for end of set. 2754 // 2755 SKIP_WS(next); 2756 if (*next == '}') { 2757 next++; // skip '}' 2758 break; 2759 } 2760 2761 // 2762 // Skip optional comma. 2763 // 2764 if (*next == ',') { 2765 next++; 2766 } 2767 SKIP_WS(next); 2768 2769 // 2770 // Read the next integer in the set. 2771 // 2772 scan = next; 2773 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2774 "bad explicit proc list"); 2775 2776 SKIP_DIGITS(next); 2777 num = __kmp_str_to_int(scan, *next); 2778 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2779 2780 // 2781 // Add the mask for that osId to the sum mask. 2782 // 2783 if ((num > maxOsId) || 2784 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2785 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2786 && (__kmp_affinity_type != affinity_none))) { 2787 KMP_WARNING(AffIgnoreInvalidProcID, num); 2788 } 2789 } 2790 else { 2791 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2792 setSize++; 2793 } 2794 } 2795 if (setSize > 0) { 2796 ADD_MASK(sumMask); 2797 } 2798 2799 SKIP_WS(next); 2800 if (*next == ',') { 2801 next++; 2802 } 2803 scan = next; 2804 continue; 2805 } 2806 2807 // 2808 // Read the first integer. 2809 // 2810 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2811 SKIP_DIGITS(next); 2812 start = __kmp_str_to_int(scan, *next); 2813 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 2814 SKIP_WS(next); 2815 2816 // 2817 // If this isn't a range, then add a mask to the list and go on. 2818 // 2819 if (*next != '-') { 2820 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2821 2822 // 2823 // Skip optional comma. 2824 // 2825 if (*next == ',') { 2826 next++; 2827 } 2828 scan = next; 2829 continue; 2830 } 2831 2832 // 2833 // This is a range. Skip over the '-' and read in the 2nd int. 2834 // 2835 next++; // skip '-' 2836 SKIP_WS(next); 2837 scan = next; 2838 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2839 SKIP_DIGITS(next); 2840 end = __kmp_str_to_int(scan, *next); 2841 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 2842 2843 // 2844 // Check for a stride parameter 2845 // 2846 stride = 1; 2847 SKIP_WS(next); 2848 if (*next == ':') { 2849 // 2850 // A stride is specified. Skip over the ':" and read the 3rd int. 2851 // 2852 int sign = +1; 2853 next++; // skip ':' 2854 SKIP_WS(next); 2855 scan = next; 2856 if (*next == '-') { 2857 sign = -1; 2858 next++; 2859 SKIP_WS(next); 2860 scan = next; 2861 } 2862 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2863 "bad explicit proc list"); 2864 SKIP_DIGITS(next); 2865 stride = __kmp_str_to_int(scan, *next); 2866 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 2867 stride *= sign; 2868 } 2869 2870 // 2871 // Do some range checks. 2872 // 2873 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 2874 if (stride > 0) { 2875 KMP_ASSERT2(start <= end, "bad explicit proc list"); 2876 } 2877 else { 2878 KMP_ASSERT2(start >= end, "bad explicit proc list"); 2879 } 2880 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 2881 2882 // 2883 // Add the mask for each OS proc # to the list. 2884 // 2885 if (stride > 0) { 2886 do { 2887 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2888 start += stride; 2889 } while (start <= end); 2890 } 2891 else { 2892 do { 2893 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2894 start += stride; 2895 } while (start >= end); 2896 } 2897 2898 // 2899 // Skip optional comma. 2900 // 2901 SKIP_WS(next); 2902 if (*next == ',') { 2903 next++; 2904 } 2905 scan = next; 2906 } 2907 2908 *out_numMasks = nextNewMask; 2909 if (nextNewMask == 0) { 2910 *out_masks = NULL; 2911 KMP_INTERNAL_FREE(newMasks); 2912 return; 2913 } 2914 *out_masks 2915 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size); 2916 KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size); 2917 __kmp_free(sumMask); 2918 KMP_INTERNAL_FREE(newMasks); 2919 } 2920 2921 2922 # if OMP_40_ENABLED 2923 2924 /*----------------------------------------------------------------------------- 2925 2926 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 2927 places. Again, Here is the grammar: 2928 2929 place_list := place 2930 place_list := place , place_list 2931 place := num 2932 place := place : num 2933 place := place : num : signed 2934 place := { subplacelist } 2935 place := ! place // (lowest priority) 2936 subplace_list := subplace 2937 subplace_list := subplace , subplace_list 2938 subplace := num 2939 subplace := num : num 2940 subplace := num : num : signed 2941 signed := num 2942 signed := + signed 2943 signed := - signed 2944 2945 -----------------------------------------------------------------------------*/ 2946 2947 static void 2948 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask, 2949 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize) 2950 { 2951 const char *next; 2952 2953 for (;;) { 2954 int start, count, stride, i; 2955 2956 // 2957 // Read in the starting proc id 2958 // 2959 SKIP_WS(*scan); 2960 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 2961 "bad explicit places list"); 2962 next = *scan; 2963 SKIP_DIGITS(next); 2964 start = __kmp_str_to_int(*scan, *next); 2965 KMP_ASSERT(start >= 0); 2966 *scan = next; 2967 2968 // 2969 // valid follow sets are ',' ':' and '}' 2970 // 2971 SKIP_WS(*scan); 2972 if (**scan == '}' || **scan == ',') { 2973 if ((start > maxOsId) || 2974 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2975 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2976 && (__kmp_affinity_type != affinity_none))) { 2977 KMP_WARNING(AffIgnoreInvalidProcID, start); 2978 } 2979 } 2980 else { 2981 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2982 (*setSize)++; 2983 } 2984 if (**scan == '}') { 2985 break; 2986 } 2987 (*scan)++; // skip ',' 2988 continue; 2989 } 2990 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 2991 (*scan)++; // skip ':' 2992 2993 // 2994 // Read count parameter 2995 // 2996 SKIP_WS(*scan); 2997 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 2998 "bad explicit places list"); 2999 next = *scan; 3000 SKIP_DIGITS(next); 3001 count = __kmp_str_to_int(*scan, *next); 3002 KMP_ASSERT(count >= 0); 3003 *scan = next; 3004 3005 // 3006 // valid follow sets are ',' ':' and '}' 3007 // 3008 SKIP_WS(*scan); 3009 if (**scan == '}' || **scan == ',') { 3010 for (i = 0; i < count; i++) { 3011 if ((start > maxOsId) || 3012 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3013 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3014 && (__kmp_affinity_type != affinity_none))) { 3015 KMP_WARNING(AffIgnoreInvalidProcID, start); 3016 } 3017 break; // don't proliferate warnings for large count 3018 } 3019 else { 3020 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3021 start++; 3022 (*setSize)++; 3023 } 3024 } 3025 if (**scan == '}') { 3026 break; 3027 } 3028 (*scan)++; // skip ',' 3029 continue; 3030 } 3031 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3032 (*scan)++; // skip ':' 3033 3034 // 3035 // Read stride parameter 3036 // 3037 int sign = +1; 3038 for (;;) { 3039 SKIP_WS(*scan); 3040 if (**scan == '+') { 3041 (*scan)++; // skip '+' 3042 continue; 3043 } 3044 if (**scan == '-') { 3045 sign *= -1; 3046 (*scan)++; // skip '-' 3047 continue; 3048 } 3049 break; 3050 } 3051 SKIP_WS(*scan); 3052 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 3053 "bad explicit places list"); 3054 next = *scan; 3055 SKIP_DIGITS(next); 3056 stride = __kmp_str_to_int(*scan, *next); 3057 KMP_ASSERT(stride >= 0); 3058 *scan = next; 3059 stride *= sign; 3060 3061 // 3062 // valid follow sets are ',' and '}' 3063 // 3064 SKIP_WS(*scan); 3065 if (**scan == '}' || **scan == ',') { 3066 for (i = 0; i < count; i++) { 3067 if ((start > maxOsId) || 3068 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3069 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3070 && (__kmp_affinity_type != affinity_none))) { 3071 KMP_WARNING(AffIgnoreInvalidProcID, start); 3072 } 3073 break; // don't proliferate warnings for large count 3074 } 3075 else { 3076 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3077 start += stride; 3078 (*setSize)++; 3079 } 3080 } 3081 if (**scan == '}') { 3082 break; 3083 } 3084 (*scan)++; // skip ',' 3085 continue; 3086 } 3087 3088 KMP_ASSERT2(0, "bad explicit places list"); 3089 } 3090 } 3091 3092 3093 static void 3094 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 3095 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize) 3096 { 3097 const char *next; 3098 3099 // 3100 // valid follow sets are '{' '!' and num 3101 // 3102 SKIP_WS(*scan); 3103 if (**scan == '{') { 3104 (*scan)++; // skip '{' 3105 __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask, 3106 setSize); 3107 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 3108 (*scan)++; // skip '}' 3109 } 3110 else if (**scan == '!') { 3111 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 3112 KMP_CPU_COMPLEMENT(tempMask); 3113 (*scan)++; // skip '!' 3114 } 3115 else if ((**scan >= '0') && (**scan <= '9')) { 3116 next = *scan; 3117 SKIP_DIGITS(next); 3118 int num = __kmp_str_to_int(*scan, *next); 3119 KMP_ASSERT(num >= 0); 3120 if ((num > maxOsId) || 3121 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3122 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3123 && (__kmp_affinity_type != affinity_none))) { 3124 KMP_WARNING(AffIgnoreInvalidProcID, num); 3125 } 3126 } 3127 else { 3128 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 3129 (*setSize)++; 3130 } 3131 *scan = next; // skip num 3132 } 3133 else { 3134 KMP_ASSERT2(0, "bad explicit places list"); 3135 } 3136 } 3137 3138 3139 //static void 3140 void 3141 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 3142 unsigned int *out_numMasks, const char *placelist, 3143 kmp_affin_mask_t *osId2Mask, int maxOsId) 3144 { 3145 const char *scan = placelist; 3146 const char *next = placelist; 3147 3148 numNewMasks = 2; 3149 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks 3150 * __kmp_affin_mask_size); 3151 nextNewMask = 0; 3152 3153 kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate( 3154 __kmp_affin_mask_size); 3155 KMP_CPU_ZERO(tempMask); 3156 int setSize = 0; 3157 3158 for (;;) { 3159 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 3160 3161 // 3162 // valid follow sets are ',' ':' and EOL 3163 // 3164 SKIP_WS(scan); 3165 if (*scan == '\0' || *scan == ',') { 3166 if (setSize > 0) { 3167 ADD_MASK(tempMask); 3168 } 3169 KMP_CPU_ZERO(tempMask); 3170 setSize = 0; 3171 if (*scan == '\0') { 3172 break; 3173 } 3174 scan++; // skip ',' 3175 continue; 3176 } 3177 3178 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3179 scan++; // skip ':' 3180 3181 // 3182 // Read count parameter 3183 // 3184 SKIP_WS(scan); 3185 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), 3186 "bad explicit places list"); 3187 next = scan; 3188 SKIP_DIGITS(next); 3189 int count = __kmp_str_to_int(scan, *next); 3190 KMP_ASSERT(count >= 0); 3191 scan = next; 3192 3193 // 3194 // valid follow sets are ',' ':' and EOL 3195 // 3196 SKIP_WS(scan); 3197 int stride; 3198 if (*scan == '\0' || *scan == ',') { 3199 stride = +1; 3200 } 3201 else { 3202 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3203 scan++; // skip ':' 3204 3205 // 3206 // Read stride parameter 3207 // 3208 int sign = +1; 3209 for (;;) { 3210 SKIP_WS(scan); 3211 if (*scan == '+') { 3212 scan++; // skip '+' 3213 continue; 3214 } 3215 if (*scan == '-') { 3216 sign *= -1; 3217 scan++; // skip '-' 3218 continue; 3219 } 3220 break; 3221 } 3222 SKIP_WS(scan); 3223 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), 3224 "bad explicit places list"); 3225 next = scan; 3226 SKIP_DIGITS(next); 3227 stride = __kmp_str_to_int(scan, *next); 3228 KMP_DEBUG_ASSERT(stride >= 0); 3229 scan = next; 3230 stride *= sign; 3231 } 3232 3233 if (stride > 0) { 3234 int i; 3235 for (i = 0; i < count; i++) { 3236 int j; 3237 if (setSize == 0) { 3238 break; 3239 } 3240 ADD_MASK(tempMask); 3241 setSize = 0; 3242 for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) { 3243 if (! KMP_CPU_ISSET(j - stride, tempMask)) { 3244 KMP_CPU_CLR(j, tempMask); 3245 } 3246 else if ((j > maxOsId) || 3247 (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) { 3248 if ((__kmp_affinity_verbose || (__kmp_affinity_warnings 3249 && (__kmp_affinity_type != affinity_none))) && i < count - 1) { 3250 KMP_WARNING(AffIgnoreInvalidProcID, j); 3251 } 3252 KMP_CPU_CLR(j, tempMask); 3253 } 3254 else { 3255 KMP_CPU_SET(j, tempMask); 3256 setSize++; 3257 } 3258 } 3259 for (; j >= 0; j--) { 3260 KMP_CPU_CLR(j, tempMask); 3261 } 3262 } 3263 } 3264 else { 3265 int i; 3266 for (i = 0; i < count; i++) { 3267 int j; 3268 if (setSize == 0) { 3269 break; 3270 } 3271 ADD_MASK(tempMask); 3272 setSize = 0; 3273 for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride; 3274 j++) { 3275 if (! KMP_CPU_ISSET(j - stride, tempMask)) { 3276 KMP_CPU_CLR(j, tempMask); 3277 } 3278 else if ((j > maxOsId) || 3279 (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) { 3280 if ((__kmp_affinity_verbose || (__kmp_affinity_warnings 3281 && (__kmp_affinity_type != affinity_none))) && i < count - 1) { 3282 KMP_WARNING(AffIgnoreInvalidProcID, j); 3283 } 3284 KMP_CPU_CLR(j, tempMask); 3285 } 3286 else { 3287 KMP_CPU_SET(j, tempMask); 3288 setSize++; 3289 } 3290 } 3291 for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) { 3292 KMP_CPU_CLR(j, tempMask); 3293 } 3294 } 3295 } 3296 KMP_CPU_ZERO(tempMask); 3297 setSize = 0; 3298 3299 // 3300 // valid follow sets are ',' and EOL 3301 // 3302 SKIP_WS(scan); 3303 if (*scan == '\0') { 3304 break; 3305 } 3306 if (*scan == ',') { 3307 scan++; // skip ',' 3308 continue; 3309 } 3310 3311 KMP_ASSERT2(0, "bad explicit places list"); 3312 } 3313 3314 *out_numMasks = nextNewMask; 3315 if (nextNewMask == 0) { 3316 *out_masks = NULL; 3317 KMP_INTERNAL_FREE(newMasks); 3318 return; 3319 } 3320 *out_masks 3321 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size); 3322 KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size); 3323 __kmp_free(tempMask); 3324 KMP_INTERNAL_FREE(newMasks); 3325 } 3326 3327 # endif /* OMP_40_ENABLED */ 3328 3329 #undef ADD_MASK 3330 #undef ADD_MASK_OSID 3331 3332 static void 3333 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) 3334 { 3335 if ( __kmp_place_num_cores == 0 ) { 3336 if ( __kmp_place_num_threads_per_core == 0 ) { 3337 return; // no cores limiting actions requested, exit 3338 } 3339 __kmp_place_num_cores = nCoresPerPkg; // use all available cores 3340 } 3341 if ( !__kmp_affinity_uniform_topology() ) { 3342 KMP_WARNING( AffThrPlaceNonUniform ); 3343 return; // don't support non-uniform topology 3344 } 3345 if ( depth != 3 ) { 3346 KMP_WARNING( AffThrPlaceNonThreeLevel ); 3347 return; // don't support not-3-level topology 3348 } 3349 if ( __kmp_place_num_threads_per_core == 0 ) { 3350 __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts 3351 } 3352 if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) { 3353 KMP_WARNING( AffThrPlaceManyCores ); 3354 return; 3355 } 3356 3357 AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) * 3358 nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core); 3359 int i, j, k, n_old = 0, n_new = 0; 3360 for ( i = 0; i < nPackages; ++i ) { 3361 for ( j = 0; j < nCoresPerPkg; ++j ) { 3362 if ( j < __kmp_place_core_offset || j >= __kmp_place_core_offset + __kmp_place_num_cores ) { 3363 n_old += __kmp_nThreadsPerCore; // skip not-requested core 3364 } else { 3365 for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) { 3366 if ( k < __kmp_place_num_threads_per_core ) { 3367 newAddr[n_new] = (*pAddr)[n_old]; // copy requested core' data to new location 3368 n_new++; 3369 } 3370 n_old++; 3371 } 3372 } 3373 } 3374 } 3375 nCoresPerPkg = __kmp_place_num_cores; // correct nCoresPerPkg 3376 __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore 3377 __kmp_avail_proc = n_new; // correct avail_proc 3378 __kmp_ncores = nPackages * __kmp_place_num_cores; // correct ncores 3379 3380 __kmp_free( *pAddr ); 3381 *pAddr = newAddr; // replace old topology with new one 3382 } 3383 3384 3385 static AddrUnsPair *address2os = NULL; 3386 static int * procarr = NULL; 3387 static int __kmp_aff_depth = 0; 3388 3389 static void 3390 __kmp_aux_affinity_initialize(void) 3391 { 3392 if (__kmp_affinity_masks != NULL) { 3393 KMP_ASSERT(fullMask != NULL); 3394 return; 3395 } 3396 3397 // 3398 // Create the "full" mask - this defines all of the processors that we 3399 // consider to be in the machine model. If respect is set, then it is 3400 // the initialization thread's affinity mask. Otherwise, it is all 3401 // processors that we know about on the machine. 3402 // 3403 if (fullMask == NULL) { 3404 fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size); 3405 } 3406 if (KMP_AFFINITY_CAPABLE()) { 3407 if (__kmp_affinity_respect_mask) { 3408 __kmp_get_system_affinity(fullMask, TRUE); 3409 3410 // 3411 // Count the number of available processors. 3412 // 3413 unsigned i; 3414 __kmp_avail_proc = 0; 3415 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 3416 if (! KMP_CPU_ISSET(i, fullMask)) { 3417 continue; 3418 } 3419 __kmp_avail_proc++; 3420 } 3421 if (__kmp_avail_proc > __kmp_xproc) { 3422 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3423 && (__kmp_affinity_type != affinity_none))) { 3424 KMP_WARNING(ErrorInitializeAffinity); 3425 } 3426 __kmp_affinity_type = affinity_none; 3427 KMP_AFFINITY_DISABLE(); 3428 return; 3429 } 3430 } 3431 else { 3432 __kmp_affinity_entire_machine_mask(fullMask); 3433 __kmp_avail_proc = __kmp_xproc; 3434 } 3435 } 3436 3437 int depth = -1; 3438 kmp_i18n_id_t msg_id = kmp_i18n_null; 3439 3440 // 3441 // For backward compatibility, setting KMP_CPUINFO_FILE => 3442 // KMP_TOPOLOGY_METHOD=cpuinfo 3443 // 3444 if ((__kmp_cpuinfo_file != NULL) && 3445 (__kmp_affinity_top_method == affinity_top_method_all)) { 3446 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 3447 } 3448 3449 if (__kmp_affinity_top_method == affinity_top_method_all) { 3450 // 3451 // In the default code path, errors are not fatal - we just try using 3452 // another method. We only emit a warning message if affinity is on, 3453 // or the verbose flag is set, an the nowarnings flag was not set. 3454 // 3455 const char *file_name = NULL; 3456 int line = 0; 3457 3458 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 3459 3460 if (__kmp_affinity_verbose) { 3461 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 3462 } 3463 3464 file_name = NULL; 3465 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3466 if (depth == 0) { 3467 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3468 KMP_ASSERT(address2os == NULL); 3469 return; 3470 } 3471 3472 if (depth < 0) { 3473 if (__kmp_affinity_verbose) { 3474 if (msg_id != kmp_i18n_null) { 3475 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), 3476 KMP_I18N_STR(DecodingLegacyAPIC)); 3477 } 3478 else { 3479 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 3480 } 3481 } 3482 3483 file_name = NULL; 3484 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3485 if (depth == 0) { 3486 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3487 KMP_ASSERT(address2os == NULL); 3488 return; 3489 } 3490 } 3491 3492 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3493 3494 # if KMP_OS_LINUX 3495 3496 if (depth < 0) { 3497 if (__kmp_affinity_verbose) { 3498 if (msg_id != kmp_i18n_null) { 3499 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo"); 3500 } 3501 else { 3502 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo"); 3503 } 3504 } 3505 3506 FILE *f = fopen("/proc/cpuinfo", "r"); 3507 if (f == NULL) { 3508 msg_id = kmp_i18n_str_CantOpenCpuinfo; 3509 } 3510 else { 3511 file_name = "/proc/cpuinfo"; 3512 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3513 fclose(f); 3514 if (depth == 0) { 3515 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3516 KMP_ASSERT(address2os == NULL); 3517 return; 3518 } 3519 } 3520 } 3521 3522 # endif /* KMP_OS_LINUX */ 3523 3524 # if KMP_GROUP_AFFINITY 3525 3526 if ((depth < 0) && (__kmp_num_proc_groups > 1)) { 3527 if (__kmp_affinity_verbose) { 3528 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 3529 } 3530 3531 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 3532 KMP_ASSERT(depth != 0); 3533 } 3534 3535 # endif /* KMP_GROUP_AFFINITY */ 3536 3537 if (depth < 0) { 3538 if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) { 3539 if (file_name == NULL) { 3540 KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id)); 3541 } 3542 else if (line == 0) { 3543 KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id)); 3544 } 3545 else { 3546 KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id)); 3547 } 3548 } 3549 // FIXME - print msg if msg_id = kmp_i18n_null ??? 3550 3551 file_name = ""; 3552 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3553 if (depth == 0) { 3554 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3555 KMP_ASSERT(address2os == NULL); 3556 return; 3557 } 3558 KMP_ASSERT(depth > 0); 3559 KMP_ASSERT(address2os != NULL); 3560 } 3561 } 3562 3563 // 3564 // If the user has specified that a paricular topology discovery method 3565 // is to be used, then we abort if that method fails. The exception is 3566 // group affinity, which might have been implicitly set. 3567 // 3568 3569 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 3570 3571 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 3572 if (__kmp_affinity_verbose) { 3573 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3574 KMP_I18N_STR(Decodingx2APIC)); 3575 } 3576 3577 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3578 if (depth == 0) { 3579 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3580 KMP_ASSERT(address2os == NULL); 3581 return; 3582 } 3583 if (depth < 0) { 3584 KMP_ASSERT(msg_id != kmp_i18n_null); 3585 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3586 } 3587 } 3588 else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 3589 if (__kmp_affinity_verbose) { 3590 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3591 KMP_I18N_STR(DecodingLegacyAPIC)); 3592 } 3593 3594 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3595 if (depth == 0) { 3596 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3597 KMP_ASSERT(address2os == NULL); 3598 return; 3599 } 3600 if (depth < 0) { 3601 KMP_ASSERT(msg_id != kmp_i18n_null); 3602 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3603 } 3604 } 3605 3606 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3607 3608 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 3609 const char *filename; 3610 if (__kmp_cpuinfo_file != NULL) { 3611 filename = __kmp_cpuinfo_file; 3612 } 3613 else { 3614 filename = "/proc/cpuinfo"; 3615 } 3616 3617 if (__kmp_affinity_verbose) { 3618 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 3619 } 3620 3621 FILE *f = fopen(filename, "r"); 3622 if (f == NULL) { 3623 int code = errno; 3624 if (__kmp_cpuinfo_file != NULL) { 3625 __kmp_msg( 3626 kmp_ms_fatal, 3627 KMP_MSG(CantOpenFileForReading, filename), 3628 KMP_ERR(code), 3629 KMP_HNT(NameComesFrom_CPUINFO_FILE), 3630 __kmp_msg_null 3631 ); 3632 } 3633 else { 3634 __kmp_msg( 3635 kmp_ms_fatal, 3636 KMP_MSG(CantOpenFileForReading, filename), 3637 KMP_ERR(code), 3638 __kmp_msg_null 3639 ); 3640 } 3641 } 3642 int line = 0; 3643 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3644 fclose(f); 3645 if (depth < 0) { 3646 KMP_ASSERT(msg_id != kmp_i18n_null); 3647 if (line > 0) { 3648 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id)); 3649 } 3650 else { 3651 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 3652 } 3653 } 3654 if (__kmp_affinity_type == affinity_none) { 3655 KMP_ASSERT(depth == 0); 3656 KMP_ASSERT(address2os == NULL); 3657 return; 3658 } 3659 } 3660 3661 # if KMP_GROUP_AFFINITY 3662 3663 else if (__kmp_affinity_top_method == affinity_top_method_group) { 3664 if (__kmp_affinity_verbose) { 3665 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 3666 } 3667 3668 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 3669 KMP_ASSERT(depth != 0); 3670 if (depth < 0) { 3671 KMP_ASSERT(msg_id != kmp_i18n_null); 3672 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3673 } 3674 } 3675 3676 # endif /* KMP_GROUP_AFFINITY */ 3677 3678 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 3679 if (__kmp_affinity_verbose) { 3680 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY"); 3681 } 3682 3683 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3684 if (depth == 0) { 3685 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3686 KMP_ASSERT(address2os == NULL); 3687 return; 3688 } 3689 // should not fail 3690 KMP_ASSERT(depth > 0); 3691 KMP_ASSERT(address2os != NULL); 3692 } 3693 3694 if (address2os == NULL) { 3695 if (KMP_AFFINITY_CAPABLE() 3696 && (__kmp_affinity_verbose || (__kmp_affinity_warnings 3697 && (__kmp_affinity_type != affinity_none)))) { 3698 KMP_WARNING(ErrorInitializeAffinity); 3699 } 3700 __kmp_affinity_type = affinity_none; 3701 KMP_AFFINITY_DISABLE(); 3702 return; 3703 } 3704 3705 __kmp_apply_thread_places(&address2os, depth); 3706 3707 // 3708 // Create the table of masks, indexed by thread Id. 3709 // 3710 unsigned maxIndex; 3711 unsigned numUnique; 3712 kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique, 3713 address2os, __kmp_avail_proc); 3714 if (__kmp_affinity_gran_levels == 0) { 3715 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 3716 } 3717 3718 // 3719 // Set the childNums vector in all Address objects. This must be done 3720 // before we can sort using __kmp_affinity_cmp_Address_child_num(), 3721 // which takes into account the setting of __kmp_affinity_compact. 3722 // 3723 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc); 3724 3725 switch (__kmp_affinity_type) { 3726 3727 case affinity_explicit: 3728 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 3729 # if OMP_40_ENABLED 3730 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 3731 # endif 3732 { 3733 __kmp_affinity_process_proclist(&__kmp_affinity_masks, 3734 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask, 3735 maxIndex); 3736 } 3737 # if OMP_40_ENABLED 3738 else { 3739 __kmp_affinity_process_placelist(&__kmp_affinity_masks, 3740 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask, 3741 maxIndex); 3742 } 3743 # endif 3744 if (__kmp_affinity_num_masks == 0) { 3745 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3746 && (__kmp_affinity_type != affinity_none))) { 3747 KMP_WARNING(AffNoValidProcID); 3748 } 3749 __kmp_affinity_type = affinity_none; 3750 return; 3751 } 3752 break; 3753 3754 // 3755 // The other affinity types rely on sorting the Addresses according 3756 // to some permutation of the machine topology tree. Set 3757 // __kmp_affinity_compact and __kmp_affinity_offset appropriately, 3758 // then jump to a common code fragment to do the sort and create 3759 // the array of affinity masks. 3760 // 3761 3762 case affinity_logical: 3763 __kmp_affinity_compact = 0; 3764 if (__kmp_affinity_offset) { 3765 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset 3766 % __kmp_avail_proc; 3767 } 3768 goto sortAddresses; 3769 3770 case affinity_physical: 3771 if (__kmp_nThreadsPerCore > 1) { 3772 __kmp_affinity_compact = 1; 3773 if (__kmp_affinity_compact >= depth) { 3774 __kmp_affinity_compact = 0; 3775 } 3776 } else { 3777 __kmp_affinity_compact = 0; 3778 } 3779 if (__kmp_affinity_offset) { 3780 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset 3781 % __kmp_avail_proc; 3782 } 3783 goto sortAddresses; 3784 3785 case affinity_scatter: 3786 if (__kmp_affinity_compact >= depth) { 3787 __kmp_affinity_compact = 0; 3788 } 3789 else { 3790 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 3791 } 3792 goto sortAddresses; 3793 3794 case affinity_compact: 3795 if (__kmp_affinity_compact >= depth) { 3796 __kmp_affinity_compact = depth - 1; 3797 } 3798 goto sortAddresses; 3799 3800 case affinity_balanced: 3801 // Balanced works only for the case of a single package and uniform topology 3802 if( nPackages > 1 ) { 3803 if( __kmp_affinity_verbose || __kmp_affinity_warnings ) { 3804 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" ); 3805 } 3806 __kmp_affinity_type = affinity_none; 3807 return; 3808 } else if( __kmp_affinity_uniform_topology() ) { 3809 break; 3810 } else { // Non-uniform topology 3811 3812 // Save the depth for further usage 3813 __kmp_aff_depth = depth; 3814 3815 // Number of hyper threads per core in HT machine 3816 int nth_per_core = __kmp_nThreadsPerCore; 3817 3818 int core_level; 3819 if( nth_per_core > 1 ) { 3820 core_level = depth - 2; 3821 } else { 3822 core_level = depth - 1; 3823 } 3824 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1; 3825 int nproc = nth_per_core * ncores; 3826 3827 procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc ); 3828 for( int i = 0; i < nproc; i++ ) { 3829 procarr[ i ] = -1; 3830 } 3831 3832 for( int i = 0; i < __kmp_avail_proc; i++ ) { 3833 int proc = address2os[ i ].second; 3834 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread. 3835 // If there is only one thread per core then depth == 2: level 0 - package, 3836 // level 1 - core. 3837 int level = depth - 1; 3838 3839 // __kmp_nth_per_core == 1 3840 int thread = 0; 3841 int core = address2os[ i ].first.labels[ level ]; 3842 // If the thread level exists, that is we have more than one thread context per core 3843 if( nth_per_core > 1 ) { 3844 thread = address2os[ i ].first.labels[ level ] % nth_per_core; 3845 core = address2os[ i ].first.labels[ level - 1 ]; 3846 } 3847 procarr[ core * nth_per_core + thread ] = proc; 3848 } 3849 3850 break; 3851 } 3852 3853 sortAddresses: 3854 // 3855 // Allocate the gtid->affinity mask table. 3856 // 3857 if (__kmp_affinity_dups) { 3858 __kmp_affinity_num_masks = __kmp_avail_proc; 3859 } 3860 else { 3861 __kmp_affinity_num_masks = numUnique; 3862 } 3863 3864 # if OMP_40_ENABLED 3865 if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel ) 3866 && ( __kmp_affinity_num_places > 0 ) 3867 && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) { 3868 __kmp_affinity_num_masks = __kmp_affinity_num_places; 3869 } 3870 # endif 3871 3872 __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate( 3873 __kmp_affinity_num_masks * __kmp_affin_mask_size); 3874 3875 // 3876 // Sort the address2os table according to the current setting of 3877 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 3878 // 3879 qsort(address2os, __kmp_avail_proc, sizeof(*address2os), 3880 __kmp_affinity_cmp_Address_child_num); 3881 { 3882 int i; 3883 unsigned j; 3884 for (i = 0, j = 0; i < __kmp_avail_proc; i++) { 3885 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) { 3886 continue; 3887 } 3888 unsigned osId = address2os[i].second; 3889 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 3890 kmp_affin_mask_t *dest 3891 = KMP_CPU_INDEX(__kmp_affinity_masks, j); 3892 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 3893 KMP_CPU_COPY(dest, src); 3894 if (++j >= __kmp_affinity_num_masks) { 3895 break; 3896 } 3897 } 3898 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 3899 } 3900 break; 3901 3902 default: 3903 KMP_ASSERT2(0, "Unexpected affinity setting"); 3904 } 3905 3906 __kmp_free(osId2Mask); 3907 machine_hierarchy.init(address2os, __kmp_avail_proc); 3908 } 3909 3910 3911 void 3912 __kmp_affinity_initialize(void) 3913 { 3914 // 3915 // Much of the code above was written assumming that if a machine was not 3916 // affinity capable, then __kmp_affinity_type == affinity_none. We now 3917 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 3918 // 3919 // There are too many checks for __kmp_affinity_type == affinity_none 3920 // in this code. Instead of trying to change them all, check if 3921 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 3922 // affinity_none, call the real initialization routine, then restore 3923 // __kmp_affinity_type to affinity_disabled. 3924 // 3925 int disabled = (__kmp_affinity_type == affinity_disabled); 3926 if (! KMP_AFFINITY_CAPABLE()) { 3927 KMP_ASSERT(disabled); 3928 } 3929 if (disabled) { 3930 __kmp_affinity_type = affinity_none; 3931 } 3932 __kmp_aux_affinity_initialize(); 3933 if (disabled) { 3934 __kmp_affinity_type = affinity_disabled; 3935 } 3936 } 3937 3938 3939 void 3940 __kmp_affinity_uninitialize(void) 3941 { 3942 if (__kmp_affinity_masks != NULL) { 3943 __kmp_free(__kmp_affinity_masks); 3944 __kmp_affinity_masks = NULL; 3945 } 3946 if (fullMask != NULL) { 3947 KMP_CPU_FREE(fullMask); 3948 fullMask = NULL; 3949 } 3950 __kmp_affinity_num_masks = 0; 3951 # if OMP_40_ENABLED 3952 __kmp_affinity_num_places = 0; 3953 # endif 3954 if (__kmp_affinity_proclist != NULL) { 3955 __kmp_free(__kmp_affinity_proclist); 3956 __kmp_affinity_proclist = NULL; 3957 } 3958 if( address2os != NULL ) { 3959 __kmp_free( address2os ); 3960 address2os = NULL; 3961 } 3962 if( procarr != NULL ) { 3963 __kmp_free( procarr ); 3964 procarr = NULL; 3965 } 3966 } 3967 3968 3969 void 3970 __kmp_affinity_set_init_mask(int gtid, int isa_root) 3971 { 3972 if (! KMP_AFFINITY_CAPABLE()) { 3973 return; 3974 } 3975 3976 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 3977 if (th->th.th_affin_mask == NULL) { 3978 KMP_CPU_ALLOC(th->th.th_affin_mask); 3979 } 3980 else { 3981 KMP_CPU_ZERO(th->th.th_affin_mask); 3982 } 3983 3984 // 3985 // Copy the thread mask to the kmp_info_t strucuture. 3986 // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one 3987 // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask 3988 // is set, then the full mask is the same as the mask of the initialization 3989 // thread. 3990 // 3991 kmp_affin_mask_t *mask; 3992 int i; 3993 3994 # if OMP_40_ENABLED 3995 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 3996 # endif 3997 { 3998 if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced) 3999 ) { 4000 # if KMP_GROUP_AFFINITY 4001 if (__kmp_num_proc_groups > 1) { 4002 return; 4003 } 4004 # endif 4005 KMP_ASSERT(fullMask != NULL); 4006 i = KMP_PLACE_ALL; 4007 mask = fullMask; 4008 } 4009 else { 4010 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 ); 4011 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4012 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4013 } 4014 } 4015 # if OMP_40_ENABLED 4016 else { 4017 if ((! isa_root) 4018 || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 4019 # if KMP_GROUP_AFFINITY 4020 if (__kmp_num_proc_groups > 1) { 4021 return; 4022 } 4023 # endif 4024 KMP_ASSERT(fullMask != NULL); 4025 i = KMP_PLACE_ALL; 4026 mask = fullMask; 4027 } 4028 else { 4029 // 4030 // int i = some hash function or just a counter that doesn't 4031 // always start at 0. Use gtid for now. 4032 // 4033 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 ); 4034 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4035 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4036 } 4037 } 4038 # endif 4039 4040 # if OMP_40_ENABLED 4041 th->th.th_current_place = i; 4042 if (isa_root) { 4043 th->th.th_new_place = i; 4044 th->th.th_first_place = 0; 4045 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4046 } 4047 4048 if (i == KMP_PLACE_ALL) { 4049 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 4050 gtid)); 4051 } 4052 else { 4053 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 4054 gtid, i)); 4055 } 4056 # else 4057 if (i == -1) { 4058 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n", 4059 gtid)); 4060 } 4061 else { 4062 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n", 4063 gtid, i)); 4064 } 4065 # endif /* OMP_40_ENABLED */ 4066 4067 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4068 4069 if (__kmp_affinity_verbose) { 4070 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4071 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4072 th->th.th_affin_mask); 4073 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid, 4074 buf); 4075 } 4076 4077 # if KMP_OS_WINDOWS 4078 // 4079 // On Windows* OS, the process affinity mask might have changed. 4080 // If the user didn't request affinity and this call fails, 4081 // just continue silently. See CQ171393. 4082 // 4083 if ( __kmp_affinity_type == affinity_none ) { 4084 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 4085 } 4086 else 4087 # endif 4088 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4089 } 4090 4091 4092 # if OMP_40_ENABLED 4093 4094 void 4095 __kmp_affinity_set_place(int gtid) 4096 { 4097 int retval; 4098 4099 if (! KMP_AFFINITY_CAPABLE()) { 4100 return; 4101 } 4102 4103 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4104 4105 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n", 4106 gtid, th->th.th_new_place, th->th.th_current_place)); 4107 4108 // 4109 // Check that the new place is within this thread's partition. 4110 // 4111 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4112 KMP_ASSERT(th->th.th_new_place >= 0); 4113 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 4114 if (th->th.th_first_place <= th->th.th_last_place) { 4115 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) 4116 && (th->th.th_new_place <= th->th.th_last_place)); 4117 } 4118 else { 4119 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) 4120 || (th->th.th_new_place >= th->th.th_last_place)); 4121 } 4122 4123 // 4124 // Copy the thread mask to the kmp_info_t strucuture, 4125 // and set this thread's affinity. 4126 // 4127 kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, 4128 th->th.th_new_place); 4129 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4130 th->th.th_current_place = th->th.th_new_place; 4131 4132 if (__kmp_affinity_verbose) { 4133 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4134 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4135 th->th.th_affin_mask); 4136 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), 4137 gtid, buf); 4138 } 4139 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4140 } 4141 4142 # endif /* OMP_40_ENABLED */ 4143 4144 4145 int 4146 __kmp_aux_set_affinity(void **mask) 4147 { 4148 int gtid; 4149 kmp_info_t *th; 4150 int retval; 4151 4152 if (! KMP_AFFINITY_CAPABLE()) { 4153 return -1; 4154 } 4155 4156 gtid = __kmp_entry_gtid(); 4157 KA_TRACE(1000, ;{ 4158 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4159 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4160 (kmp_affin_mask_t *)(*mask)); 4161 __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n", 4162 gtid, buf); 4163 }); 4164 4165 if (__kmp_env_consistency_check) { 4166 if ((mask == NULL) || (*mask == NULL)) { 4167 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4168 } 4169 else { 4170 unsigned proc; 4171 int num_procs = 0; 4172 4173 for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) { 4174 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 4175 continue; 4176 } 4177 num_procs++; 4178 if (! KMP_CPU_ISSET(proc, fullMask)) { 4179 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4180 break; 4181 } 4182 } 4183 if (num_procs == 0) { 4184 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4185 } 4186 4187 # if KMP_GROUP_AFFINITY 4188 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 4189 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4190 } 4191 # endif /* KMP_GROUP_AFFINITY */ 4192 4193 } 4194 } 4195 4196 th = __kmp_threads[gtid]; 4197 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4198 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4199 if (retval == 0) { 4200 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 4201 } 4202 4203 # if OMP_40_ENABLED 4204 th->th.th_current_place = KMP_PLACE_UNDEFINED; 4205 th->th.th_new_place = KMP_PLACE_UNDEFINED; 4206 th->th.th_first_place = 0; 4207 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4208 4209 // 4210 // Turn off 4.0 affinity for the current tread at this parallel level. 4211 // 4212 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 4213 # endif 4214 4215 return retval; 4216 } 4217 4218 4219 int 4220 __kmp_aux_get_affinity(void **mask) 4221 { 4222 int gtid; 4223 int retval; 4224 kmp_info_t *th; 4225 4226 if (! KMP_AFFINITY_CAPABLE()) { 4227 return -1; 4228 } 4229 4230 gtid = __kmp_entry_gtid(); 4231 th = __kmp_threads[gtid]; 4232 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4233 4234 KA_TRACE(1000, ;{ 4235 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4236 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4237 th->th.th_affin_mask); 4238 __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf); 4239 }); 4240 4241 if (__kmp_env_consistency_check) { 4242 if ((mask == NULL) || (*mask == NULL)) { 4243 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 4244 } 4245 } 4246 4247 # if !KMP_OS_WINDOWS 4248 4249 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4250 KA_TRACE(1000, ;{ 4251 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4252 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4253 (kmp_affin_mask_t *)(*mask)); 4254 __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf); 4255 }); 4256 return retval; 4257 4258 # else 4259 4260 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 4261 return 0; 4262 4263 # endif /* KMP_OS_WINDOWS */ 4264 4265 } 4266 4267 int 4268 __kmp_aux_set_affinity_mask_proc(int proc, void **mask) 4269 { 4270 int retval; 4271 4272 if (! KMP_AFFINITY_CAPABLE()) { 4273 return -1; 4274 } 4275 4276 KA_TRACE(1000, ;{ 4277 int gtid = __kmp_entry_gtid(); 4278 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4279 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4280 (kmp_affin_mask_t *)(*mask)); 4281 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n", 4282 proc, gtid, buf); 4283 }); 4284 4285 if (__kmp_env_consistency_check) { 4286 if ((mask == NULL) || (*mask == NULL)) { 4287 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4288 } 4289 } 4290 4291 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4292 return -1; 4293 } 4294 if (! KMP_CPU_ISSET(proc, fullMask)) { 4295 return -2; 4296 } 4297 4298 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 4299 return 0; 4300 } 4301 4302 4303 int 4304 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) 4305 { 4306 int retval; 4307 4308 if (! KMP_AFFINITY_CAPABLE()) { 4309 return -1; 4310 } 4311 4312 KA_TRACE(1000, ;{ 4313 int gtid = __kmp_entry_gtid(); 4314 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4315 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4316 (kmp_affin_mask_t *)(*mask)); 4317 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n", 4318 proc, gtid, buf); 4319 }); 4320 4321 if (__kmp_env_consistency_check) { 4322 if ((mask == NULL) || (*mask == NULL)) { 4323 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 4324 } 4325 } 4326 4327 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4328 return -1; 4329 } 4330 if (! KMP_CPU_ISSET(proc, fullMask)) { 4331 return -2; 4332 } 4333 4334 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 4335 return 0; 4336 } 4337 4338 4339 int 4340 __kmp_aux_get_affinity_mask_proc(int proc, void **mask) 4341 { 4342 int retval; 4343 4344 if (! KMP_AFFINITY_CAPABLE()) { 4345 return -1; 4346 } 4347 4348 KA_TRACE(1000, ;{ 4349 int gtid = __kmp_entry_gtid(); 4350 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4351 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4352 (kmp_affin_mask_t *)(*mask)); 4353 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n", 4354 proc, gtid, buf); 4355 }); 4356 4357 if (__kmp_env_consistency_check) { 4358 if ((mask == NULL) || (*mask == NULL)) { 4359 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc"); 4360 } 4361 } 4362 4363 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4364 return 0; 4365 } 4366 if (! KMP_CPU_ISSET(proc, fullMask)) { 4367 return 0; 4368 } 4369 4370 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 4371 } 4372 4373 4374 // Dynamic affinity settings - Affinity balanced 4375 void __kmp_balanced_affinity( int tid, int nthreads ) 4376 { 4377 if( __kmp_affinity_uniform_topology() ) { 4378 int coreID; 4379 int threadID; 4380 // Number of hyper threads per core in HT machine 4381 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 4382 // Number of cores 4383 int ncores = __kmp_ncores; 4384 // How many threads will be bound to each core 4385 int chunk = nthreads / ncores; 4386 // How many cores will have an additional thread bound to it - "big cores" 4387 int big_cores = nthreads % ncores; 4388 // Number of threads on the big cores 4389 int big_nth = ( chunk + 1 ) * big_cores; 4390 if( tid < big_nth ) { 4391 coreID = tid / (chunk + 1 ); 4392 threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ; 4393 } else { //tid >= big_nth 4394 coreID = ( tid - big_cores ) / chunk; 4395 threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ; 4396 } 4397 4398 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 4399 "Illegal set affinity operation when not capable"); 4400 4401 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size); 4402 KMP_CPU_ZERO(mask); 4403 4404 // Granularity == thread 4405 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4406 int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second; 4407 KMP_CPU_SET( osID, mask); 4408 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4409 for( int i = 0; i < __kmp_nth_per_core; i++ ) { 4410 int osID; 4411 osID = address2os[ coreID * __kmp_nth_per_core + i ].second; 4412 KMP_CPU_SET( osID, mask); 4413 } 4414 } 4415 if (__kmp_affinity_verbose) { 4416 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4417 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4418 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4419 tid, buf); 4420 } 4421 __kmp_set_system_affinity( mask, TRUE ); 4422 } else { // Non-uniform topology 4423 4424 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size); 4425 KMP_CPU_ZERO(mask); 4426 4427 // Number of hyper threads per core in HT machine 4428 int nth_per_core = __kmp_nThreadsPerCore; 4429 int core_level; 4430 if( nth_per_core > 1 ) { 4431 core_level = __kmp_aff_depth - 2; 4432 } else { 4433 core_level = __kmp_aff_depth - 1; 4434 } 4435 4436 // Number of cores - maximum value; it does not count trail cores with 0 processors 4437 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1; 4438 4439 // For performance gain consider the special case nthreads == __kmp_avail_proc 4440 if( nthreads == __kmp_avail_proc ) { 4441 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4442 int osID = address2os[ tid ].second; 4443 KMP_CPU_SET( osID, mask); 4444 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4445 int coreID = address2os[ tid ].first.labels[ core_level ]; 4446 // We'll count found osIDs for the current core; they can be not more than nth_per_core; 4447 // since the address2os is sortied we can break when cnt==nth_per_core 4448 int cnt = 0; 4449 for( int i = 0; i < __kmp_avail_proc; i++ ) { 4450 int osID = address2os[ i ].second; 4451 int core = address2os[ i ].first.labels[ core_level ]; 4452 if( core == coreID ) { 4453 KMP_CPU_SET( osID, mask); 4454 cnt++; 4455 if( cnt == nth_per_core ) { 4456 break; 4457 } 4458 } 4459 } 4460 } 4461 } else if( nthreads <= __kmp_ncores ) { 4462 4463 int core = 0; 4464 for( int i = 0; i < ncores; i++ ) { 4465 // Check if this core from procarr[] is in the mask 4466 int in_mask = 0; 4467 for( int j = 0; j < nth_per_core; j++ ) { 4468 if( procarr[ i * nth_per_core + j ] != - 1 ) { 4469 in_mask = 1; 4470 break; 4471 } 4472 } 4473 if( in_mask ) { 4474 if( tid == core ) { 4475 for( int j = 0; j < nth_per_core; j++ ) { 4476 int osID = procarr[ i * nth_per_core + j ]; 4477 if( osID != -1 ) { 4478 KMP_CPU_SET( osID, mask ); 4479 // For granularity=thread it is enough to set the first available osID for this core 4480 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4481 break; 4482 } 4483 } 4484 } 4485 break; 4486 } else { 4487 core++; 4488 } 4489 } 4490 } 4491 4492 } else { // nthreads > __kmp_ncores 4493 4494 // Array to save the number of processors at each core 4495 int nproc_at_core[ ncores ]; 4496 // Array to save the number of cores with "x" available processors; 4497 int ncores_with_x_procs[ nth_per_core + 1 ]; 4498 // Array to save the number of cores with # procs from x to nth_per_core 4499 int ncores_with_x_to_max_procs[ nth_per_core + 1 ]; 4500 4501 for( int i = 0; i <= nth_per_core; i++ ) { 4502 ncores_with_x_procs[ i ] = 0; 4503 ncores_with_x_to_max_procs[ i ] = 0; 4504 } 4505 4506 for( int i = 0; i < ncores; i++ ) { 4507 int cnt = 0; 4508 for( int j = 0; j < nth_per_core; j++ ) { 4509 if( procarr[ i * nth_per_core + j ] != -1 ) { 4510 cnt++; 4511 } 4512 } 4513 nproc_at_core[ i ] = cnt; 4514 ncores_with_x_procs[ cnt ]++; 4515 } 4516 4517 for( int i = 0; i <= nth_per_core; i++ ) { 4518 for( int j = i; j <= nth_per_core; j++ ) { 4519 ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ]; 4520 } 4521 } 4522 4523 // Max number of processors 4524 int nproc = nth_per_core * ncores; 4525 // An array to keep number of threads per each context 4526 int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc ); 4527 for( int i = 0; i < nproc; i++ ) { 4528 newarr[ i ] = 0; 4529 } 4530 4531 int nth = nthreads; 4532 int flag = 0; 4533 while( nth > 0 ) { 4534 for( int j = 1; j <= nth_per_core; j++ ) { 4535 int cnt = ncores_with_x_to_max_procs[ j ]; 4536 for( int i = 0; i < ncores; i++ ) { 4537 // Skip the core with 0 processors 4538 if( nproc_at_core[ i ] == 0 ) { 4539 continue; 4540 } 4541 for( int k = 0; k < nth_per_core; k++ ) { 4542 if( procarr[ i * nth_per_core + k ] != -1 ) { 4543 if( newarr[ i * nth_per_core + k ] == 0 ) { 4544 newarr[ i * nth_per_core + k ] = 1; 4545 cnt--; 4546 nth--; 4547 break; 4548 } else { 4549 if( flag != 0 ) { 4550 newarr[ i * nth_per_core + k ] ++; 4551 cnt--; 4552 nth--; 4553 break; 4554 } 4555 } 4556 } 4557 } 4558 if( cnt == 0 || nth == 0 ) { 4559 break; 4560 } 4561 } 4562 if( nth == 0 ) { 4563 break; 4564 } 4565 } 4566 flag = 1; 4567 } 4568 int sum = 0; 4569 for( int i = 0; i < nproc; i++ ) { 4570 sum += newarr[ i ]; 4571 if( sum > tid ) { 4572 // Granularity == thread 4573 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4574 int osID = procarr[ i ]; 4575 KMP_CPU_SET( osID, mask); 4576 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4577 int coreID = i / nth_per_core; 4578 for( int ii = 0; ii < nth_per_core; ii++ ) { 4579 int osID = procarr[ coreID * nth_per_core + ii ]; 4580 if( osID != -1 ) { 4581 KMP_CPU_SET( osID, mask); 4582 } 4583 } 4584 } 4585 break; 4586 } 4587 } 4588 __kmp_free( newarr ); 4589 } 4590 4591 if (__kmp_affinity_verbose) { 4592 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4593 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4594 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4595 tid, buf); 4596 } 4597 __kmp_set_system_affinity( mask, TRUE ); 4598 } 4599 } 4600 4601 #else 4602 // affinity not supported 4603 4604 kmp_uint32 mac_skipPerLevel[7]; 4605 kmp_uint32 mac_depth; 4606 kmp_uint8 mac_leaf_kids; 4607 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 4608 static int first = 1; 4609 if (first) { 4610 const kmp_uint32 maxLevels = 7; 4611 kmp_uint32 numPerLevel[maxLevels]; 4612 4613 for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level 4614 numPerLevel[i] = 1; 4615 mac_skipPerLevel[i] = 1; 4616 } 4617 4618 mac_depth = 2; 4619 numPerLevel[0] = nproc; 4620 4621 kmp_uint32 branch = 4; 4622 if (numPerLevel[0] == 1) branch = nproc/4; 4623 if (branch<4) branch=4; 4624 for (kmp_uint32 d=0; d<mac_depth-1; ++d) { // optimize hierarchy width 4625 while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0! 4626 if (numPerLevel[d] & 1) numPerLevel[d]++; 4627 numPerLevel[d] = numPerLevel[d] >> 1; 4628 if (numPerLevel[d+1] == 1) mac_depth++; 4629 numPerLevel[d+1] = numPerLevel[d+1] << 1; 4630 } 4631 if(numPerLevel[0] == 1) { 4632 branch = branch >> 1; 4633 if (branch<4) branch = 4; 4634 } 4635 } 4636 4637 for (kmp_uint32 i=1; i<mac_depth; ++i) 4638 mac_skipPerLevel[i] = numPerLevel[i-1] * mac_skipPerLevel[i-1]; 4639 mac_leaf_kids = (kmp_uint8)numPerLevel[0]-1; 4640 first=0; 4641 } 4642 thr_bar->depth = mac_depth; 4643 thr_bar->base_leaf_kids = mac_leaf_kids; 4644 thr_bar->skip_per_level = mac_skipPerLevel; 4645 } 4646 4647 #endif // KMP_AFFINITY_SUPPORTED 4648