1 /* 2 * kmp_affinity.cpp -- affinity management 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "kmp.h" 17 #include "kmp_i18n.h" 18 #include "kmp_io.h" 19 #include "kmp_str.h" 20 #include "kmp_wrapper_getpid.h" 21 22 #if KMP_AFFINITY_SUPPORTED 23 24 // 25 // Print the affinity mask to the character array in a pretty format. 26 // 27 char * 28 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask) 29 { 30 KMP_ASSERT(buf_len >= 40); 31 char *scan = buf; 32 char *end = buf + buf_len - 1; 33 34 // 35 // Find first element / check for empty set. 36 // 37 size_t i; 38 for (i = 0; i < KMP_CPU_SETSIZE; i++) { 39 if (KMP_CPU_ISSET(i, mask)) { 40 break; 41 } 42 } 43 if (i == KMP_CPU_SETSIZE) { 44 sprintf(scan, "{<empty>}"); 45 while (*scan != '\0') scan++; 46 KMP_ASSERT(scan <= end); 47 return buf; 48 } 49 50 sprintf(scan, "{%ld", (long)i); 51 while (*scan != '\0') scan++; 52 i++; 53 for (; i < KMP_CPU_SETSIZE; i++) { 54 if (! KMP_CPU_ISSET(i, mask)) { 55 continue; 56 } 57 58 // 59 // Check for buffer overflow. A string of the form ",<n>" will have 60 // at most 10 characters, plus we want to leave room to print ",...}" 61 // if the set is too large to print for a total of 15 characters. 62 // We already left room for '\0' in setting end. 63 // 64 if (end - scan < 15) { 65 break; 66 } 67 sprintf(scan, ",%-ld", (long)i); 68 while (*scan != '\0') scan++; 69 } 70 if (i < KMP_CPU_SETSIZE) { 71 sprintf(scan, ",..."); 72 while (*scan != '\0') scan++; 73 } 74 sprintf(scan, "}"); 75 while (*scan != '\0') scan++; 76 KMP_ASSERT(scan <= end); 77 return buf; 78 } 79 80 81 void 82 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) 83 { 84 KMP_CPU_ZERO(mask); 85 86 # if KMP_GROUP_AFFINITY 87 88 if (__kmp_num_proc_groups > 1) { 89 int group; 90 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 91 for (group = 0; group < __kmp_num_proc_groups; group++) { 92 int i; 93 int num = __kmp_GetActiveProcessorCount(group); 94 for (i = 0; i < num; i++) { 95 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 96 } 97 } 98 } 99 else 100 101 # endif /* KMP_GROUP_AFFINITY */ 102 103 { 104 int proc; 105 for (proc = 0; proc < __kmp_xproc; proc++) { 106 KMP_CPU_SET(proc, mask); 107 } 108 } 109 } 110 111 112 // 113 // In Linux* OS debug & cover (-O0) builds, we need to avoid inline member 114 // functions. 115 // 116 // The icc codegen emits sections with extremely long names, of the form 117 // ".gnu.linkonce.<mangled_name>". There seems to have been a linker bug 118 // introduced between GNU ld version 2.14.90.0.4 and 2.15.92.0.2 involving 119 // some sort of memory corruption or table overflow that is triggered by 120 // these long strings. I checked the latest version of the linker - 121 // GNU ld (Linux* OS/GNU Binutils) 2.18.50.0.7.20080422 - and the bug is not 122 // fixed. 123 // 124 // Unfortunately, my attempts to reproduce it in a smaller example have 125 // failed - I'm not sure what the prospects are of getting it fixed 126 // properly - but we need a reproducer smaller than all of libiomp. 127 // 128 // Work around the problem by avoiding inline constructors in such builds. 129 // We do this for all platforms, not just Linux* OS - non-inline functions are 130 // more debuggable and provide better coverage into than inline functions. 131 // Use inline functions in shipping libs, for performance. 132 // 133 134 # if !defined(KMP_DEBUG) && !defined(COVER) 135 136 class Address { 137 public: 138 static const unsigned maxDepth = 32; 139 unsigned labels[maxDepth]; 140 unsigned childNums[maxDepth]; 141 unsigned depth; 142 unsigned leader; 143 Address(unsigned _depth) 144 : depth(_depth), leader(FALSE) { 145 } 146 Address &operator=(const Address &b) { 147 depth = b.depth; 148 for (unsigned i = 0; i < depth; i++) { 149 labels[i] = b.labels[i]; 150 childNums[i] = b.childNums[i]; 151 } 152 leader = FALSE; 153 return *this; 154 } 155 bool operator==(const Address &b) const { 156 if (depth != b.depth) 157 return false; 158 for (unsigned i = 0; i < depth; i++) 159 if(labels[i] != b.labels[i]) 160 return false; 161 return true; 162 } 163 bool isClose(const Address &b, int level) const { 164 if (depth != b.depth) 165 return false; 166 if ((unsigned)level >= depth) 167 return true; 168 for (unsigned i = 0; i < (depth - level); i++) 169 if(labels[i] != b.labels[i]) 170 return false; 171 return true; 172 } 173 bool operator!=(const Address &b) const { 174 return !operator==(b); 175 } 176 }; 177 178 class AddrUnsPair { 179 public: 180 Address first; 181 unsigned second; 182 AddrUnsPair(Address _first, unsigned _second) 183 : first(_first), second(_second) { 184 } 185 AddrUnsPair &operator=(const AddrUnsPair &b) 186 { 187 first = b.first; 188 second = b.second; 189 return *this; 190 } 191 }; 192 193 # else 194 195 class Address { 196 public: 197 static const unsigned maxDepth = 32; 198 unsigned labels[maxDepth]; 199 unsigned childNums[maxDepth]; 200 unsigned depth; 201 unsigned leader; 202 Address(unsigned _depth); 203 Address &operator=(const Address &b); 204 bool operator==(const Address &b) const; 205 bool isClose(const Address &b, int level) const; 206 bool operator!=(const Address &b) const; 207 }; 208 209 Address::Address(unsigned _depth) 210 { 211 depth = _depth; 212 leader = FALSE; 213 } 214 215 Address &Address::operator=(const Address &b) { 216 depth = b.depth; 217 for (unsigned i = 0; i < depth; i++) { 218 labels[i] = b.labels[i]; 219 childNums[i] = b.childNums[i]; 220 } 221 leader = FALSE; 222 return *this; 223 } 224 225 bool Address::operator==(const Address &b) const { 226 if (depth != b.depth) 227 return false; 228 for (unsigned i = 0; i < depth; i++) 229 if(labels[i] != b.labels[i]) 230 return false; 231 return true; 232 } 233 234 bool Address::isClose(const Address &b, int level) const { 235 if (depth != b.depth) 236 return false; 237 if ((unsigned)level >= depth) 238 return true; 239 for (unsigned i = 0; i < (depth - level); i++) 240 if(labels[i] != b.labels[i]) 241 return false; 242 return true; 243 } 244 245 bool Address::operator!=(const Address &b) const { 246 return !operator==(b); 247 } 248 249 class AddrUnsPair { 250 public: 251 Address first; 252 unsigned second; 253 AddrUnsPair(Address _first, unsigned _second); 254 AddrUnsPair &operator=(const AddrUnsPair &b); 255 }; 256 257 AddrUnsPair::AddrUnsPair(Address _first, unsigned _second) 258 : first(_first), second(_second) 259 { 260 } 261 262 AddrUnsPair &AddrUnsPair::operator=(const AddrUnsPair &b) 263 { 264 first = b.first; 265 second = b.second; 266 return *this; 267 } 268 269 # endif /* !defined(KMP_DEBUG) && !defined(COVER) */ 270 271 272 static int 273 __kmp_affinity_cmp_Address_labels(const void *a, const void *b) 274 { 275 const Address *aa = (const Address *)&(((AddrUnsPair *)a) 276 ->first); 277 const Address *bb = (const Address *)&(((AddrUnsPair *)b) 278 ->first); 279 unsigned depth = aa->depth; 280 unsigned i; 281 KMP_DEBUG_ASSERT(depth == bb->depth); 282 for (i = 0; i < depth; i++) { 283 if (aa->labels[i] < bb->labels[i]) return -1; 284 if (aa->labels[i] > bb->labels[i]) return 1; 285 } 286 return 0; 287 } 288 289 290 static int 291 __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) 292 { 293 const Address *aa = (const Address *)&(((AddrUnsPair *)a) 294 ->first); 295 const Address *bb = (const Address *)&(((AddrUnsPair *)b) 296 ->first); 297 unsigned depth = aa->depth; 298 unsigned i; 299 KMP_DEBUG_ASSERT(depth == bb->depth); 300 KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth); 301 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0); 302 for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) { 303 int j = depth - i - 1; 304 if (aa->childNums[j] < bb->childNums[j]) return -1; 305 if (aa->childNums[j] > bb->childNums[j]) return 1; 306 } 307 for (; i < depth; i++) { 308 int j = i - __kmp_affinity_compact; 309 if (aa->childNums[j] < bb->childNums[j]) return -1; 310 if (aa->childNums[j] > bb->childNums[j]) return 1; 311 } 312 return 0; 313 } 314 315 /** A structure for holding machine-specific hierarchy info to be computed once at init. */ 316 class hierarchy_info { 317 public: 318 /** Typical levels are threads/core, cores/package or socket, packages/node, nodes/machine, 319 etc. We don't want to get specific with nomenclature */ 320 static const kmp_uint32 maxLevels=7; 321 322 /** This is specifically the depth of the machine configuration hierarchy, in terms of the 323 number of levels along the longest path from root to any leaf. It corresponds to the 324 number of entries in numPerLevel if we exclude all but one trailing 1. */ 325 kmp_uint32 depth; 326 kmp_uint32 base_depth; 327 kmp_uint32 base_num_threads; 328 bool uninitialized; 329 330 /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children the parent of a 331 node at level i has. For example, if we have a machine with 4 packages, 4 cores/package 332 and 2 HT per core, then numPerLevel = {2, 4, 4, 1, 1}. All empty levels are set to 1. */ 333 kmp_uint32 numPerLevel[maxLevels]; 334 kmp_uint32 skipPerLevel[maxLevels]; 335 336 void deriveLevels(AddrUnsPair *adr2os, int num_addrs) { 337 int hier_depth = adr2os[0].first.depth; 338 int level = 0; 339 for (int i=hier_depth-1; i>=0; --i) { 340 int max = -1; 341 for (int j=0; j<num_addrs; ++j) { 342 int next = adr2os[j].first.childNums[i]; 343 if (next > max) max = next; 344 } 345 numPerLevel[level] = max+1; 346 ++level; 347 } 348 } 349 350 hierarchy_info() : depth(1), uninitialized(true) {} 351 void init(AddrUnsPair *adr2os, int num_addrs) 352 { 353 uninitialized = false; 354 for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level 355 numPerLevel[i] = 1; 356 skipPerLevel[i] = 1; 357 } 358 359 // Sort table by physical ID 360 if (adr2os) { 361 qsort(adr2os, num_addrs, sizeof(*adr2os), __kmp_affinity_cmp_Address_labels); 362 deriveLevels(adr2os, num_addrs); 363 } 364 else { 365 numPerLevel[0] = 4; 366 numPerLevel[1] = num_addrs/4; 367 if (num_addrs%4) numPerLevel[1]++; 368 } 369 370 base_num_threads = num_addrs; 371 for (int i=maxLevels-1; i>=0; --i) // count non-empty levels to get depth 372 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1' 373 depth++; 374 375 kmp_uint32 branch = 4; 376 if (numPerLevel[0] == 1) branch = num_addrs/4; 377 if (branch<4) branch=4; 378 for (kmp_uint32 d=0; d<depth-1; ++d) { // optimize hierarchy width 379 while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0! 380 if (numPerLevel[d] & 1) numPerLevel[d]++; 381 numPerLevel[d] = numPerLevel[d] >> 1; 382 if (numPerLevel[d+1] == 1) depth++; 383 numPerLevel[d+1] = numPerLevel[d+1] << 1; 384 } 385 if(numPerLevel[0] == 1) { 386 branch = branch >> 1; 387 if (branch<4) branch = 4; 388 } 389 } 390 391 for (kmp_uint32 i=1; i<depth; ++i) 392 skipPerLevel[i] = numPerLevel[i-1] * skipPerLevel[i-1]; 393 394 base_depth = depth; 395 } 396 }; 397 398 static hierarchy_info machine_hierarchy; 399 400 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 401 if (machine_hierarchy.uninitialized) 402 machine_hierarchy.init(NULL, nproc); 403 404 if (nproc <= machine_hierarchy.base_num_threads) 405 machine_hierarchy.depth = machine_hierarchy.base_depth; 406 KMP_DEBUG_ASSERT(machine_hierarchy.depth > 0); 407 while (nproc > machine_hierarchy.skipPerLevel[machine_hierarchy.depth-1]) { 408 machine_hierarchy.depth++; 409 machine_hierarchy.skipPerLevel[machine_hierarchy.depth-1] = 2*machine_hierarchy.skipPerLevel[machine_hierarchy.depth-2]; 410 } 411 thr_bar->depth = machine_hierarchy.depth; 412 thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1; 413 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 414 } 415 416 // 417 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be 418 // called to renumber the labels from [0..n] and place them into the child_num 419 // vector of the address object. This is done in case the labels used for 420 // the children at one node of the hierarchy differ from those used for 421 // another node at the same level. Example: suppose the machine has 2 nodes 422 // with 2 packages each. The first node contains packages 601 and 602, and 423 // second node contains packages 603 and 604. If we try to sort the table 424 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604 425 // because we are paying attention to the labels themselves, not the ordinal 426 // child numbers. By using the child numbers in the sort, the result is 427 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604. 428 // 429 static void 430 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os, 431 int numAddrs) 432 { 433 KMP_DEBUG_ASSERT(numAddrs > 0); 434 int depth = address2os->first.depth; 435 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 436 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth 437 * sizeof(unsigned)); 438 int labCt; 439 for (labCt = 0; labCt < depth; labCt++) { 440 address2os[0].first.childNums[labCt] = counts[labCt] = 0; 441 lastLabel[labCt] = address2os[0].first.labels[labCt]; 442 } 443 int i; 444 for (i = 1; i < numAddrs; i++) { 445 for (labCt = 0; labCt < depth; labCt++) { 446 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) { 447 int labCt2; 448 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) { 449 counts[labCt2] = 0; 450 lastLabel[labCt2] = address2os[i].first.labels[labCt2]; 451 } 452 counts[labCt]++; 453 lastLabel[labCt] = address2os[i].first.labels[labCt]; 454 break; 455 } 456 } 457 for (labCt = 0; labCt < depth; labCt++) { 458 address2os[i].first.childNums[labCt] = counts[labCt]; 459 } 460 for (; labCt < (int)Address::maxDepth; labCt++) { 461 address2os[i].first.childNums[labCt] = 0; 462 } 463 } 464 } 465 466 467 // 468 // All of the __kmp_affinity_create_*_map() routines should set 469 // __kmp_affinity_masks to a vector of affinity mask objects of length 470 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and 471 // return the number of levels in the machine topology tree (zero if 472 // __kmp_affinity_type == affinity_none). 473 // 474 // All of the __kmp_affinity_create_*_map() routines should set *fullMask 475 // to the affinity mask for the initialization thread. They need to save and 476 // restore the mask, and it could be needed later, so saving it is just an 477 // optimization to avoid calling kmp_get_system_affinity() again. 478 // 479 static kmp_affin_mask_t *fullMask = NULL; 480 481 kmp_affin_mask_t * 482 __kmp_affinity_get_fullMask() { return fullMask; } 483 484 485 static int nCoresPerPkg, nPackages; 486 static int __kmp_nThreadsPerCore; 487 #ifndef KMP_DFLT_NTH_CORES 488 static int __kmp_ncores; 489 #endif 490 491 // 492 // __kmp_affinity_uniform_topology() doesn't work when called from 493 // places which support arbitrarily many levels in the machine topology 494 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map() 495 // __kmp_affinity_create_x2apicid_map(). 496 // 497 inline static bool 498 __kmp_affinity_uniform_topology() 499 { 500 return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages); 501 } 502 503 504 // 505 // Print out the detailed machine topology map, i.e. the physical locations 506 // of each OS proc. 507 // 508 static void 509 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth, 510 int pkgLevel, int coreLevel, int threadLevel) 511 { 512 int proc; 513 514 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 515 for (proc = 0; proc < len; proc++) { 516 int level; 517 kmp_str_buf_t buf; 518 __kmp_str_buf_init(&buf); 519 for (level = 0; level < depth; level++) { 520 if (level == threadLevel) { 521 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread)); 522 } 523 else if (level == coreLevel) { 524 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core)); 525 } 526 else if (level == pkgLevel) { 527 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package)); 528 } 529 else if (level > pkgLevel) { 530 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node), 531 level - pkgLevel - 1); 532 } 533 else { 534 __kmp_str_buf_print(&buf, "L%d ", level); 535 } 536 __kmp_str_buf_print(&buf, "%d ", 537 address2os[proc].first.labels[level]); 538 } 539 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second, 540 buf.str); 541 __kmp_str_buf_free(&buf); 542 } 543 } 544 545 546 // 547 // If we don't know how to retrieve the machine's processor topology, or 548 // encounter an error in doing so, this routine is called to form a "flat" 549 // mapping of os thread id's <-> processor id's. 550 // 551 static int 552 __kmp_affinity_create_flat_map(AddrUnsPair **address2os, 553 kmp_i18n_id_t *const msg_id) 554 { 555 *address2os = NULL; 556 *msg_id = kmp_i18n_null; 557 558 // 559 // Even if __kmp_affinity_type == affinity_none, this routine might still 560 // called to set __kmp_ncores, as well as 561 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 562 // 563 if (! KMP_AFFINITY_CAPABLE()) { 564 KMP_ASSERT(__kmp_affinity_type == affinity_none); 565 __kmp_ncores = nPackages = __kmp_xproc; 566 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 567 if (__kmp_affinity_verbose) { 568 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY"); 569 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 570 KMP_INFORM(Uniform, "KMP_AFFINITY"); 571 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 572 __kmp_nThreadsPerCore, __kmp_ncores); 573 } 574 return 0; 575 } 576 577 // 578 // When affinity is off, this routine will still be called to set 579 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 580 // nCoresPerPkg, & nPackages. Make sure all these vars are set 581 // correctly, and return now if affinity is not enabled. 582 // 583 __kmp_ncores = nPackages = __kmp_avail_proc; 584 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 585 if (__kmp_affinity_verbose) { 586 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 587 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask); 588 589 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY"); 590 if (__kmp_affinity_respect_mask) { 591 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 592 } else { 593 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 594 } 595 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 596 KMP_INFORM(Uniform, "KMP_AFFINITY"); 597 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 598 __kmp_nThreadsPerCore, __kmp_ncores); 599 } 600 if (__kmp_affinity_type == affinity_none) { 601 return 0; 602 } 603 604 // 605 // Contruct the data structure to be returned. 606 // 607 *address2os = (AddrUnsPair*) 608 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 609 int avail_ct = 0; 610 unsigned int i; 611 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 612 // 613 // Skip this proc if it is not included in the machine model. 614 // 615 if (! KMP_CPU_ISSET(i, fullMask)) { 616 continue; 617 } 618 619 Address addr(1); 620 addr.labels[0] = i; 621 (*address2os)[avail_ct++] = AddrUnsPair(addr,i); 622 } 623 if (__kmp_affinity_verbose) { 624 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 625 } 626 627 if (__kmp_affinity_gran_levels < 0) { 628 // 629 // Only the package level is modeled in the machine topology map, 630 // so the #levels of granularity is either 0 or 1. 631 // 632 if (__kmp_affinity_gran > affinity_gran_package) { 633 __kmp_affinity_gran_levels = 1; 634 } 635 else { 636 __kmp_affinity_gran_levels = 0; 637 } 638 } 639 return 1; 640 } 641 642 643 # if KMP_GROUP_AFFINITY 644 645 // 646 // If multiple Windows* OS processor groups exist, we can create a 2-level 647 // topology map with the groups at level 0 and the individual procs at 648 // level 1. 649 // 650 // This facilitates letting the threads float among all procs in a group, 651 // if granularity=group (the default when there are multiple groups). 652 // 653 static int 654 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os, 655 kmp_i18n_id_t *const msg_id) 656 { 657 *address2os = NULL; 658 *msg_id = kmp_i18n_null; 659 660 // 661 // If we don't have multiple processor groups, return now. 662 // The flat mapping will be used. 663 // 664 if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) { 665 // FIXME set *msg_id 666 return -1; 667 } 668 669 // 670 // Contruct the data structure to be returned. 671 // 672 *address2os = (AddrUnsPair*) 673 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 674 int avail_ct = 0; 675 int i; 676 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 677 // 678 // Skip this proc if it is not included in the machine model. 679 // 680 if (! KMP_CPU_ISSET(i, fullMask)) { 681 continue; 682 } 683 684 Address addr(2); 685 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR)); 686 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR)); 687 (*address2os)[avail_ct++] = AddrUnsPair(addr,i); 688 689 if (__kmp_affinity_verbose) { 690 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0], 691 addr.labels[1]); 692 } 693 } 694 695 if (__kmp_affinity_gran_levels < 0) { 696 if (__kmp_affinity_gran == affinity_gran_group) { 697 __kmp_affinity_gran_levels = 1; 698 } 699 else if ((__kmp_affinity_gran == affinity_gran_fine) 700 || (__kmp_affinity_gran == affinity_gran_thread)) { 701 __kmp_affinity_gran_levels = 0; 702 } 703 else { 704 const char *gran_str = NULL; 705 if (__kmp_affinity_gran == affinity_gran_core) { 706 gran_str = "core"; 707 } 708 else if (__kmp_affinity_gran == affinity_gran_package) { 709 gran_str = "package"; 710 } 711 else if (__kmp_affinity_gran == affinity_gran_node) { 712 gran_str = "node"; 713 } 714 else { 715 KMP_ASSERT(0); 716 } 717 718 // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread" 719 __kmp_affinity_gran_levels = 0; 720 } 721 } 722 return 2; 723 } 724 725 # endif /* KMP_GROUP_AFFINITY */ 726 727 728 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 729 730 static int 731 __kmp_cpuid_mask_width(int count) { 732 int r = 0; 733 734 while((1<<r) < count) 735 ++r; 736 return r; 737 } 738 739 740 class apicThreadInfo { 741 public: 742 unsigned osId; // param to __kmp_affinity_bind_thread 743 unsigned apicId; // from cpuid after binding 744 unsigned maxCoresPerPkg; // "" 745 unsigned maxThreadsPerPkg; // "" 746 unsigned pkgId; // inferred from above values 747 unsigned coreId; // "" 748 unsigned threadId; // "" 749 }; 750 751 752 static int 753 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b) 754 { 755 const apicThreadInfo *aa = (const apicThreadInfo *)a; 756 const apicThreadInfo *bb = (const apicThreadInfo *)b; 757 if (aa->osId < bb->osId) return -1; 758 if (aa->osId > bb->osId) return 1; 759 return 0; 760 } 761 762 763 static int 764 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b) 765 { 766 const apicThreadInfo *aa = (const apicThreadInfo *)a; 767 const apicThreadInfo *bb = (const apicThreadInfo *)b; 768 if (aa->pkgId < bb->pkgId) return -1; 769 if (aa->pkgId > bb->pkgId) return 1; 770 if (aa->coreId < bb->coreId) return -1; 771 if (aa->coreId > bb->coreId) return 1; 772 if (aa->threadId < bb->threadId) return -1; 773 if (aa->threadId > bb->threadId) return 1; 774 return 0; 775 } 776 777 778 // 779 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 780 // an algorithm which cycles through the available os threads, setting 781 // the current thread's affinity mask to that thread, and then retrieves 782 // the Apic Id for each thread context using the cpuid instruction. 783 // 784 static int 785 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os, 786 kmp_i18n_id_t *const msg_id) 787 { 788 kmp_cpuid buf; 789 int rc; 790 *address2os = NULL; 791 *msg_id = kmp_i18n_null; 792 793 // 794 // Check if cpuid leaf 4 is supported. 795 // 796 __kmp_x86_cpuid(0, 0, &buf); 797 if (buf.eax < 4) { 798 *msg_id = kmp_i18n_str_NoLeaf4Support; 799 return -1; 800 } 801 802 // 803 // The algorithm used starts by setting the affinity to each available 804 // thread and retrieving info from the cpuid instruction, so if we are 805 // not capable of calling __kmp_get_system_affinity() and 806 // _kmp_get_system_affinity(), then we need to do something else - use 807 // the defaults that we calculated from issuing cpuid without binding 808 // to each proc. 809 // 810 if (! KMP_AFFINITY_CAPABLE()) { 811 // 812 // Hack to try and infer the machine topology using only the data 813 // available from cpuid on the current thread, and __kmp_xproc. 814 // 815 KMP_ASSERT(__kmp_affinity_type == affinity_none); 816 817 // 818 // Get an upper bound on the number of threads per package using 819 // cpuid(1). 820 // 821 // On some OS/chps combinations where HT is supported by the chip 822 // but is disabled, this value will be 2 on a single core chip. 823 // Usually, it will be 2 if HT is enabled and 1 if HT is disabled. 824 // 825 __kmp_x86_cpuid(1, 0, &buf); 826 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 827 if (maxThreadsPerPkg == 0) { 828 maxThreadsPerPkg = 1; 829 } 830 831 // 832 // The num cores per pkg comes from cpuid(4). 833 // 1 must be added to the encoded value. 834 // 835 // The author of cpu_count.cpp treated this only an upper bound 836 // on the number of cores, but I haven't seen any cases where it 837 // was greater than the actual number of cores, so we will treat 838 // it as exact in this block of code. 839 // 840 // First, we need to check if cpuid(4) is supported on this chip. 841 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax 842 // has the value n or greater. 843 // 844 __kmp_x86_cpuid(0, 0, &buf); 845 if (buf.eax >= 4) { 846 __kmp_x86_cpuid(4, 0, &buf); 847 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 848 } 849 else { 850 nCoresPerPkg = 1; 851 } 852 853 // 854 // There is no way to reliably tell if HT is enabled without issuing 855 // the cpuid instruction from every thread, can correlating the cpuid 856 // info, so if the machine is not affinity capable, we assume that HT 857 // is off. We have seen quite a few machines where maxThreadsPerPkg 858 // is 2, yet the machine does not support HT. 859 // 860 // - Older OSes are usually found on machines with older chips, which 861 // do not support HT. 862 // 863 // - The performance penalty for mistakenly identifying a machine as 864 // HT when it isn't (which results in blocktime being incorrecly set 865 // to 0) is greater than the penalty when for mistakenly identifying 866 // a machine as being 1 thread/core when it is really HT enabled 867 // (which results in blocktime being incorrectly set to a positive 868 // value). 869 // 870 __kmp_ncores = __kmp_xproc; 871 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 872 __kmp_nThreadsPerCore = 1; 873 if (__kmp_affinity_verbose) { 874 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY"); 875 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 876 if (__kmp_affinity_uniform_topology()) { 877 KMP_INFORM(Uniform, "KMP_AFFINITY"); 878 } else { 879 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 880 } 881 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 882 __kmp_nThreadsPerCore, __kmp_ncores); 883 } 884 return 0; 885 } 886 887 // 888 // 889 // From here on, we can assume that it is safe to call 890 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), 891 // even if __kmp_affinity_type = affinity_none. 892 // 893 894 // 895 // Save the affinity mask for the current thread. 896 // 897 kmp_affin_mask_t *oldMask; 898 KMP_CPU_ALLOC(oldMask); 899 KMP_ASSERT(oldMask != NULL); 900 __kmp_get_system_affinity(oldMask, TRUE); 901 902 // 903 // Run through each of the available contexts, binding the current thread 904 // to it, and obtaining the pertinent information using the cpuid instr. 905 // 906 // The relevant information is: 907 // 908 // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 909 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 910 // 911 // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The 912 // value of this field determines the width of the core# + thread# 913 // fields in the Apic Id. It is also an upper bound on the number 914 // of threads per package, but it has been verified that situations 915 // happen were it is not exact. In particular, on certain OS/chip 916 // combinations where Intel(R) Hyper-Threading Technology is supported 917 // by the chip but has 918 // been disabled, the value of this field will be 2 (for a single core 919 // chip). On other OS/chip combinations supporting 920 // Intel(R) Hyper-Threading Technology, the value of 921 // this field will be 1 when Intel(R) Hyper-Threading Technology is 922 // disabled and 2 when it is enabled. 923 // 924 // Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The 925 // value of this field (+1) determines the width of the core# field in 926 // the Apic Id. The comments in "cpucount.cpp" say that this value is 927 // an upper bound, but the IA-32 architecture manual says that it is 928 // exactly the number of cores per package, and I haven't seen any 929 // case where it wasn't. 930 // 931 // From this information, deduce the package Id, core Id, and thread Id, 932 // and set the corresponding fields in the apicThreadInfo struct. 933 // 934 unsigned i; 935 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 936 __kmp_avail_proc * sizeof(apicThreadInfo)); 937 unsigned nApics = 0; 938 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 939 // 940 // Skip this proc if it is not included in the machine model. 941 // 942 if (! KMP_CPU_ISSET(i, fullMask)) { 943 continue; 944 } 945 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 946 947 __kmp_affinity_bind_thread(i); 948 threadInfo[nApics].osId = i; 949 950 // 951 // The apic id and max threads per pkg come from cpuid(1). 952 // 953 __kmp_x86_cpuid(1, 0, &buf); 954 if (! (buf.edx >> 9) & 1) { 955 __kmp_set_system_affinity(oldMask, TRUE); 956 __kmp_free(threadInfo); 957 KMP_CPU_FREE(oldMask); 958 *msg_id = kmp_i18n_str_ApicNotPresent; 959 return -1; 960 } 961 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 962 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 963 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 964 threadInfo[nApics].maxThreadsPerPkg = 1; 965 } 966 967 // 968 // Max cores per pkg comes from cpuid(4). 969 // 1 must be added to the encoded value. 970 // 971 // First, we need to check if cpuid(4) is supported on this chip. 972 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax 973 // has the value n or greater. 974 // 975 __kmp_x86_cpuid(0, 0, &buf); 976 if (buf.eax >= 4) { 977 __kmp_x86_cpuid(4, 0, &buf); 978 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 979 } 980 else { 981 threadInfo[nApics].maxCoresPerPkg = 1; 982 } 983 984 // 985 // Infer the pkgId / coreId / threadId using only the info 986 // obtained locally. 987 // 988 int widthCT = __kmp_cpuid_mask_width( 989 threadInfo[nApics].maxThreadsPerPkg); 990 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 991 992 int widthC = __kmp_cpuid_mask_width( 993 threadInfo[nApics].maxCoresPerPkg); 994 int widthT = widthCT - widthC; 995 if (widthT < 0) { 996 // 997 // I've never seen this one happen, but I suppose it could, if 998 // the cpuid instruction on a chip was really screwed up. 999 // Make sure to restore the affinity mask before the tail call. 1000 // 1001 __kmp_set_system_affinity(oldMask, TRUE); 1002 __kmp_free(threadInfo); 1003 KMP_CPU_FREE(oldMask); 1004 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1005 return -1; 1006 } 1007 1008 int maskC = (1 << widthC) - 1; 1009 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) 1010 &maskC; 1011 1012 int maskT = (1 << widthT) - 1; 1013 threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT; 1014 1015 nApics++; 1016 } 1017 1018 // 1019 // We've collected all the info we need. 1020 // Restore the old affinity mask for this thread. 1021 // 1022 __kmp_set_system_affinity(oldMask, TRUE); 1023 1024 // 1025 // If there's only one thread context to bind to, form an Address object 1026 // with depth 1 and return immediately (or, if affinity is off, set 1027 // address2os to NULL and return). 1028 // 1029 // If it is configured to omit the package level when there is only a 1030 // single package, the logic at the end of this routine won't work if 1031 // there is only a single thread - it would try to form an Address 1032 // object with depth 0. 1033 // 1034 KMP_ASSERT(nApics > 0); 1035 if (nApics == 1) { 1036 __kmp_ncores = nPackages = 1; 1037 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1038 if (__kmp_affinity_verbose) { 1039 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1040 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1041 1042 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1043 if (__kmp_affinity_respect_mask) { 1044 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1045 } else { 1046 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1047 } 1048 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1049 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1050 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1051 __kmp_nThreadsPerCore, __kmp_ncores); 1052 } 1053 1054 if (__kmp_affinity_type == affinity_none) { 1055 __kmp_free(threadInfo); 1056 KMP_CPU_FREE(oldMask); 1057 return 0; 1058 } 1059 1060 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair)); 1061 Address addr(1); 1062 addr.labels[0] = threadInfo[0].pkgId; 1063 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId); 1064 1065 if (__kmp_affinity_gran_levels < 0) { 1066 __kmp_affinity_gran_levels = 0; 1067 } 1068 1069 if (__kmp_affinity_verbose) { 1070 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 1071 } 1072 1073 __kmp_free(threadInfo); 1074 KMP_CPU_FREE(oldMask); 1075 return 1; 1076 } 1077 1078 // 1079 // Sort the threadInfo table by physical Id. 1080 // 1081 qsort(threadInfo, nApics, sizeof(*threadInfo), 1082 __kmp_affinity_cmp_apicThreadInfo_phys_id); 1083 1084 // 1085 // The table is now sorted by pkgId / coreId / threadId, but we really 1086 // don't know the radix of any of the fields. pkgId's may be sparsely 1087 // assigned among the chips on a system. Although coreId's are usually 1088 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned 1089 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1090 // 1091 // For that matter, we don't know what coresPerPkg and threadsPerCore 1092 // (or the total # packages) are at this point - we want to determine 1093 // that now. We only have an upper bound on the first two figures. 1094 // 1095 // We also perform a consistency check at this point: the values returned 1096 // by the cpuid instruction for any thread bound to a given package had 1097 // better return the same info for maxThreadsPerPkg and maxCoresPerPkg. 1098 // 1099 nPackages = 1; 1100 nCoresPerPkg = 1; 1101 __kmp_nThreadsPerCore = 1; 1102 unsigned nCores = 1; 1103 1104 unsigned pkgCt = 1; // to determine radii 1105 unsigned lastPkgId = threadInfo[0].pkgId; 1106 unsigned coreCt = 1; 1107 unsigned lastCoreId = threadInfo[0].coreId; 1108 unsigned threadCt = 1; 1109 unsigned lastThreadId = threadInfo[0].threadId; 1110 1111 // intra-pkg consist checks 1112 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 1113 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 1114 1115 for (i = 1; i < nApics; i++) { 1116 if (threadInfo[i].pkgId != lastPkgId) { 1117 nCores++; 1118 pkgCt++; 1119 lastPkgId = threadInfo[i].pkgId; 1120 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; 1121 coreCt = 1; 1122 lastCoreId = threadInfo[i].coreId; 1123 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1124 threadCt = 1; 1125 lastThreadId = threadInfo[i].threadId; 1126 1127 // 1128 // This is a different package, so go on to the next iteration 1129 // without doing any consistency checks. Reset the consistency 1130 // check vars, though. 1131 // 1132 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 1133 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 1134 continue; 1135 } 1136 1137 if (threadInfo[i].coreId != lastCoreId) { 1138 nCores++; 1139 coreCt++; 1140 lastCoreId = threadInfo[i].coreId; 1141 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1142 threadCt = 1; 1143 lastThreadId = threadInfo[i].threadId; 1144 } 1145 else if (threadInfo[i].threadId != lastThreadId) { 1146 threadCt++; 1147 lastThreadId = threadInfo[i].threadId; 1148 } 1149 else { 1150 __kmp_free(threadInfo); 1151 KMP_CPU_FREE(oldMask); 1152 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1153 return -1; 1154 } 1155 1156 // 1157 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 1158 // fields agree between all the threads bounds to a given package. 1159 // 1160 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) 1161 || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 1162 __kmp_free(threadInfo); 1163 KMP_CPU_FREE(oldMask); 1164 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1165 return -1; 1166 } 1167 } 1168 nPackages = pkgCt; 1169 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; 1170 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1171 1172 // 1173 // When affinity is off, this routine will still be called to set 1174 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 1175 // nCoresPerPkg, & nPackages. Make sure all these vars are set 1176 // correctly, and return now if affinity is not enabled. 1177 // 1178 __kmp_ncores = nCores; 1179 if (__kmp_affinity_verbose) { 1180 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1181 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1182 1183 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1184 if (__kmp_affinity_respect_mask) { 1185 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1186 } else { 1187 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1188 } 1189 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1190 if (__kmp_affinity_uniform_topology()) { 1191 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1192 } else { 1193 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1194 } 1195 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1196 __kmp_nThreadsPerCore, __kmp_ncores); 1197 1198 } 1199 1200 if (__kmp_affinity_type == affinity_none) { 1201 __kmp_free(threadInfo); 1202 KMP_CPU_FREE(oldMask); 1203 return 0; 1204 } 1205 1206 // 1207 // Now that we've determined the number of packages, the number of cores 1208 // per package, and the number of threads per core, we can construct the 1209 // data structure that is to be returned. 1210 // 1211 int pkgLevel = 0; 1212 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1; 1213 int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 1214 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 1215 1216 KMP_ASSERT(depth > 0); 1217 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1218 1219 for (i = 0; i < nApics; ++i) { 1220 Address addr(depth); 1221 unsigned os = threadInfo[i].osId; 1222 int d = 0; 1223 1224 if (pkgLevel >= 0) { 1225 addr.labels[d++] = threadInfo[i].pkgId; 1226 } 1227 if (coreLevel >= 0) { 1228 addr.labels[d++] = threadInfo[i].coreId; 1229 } 1230 if (threadLevel >= 0) { 1231 addr.labels[d++] = threadInfo[i].threadId; 1232 } 1233 (*address2os)[i] = AddrUnsPair(addr, os); 1234 } 1235 1236 if (__kmp_affinity_gran_levels < 0) { 1237 // 1238 // Set the granularity level based on what levels are modeled 1239 // in the machine topology map. 1240 // 1241 __kmp_affinity_gran_levels = 0; 1242 if ((threadLevel >= 0) 1243 && (__kmp_affinity_gran > affinity_gran_thread)) { 1244 __kmp_affinity_gran_levels++; 1245 } 1246 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1247 __kmp_affinity_gran_levels++; 1248 } 1249 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) { 1250 __kmp_affinity_gran_levels++; 1251 } 1252 } 1253 1254 if (__kmp_affinity_verbose) { 1255 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel, 1256 coreLevel, threadLevel); 1257 } 1258 1259 __kmp_free(threadInfo); 1260 KMP_CPU_FREE(oldMask); 1261 return depth; 1262 } 1263 1264 1265 // 1266 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 1267 // architectures support a newer interface for specifying the x2APIC Ids, 1268 // based on cpuid leaf 11. 1269 // 1270 static int 1271 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os, 1272 kmp_i18n_id_t *const msg_id) 1273 { 1274 kmp_cpuid buf; 1275 1276 *address2os = NULL; 1277 *msg_id = kmp_i18n_null; 1278 1279 // 1280 // Check to see if cpuid leaf 11 is supported. 1281 // 1282 __kmp_x86_cpuid(0, 0, &buf); 1283 if (buf.eax < 11) { 1284 *msg_id = kmp_i18n_str_NoLeaf11Support; 1285 return -1; 1286 } 1287 __kmp_x86_cpuid(11, 0, &buf); 1288 if (buf.ebx == 0) { 1289 *msg_id = kmp_i18n_str_NoLeaf11Support; 1290 return -1; 1291 } 1292 1293 // 1294 // Find the number of levels in the machine topology. While we're at it, 1295 // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will 1296 // try to get more accurate values later by explicitly counting them, 1297 // but get reasonable defaults now, in case we return early. 1298 // 1299 int level; 1300 int threadLevel = -1; 1301 int coreLevel = -1; 1302 int pkgLevel = -1; 1303 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1304 1305 for (level = 0;; level++) { 1306 if (level > 31) { 1307 // 1308 // FIXME: Hack for DPD200163180 1309 // 1310 // If level is big then something went wrong -> exiting 1311 // 1312 // There could actually be 32 valid levels in the machine topology, 1313 // but so far, the only machine we have seen which does not exit 1314 // this loop before iteration 32 has fubar x2APIC settings. 1315 // 1316 // For now, just reject this case based upon loop trip count. 1317 // 1318 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1319 return -1; 1320 } 1321 __kmp_x86_cpuid(11, level, &buf); 1322 if (buf.ebx == 0) { 1323 if (pkgLevel < 0) { 1324 // 1325 // Will infer nPackages from __kmp_xproc 1326 // 1327 pkgLevel = level; 1328 level++; 1329 } 1330 break; 1331 } 1332 int kind = (buf.ecx >> 8) & 0xff; 1333 if (kind == 1) { 1334 // 1335 // SMT level 1336 // 1337 threadLevel = level; 1338 coreLevel = -1; 1339 pkgLevel = -1; 1340 __kmp_nThreadsPerCore = buf.ebx & 0xff; 1341 if (__kmp_nThreadsPerCore == 0) { 1342 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1343 return -1; 1344 } 1345 } 1346 else if (kind == 2) { 1347 // 1348 // core level 1349 // 1350 coreLevel = level; 1351 pkgLevel = -1; 1352 nCoresPerPkg = buf.ebx & 0xff; 1353 if (nCoresPerPkg == 0) { 1354 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1355 return -1; 1356 } 1357 } 1358 else { 1359 if (level <= 0) { 1360 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1361 return -1; 1362 } 1363 if (pkgLevel >= 0) { 1364 continue; 1365 } 1366 pkgLevel = level; 1367 nPackages = buf.ebx & 0xff; 1368 if (nPackages == 0) { 1369 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1370 return -1; 1371 } 1372 } 1373 } 1374 int depth = level; 1375 1376 // 1377 // In the above loop, "level" was counted from the finest level (usually 1378 // thread) to the coarsest. The caller expects that we will place the 1379 // labels in (*address2os)[].first.labels[] in the inverse order, so 1380 // we need to invert the vars saying which level means what. 1381 // 1382 if (threadLevel >= 0) { 1383 threadLevel = depth - threadLevel - 1; 1384 } 1385 if (coreLevel >= 0) { 1386 coreLevel = depth - coreLevel - 1; 1387 } 1388 KMP_DEBUG_ASSERT(pkgLevel >= 0); 1389 pkgLevel = depth - pkgLevel - 1; 1390 1391 // 1392 // The algorithm used starts by setting the affinity to each available 1393 // thread and retrieving info from the cpuid instruction, so if we are 1394 // not capable of calling __kmp_get_system_affinity() and 1395 // _kmp_get_system_affinity(), then we need to do something else - use 1396 // the defaults that we calculated from issuing cpuid without binding 1397 // to each proc. 1398 // 1399 if (! KMP_AFFINITY_CAPABLE()) 1400 { 1401 // 1402 // Hack to try and infer the machine topology using only the data 1403 // available from cpuid on the current thread, and __kmp_xproc. 1404 // 1405 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1406 1407 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1408 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1409 if (__kmp_affinity_verbose) { 1410 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 1411 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1412 if (__kmp_affinity_uniform_topology()) { 1413 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1414 } else { 1415 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1416 } 1417 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1418 __kmp_nThreadsPerCore, __kmp_ncores); 1419 } 1420 return 0; 1421 } 1422 1423 // 1424 // 1425 // From here on, we can assume that it is safe to call 1426 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), 1427 // even if __kmp_affinity_type = affinity_none. 1428 // 1429 1430 // 1431 // Save the affinity mask for the current thread. 1432 // 1433 kmp_affin_mask_t *oldMask; 1434 KMP_CPU_ALLOC(oldMask); 1435 __kmp_get_system_affinity(oldMask, TRUE); 1436 1437 // 1438 // Allocate the data structure to be returned. 1439 // 1440 AddrUnsPair *retval = (AddrUnsPair *) 1441 __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 1442 1443 // 1444 // Run through each of the available contexts, binding the current thread 1445 // to it, and obtaining the pertinent information using the cpuid instr. 1446 // 1447 unsigned int proc; 1448 int nApics = 0; 1449 for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) { 1450 // 1451 // Skip this proc if it is not included in the machine model. 1452 // 1453 if (! KMP_CPU_ISSET(proc, fullMask)) { 1454 continue; 1455 } 1456 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc); 1457 1458 __kmp_affinity_bind_thread(proc); 1459 1460 // 1461 // Extrach the labels for each level in the machine topology map 1462 // from the Apic ID. 1463 // 1464 Address addr(depth); 1465 int prev_shift = 0; 1466 1467 for (level = 0; level < depth; level++) { 1468 __kmp_x86_cpuid(11, level, &buf); 1469 unsigned apicId = buf.edx; 1470 if (buf.ebx == 0) { 1471 if (level != depth - 1) { 1472 KMP_CPU_FREE(oldMask); 1473 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1474 return -1; 1475 } 1476 addr.labels[depth - level - 1] = apicId >> prev_shift; 1477 level++; 1478 break; 1479 } 1480 int shift = buf.eax & 0x1f; 1481 int mask = (1 << shift) - 1; 1482 addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift; 1483 prev_shift = shift; 1484 } 1485 if (level != depth) { 1486 KMP_CPU_FREE(oldMask); 1487 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1488 return -1; 1489 } 1490 1491 retval[nApics] = AddrUnsPair(addr, proc); 1492 nApics++; 1493 } 1494 1495 // 1496 // We've collected all the info we need. 1497 // Restore the old affinity mask for this thread. 1498 // 1499 __kmp_set_system_affinity(oldMask, TRUE); 1500 1501 // 1502 // If there's only one thread context to bind to, return now. 1503 // 1504 KMP_ASSERT(nApics > 0); 1505 if (nApics == 1) { 1506 __kmp_ncores = nPackages = 1; 1507 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1508 if (__kmp_affinity_verbose) { 1509 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1510 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1511 1512 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1513 if (__kmp_affinity_respect_mask) { 1514 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1515 } else { 1516 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1517 } 1518 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1519 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1520 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1521 __kmp_nThreadsPerCore, __kmp_ncores); 1522 } 1523 1524 if (__kmp_affinity_type == affinity_none) { 1525 __kmp_free(retval); 1526 KMP_CPU_FREE(oldMask); 1527 return 0; 1528 } 1529 1530 // 1531 // Form an Address object which only includes the package level. 1532 // 1533 Address addr(1); 1534 addr.labels[0] = retval[0].first.labels[pkgLevel]; 1535 retval[0].first = addr; 1536 1537 if (__kmp_affinity_gran_levels < 0) { 1538 __kmp_affinity_gran_levels = 0; 1539 } 1540 1541 if (__kmp_affinity_verbose) { 1542 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 1543 } 1544 1545 *address2os = retval; 1546 KMP_CPU_FREE(oldMask); 1547 return 1; 1548 } 1549 1550 // 1551 // Sort the table by physical Id. 1552 // 1553 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels); 1554 1555 // 1556 // Find the radix at each of the levels. 1557 // 1558 unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1559 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1560 unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1561 unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1562 for (level = 0; level < depth; level++) { 1563 totals[level] = 1; 1564 maxCt[level] = 1; 1565 counts[level] = 1; 1566 last[level] = retval[0].first.labels[level]; 1567 } 1568 1569 // 1570 // From here on, the iteration variable "level" runs from the finest 1571 // level to the coarsest, i.e. we iterate forward through 1572 // (*address2os)[].first.labels[] - in the previous loops, we iterated 1573 // backwards. 1574 // 1575 for (proc = 1; (int)proc < nApics; proc++) { 1576 int level; 1577 for (level = 0; level < depth; level++) { 1578 if (retval[proc].first.labels[level] != last[level]) { 1579 int j; 1580 for (j = level + 1; j < depth; j++) { 1581 totals[j]++; 1582 counts[j] = 1; 1583 // The line below causes printing incorrect topology information 1584 // in case the max value for some level (maxCt[level]) is encountered earlier than 1585 // some less value while going through the array. 1586 // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2 1587 // whereas it must be 4. 1588 // TODO!!! Check if it can be commented safely 1589 //maxCt[j] = 1; 1590 last[j] = retval[proc].first.labels[j]; 1591 } 1592 totals[level]++; 1593 counts[level]++; 1594 if (counts[level] > maxCt[level]) { 1595 maxCt[level] = counts[level]; 1596 } 1597 last[level] = retval[proc].first.labels[level]; 1598 break; 1599 } 1600 else if (level == depth - 1) { 1601 __kmp_free(last); 1602 __kmp_free(maxCt); 1603 __kmp_free(counts); 1604 __kmp_free(totals); 1605 __kmp_free(retval); 1606 KMP_CPU_FREE(oldMask); 1607 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 1608 return -1; 1609 } 1610 } 1611 } 1612 1613 // 1614 // When affinity is off, this routine will still be called to set 1615 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 1616 // nCoresPerPkg, & nPackages. Make sure all these vars are set 1617 // correctly, and return if affinity is not enabled. 1618 // 1619 if (threadLevel >= 0) { 1620 __kmp_nThreadsPerCore = maxCt[threadLevel]; 1621 } 1622 else { 1623 __kmp_nThreadsPerCore = 1; 1624 } 1625 nPackages = totals[pkgLevel]; 1626 1627 if (coreLevel >= 0) { 1628 __kmp_ncores = totals[coreLevel]; 1629 nCoresPerPkg = maxCt[coreLevel]; 1630 } 1631 else { 1632 __kmp_ncores = nPackages; 1633 nCoresPerPkg = 1; 1634 } 1635 1636 // 1637 // Check to see if the machine topology is uniform 1638 // 1639 unsigned prod = maxCt[0]; 1640 for (level = 1; level < depth; level++) { 1641 prod *= maxCt[level]; 1642 } 1643 bool uniform = (prod == totals[level - 1]); 1644 1645 // 1646 // Print the machine topology summary. 1647 // 1648 if (__kmp_affinity_verbose) { 1649 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 1650 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1651 1652 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1653 if (__kmp_affinity_respect_mask) { 1654 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 1655 } else { 1656 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 1657 } 1658 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1659 if (uniform) { 1660 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1661 } else { 1662 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1663 } 1664 1665 kmp_str_buf_t buf; 1666 __kmp_str_buf_init(&buf); 1667 1668 __kmp_str_buf_print(&buf, "%d", totals[0]); 1669 for (level = 1; level <= pkgLevel; level++) { 1670 __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 1671 } 1672 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 1673 __kmp_nThreadsPerCore, __kmp_ncores); 1674 1675 __kmp_str_buf_free(&buf); 1676 } 1677 1678 if (__kmp_affinity_type == affinity_none) { 1679 __kmp_free(last); 1680 __kmp_free(maxCt); 1681 __kmp_free(counts); 1682 __kmp_free(totals); 1683 __kmp_free(retval); 1684 KMP_CPU_FREE(oldMask); 1685 return 0; 1686 } 1687 1688 // 1689 // Find any levels with radiix 1, and remove them from the map 1690 // (except for the package level). 1691 // 1692 int new_depth = 0; 1693 for (level = 0; level < depth; level++) { 1694 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1695 continue; 1696 } 1697 new_depth++; 1698 } 1699 1700 // 1701 // If we are removing any levels, allocate a new vector to return, 1702 // and copy the relevant information to it. 1703 // 1704 if (new_depth != depth) { 1705 AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate( 1706 sizeof(AddrUnsPair) * nApics); 1707 for (proc = 0; (int)proc < nApics; proc++) { 1708 Address addr(new_depth); 1709 new_retval[proc] = AddrUnsPair(addr, retval[proc].second); 1710 } 1711 int new_level = 0; 1712 for (level = 0; level < depth; level++) { 1713 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1714 if (level == threadLevel) { 1715 threadLevel = -1; 1716 } 1717 else if ((threadLevel >= 0) && (level < threadLevel)) { 1718 threadLevel--; 1719 } 1720 if (level == coreLevel) { 1721 coreLevel = -1; 1722 } 1723 else if ((coreLevel >= 0) && (level < coreLevel)) { 1724 coreLevel--; 1725 } 1726 if (level < pkgLevel) { 1727 pkgLevel--; 1728 } 1729 continue; 1730 } 1731 for (proc = 0; (int)proc < nApics; proc++) { 1732 new_retval[proc].first.labels[new_level] 1733 = retval[proc].first.labels[level]; 1734 } 1735 new_level++; 1736 } 1737 1738 __kmp_free(retval); 1739 retval = new_retval; 1740 depth = new_depth; 1741 } 1742 1743 if (__kmp_affinity_gran_levels < 0) { 1744 // 1745 // Set the granularity level based on what levels are modeled 1746 // in the machine topology map. 1747 // 1748 __kmp_affinity_gran_levels = 0; 1749 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1750 __kmp_affinity_gran_levels++; 1751 } 1752 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1753 __kmp_affinity_gran_levels++; 1754 } 1755 if (__kmp_affinity_gran > affinity_gran_package) { 1756 __kmp_affinity_gran_levels++; 1757 } 1758 } 1759 1760 if (__kmp_affinity_verbose) { 1761 __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, 1762 coreLevel, threadLevel); 1763 } 1764 1765 __kmp_free(last); 1766 __kmp_free(maxCt); 1767 __kmp_free(counts); 1768 __kmp_free(totals); 1769 KMP_CPU_FREE(oldMask); 1770 *address2os = retval; 1771 return depth; 1772 } 1773 1774 1775 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1776 1777 1778 #define osIdIndex 0 1779 #define threadIdIndex 1 1780 #define coreIdIndex 2 1781 #define pkgIdIndex 3 1782 #define nodeIdIndex 4 1783 1784 typedef unsigned *ProcCpuInfo; 1785 static unsigned maxIndex = pkgIdIndex; 1786 1787 1788 static int 1789 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b) 1790 { 1791 const unsigned *aa = (const unsigned *)a; 1792 const unsigned *bb = (const unsigned *)b; 1793 if (aa[osIdIndex] < bb[osIdIndex]) return -1; 1794 if (aa[osIdIndex] > bb[osIdIndex]) return 1; 1795 return 0; 1796 }; 1797 1798 1799 static int 1800 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b) 1801 { 1802 unsigned i; 1803 const unsigned *aa = *((const unsigned **)a); 1804 const unsigned *bb = *((const unsigned **)b); 1805 for (i = maxIndex; ; i--) { 1806 if (aa[i] < bb[i]) return -1; 1807 if (aa[i] > bb[i]) return 1; 1808 if (i == osIdIndex) break; 1809 } 1810 return 0; 1811 } 1812 1813 1814 // 1815 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 1816 // affinity map. 1817 // 1818 static int 1819 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line, 1820 kmp_i18n_id_t *const msg_id, FILE *f) 1821 { 1822 *address2os = NULL; 1823 *msg_id = kmp_i18n_null; 1824 1825 // 1826 // Scan of the file, and count the number of "processor" (osId) fields, 1827 // and find the highest value of <n> for a node_<n> field. 1828 // 1829 char buf[256]; 1830 unsigned num_records = 0; 1831 while (! feof(f)) { 1832 buf[sizeof(buf) - 1] = 1; 1833 if (! fgets(buf, sizeof(buf), f)) { 1834 // 1835 // Read errors presumably because of EOF 1836 // 1837 break; 1838 } 1839 1840 char s1[] = "processor"; 1841 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1842 num_records++; 1843 continue; 1844 } 1845 1846 // 1847 // FIXME - this will match "node_<n> <garbage>" 1848 // 1849 unsigned level; 1850 if (sscanf(buf, "node_%d id", &level) == 1) { 1851 if (nodeIdIndex + level >= maxIndex) { 1852 maxIndex = nodeIdIndex + level; 1853 } 1854 continue; 1855 } 1856 } 1857 1858 // 1859 // Check for empty file / no valid processor records, or too many. 1860 // The number of records can't exceed the number of valid bits in the 1861 // affinity mask. 1862 // 1863 if (num_records == 0) { 1864 *line = 0; 1865 *msg_id = kmp_i18n_str_NoProcRecords; 1866 return -1; 1867 } 1868 if (num_records > (unsigned)__kmp_xproc) { 1869 *line = 0; 1870 *msg_id = kmp_i18n_str_TooManyProcRecords; 1871 return -1; 1872 } 1873 1874 // 1875 // Set the file pointer back to the begginning, so that we can scan the 1876 // file again, this time performing a full parse of the data. 1877 // Allocate a vector of ProcCpuInfo object, where we will place the data. 1878 // Adding an extra element at the end allows us to remove a lot of extra 1879 // checks for termination conditions. 1880 // 1881 if (fseek(f, 0, SEEK_SET) != 0) { 1882 *line = 0; 1883 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 1884 return -1; 1885 } 1886 1887 // 1888 // Allocate the array of records to store the proc info in. The dummy 1889 // element at the end makes the logic in filling them out easier to code. 1890 // 1891 unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1) 1892 * sizeof(unsigned *)); 1893 unsigned i; 1894 for (i = 0; i <= num_records; i++) { 1895 threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1) 1896 * sizeof(unsigned)); 1897 } 1898 1899 #define CLEANUP_THREAD_INFO \ 1900 for (i = 0; i <= num_records; i++) { \ 1901 __kmp_free(threadInfo[i]); \ 1902 } \ 1903 __kmp_free(threadInfo); 1904 1905 // 1906 // A value of UINT_MAX means that we didn't find the field 1907 // 1908 unsigned __index; 1909 1910 #define INIT_PROC_INFO(p) \ 1911 for (__index = 0; __index <= maxIndex; __index++) { \ 1912 (p)[__index] = UINT_MAX; \ 1913 } 1914 1915 for (i = 0; i <= num_records; i++) { 1916 INIT_PROC_INFO(threadInfo[i]); 1917 } 1918 1919 unsigned num_avail = 0; 1920 *line = 0; 1921 while (! feof(f)) { 1922 // 1923 // Create an inner scoping level, so that all the goto targets at the 1924 // end of the loop appear in an outer scoping level. This avoids 1925 // warnings about jumping past an initialization to a target in the 1926 // same block. 1927 // 1928 { 1929 buf[sizeof(buf) - 1] = 1; 1930 bool long_line = false; 1931 if (! fgets(buf, sizeof(buf), f)) { 1932 // 1933 // Read errors presumably because of EOF 1934 // 1935 // If there is valid data in threadInfo[num_avail], then fake 1936 // a blank line in ensure that the last address gets parsed. 1937 // 1938 bool valid = false; 1939 for (i = 0; i <= maxIndex; i++) { 1940 if (threadInfo[num_avail][i] != UINT_MAX) { 1941 valid = true; 1942 } 1943 } 1944 if (! valid) { 1945 break; 1946 } 1947 buf[0] = 0; 1948 } else if (!buf[sizeof(buf) - 1]) { 1949 // 1950 // The line is longer than the buffer. Set a flag and don't 1951 // emit an error if we were going to ignore the line, anyway. 1952 // 1953 long_line = true; 1954 1955 #define CHECK_LINE \ 1956 if (long_line) { \ 1957 CLEANUP_THREAD_INFO; \ 1958 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 1959 return -1; \ 1960 } 1961 } 1962 (*line)++; 1963 1964 char s1[] = "processor"; 1965 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1966 CHECK_LINE; 1967 char *p = strchr(buf + sizeof(s1) - 1, ':'); 1968 unsigned val; 1969 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val; 1970 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field; 1971 threadInfo[num_avail][osIdIndex] = val; 1972 #if KMP_OS_LINUX && USE_SYSFS_INFO 1973 char path[256]; 1974 snprintf(path, sizeof(path), 1975 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 1976 threadInfo[num_avail][osIdIndex]); 1977 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 1978 1979 snprintf(path, sizeof(path), 1980 "/sys/devices/system/cpu/cpu%u/topology/core_id", 1981 threadInfo[num_avail][osIdIndex]); 1982 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 1983 continue; 1984 #else 1985 } 1986 char s2[] = "physical id"; 1987 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 1988 CHECK_LINE; 1989 char *p = strchr(buf + sizeof(s2) - 1, ':'); 1990 unsigned val; 1991 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val; 1992 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field; 1993 threadInfo[num_avail][pkgIdIndex] = val; 1994 continue; 1995 } 1996 char s3[] = "core id"; 1997 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 1998 CHECK_LINE; 1999 char *p = strchr(buf + sizeof(s3) - 1, ':'); 2000 unsigned val; 2001 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val; 2002 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field; 2003 threadInfo[num_avail][coreIdIndex] = val; 2004 continue; 2005 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 2006 } 2007 char s4[] = "thread id"; 2008 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 2009 CHECK_LINE; 2010 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2011 unsigned val; 2012 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val; 2013 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field; 2014 threadInfo[num_avail][threadIdIndex] = val; 2015 continue; 2016 } 2017 unsigned level; 2018 if (sscanf(buf, "node_%d id", &level) == 1) { 2019 CHECK_LINE; 2020 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2021 unsigned val; 2022 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val; 2023 KMP_ASSERT(nodeIdIndex + level <= maxIndex); 2024 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field; 2025 threadInfo[num_avail][nodeIdIndex + level] = val; 2026 continue; 2027 } 2028 2029 // 2030 // We didn't recognize the leading token on the line. 2031 // There are lots of leading tokens that we don't recognize - 2032 // if the line isn't empty, go on to the next line. 2033 // 2034 if ((*buf != 0) && (*buf != '\n')) { 2035 // 2036 // If the line is longer than the buffer, read characters 2037 // until we find a newline. 2038 // 2039 if (long_line) { 2040 int ch; 2041 while (((ch = fgetc(f)) != EOF) && (ch != '\n')); 2042 } 2043 continue; 2044 } 2045 2046 // 2047 // A newline has signalled the end of the processor record. 2048 // Check that there aren't too many procs specified. 2049 // 2050 if ((int)num_avail == __kmp_xproc) { 2051 CLEANUP_THREAD_INFO; 2052 *msg_id = kmp_i18n_str_TooManyEntries; 2053 return -1; 2054 } 2055 2056 // 2057 // Check for missing fields. The osId field must be there, and we 2058 // currently require that the physical id field is specified, also. 2059 // 2060 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 2061 CLEANUP_THREAD_INFO; 2062 *msg_id = kmp_i18n_str_MissingProcField; 2063 return -1; 2064 } 2065 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 2066 CLEANUP_THREAD_INFO; 2067 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 2068 return -1; 2069 } 2070 2071 // 2072 // Skip this proc if it is not included in the machine model. 2073 // 2074 if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) { 2075 INIT_PROC_INFO(threadInfo[num_avail]); 2076 continue; 2077 } 2078 2079 // 2080 // We have a successful parse of this proc's info. 2081 // Increment the counter, and prepare for the next proc. 2082 // 2083 num_avail++; 2084 KMP_ASSERT(num_avail <= num_records); 2085 INIT_PROC_INFO(threadInfo[num_avail]); 2086 } 2087 continue; 2088 2089 no_val: 2090 CLEANUP_THREAD_INFO; 2091 *msg_id = kmp_i18n_str_MissingValCpuinfo; 2092 return -1; 2093 2094 dup_field: 2095 CLEANUP_THREAD_INFO; 2096 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 2097 return -1; 2098 } 2099 *line = 0; 2100 2101 # if KMP_MIC && REDUCE_TEAM_SIZE 2102 unsigned teamSize = 0; 2103 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2104 2105 // check for num_records == __kmp_xproc ??? 2106 2107 // 2108 // If there's only one thread context to bind to, form an Address object 2109 // with depth 1 and return immediately (or, if affinity is off, set 2110 // address2os to NULL and return). 2111 // 2112 // If it is configured to omit the package level when there is only a 2113 // single package, the logic at the end of this routine won't work if 2114 // there is only a single thread - it would try to form an Address 2115 // object with depth 0. 2116 // 2117 KMP_ASSERT(num_avail > 0); 2118 KMP_ASSERT(num_avail <= num_records); 2119 if (num_avail == 1) { 2120 __kmp_ncores = 1; 2121 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 2122 if (__kmp_affinity_verbose) { 2123 if (! KMP_AFFINITY_CAPABLE()) { 2124 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2125 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2126 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2127 } 2128 else { 2129 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2130 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 2131 fullMask); 2132 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2133 if (__kmp_affinity_respect_mask) { 2134 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2135 } else { 2136 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2137 } 2138 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2139 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2140 } 2141 int index; 2142 kmp_str_buf_t buf; 2143 __kmp_str_buf_init(&buf); 2144 __kmp_str_buf_print(&buf, "1"); 2145 for (index = maxIndex - 1; index > pkgIdIndex; index--) { 2146 __kmp_str_buf_print(&buf, " x 1"); 2147 } 2148 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1); 2149 __kmp_str_buf_free(&buf); 2150 } 2151 2152 if (__kmp_affinity_type == affinity_none) { 2153 CLEANUP_THREAD_INFO; 2154 return 0; 2155 } 2156 2157 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair)); 2158 Address addr(1); 2159 addr.labels[0] = threadInfo[0][pkgIdIndex]; 2160 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]); 2161 2162 if (__kmp_affinity_gran_levels < 0) { 2163 __kmp_affinity_gran_levels = 0; 2164 } 2165 2166 if (__kmp_affinity_verbose) { 2167 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 2168 } 2169 2170 CLEANUP_THREAD_INFO; 2171 return 1; 2172 } 2173 2174 // 2175 // Sort the threadInfo table by physical Id. 2176 // 2177 qsort(threadInfo, num_avail, sizeof(*threadInfo), 2178 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 2179 2180 // 2181 // The table is now sorted by pkgId / coreId / threadId, but we really 2182 // don't know the radix of any of the fields. pkgId's may be sparsely 2183 // assigned among the chips on a system. Although coreId's are usually 2184 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned 2185 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2186 // 2187 // For that matter, we don't know what coresPerPkg and threadsPerCore 2188 // (or the total # packages) are at this point - we want to determine 2189 // that now. We only have an upper bound on the first two figures. 2190 // 2191 unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1) 2192 * sizeof(unsigned)); 2193 unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1) 2194 * sizeof(unsigned)); 2195 unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1) 2196 * sizeof(unsigned)); 2197 unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1) 2198 * sizeof(unsigned)); 2199 2200 bool assign_thread_ids = false; 2201 unsigned threadIdCt; 2202 unsigned index; 2203 2204 restart_radix_check: 2205 threadIdCt = 0; 2206 2207 // 2208 // Initialize the counter arrays with data from threadInfo[0]. 2209 // 2210 if (assign_thread_ids) { 2211 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 2212 threadInfo[0][threadIdIndex] = threadIdCt++; 2213 } 2214 else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 2215 threadIdCt = threadInfo[0][threadIdIndex] + 1; 2216 } 2217 } 2218 for (index = 0; index <= maxIndex; index++) { 2219 counts[index] = 1; 2220 maxCt[index] = 1; 2221 totals[index] = 1; 2222 lastId[index] = threadInfo[0][index];; 2223 } 2224 2225 // 2226 // Run through the rest of the OS procs. 2227 // 2228 for (i = 1; i < num_avail; i++) { 2229 // 2230 // Find the most significant index whose id differs 2231 // from the id for the previous OS proc. 2232 // 2233 for (index = maxIndex; index >= threadIdIndex; index--) { 2234 if (assign_thread_ids && (index == threadIdIndex)) { 2235 // 2236 // Auto-assign the thread id field if it wasn't specified. 2237 // 2238 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2239 threadInfo[i][threadIdIndex] = threadIdCt++; 2240 } 2241 2242 // 2243 // Aparrently the thread id field was specified for some 2244 // entries and not others. Start the thread id counter 2245 // off at the next higher thread id. 2246 // 2247 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2248 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2249 } 2250 } 2251 if (threadInfo[i][index] != lastId[index]) { 2252 // 2253 // Run through all indices which are less significant, 2254 // and reset the counts to 1. 2255 // 2256 // At all levels up to and including index, we need to 2257 // increment the totals and record the last id. 2258 // 2259 unsigned index2; 2260 for (index2 = threadIdIndex; index2 < index; index2++) { 2261 totals[index2]++; 2262 if (counts[index2] > maxCt[index2]) { 2263 maxCt[index2] = counts[index2]; 2264 } 2265 counts[index2] = 1; 2266 lastId[index2] = threadInfo[i][index2]; 2267 } 2268 counts[index]++; 2269 totals[index]++; 2270 lastId[index] = threadInfo[i][index]; 2271 2272 if (assign_thread_ids && (index > threadIdIndex)) { 2273 2274 # if KMP_MIC && REDUCE_TEAM_SIZE 2275 // 2276 // The default team size is the total #threads in the machine 2277 // minus 1 thread for every core that has 3 or more threads. 2278 // 2279 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 ); 2280 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2281 2282 // 2283 // Restart the thread counter, as we are on a new core. 2284 // 2285 threadIdCt = 0; 2286 2287 // 2288 // Auto-assign the thread id field if it wasn't specified. 2289 // 2290 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2291 threadInfo[i][threadIdIndex] = threadIdCt++; 2292 } 2293 2294 // 2295 // Aparrently the thread id field was specified for some 2296 // entries and not others. Start the thread id counter 2297 // off at the next higher thread id. 2298 // 2299 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2300 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2301 } 2302 } 2303 break; 2304 } 2305 } 2306 if (index < threadIdIndex) { 2307 // 2308 // If thread ids were specified, it is an error if they are not 2309 // unique. Also, check that we waven't already restarted the 2310 // loop (to be safe - shouldn't need to). 2311 // 2312 if ((threadInfo[i][threadIdIndex] != UINT_MAX) 2313 || assign_thread_ids) { 2314 __kmp_free(lastId); 2315 __kmp_free(totals); 2316 __kmp_free(maxCt); 2317 __kmp_free(counts); 2318 CLEANUP_THREAD_INFO; 2319 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2320 return -1; 2321 } 2322 2323 // 2324 // If the thread ids were not specified and we see entries 2325 // entries that are duplicates, start the loop over and 2326 // assign the thread ids manually. 2327 // 2328 assign_thread_ids = true; 2329 goto restart_radix_check; 2330 } 2331 } 2332 2333 # if KMP_MIC && REDUCE_TEAM_SIZE 2334 // 2335 // The default team size is the total #threads in the machine 2336 // minus 1 thread for every core that has 3 or more threads. 2337 // 2338 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 ); 2339 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2340 2341 for (index = threadIdIndex; index <= maxIndex; index++) { 2342 if (counts[index] > maxCt[index]) { 2343 maxCt[index] = counts[index]; 2344 } 2345 } 2346 2347 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2348 nCoresPerPkg = maxCt[coreIdIndex]; 2349 nPackages = totals[pkgIdIndex]; 2350 2351 // 2352 // Check to see if the machine topology is uniform 2353 // 2354 unsigned prod = totals[maxIndex]; 2355 for (index = threadIdIndex; index < maxIndex; index++) { 2356 prod *= maxCt[index]; 2357 } 2358 bool uniform = (prod == totals[threadIdIndex]); 2359 2360 // 2361 // When affinity is off, this routine will still be called to set 2362 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 2363 // nCoresPerPkg, & nPackages. Make sure all these vars are set 2364 // correctly, and return now if affinity is not enabled. 2365 // 2366 __kmp_ncores = totals[coreIdIndex]; 2367 2368 if (__kmp_affinity_verbose) { 2369 if (! KMP_AFFINITY_CAPABLE()) { 2370 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2371 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2372 if (uniform) { 2373 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2374 } else { 2375 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2376 } 2377 } 2378 else { 2379 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2380 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask); 2381 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2382 if (__kmp_affinity_respect_mask) { 2383 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2384 } else { 2385 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2386 } 2387 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2388 if (uniform) { 2389 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2390 } else { 2391 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2392 } 2393 } 2394 kmp_str_buf_t buf; 2395 __kmp_str_buf_init(&buf); 2396 2397 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]); 2398 for (index = maxIndex - 1; index >= pkgIdIndex; index--) { 2399 __kmp_str_buf_print(&buf, " x %d", maxCt[index]); 2400 } 2401 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex], 2402 maxCt[threadIdIndex], __kmp_ncores); 2403 2404 __kmp_str_buf_free(&buf); 2405 } 2406 2407 # if KMP_MIC && REDUCE_TEAM_SIZE 2408 // 2409 // Set the default team size. 2410 // 2411 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2412 __kmp_dflt_team_nth = teamSize; 2413 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n", 2414 __kmp_dflt_team_nth)); 2415 } 2416 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2417 2418 if (__kmp_affinity_type == affinity_none) { 2419 __kmp_free(lastId); 2420 __kmp_free(totals); 2421 __kmp_free(maxCt); 2422 __kmp_free(counts); 2423 CLEANUP_THREAD_INFO; 2424 return 0; 2425 } 2426 2427 // 2428 // Count the number of levels which have more nodes at that level than 2429 // at the parent's level (with there being an implicit root node of 2430 // the top level). This is equivalent to saying that there is at least 2431 // one node at this level which has a sibling. These levels are in the 2432 // map, and the package level is always in the map. 2433 // 2434 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2435 int level = 0; 2436 for (index = threadIdIndex; index < maxIndex; index++) { 2437 KMP_ASSERT(totals[index] >= totals[index + 1]); 2438 inMap[index] = (totals[index] > totals[index + 1]); 2439 } 2440 inMap[maxIndex] = (totals[maxIndex] > 1); 2441 inMap[pkgIdIndex] = true; 2442 2443 int depth = 0; 2444 for (index = threadIdIndex; index <= maxIndex; index++) { 2445 if (inMap[index]) { 2446 depth++; 2447 } 2448 } 2449 KMP_ASSERT(depth > 0); 2450 2451 // 2452 // Construct the data structure that is to be returned. 2453 // 2454 *address2os = (AddrUnsPair*) 2455 __kmp_allocate(sizeof(AddrUnsPair) * num_avail); 2456 int pkgLevel = -1; 2457 int coreLevel = -1; 2458 int threadLevel = -1; 2459 2460 for (i = 0; i < num_avail; ++i) { 2461 Address addr(depth); 2462 unsigned os = threadInfo[i][osIdIndex]; 2463 int src_index; 2464 int dst_index = 0; 2465 2466 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2467 if (! inMap[src_index]) { 2468 continue; 2469 } 2470 addr.labels[dst_index] = threadInfo[i][src_index]; 2471 if (src_index == pkgIdIndex) { 2472 pkgLevel = dst_index; 2473 } 2474 else if (src_index == coreIdIndex) { 2475 coreLevel = dst_index; 2476 } 2477 else if (src_index == threadIdIndex) { 2478 threadLevel = dst_index; 2479 } 2480 dst_index++; 2481 } 2482 (*address2os)[i] = AddrUnsPair(addr, os); 2483 } 2484 2485 if (__kmp_affinity_gran_levels < 0) { 2486 // 2487 // Set the granularity level based on what levels are modeled 2488 // in the machine topology map. 2489 // 2490 unsigned src_index; 2491 __kmp_affinity_gran_levels = 0; 2492 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) { 2493 if (! inMap[src_index]) { 2494 continue; 2495 } 2496 switch (src_index) { 2497 case threadIdIndex: 2498 if (__kmp_affinity_gran > affinity_gran_thread) { 2499 __kmp_affinity_gran_levels++; 2500 } 2501 2502 break; 2503 case coreIdIndex: 2504 if (__kmp_affinity_gran > affinity_gran_core) { 2505 __kmp_affinity_gran_levels++; 2506 } 2507 break; 2508 2509 case pkgIdIndex: 2510 if (__kmp_affinity_gran > affinity_gran_package) { 2511 __kmp_affinity_gran_levels++; 2512 } 2513 break; 2514 } 2515 } 2516 } 2517 2518 if (__kmp_affinity_verbose) { 2519 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel, 2520 coreLevel, threadLevel); 2521 } 2522 2523 __kmp_free(inMap); 2524 __kmp_free(lastId); 2525 __kmp_free(totals); 2526 __kmp_free(maxCt); 2527 __kmp_free(counts); 2528 CLEANUP_THREAD_INFO; 2529 return depth; 2530 } 2531 2532 2533 // 2534 // Create and return a table of affinity masks, indexed by OS thread ID. 2535 // This routine handles OR'ing together all the affinity masks of threads 2536 // that are sufficiently close, if granularity > fine. 2537 // 2538 static kmp_affin_mask_t * 2539 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique, 2540 AddrUnsPair *address2os, unsigned numAddrs) 2541 { 2542 // 2543 // First form a table of affinity masks in order of OS thread id. 2544 // 2545 unsigned depth; 2546 unsigned maxOsId; 2547 unsigned i; 2548 2549 KMP_ASSERT(numAddrs > 0); 2550 depth = address2os[0].first.depth; 2551 2552 maxOsId = 0; 2553 for (i = 0; i < numAddrs; i++) { 2554 unsigned osId = address2os[i].second; 2555 if (osId > maxOsId) { 2556 maxOsId = osId; 2557 } 2558 } 2559 kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate( 2560 (maxOsId + 1) * __kmp_affin_mask_size); 2561 2562 // 2563 // Sort the address2os table according to physical order. Doing so 2564 // will put all threads on the same core/package/node in consecutive 2565 // locations. 2566 // 2567 qsort(address2os, numAddrs, sizeof(*address2os), 2568 __kmp_affinity_cmp_Address_labels); 2569 2570 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2571 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2572 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2573 } 2574 if (__kmp_affinity_gran_levels >= (int)depth) { 2575 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2576 && (__kmp_affinity_type != affinity_none))) { 2577 KMP_WARNING(AffThreadsMayMigrate); 2578 } 2579 } 2580 2581 // 2582 // Run through the table, forming the masks for all threads on each 2583 // core. Threads on the same core will have identical "Address" 2584 // objects, not considering the last level, which must be the thread 2585 // id. All threads on a core will appear consecutively. 2586 // 2587 unsigned unique = 0; 2588 unsigned j = 0; // index of 1st thread on core 2589 unsigned leader = 0; 2590 Address *leaderAddr = &(address2os[0].first); 2591 kmp_affin_mask_t *sum 2592 = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size); 2593 KMP_CPU_ZERO(sum); 2594 KMP_CPU_SET(address2os[0].second, sum); 2595 for (i = 1; i < numAddrs; i++) { 2596 // 2597 // If this thread is sufficiently close to the leader (within the 2598 // granularity setting), then set the bit for this os thread in the 2599 // affinity mask for this group, and go on to the next thread. 2600 // 2601 if (leaderAddr->isClose(address2os[i].first, 2602 __kmp_affinity_gran_levels)) { 2603 KMP_CPU_SET(address2os[i].second, sum); 2604 continue; 2605 } 2606 2607 // 2608 // For every thread in this group, copy the mask to the thread's 2609 // entry in the osId2Mask table. Mark the first address as a 2610 // leader. 2611 // 2612 for (; j < i; j++) { 2613 unsigned osId = address2os[j].second; 2614 KMP_DEBUG_ASSERT(osId <= maxOsId); 2615 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2616 KMP_CPU_COPY(mask, sum); 2617 address2os[j].first.leader = (j == leader); 2618 } 2619 unique++; 2620 2621 // 2622 // Start a new mask. 2623 // 2624 leader = i; 2625 leaderAddr = &(address2os[i].first); 2626 KMP_CPU_ZERO(sum); 2627 KMP_CPU_SET(address2os[i].second, sum); 2628 } 2629 2630 // 2631 // For every thread in last group, copy the mask to the thread's 2632 // entry in the osId2Mask table. 2633 // 2634 for (; j < i; j++) { 2635 unsigned osId = address2os[j].second; 2636 KMP_DEBUG_ASSERT(osId <= maxOsId); 2637 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2638 KMP_CPU_COPY(mask, sum); 2639 address2os[j].first.leader = (j == leader); 2640 } 2641 unique++; 2642 2643 *maxIndex = maxOsId; 2644 *numUnique = unique; 2645 return osId2Mask; 2646 } 2647 2648 2649 // 2650 // Stuff for the affinity proclist parsers. It's easier to declare these vars 2651 // as file-static than to try and pass them through the calling sequence of 2652 // the recursive-descent OMP_PLACES parser. 2653 // 2654 static kmp_affin_mask_t *newMasks; 2655 static int numNewMasks; 2656 static int nextNewMask; 2657 2658 #define ADD_MASK(_mask) \ 2659 { \ 2660 if (nextNewMask >= numNewMasks) { \ 2661 numNewMasks *= 2; \ 2662 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \ 2663 numNewMasks * __kmp_affin_mask_size); \ 2664 } \ 2665 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 2666 nextNewMask++; \ 2667 } 2668 2669 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \ 2670 { \ 2671 if (((_osId) > _maxOsId) || \ 2672 (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 2673 if (__kmp_affinity_verbose || (__kmp_affinity_warnings \ 2674 && (__kmp_affinity_type != affinity_none))) { \ 2675 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 2676 } \ 2677 } \ 2678 else { \ 2679 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 2680 } \ 2681 } 2682 2683 2684 // 2685 // Re-parse the proclist (for the explicit affinity type), and form the list 2686 // of affinity newMasks indexed by gtid. 2687 // 2688 static void 2689 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 2690 unsigned int *out_numMasks, const char *proclist, 2691 kmp_affin_mask_t *osId2Mask, int maxOsId) 2692 { 2693 const char *scan = proclist; 2694 const char *next = proclist; 2695 2696 // 2697 // We use malloc() for the temporary mask vector, 2698 // so that we can use realloc() to extend it. 2699 // 2700 numNewMasks = 2; 2701 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks 2702 * __kmp_affin_mask_size); 2703 nextNewMask = 0; 2704 kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate( 2705 __kmp_affin_mask_size); 2706 int setSize = 0; 2707 2708 for (;;) { 2709 int start, end, stride; 2710 2711 SKIP_WS(scan); 2712 next = scan; 2713 if (*next == '\0') { 2714 break; 2715 } 2716 2717 if (*next == '{') { 2718 int num; 2719 setSize = 0; 2720 next++; // skip '{' 2721 SKIP_WS(next); 2722 scan = next; 2723 2724 // 2725 // Read the first integer in the set. 2726 // 2727 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2728 "bad proclist"); 2729 SKIP_DIGITS(next); 2730 num = __kmp_str_to_int(scan, *next); 2731 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2732 2733 // 2734 // Copy the mask for that osId to the sum (union) mask. 2735 // 2736 if ((num > maxOsId) || 2737 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2738 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2739 && (__kmp_affinity_type != affinity_none))) { 2740 KMP_WARNING(AffIgnoreInvalidProcID, num); 2741 } 2742 KMP_CPU_ZERO(sumMask); 2743 } 2744 else { 2745 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2746 setSize = 1; 2747 } 2748 2749 for (;;) { 2750 // 2751 // Check for end of set. 2752 // 2753 SKIP_WS(next); 2754 if (*next == '}') { 2755 next++; // skip '}' 2756 break; 2757 } 2758 2759 // 2760 // Skip optional comma. 2761 // 2762 if (*next == ',') { 2763 next++; 2764 } 2765 SKIP_WS(next); 2766 2767 // 2768 // Read the next integer in the set. 2769 // 2770 scan = next; 2771 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2772 "bad explicit proc list"); 2773 2774 SKIP_DIGITS(next); 2775 num = __kmp_str_to_int(scan, *next); 2776 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2777 2778 // 2779 // Add the mask for that osId to the sum mask. 2780 // 2781 if ((num > maxOsId) || 2782 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2783 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2784 && (__kmp_affinity_type != affinity_none))) { 2785 KMP_WARNING(AffIgnoreInvalidProcID, num); 2786 } 2787 } 2788 else { 2789 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2790 setSize++; 2791 } 2792 } 2793 if (setSize > 0) { 2794 ADD_MASK(sumMask); 2795 } 2796 2797 SKIP_WS(next); 2798 if (*next == ',') { 2799 next++; 2800 } 2801 scan = next; 2802 continue; 2803 } 2804 2805 // 2806 // Read the first integer. 2807 // 2808 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2809 SKIP_DIGITS(next); 2810 start = __kmp_str_to_int(scan, *next); 2811 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 2812 SKIP_WS(next); 2813 2814 // 2815 // If this isn't a range, then add a mask to the list and go on. 2816 // 2817 if (*next != '-') { 2818 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2819 2820 // 2821 // Skip optional comma. 2822 // 2823 if (*next == ',') { 2824 next++; 2825 } 2826 scan = next; 2827 continue; 2828 } 2829 2830 // 2831 // This is a range. Skip over the '-' and read in the 2nd int. 2832 // 2833 next++; // skip '-' 2834 SKIP_WS(next); 2835 scan = next; 2836 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2837 SKIP_DIGITS(next); 2838 end = __kmp_str_to_int(scan, *next); 2839 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 2840 2841 // 2842 // Check for a stride parameter 2843 // 2844 stride = 1; 2845 SKIP_WS(next); 2846 if (*next == ':') { 2847 // 2848 // A stride is specified. Skip over the ':" and read the 3rd int. 2849 // 2850 int sign = +1; 2851 next++; // skip ':' 2852 SKIP_WS(next); 2853 scan = next; 2854 if (*next == '-') { 2855 sign = -1; 2856 next++; 2857 SKIP_WS(next); 2858 scan = next; 2859 } 2860 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2861 "bad explicit proc list"); 2862 SKIP_DIGITS(next); 2863 stride = __kmp_str_to_int(scan, *next); 2864 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 2865 stride *= sign; 2866 } 2867 2868 // 2869 // Do some range checks. 2870 // 2871 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 2872 if (stride > 0) { 2873 KMP_ASSERT2(start <= end, "bad explicit proc list"); 2874 } 2875 else { 2876 KMP_ASSERT2(start >= end, "bad explicit proc list"); 2877 } 2878 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 2879 2880 // 2881 // Add the mask for each OS proc # to the list. 2882 // 2883 if (stride > 0) { 2884 do { 2885 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2886 start += stride; 2887 } while (start <= end); 2888 } 2889 else { 2890 do { 2891 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2892 start += stride; 2893 } while (start >= end); 2894 } 2895 2896 // 2897 // Skip optional comma. 2898 // 2899 SKIP_WS(next); 2900 if (*next == ',') { 2901 next++; 2902 } 2903 scan = next; 2904 } 2905 2906 *out_numMasks = nextNewMask; 2907 if (nextNewMask == 0) { 2908 *out_masks = NULL; 2909 KMP_INTERNAL_FREE(newMasks); 2910 return; 2911 } 2912 *out_masks 2913 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size); 2914 memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size); 2915 __kmp_free(sumMask); 2916 KMP_INTERNAL_FREE(newMasks); 2917 } 2918 2919 2920 # if OMP_40_ENABLED 2921 2922 /*----------------------------------------------------------------------------- 2923 2924 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 2925 places. Again, Here is the grammar: 2926 2927 place_list := place 2928 place_list := place , place_list 2929 place := num 2930 place := place : num 2931 place := place : num : signed 2932 place := { subplacelist } 2933 place := ! place // (lowest priority) 2934 subplace_list := subplace 2935 subplace_list := subplace , subplace_list 2936 subplace := num 2937 subplace := num : num 2938 subplace := num : num : signed 2939 signed := num 2940 signed := + signed 2941 signed := - signed 2942 2943 -----------------------------------------------------------------------------*/ 2944 2945 static void 2946 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask, 2947 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize) 2948 { 2949 const char *next; 2950 2951 for (;;) { 2952 int start, count, stride, i; 2953 2954 // 2955 // Read in the starting proc id 2956 // 2957 SKIP_WS(*scan); 2958 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 2959 "bad explicit places list"); 2960 next = *scan; 2961 SKIP_DIGITS(next); 2962 start = __kmp_str_to_int(*scan, *next); 2963 KMP_ASSERT(start >= 0); 2964 *scan = next; 2965 2966 // 2967 // valid follow sets are ',' ':' and '}' 2968 // 2969 SKIP_WS(*scan); 2970 if (**scan == '}' || **scan == ',') { 2971 if ((start > maxOsId) || 2972 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2973 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2974 && (__kmp_affinity_type != affinity_none))) { 2975 KMP_WARNING(AffIgnoreInvalidProcID, start); 2976 } 2977 } 2978 else { 2979 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2980 (*setSize)++; 2981 } 2982 if (**scan == '}') { 2983 break; 2984 } 2985 (*scan)++; // skip ',' 2986 continue; 2987 } 2988 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 2989 (*scan)++; // skip ':' 2990 2991 // 2992 // Read count parameter 2993 // 2994 SKIP_WS(*scan); 2995 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 2996 "bad explicit places list"); 2997 next = *scan; 2998 SKIP_DIGITS(next); 2999 count = __kmp_str_to_int(*scan, *next); 3000 KMP_ASSERT(count >= 0); 3001 *scan = next; 3002 3003 // 3004 // valid follow sets are ',' ':' and '}' 3005 // 3006 SKIP_WS(*scan); 3007 if (**scan == '}' || **scan == ',') { 3008 for (i = 0; i < count; i++) { 3009 if ((start > maxOsId) || 3010 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3011 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3012 && (__kmp_affinity_type != affinity_none))) { 3013 KMP_WARNING(AffIgnoreInvalidProcID, start); 3014 } 3015 break; // don't proliferate warnings for large count 3016 } 3017 else { 3018 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3019 start++; 3020 (*setSize)++; 3021 } 3022 } 3023 if (**scan == '}') { 3024 break; 3025 } 3026 (*scan)++; // skip ',' 3027 continue; 3028 } 3029 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3030 (*scan)++; // skip ':' 3031 3032 // 3033 // Read stride parameter 3034 // 3035 int sign = +1; 3036 for (;;) { 3037 SKIP_WS(*scan); 3038 if (**scan == '+') { 3039 (*scan)++; // skip '+' 3040 continue; 3041 } 3042 if (**scan == '-') { 3043 sign *= -1; 3044 (*scan)++; // skip '-' 3045 continue; 3046 } 3047 break; 3048 } 3049 SKIP_WS(*scan); 3050 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 3051 "bad explicit places list"); 3052 next = *scan; 3053 SKIP_DIGITS(next); 3054 stride = __kmp_str_to_int(*scan, *next); 3055 KMP_ASSERT(stride >= 0); 3056 *scan = next; 3057 stride *= sign; 3058 3059 // 3060 // valid follow sets are ',' and '}' 3061 // 3062 SKIP_WS(*scan); 3063 if (**scan == '}' || **scan == ',') { 3064 for (i = 0; i < count; i++) { 3065 if ((start > maxOsId) || 3066 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3067 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3068 && (__kmp_affinity_type != affinity_none))) { 3069 KMP_WARNING(AffIgnoreInvalidProcID, start); 3070 } 3071 break; // don't proliferate warnings for large count 3072 } 3073 else { 3074 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3075 start += stride; 3076 (*setSize)++; 3077 } 3078 } 3079 if (**scan == '}') { 3080 break; 3081 } 3082 (*scan)++; // skip ',' 3083 continue; 3084 } 3085 3086 KMP_ASSERT2(0, "bad explicit places list"); 3087 } 3088 } 3089 3090 3091 static void 3092 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 3093 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize) 3094 { 3095 const char *next; 3096 3097 // 3098 // valid follow sets are '{' '!' and num 3099 // 3100 SKIP_WS(*scan); 3101 if (**scan == '{') { 3102 (*scan)++; // skip '{' 3103 __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask, 3104 setSize); 3105 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 3106 (*scan)++; // skip '}' 3107 } 3108 else if (**scan == '!') { 3109 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 3110 KMP_CPU_COMPLEMENT(tempMask); 3111 (*scan)++; // skip '!' 3112 } 3113 else if ((**scan >= '0') && (**scan <= '9')) { 3114 next = *scan; 3115 SKIP_DIGITS(next); 3116 int num = __kmp_str_to_int(*scan, *next); 3117 KMP_ASSERT(num >= 0); 3118 if ((num > maxOsId) || 3119 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3120 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3121 && (__kmp_affinity_type != affinity_none))) { 3122 KMP_WARNING(AffIgnoreInvalidProcID, num); 3123 } 3124 } 3125 else { 3126 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 3127 (*setSize)++; 3128 } 3129 *scan = next; // skip num 3130 } 3131 else { 3132 KMP_ASSERT2(0, "bad explicit places list"); 3133 } 3134 } 3135 3136 3137 //static void 3138 void 3139 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 3140 unsigned int *out_numMasks, const char *placelist, 3141 kmp_affin_mask_t *osId2Mask, int maxOsId) 3142 { 3143 const char *scan = placelist; 3144 const char *next = placelist; 3145 3146 numNewMasks = 2; 3147 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks 3148 * __kmp_affin_mask_size); 3149 nextNewMask = 0; 3150 3151 kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate( 3152 __kmp_affin_mask_size); 3153 KMP_CPU_ZERO(tempMask); 3154 int setSize = 0; 3155 3156 for (;;) { 3157 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 3158 3159 // 3160 // valid follow sets are ',' ':' and EOL 3161 // 3162 SKIP_WS(scan); 3163 if (*scan == '\0' || *scan == ',') { 3164 if (setSize > 0) { 3165 ADD_MASK(tempMask); 3166 } 3167 KMP_CPU_ZERO(tempMask); 3168 setSize = 0; 3169 if (*scan == '\0') { 3170 break; 3171 } 3172 scan++; // skip ',' 3173 continue; 3174 } 3175 3176 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3177 scan++; // skip ':' 3178 3179 // 3180 // Read count parameter 3181 // 3182 SKIP_WS(scan); 3183 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), 3184 "bad explicit places list"); 3185 next = scan; 3186 SKIP_DIGITS(next); 3187 int count = __kmp_str_to_int(scan, *next); 3188 KMP_ASSERT(count >= 0); 3189 scan = next; 3190 3191 // 3192 // valid follow sets are ',' ':' and EOL 3193 // 3194 SKIP_WS(scan); 3195 int stride; 3196 if (*scan == '\0' || *scan == ',') { 3197 stride = +1; 3198 } 3199 else { 3200 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3201 scan++; // skip ':' 3202 3203 // 3204 // Read stride parameter 3205 // 3206 int sign = +1; 3207 for (;;) { 3208 SKIP_WS(scan); 3209 if (*scan == '+') { 3210 scan++; // skip '+' 3211 continue; 3212 } 3213 if (*scan == '-') { 3214 sign *= -1; 3215 scan++; // skip '-' 3216 continue; 3217 } 3218 break; 3219 } 3220 SKIP_WS(scan); 3221 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), 3222 "bad explicit places list"); 3223 next = scan; 3224 SKIP_DIGITS(next); 3225 stride = __kmp_str_to_int(scan, *next); 3226 KMP_DEBUG_ASSERT(stride >= 0); 3227 scan = next; 3228 stride *= sign; 3229 } 3230 3231 if (stride > 0) { 3232 int i; 3233 for (i = 0; i < count; i++) { 3234 int j; 3235 if (setSize == 0) { 3236 break; 3237 } 3238 ADD_MASK(tempMask); 3239 setSize = 0; 3240 for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) { 3241 if (! KMP_CPU_ISSET(j - stride, tempMask)) { 3242 KMP_CPU_CLR(j, tempMask); 3243 } 3244 else if ((j > maxOsId) || 3245 (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) { 3246 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3247 && (__kmp_affinity_type != affinity_none))) { 3248 KMP_WARNING(AffIgnoreInvalidProcID, j); 3249 } 3250 KMP_CPU_CLR(j, tempMask); 3251 } 3252 else { 3253 KMP_CPU_SET(j, tempMask); 3254 setSize++; 3255 } 3256 } 3257 for (; j >= 0; j--) { 3258 KMP_CPU_CLR(j, tempMask); 3259 } 3260 } 3261 } 3262 else { 3263 int i; 3264 for (i = 0; i < count; i++) { 3265 int j; 3266 if (setSize == 0) { 3267 break; 3268 } 3269 ADD_MASK(tempMask); 3270 setSize = 0; 3271 for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride; 3272 j++) { 3273 if (! KMP_CPU_ISSET(j - stride, tempMask)) { 3274 KMP_CPU_CLR(j, tempMask); 3275 } 3276 else if ((j > maxOsId) || 3277 (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) { 3278 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3279 && (__kmp_affinity_type != affinity_none))) { 3280 KMP_WARNING(AffIgnoreInvalidProcID, j); 3281 } 3282 KMP_CPU_CLR(j, tempMask); 3283 } 3284 else { 3285 KMP_CPU_SET(j, tempMask); 3286 setSize++; 3287 } 3288 } 3289 for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) { 3290 KMP_CPU_CLR(j, tempMask); 3291 } 3292 } 3293 } 3294 KMP_CPU_ZERO(tempMask); 3295 setSize = 0; 3296 3297 // 3298 // valid follow sets are ',' and EOL 3299 // 3300 SKIP_WS(scan); 3301 if (*scan == '\0') { 3302 break; 3303 } 3304 if (*scan == ',') { 3305 scan++; // skip ',' 3306 continue; 3307 } 3308 3309 KMP_ASSERT2(0, "bad explicit places list"); 3310 } 3311 3312 *out_numMasks = nextNewMask; 3313 if (nextNewMask == 0) { 3314 *out_masks = NULL; 3315 KMP_INTERNAL_FREE(newMasks); 3316 return; 3317 } 3318 *out_masks 3319 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size); 3320 memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size); 3321 __kmp_free(tempMask); 3322 KMP_INTERNAL_FREE(newMasks); 3323 } 3324 3325 # endif /* OMP_40_ENABLED */ 3326 3327 #undef ADD_MASK 3328 #undef ADD_MASK_OSID 3329 3330 static void 3331 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) 3332 { 3333 if ( __kmp_place_num_cores == 0 ) { 3334 if ( __kmp_place_num_threads_per_core == 0 ) { 3335 return; // no cores limiting actions requested, exit 3336 } 3337 __kmp_place_num_cores = nCoresPerPkg; // use all available cores 3338 } 3339 if ( !__kmp_affinity_uniform_topology() ) { 3340 KMP_WARNING( AffThrPlaceNonUniform ); 3341 return; // don't support non-uniform topology 3342 } 3343 if ( depth != 3 ) { 3344 KMP_WARNING( AffThrPlaceNonThreeLevel ); 3345 return; // don't support not-3-level topology 3346 } 3347 if ( __kmp_place_num_threads_per_core == 0 ) { 3348 __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts 3349 } 3350 if ( __kmp_place_core_offset + __kmp_place_num_cores > (unsigned int)nCoresPerPkg ) { 3351 KMP_WARNING( AffThrPlaceManyCores ); 3352 return; 3353 } 3354 3355 AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) * 3356 nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core); 3357 int i, j, k, n_old = 0, n_new = 0; 3358 for ( i = 0; i < nPackages; ++i ) { 3359 for ( j = 0; j < nCoresPerPkg; ++j ) { 3360 if ( (unsigned int)j < __kmp_place_core_offset || (unsigned int)j >= __kmp_place_core_offset + __kmp_place_num_cores ) { 3361 n_old += __kmp_nThreadsPerCore; // skip not-requested core 3362 } else { 3363 for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) { 3364 if ( (unsigned int)k < __kmp_place_num_threads_per_core ) { 3365 newAddr[n_new] = (*pAddr)[n_old]; // copy requested core' data to new location 3366 n_new++; 3367 } 3368 n_old++; 3369 } 3370 } 3371 } 3372 } 3373 nCoresPerPkg = __kmp_place_num_cores; // correct nCoresPerPkg 3374 __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore 3375 __kmp_avail_proc = n_new; // correct avail_proc 3376 __kmp_ncores = nPackages * __kmp_place_num_cores; // correct ncores 3377 3378 __kmp_free( *pAddr ); 3379 *pAddr = newAddr; // replace old topology with new one 3380 } 3381 3382 3383 static AddrUnsPair *address2os = NULL; 3384 static int * procarr = NULL; 3385 static int __kmp_aff_depth = 0; 3386 3387 static void 3388 __kmp_aux_affinity_initialize(void) 3389 { 3390 if (__kmp_affinity_masks != NULL) { 3391 KMP_ASSERT(fullMask != NULL); 3392 return; 3393 } 3394 3395 // 3396 // Create the "full" mask - this defines all of the processors that we 3397 // consider to be in the machine model. If respect is set, then it is 3398 // the initialization thread's affinity mask. Otherwise, it is all 3399 // processors that we know about on the machine. 3400 // 3401 if (fullMask == NULL) { 3402 fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size); 3403 } 3404 if (KMP_AFFINITY_CAPABLE()) { 3405 if (__kmp_affinity_respect_mask) { 3406 __kmp_get_system_affinity(fullMask, TRUE); 3407 3408 // 3409 // Count the number of available processors. 3410 // 3411 unsigned i; 3412 __kmp_avail_proc = 0; 3413 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 3414 if (! KMP_CPU_ISSET(i, fullMask)) { 3415 continue; 3416 } 3417 __kmp_avail_proc++; 3418 } 3419 if (__kmp_avail_proc > __kmp_xproc) { 3420 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3421 && (__kmp_affinity_type != affinity_none))) { 3422 KMP_WARNING(ErrorInitializeAffinity); 3423 } 3424 __kmp_affinity_type = affinity_none; 3425 __kmp_affin_mask_size = 0; 3426 return; 3427 } 3428 } 3429 else { 3430 __kmp_affinity_entire_machine_mask(fullMask); 3431 __kmp_avail_proc = __kmp_xproc; 3432 } 3433 } 3434 3435 int depth = -1; 3436 kmp_i18n_id_t msg_id = kmp_i18n_null; 3437 3438 // 3439 // For backward compatibility, setting KMP_CPUINFO_FILE => 3440 // KMP_TOPOLOGY_METHOD=cpuinfo 3441 // 3442 if ((__kmp_cpuinfo_file != NULL) && 3443 (__kmp_affinity_top_method == affinity_top_method_all)) { 3444 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 3445 } 3446 3447 if (__kmp_affinity_top_method == affinity_top_method_all) { 3448 // 3449 // In the default code path, errors are not fatal - we just try using 3450 // another method. We only emit a warning message if affinity is on, 3451 // or the verbose flag is set, an the nowarnings flag was not set. 3452 // 3453 const char *file_name = NULL; 3454 int line = 0; 3455 3456 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 3457 3458 if (__kmp_affinity_verbose) { 3459 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 3460 } 3461 3462 file_name = NULL; 3463 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3464 if (depth == 0) { 3465 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3466 KMP_ASSERT(address2os == NULL); 3467 return; 3468 } 3469 3470 if (depth < 0) { 3471 if (__kmp_affinity_verbose) { 3472 if (msg_id != kmp_i18n_null) { 3473 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), 3474 KMP_I18N_STR(DecodingLegacyAPIC)); 3475 } 3476 else { 3477 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 3478 } 3479 } 3480 3481 file_name = NULL; 3482 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3483 if (depth == 0) { 3484 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3485 KMP_ASSERT(address2os == NULL); 3486 return; 3487 } 3488 } 3489 3490 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3491 3492 # if KMP_OS_LINUX 3493 3494 if (depth < 0) { 3495 if (__kmp_affinity_verbose) { 3496 if (msg_id != kmp_i18n_null) { 3497 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo"); 3498 } 3499 else { 3500 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo"); 3501 } 3502 } 3503 3504 FILE *f = fopen("/proc/cpuinfo", "r"); 3505 if (f == NULL) { 3506 msg_id = kmp_i18n_str_CantOpenCpuinfo; 3507 } 3508 else { 3509 file_name = "/proc/cpuinfo"; 3510 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3511 fclose(f); 3512 if (depth == 0) { 3513 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3514 KMP_ASSERT(address2os == NULL); 3515 return; 3516 } 3517 } 3518 } 3519 3520 # endif /* KMP_OS_LINUX */ 3521 3522 # if KMP_GROUP_AFFINITY 3523 3524 if ((depth < 0) && (__kmp_num_proc_groups > 1)) { 3525 if (__kmp_affinity_verbose) { 3526 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 3527 } 3528 3529 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 3530 KMP_ASSERT(depth != 0); 3531 } 3532 3533 # endif /* KMP_GROUP_AFFINITY */ 3534 3535 if (depth < 0) { 3536 if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) { 3537 if (file_name == NULL) { 3538 KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id)); 3539 } 3540 else if (line == 0) { 3541 KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id)); 3542 } 3543 else { 3544 KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id)); 3545 } 3546 } 3547 // FIXME - print msg if msg_id = kmp_i18n_null ??? 3548 3549 file_name = ""; 3550 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3551 if (depth == 0) { 3552 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3553 KMP_ASSERT(address2os == NULL); 3554 return; 3555 } 3556 KMP_ASSERT(depth > 0); 3557 KMP_ASSERT(address2os != NULL); 3558 } 3559 } 3560 3561 // 3562 // If the user has specified that a paricular topology discovery method 3563 // is to be used, then we abort if that method fails. The exception is 3564 // group affinity, which might have been implicitly set. 3565 // 3566 3567 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 3568 3569 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 3570 if (__kmp_affinity_verbose) { 3571 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3572 KMP_I18N_STR(Decodingx2APIC)); 3573 } 3574 3575 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3576 if (depth == 0) { 3577 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3578 KMP_ASSERT(address2os == NULL); 3579 return; 3580 } 3581 if (depth < 0) { 3582 KMP_ASSERT(msg_id != kmp_i18n_null); 3583 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3584 } 3585 } 3586 else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 3587 if (__kmp_affinity_verbose) { 3588 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3589 KMP_I18N_STR(DecodingLegacyAPIC)); 3590 } 3591 3592 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3593 if (depth == 0) { 3594 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3595 KMP_ASSERT(address2os == NULL); 3596 return; 3597 } 3598 if (depth < 0) { 3599 KMP_ASSERT(msg_id != kmp_i18n_null); 3600 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3601 } 3602 } 3603 3604 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3605 3606 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 3607 const char *filename; 3608 if (__kmp_cpuinfo_file != NULL) { 3609 filename = __kmp_cpuinfo_file; 3610 } 3611 else { 3612 filename = "/proc/cpuinfo"; 3613 } 3614 3615 if (__kmp_affinity_verbose) { 3616 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 3617 } 3618 3619 FILE *f = fopen(filename, "r"); 3620 if (f == NULL) { 3621 int code = errno; 3622 if (__kmp_cpuinfo_file != NULL) { 3623 __kmp_msg( 3624 kmp_ms_fatal, 3625 KMP_MSG(CantOpenFileForReading, filename), 3626 KMP_ERR(code), 3627 KMP_HNT(NameComesFrom_CPUINFO_FILE), 3628 __kmp_msg_null 3629 ); 3630 } 3631 else { 3632 __kmp_msg( 3633 kmp_ms_fatal, 3634 KMP_MSG(CantOpenFileForReading, filename), 3635 KMP_ERR(code), 3636 __kmp_msg_null 3637 ); 3638 } 3639 } 3640 int line = 0; 3641 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3642 fclose(f); 3643 if (depth < 0) { 3644 KMP_ASSERT(msg_id != kmp_i18n_null); 3645 if (line > 0) { 3646 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id)); 3647 } 3648 else { 3649 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 3650 } 3651 } 3652 if (__kmp_affinity_type == affinity_none) { 3653 KMP_ASSERT(depth == 0); 3654 KMP_ASSERT(address2os == NULL); 3655 return; 3656 } 3657 } 3658 3659 # if KMP_GROUP_AFFINITY 3660 3661 else if (__kmp_affinity_top_method == affinity_top_method_group) { 3662 if (__kmp_affinity_verbose) { 3663 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 3664 } 3665 3666 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 3667 KMP_ASSERT(depth != 0); 3668 if (depth < 0) { 3669 KMP_ASSERT(msg_id != kmp_i18n_null); 3670 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3671 } 3672 } 3673 3674 # endif /* KMP_GROUP_AFFINITY */ 3675 3676 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 3677 if (__kmp_affinity_verbose) { 3678 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY"); 3679 } 3680 3681 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3682 if (depth == 0) { 3683 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3684 KMP_ASSERT(address2os == NULL); 3685 return; 3686 } 3687 // should not fail 3688 KMP_ASSERT(depth > 0); 3689 KMP_ASSERT(address2os != NULL); 3690 } 3691 3692 if (address2os == NULL) { 3693 if (KMP_AFFINITY_CAPABLE() 3694 && (__kmp_affinity_verbose || (__kmp_affinity_warnings 3695 && (__kmp_affinity_type != affinity_none)))) { 3696 KMP_WARNING(ErrorInitializeAffinity); 3697 } 3698 __kmp_affinity_type = affinity_none; 3699 __kmp_affin_mask_size = 0; 3700 return; 3701 } 3702 3703 __kmp_apply_thread_places(&address2os, depth); 3704 3705 // 3706 // Create the table of masks, indexed by thread Id. 3707 // 3708 unsigned maxIndex; 3709 unsigned numUnique; 3710 kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique, 3711 address2os, __kmp_avail_proc); 3712 if (__kmp_affinity_gran_levels == 0) { 3713 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 3714 } 3715 3716 // 3717 // Set the childNums vector in all Address objects. This must be done 3718 // before we can sort using __kmp_affinity_cmp_Address_child_num(), 3719 // which takes into account the setting of __kmp_affinity_compact. 3720 // 3721 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc); 3722 3723 switch (__kmp_affinity_type) { 3724 3725 case affinity_explicit: 3726 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 3727 # if OMP_40_ENABLED 3728 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 3729 # endif 3730 { 3731 __kmp_affinity_process_proclist(&__kmp_affinity_masks, 3732 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask, 3733 maxIndex); 3734 } 3735 # if OMP_40_ENABLED 3736 else { 3737 __kmp_affinity_process_placelist(&__kmp_affinity_masks, 3738 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask, 3739 maxIndex); 3740 } 3741 # endif 3742 if (__kmp_affinity_num_masks == 0) { 3743 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3744 && (__kmp_affinity_type != affinity_none))) { 3745 KMP_WARNING(AffNoValidProcID); 3746 } 3747 __kmp_affinity_type = affinity_none; 3748 return; 3749 } 3750 break; 3751 3752 // 3753 // The other affinity types rely on sorting the Addresses according 3754 // to some permutation of the machine topology tree. Set 3755 // __kmp_affinity_compact and __kmp_affinity_offset appropriately, 3756 // then jump to a common code fragment to do the sort and create 3757 // the array of affinity masks. 3758 // 3759 3760 case affinity_logical: 3761 __kmp_affinity_compact = 0; 3762 if (__kmp_affinity_offset) { 3763 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset 3764 % __kmp_avail_proc; 3765 } 3766 goto sortAddresses; 3767 3768 case affinity_physical: 3769 if (__kmp_nThreadsPerCore > 1) { 3770 __kmp_affinity_compact = 1; 3771 if (__kmp_affinity_compact >= depth) { 3772 __kmp_affinity_compact = 0; 3773 } 3774 } else { 3775 __kmp_affinity_compact = 0; 3776 } 3777 if (__kmp_affinity_offset) { 3778 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset 3779 % __kmp_avail_proc; 3780 } 3781 goto sortAddresses; 3782 3783 case affinity_scatter: 3784 if (__kmp_affinity_compact >= depth) { 3785 __kmp_affinity_compact = 0; 3786 } 3787 else { 3788 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 3789 } 3790 goto sortAddresses; 3791 3792 case affinity_compact: 3793 if (__kmp_affinity_compact >= depth) { 3794 __kmp_affinity_compact = depth - 1; 3795 } 3796 goto sortAddresses; 3797 3798 case affinity_balanced: 3799 // Balanced works only for the case of a single package 3800 if( nPackages > 1 ) { 3801 if( __kmp_affinity_verbose || __kmp_affinity_warnings ) { 3802 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" ); 3803 } 3804 __kmp_affinity_type = affinity_none; 3805 return; 3806 } else if( __kmp_affinity_uniform_topology() ) { 3807 break; 3808 } else { // Non-uniform topology 3809 3810 // Save the depth for further usage 3811 __kmp_aff_depth = depth; 3812 3813 // Number of hyper threads per core in HT machine 3814 int nth_per_core = __kmp_nThreadsPerCore; 3815 3816 int core_level; 3817 if( nth_per_core > 1 ) { 3818 core_level = depth - 2; 3819 } else { 3820 core_level = depth - 1; 3821 } 3822 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1; 3823 int nproc = nth_per_core * ncores; 3824 3825 procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc ); 3826 for( int i = 0; i < nproc; i++ ) { 3827 procarr[ i ] = -1; 3828 } 3829 3830 for( int i = 0; i < __kmp_avail_proc; i++ ) { 3831 int proc = address2os[ i ].second; 3832 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread. 3833 // If there is only one thread per core then depth == 2: level 0 - package, 3834 // level 1 - core. 3835 int level = depth - 1; 3836 3837 // __kmp_nth_per_core == 1 3838 int thread = 0; 3839 int core = address2os[ i ].first.labels[ level ]; 3840 // If the thread level exists, that is we have more than one thread context per core 3841 if( nth_per_core > 1 ) { 3842 thread = address2os[ i ].first.labels[ level ] % nth_per_core; 3843 core = address2os[ i ].first.labels[ level - 1 ]; 3844 } 3845 procarr[ core * nth_per_core + thread ] = proc; 3846 } 3847 3848 break; 3849 } 3850 3851 sortAddresses: 3852 // 3853 // Allocate the gtid->affinity mask table. 3854 // 3855 if (__kmp_affinity_dups) { 3856 __kmp_affinity_num_masks = __kmp_avail_proc; 3857 } 3858 else { 3859 __kmp_affinity_num_masks = numUnique; 3860 } 3861 3862 # if OMP_40_ENABLED 3863 if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel ) 3864 && ( __kmp_affinity_num_places > 0 ) 3865 && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) { 3866 __kmp_affinity_num_masks = __kmp_affinity_num_places; 3867 } 3868 # endif 3869 3870 __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate( 3871 __kmp_affinity_num_masks * __kmp_affin_mask_size); 3872 3873 // 3874 // Sort the address2os table according to the current setting of 3875 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 3876 // 3877 qsort(address2os, __kmp_avail_proc, sizeof(*address2os), 3878 __kmp_affinity_cmp_Address_child_num); 3879 { 3880 int i; 3881 unsigned j; 3882 for (i = 0, j = 0; i < __kmp_avail_proc; i++) { 3883 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) { 3884 continue; 3885 } 3886 unsigned osId = address2os[i].second; 3887 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 3888 kmp_affin_mask_t *dest 3889 = KMP_CPU_INDEX(__kmp_affinity_masks, j); 3890 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 3891 KMP_CPU_COPY(dest, src); 3892 if (++j >= __kmp_affinity_num_masks) { 3893 break; 3894 } 3895 } 3896 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 3897 } 3898 break; 3899 3900 default: 3901 KMP_ASSERT2(0, "Unexpected affinity setting"); 3902 } 3903 3904 __kmp_free(osId2Mask); 3905 machine_hierarchy.init(address2os, __kmp_avail_proc); 3906 } 3907 3908 3909 void 3910 __kmp_affinity_initialize(void) 3911 { 3912 // 3913 // Much of the code above was written assumming that if a machine was not 3914 // affinity capable, then __kmp_affinity_type == affinity_none. We now 3915 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 3916 // 3917 // There are too many checks for __kmp_affinity_type == affinity_none 3918 // in this code. Instead of trying to change them all, check if 3919 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 3920 // affinity_none, call the real initialization routine, then restore 3921 // __kmp_affinity_type to affinity_disabled. 3922 // 3923 int disabled = (__kmp_affinity_type == affinity_disabled); 3924 if (! KMP_AFFINITY_CAPABLE()) { 3925 KMP_ASSERT(disabled); 3926 } 3927 if (disabled) { 3928 __kmp_affinity_type = affinity_none; 3929 } 3930 __kmp_aux_affinity_initialize(); 3931 if (disabled) { 3932 __kmp_affinity_type = affinity_disabled; 3933 } 3934 } 3935 3936 3937 void 3938 __kmp_affinity_uninitialize(void) 3939 { 3940 if (__kmp_affinity_masks != NULL) { 3941 __kmp_free(__kmp_affinity_masks); 3942 __kmp_affinity_masks = NULL; 3943 } 3944 if (fullMask != NULL) { 3945 KMP_CPU_FREE(fullMask); 3946 fullMask = NULL; 3947 } 3948 __kmp_affinity_num_masks = 0; 3949 # if OMP_40_ENABLED 3950 __kmp_affinity_num_places = 0; 3951 # endif 3952 if (__kmp_affinity_proclist != NULL) { 3953 __kmp_free(__kmp_affinity_proclist); 3954 __kmp_affinity_proclist = NULL; 3955 } 3956 if( address2os != NULL ) { 3957 __kmp_free( address2os ); 3958 address2os = NULL; 3959 } 3960 if( procarr != NULL ) { 3961 __kmp_free( procarr ); 3962 procarr = NULL; 3963 } 3964 } 3965 3966 3967 void 3968 __kmp_affinity_set_init_mask(int gtid, int isa_root) 3969 { 3970 if (! KMP_AFFINITY_CAPABLE()) { 3971 return; 3972 } 3973 3974 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 3975 if (th->th.th_affin_mask == NULL) { 3976 KMP_CPU_ALLOC(th->th.th_affin_mask); 3977 } 3978 else { 3979 KMP_CPU_ZERO(th->th.th_affin_mask); 3980 } 3981 3982 // 3983 // Copy the thread mask to the kmp_info_t strucuture. 3984 // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one 3985 // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask 3986 // is set, then the full mask is the same as the mask of the initialization 3987 // thread. 3988 // 3989 kmp_affin_mask_t *mask; 3990 int i; 3991 3992 # if OMP_40_ENABLED 3993 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 3994 # endif 3995 { 3996 if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced) 3997 ) { 3998 # if KMP_GROUP_AFFINITY 3999 if (__kmp_num_proc_groups > 1) { 4000 return; 4001 } 4002 # endif 4003 KMP_ASSERT(fullMask != NULL); 4004 i = KMP_PLACE_ALL; 4005 mask = fullMask; 4006 } 4007 else { 4008 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 ); 4009 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4010 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4011 } 4012 } 4013 # if OMP_40_ENABLED 4014 else { 4015 if ((! isa_root) 4016 || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 4017 # if KMP_GROUP_AFFINITY 4018 if (__kmp_num_proc_groups > 1) { 4019 return; 4020 } 4021 # endif 4022 KMP_ASSERT(fullMask != NULL); 4023 i = KMP_PLACE_ALL; 4024 mask = fullMask; 4025 } 4026 else { 4027 // 4028 // int i = some hash function or just a counter that doesn't 4029 // always start at 0. Use gtid for now. 4030 // 4031 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 ); 4032 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4033 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4034 } 4035 } 4036 # endif 4037 4038 # if OMP_40_ENABLED 4039 th->th.th_current_place = i; 4040 if (isa_root) { 4041 th->th.th_new_place = i; 4042 th->th.th_first_place = 0; 4043 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4044 } 4045 4046 if (i == KMP_PLACE_ALL) { 4047 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 4048 gtid)); 4049 } 4050 else { 4051 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 4052 gtid, i)); 4053 } 4054 # else 4055 if (i == -1) { 4056 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n", 4057 gtid)); 4058 } 4059 else { 4060 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n", 4061 gtid, i)); 4062 } 4063 # endif /* OMP_40_ENABLED */ 4064 4065 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4066 4067 if (__kmp_affinity_verbose) { 4068 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4069 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4070 th->th.th_affin_mask); 4071 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid, 4072 buf); 4073 } 4074 4075 # if KMP_OS_WINDOWS 4076 // 4077 // On Windows* OS, the process affinity mask might have changed. 4078 // If the user didn't request affinity and this call fails, 4079 // just continue silently. See CQ171393. 4080 // 4081 if ( __kmp_affinity_type == affinity_none ) { 4082 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 4083 } 4084 else 4085 # endif 4086 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4087 } 4088 4089 4090 # if OMP_40_ENABLED 4091 4092 void 4093 __kmp_affinity_set_place(int gtid) 4094 { 4095 int retval; 4096 4097 if (! KMP_AFFINITY_CAPABLE()) { 4098 return; 4099 } 4100 4101 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4102 4103 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n", 4104 gtid, th->th.th_new_place, th->th.th_current_place)); 4105 4106 // 4107 // Check that the new place is within this thread's partition. 4108 // 4109 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4110 KMP_ASSERT(th->th.th_new_place >= 0); 4111 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 4112 if (th->th.th_first_place <= th->th.th_last_place) { 4113 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) 4114 && (th->th.th_new_place <= th->th.th_last_place)); 4115 } 4116 else { 4117 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) 4118 || (th->th.th_new_place >= th->th.th_last_place)); 4119 } 4120 4121 // 4122 // Copy the thread mask to the kmp_info_t strucuture, 4123 // and set this thread's affinity. 4124 // 4125 kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, 4126 th->th.th_new_place); 4127 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4128 th->th.th_current_place = th->th.th_new_place; 4129 4130 if (__kmp_affinity_verbose) { 4131 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4132 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4133 th->th.th_affin_mask); 4134 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), 4135 gtid, buf); 4136 } 4137 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4138 } 4139 4140 # endif /* OMP_40_ENABLED */ 4141 4142 4143 int 4144 __kmp_aux_set_affinity(void **mask) 4145 { 4146 int gtid; 4147 kmp_info_t *th; 4148 int retval; 4149 4150 if (! KMP_AFFINITY_CAPABLE()) { 4151 return -1; 4152 } 4153 4154 gtid = __kmp_entry_gtid(); 4155 KA_TRACE(1000, ;{ 4156 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4157 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4158 (kmp_affin_mask_t *)(*mask)); 4159 __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n", 4160 gtid, buf); 4161 }); 4162 4163 if (__kmp_env_consistency_check) { 4164 if ((mask == NULL) || (*mask == NULL)) { 4165 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4166 } 4167 else { 4168 unsigned proc; 4169 int num_procs = 0; 4170 4171 for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) { 4172 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 4173 continue; 4174 } 4175 num_procs++; 4176 if (! KMP_CPU_ISSET(proc, fullMask)) { 4177 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4178 break; 4179 } 4180 } 4181 if (num_procs == 0) { 4182 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4183 } 4184 4185 # if KMP_GROUP_AFFINITY 4186 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 4187 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4188 } 4189 # endif /* KMP_GROUP_AFFINITY */ 4190 4191 } 4192 } 4193 4194 th = __kmp_threads[gtid]; 4195 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4196 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4197 if (retval == 0) { 4198 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 4199 } 4200 4201 # if OMP_40_ENABLED 4202 th->th.th_current_place = KMP_PLACE_UNDEFINED; 4203 th->th.th_new_place = KMP_PLACE_UNDEFINED; 4204 th->th.th_first_place = 0; 4205 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4206 4207 // 4208 // Turn off 4.0 affinity for the current tread at this parallel level. 4209 // 4210 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 4211 # endif 4212 4213 return retval; 4214 } 4215 4216 4217 int 4218 __kmp_aux_get_affinity(void **mask) 4219 { 4220 int gtid; 4221 int retval; 4222 kmp_info_t *th; 4223 4224 if (! KMP_AFFINITY_CAPABLE()) { 4225 return -1; 4226 } 4227 4228 gtid = __kmp_entry_gtid(); 4229 th = __kmp_threads[gtid]; 4230 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4231 4232 KA_TRACE(1000, ;{ 4233 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4234 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4235 th->th.th_affin_mask); 4236 __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf); 4237 }); 4238 4239 if (__kmp_env_consistency_check) { 4240 if ((mask == NULL) || (*mask == NULL)) { 4241 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 4242 } 4243 } 4244 4245 # if !KMP_OS_WINDOWS 4246 4247 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4248 KA_TRACE(1000, ;{ 4249 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4250 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4251 (kmp_affin_mask_t *)(*mask)); 4252 __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf); 4253 }); 4254 return retval; 4255 4256 # else 4257 4258 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 4259 return 0; 4260 4261 # endif /* KMP_OS_WINDOWS */ 4262 4263 } 4264 4265 int 4266 __kmp_aux_set_affinity_mask_proc(int proc, void **mask) 4267 { 4268 int retval; 4269 4270 if (! KMP_AFFINITY_CAPABLE()) { 4271 return -1; 4272 } 4273 4274 KA_TRACE(1000, ;{ 4275 int gtid = __kmp_entry_gtid(); 4276 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4277 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4278 (kmp_affin_mask_t *)(*mask)); 4279 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n", 4280 proc, gtid, buf); 4281 }); 4282 4283 if (__kmp_env_consistency_check) { 4284 if ((mask == NULL) || (*mask == NULL)) { 4285 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4286 } 4287 } 4288 4289 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4290 return -1; 4291 } 4292 if (! KMP_CPU_ISSET(proc, fullMask)) { 4293 return -2; 4294 } 4295 4296 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 4297 return 0; 4298 } 4299 4300 4301 int 4302 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) 4303 { 4304 int retval; 4305 4306 if (! KMP_AFFINITY_CAPABLE()) { 4307 return -1; 4308 } 4309 4310 KA_TRACE(1000, ;{ 4311 int gtid = __kmp_entry_gtid(); 4312 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4313 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4314 (kmp_affin_mask_t *)(*mask)); 4315 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n", 4316 proc, gtid, buf); 4317 }); 4318 4319 if (__kmp_env_consistency_check) { 4320 if ((mask == NULL) || (*mask == NULL)) { 4321 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 4322 } 4323 } 4324 4325 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4326 return -1; 4327 } 4328 if (! KMP_CPU_ISSET(proc, fullMask)) { 4329 return -2; 4330 } 4331 4332 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 4333 return 0; 4334 } 4335 4336 4337 int 4338 __kmp_aux_get_affinity_mask_proc(int proc, void **mask) 4339 { 4340 int retval; 4341 4342 if (! KMP_AFFINITY_CAPABLE()) { 4343 return -1; 4344 } 4345 4346 KA_TRACE(1000, ;{ 4347 int gtid = __kmp_entry_gtid(); 4348 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4349 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4350 (kmp_affin_mask_t *)(*mask)); 4351 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n", 4352 proc, gtid, buf); 4353 }); 4354 4355 if (__kmp_env_consistency_check) { 4356 if ((mask == NULL) || (*mask == NULL)) { 4357 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc"); 4358 } 4359 } 4360 4361 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4362 return 0; 4363 } 4364 if (! KMP_CPU_ISSET(proc, fullMask)) { 4365 return 0; 4366 } 4367 4368 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 4369 } 4370 4371 4372 // Dynamic affinity settings - Affinity balanced 4373 void __kmp_balanced_affinity( int tid, int nthreads ) 4374 { 4375 if( __kmp_affinity_uniform_topology() ) { 4376 int coreID; 4377 int threadID; 4378 // Number of hyper threads per core in HT machine 4379 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 4380 // Number of cores 4381 int ncores = __kmp_ncores; 4382 // How many threads will be bound to each core 4383 int chunk = nthreads / ncores; 4384 // How many cores will have an additional thread bound to it - "big cores" 4385 int big_cores = nthreads % ncores; 4386 // Number of threads on the big cores 4387 int big_nth = ( chunk + 1 ) * big_cores; 4388 if( tid < big_nth ) { 4389 coreID = tid / (chunk + 1 ); 4390 threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ; 4391 } else { //tid >= big_nth 4392 coreID = ( tid - big_cores ) / chunk; 4393 threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ; 4394 } 4395 4396 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 4397 "Illegal set affinity operation when not capable"); 4398 4399 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size); 4400 KMP_CPU_ZERO(mask); 4401 4402 // Granularity == thread 4403 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4404 int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second; 4405 KMP_CPU_SET( osID, mask); 4406 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4407 for( int i = 0; i < __kmp_nth_per_core; i++ ) { 4408 int osID; 4409 osID = address2os[ coreID * __kmp_nth_per_core + i ].second; 4410 KMP_CPU_SET( osID, mask); 4411 } 4412 } 4413 if (__kmp_affinity_verbose) { 4414 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4415 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4416 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4417 tid, buf); 4418 } 4419 __kmp_set_system_affinity( mask, TRUE ); 4420 } else { // Non-uniform topology 4421 4422 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size); 4423 KMP_CPU_ZERO(mask); 4424 4425 // Number of hyper threads per core in HT machine 4426 int nth_per_core = __kmp_nThreadsPerCore; 4427 int core_level; 4428 if( nth_per_core > 1 ) { 4429 core_level = __kmp_aff_depth - 2; 4430 } else { 4431 core_level = __kmp_aff_depth - 1; 4432 } 4433 4434 // Number of cores - maximum value; it does not count trail cores with 0 processors 4435 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1; 4436 4437 // For performance gain consider the special case nthreads == __kmp_avail_proc 4438 if( nthreads == __kmp_avail_proc ) { 4439 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4440 int osID = address2os[ tid ].second; 4441 KMP_CPU_SET( osID, mask); 4442 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4443 int coreID = address2os[ tid ].first.labels[ core_level ]; 4444 // We'll count found osIDs for the current core; they can be not more than nth_per_core; 4445 // since the address2os is sortied we can break when cnt==nth_per_core 4446 int cnt = 0; 4447 for( int i = 0; i < __kmp_avail_proc; i++ ) { 4448 int osID = address2os[ i ].second; 4449 int core = address2os[ i ].first.labels[ core_level ]; 4450 if( core == coreID ) { 4451 KMP_CPU_SET( osID, mask); 4452 cnt++; 4453 if( cnt == nth_per_core ) { 4454 break; 4455 } 4456 } 4457 } 4458 } 4459 } else if( nthreads <= __kmp_ncores ) { 4460 4461 int core = 0; 4462 for( int i = 0; i < ncores; i++ ) { 4463 // Check if this core from procarr[] is in the mask 4464 int in_mask = 0; 4465 for( int j = 0; j < nth_per_core; j++ ) { 4466 if( procarr[ i * nth_per_core + j ] != - 1 ) { 4467 in_mask = 1; 4468 break; 4469 } 4470 } 4471 if( in_mask ) { 4472 if( tid == core ) { 4473 for( int j = 0; j < nth_per_core; j++ ) { 4474 int osID = procarr[ i * nth_per_core + j ]; 4475 if( osID != -1 ) { 4476 KMP_CPU_SET( osID, mask ); 4477 // For granularity=thread it is enough to set the first available osID for this core 4478 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4479 break; 4480 } 4481 } 4482 } 4483 break; 4484 } else { 4485 core++; 4486 } 4487 } 4488 } 4489 4490 } else { // nthreads > __kmp_ncores 4491 4492 // Array to save the number of processors at each core 4493 int nproc_at_core[ ncores ]; 4494 // Array to save the number of cores with "x" available processors; 4495 int ncores_with_x_procs[ nth_per_core + 1 ]; 4496 // Array to save the number of cores with # procs from x to nth_per_core 4497 int ncores_with_x_to_max_procs[ nth_per_core + 1 ]; 4498 4499 for( int i = 0; i <= nth_per_core; i++ ) { 4500 ncores_with_x_procs[ i ] = 0; 4501 ncores_with_x_to_max_procs[ i ] = 0; 4502 } 4503 4504 for( int i = 0; i < ncores; i++ ) { 4505 int cnt = 0; 4506 for( int j = 0; j < nth_per_core; j++ ) { 4507 if( procarr[ i * nth_per_core + j ] != -1 ) { 4508 cnt++; 4509 } 4510 } 4511 nproc_at_core[ i ] = cnt; 4512 ncores_with_x_procs[ cnt ]++; 4513 } 4514 4515 for( int i = 0; i <= nth_per_core; i++ ) { 4516 for( int j = i; j <= nth_per_core; j++ ) { 4517 ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ]; 4518 } 4519 } 4520 4521 // Max number of processors 4522 int nproc = nth_per_core * ncores; 4523 // An array to keep number of threads per each context 4524 int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc ); 4525 for( int i = 0; i < nproc; i++ ) { 4526 newarr[ i ] = 0; 4527 } 4528 4529 int nth = nthreads; 4530 int flag = 0; 4531 while( nth > 0 ) { 4532 for( int j = 1; j <= nth_per_core; j++ ) { 4533 int cnt = ncores_with_x_to_max_procs[ j ]; 4534 for( int i = 0; i < ncores; i++ ) { 4535 // Skip the core with 0 processors 4536 if( nproc_at_core[ i ] == 0 ) { 4537 continue; 4538 } 4539 for( int k = 0; k < nth_per_core; k++ ) { 4540 if( procarr[ i * nth_per_core + k ] != -1 ) { 4541 if( newarr[ i * nth_per_core + k ] == 0 ) { 4542 newarr[ i * nth_per_core + k ] = 1; 4543 cnt--; 4544 nth--; 4545 break; 4546 } else { 4547 if( flag != 0 ) { 4548 newarr[ i * nth_per_core + k ] ++; 4549 cnt--; 4550 nth--; 4551 break; 4552 } 4553 } 4554 } 4555 } 4556 if( cnt == 0 || nth == 0 ) { 4557 break; 4558 } 4559 } 4560 if( nth == 0 ) { 4561 break; 4562 } 4563 } 4564 flag = 1; 4565 } 4566 int sum = 0; 4567 for( int i = 0; i < nproc; i++ ) { 4568 sum += newarr[ i ]; 4569 if( sum > tid ) { 4570 // Granularity == thread 4571 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4572 int osID = procarr[ i ]; 4573 KMP_CPU_SET( osID, mask); 4574 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4575 int coreID = i / nth_per_core; 4576 for( int ii = 0; ii < nth_per_core; ii++ ) { 4577 int osID = procarr[ coreID * nth_per_core + ii ]; 4578 if( osID != -1 ) { 4579 KMP_CPU_SET( osID, mask); 4580 } 4581 } 4582 } 4583 break; 4584 } 4585 } 4586 __kmp_free( newarr ); 4587 } 4588 4589 if (__kmp_affinity_verbose) { 4590 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4591 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4592 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4593 tid, buf); 4594 } 4595 __kmp_set_system_affinity( mask, TRUE ); 4596 } 4597 } 4598 4599 #else 4600 // affinity not supported 4601 4602 kmp_uint32 mac_skipPerLevel[7]; 4603 kmp_uint32 mac_depth; 4604 kmp_uint8 mac_leaf_kids; 4605 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 4606 static int first = 1; 4607 if (first) { 4608 const kmp_uint32 maxLevels = 7; 4609 kmp_uint32 numPerLevel[maxLevels]; 4610 4611 for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level 4612 numPerLevel[i] = 1; 4613 mac_skipPerLevel[i] = 1; 4614 } 4615 4616 mac_depth = 2; 4617 numPerLevel[0] = nproc; 4618 4619 kmp_uint32 branch = 4; 4620 if (numPerLevel[0] == 1) branch = nproc/4; 4621 if (branch<4) branch=4; 4622 for (kmp_uint32 d=0; d<mac_depth-1; ++d) { // optimize hierarchy width 4623 while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0! 4624 if (numPerLevel[d] & 1) numPerLevel[d]++; 4625 numPerLevel[d] = numPerLevel[d] >> 1; 4626 if (numPerLevel[d+1] == 1) mac_depth++; 4627 numPerLevel[d+1] = numPerLevel[d+1] << 1; 4628 } 4629 if(numPerLevel[0] == 1) { 4630 branch = branch >> 1; 4631 if (branch<4) branch = 4; 4632 } 4633 } 4634 4635 for (kmp_uint32 i=1; i<mac_depth; ++i) 4636 mac_skipPerLevel[i] = numPerLevel[i-1] * mac_skipPerLevel[i-1]; 4637 mac_leaf_kids = (kmp_uint8)numPerLevel[0]-1; 4638 first=0; 4639 } 4640 thr_bar->depth = mac_depth; 4641 thr_bar->base_leaf_kids = mac_leaf_kids; 4642 thr_bar->skip_per_level = mac_skipPerLevel; 4643 } 4644 4645 #endif // KMP_AFFINITY_SUPPORTED 4646