1 /* 2 * kmp_affinity.cpp -- affinity management 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "kmp.h" 17 #include "kmp_i18n.h" 18 #include "kmp_io.h" 19 #include "kmp_str.h" 20 #include "kmp_wrapper_getpid.h" 21 22 #if KMP_AFFINITY_SUPPORTED 23 24 // 25 // Print the affinity mask to the character array in a pretty format. 26 // 27 char * 28 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask) 29 { 30 KMP_ASSERT(buf_len >= 40); 31 char *scan = buf; 32 char *end = buf + buf_len - 1; 33 34 // 35 // Find first element / check for empty set. 36 // 37 size_t i; 38 for (i = 0; i < KMP_CPU_SETSIZE; i++) { 39 if (KMP_CPU_ISSET(i, mask)) { 40 break; 41 } 42 } 43 if (i == KMP_CPU_SETSIZE) { 44 KMP_SNPRINTF(scan, buf_len, "{<empty>}"); 45 while (*scan != '\0') scan++; 46 KMP_ASSERT(scan <= end); 47 return buf; 48 } 49 50 KMP_SNPRINTF(scan, buf_len, "{%ld", (long)i); 51 while (*scan != '\0') scan++; 52 i++; 53 for (; i < KMP_CPU_SETSIZE; i++) { 54 if (! KMP_CPU_ISSET(i, mask)) { 55 continue; 56 } 57 58 // 59 // Check for buffer overflow. A string of the form ",<n>" will have 60 // at most 10 characters, plus we want to leave room to print ",...}" 61 // if the set is too large to print for a total of 15 characters. 62 // We already left room for '\0' in setting end. 63 // 64 if (end - scan < 15) { 65 break; 66 } 67 KMP_SNPRINTF(scan, buf_len, ",%-ld", (long)i); 68 while (*scan != '\0') scan++; 69 } 70 if (i < KMP_CPU_SETSIZE) { 71 KMP_SNPRINTF(scan, buf_len, ",..."); 72 while (*scan != '\0') scan++; 73 } 74 KMP_SNPRINTF(scan, buf_len, "}"); 75 while (*scan != '\0') scan++; 76 KMP_ASSERT(scan <= end); 77 return buf; 78 } 79 80 81 void 82 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) 83 { 84 KMP_CPU_ZERO(mask); 85 86 # if KMP_GROUP_AFFINITY 87 88 if (__kmp_num_proc_groups > 1) { 89 int group; 90 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 91 for (group = 0; group < __kmp_num_proc_groups; group++) { 92 int i; 93 int num = __kmp_GetActiveProcessorCount(group); 94 for (i = 0; i < num; i++) { 95 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 96 } 97 } 98 } 99 else 100 101 # endif /* KMP_GROUP_AFFINITY */ 102 103 { 104 int proc; 105 for (proc = 0; proc < __kmp_xproc; proc++) { 106 KMP_CPU_SET(proc, mask); 107 } 108 } 109 } 110 111 112 // 113 // In Linux* OS debug & cover (-O0) builds, we need to avoid inline member 114 // functions. 115 // 116 // The icc codegen emits sections with extremely long names, of the form 117 // ".gnu.linkonce.<mangled_name>". There seems to have been a linker bug 118 // introduced between GNU ld version 2.14.90.0.4 and 2.15.92.0.2 involving 119 // some sort of memory corruption or table overflow that is triggered by 120 // these long strings. I checked the latest version of the linker - 121 // GNU ld (Linux* OS/GNU Binutils) 2.18.50.0.7.20080422 - and the bug is not 122 // fixed. 123 // 124 // Unfortunately, my attempts to reproduce it in a smaller example have 125 // failed - I'm not sure what the prospects are of getting it fixed 126 // properly - but we need a reproducer smaller than all of libomp. 127 // 128 // Work around the problem by avoiding inline constructors in such builds. 129 // We do this for all platforms, not just Linux* OS - non-inline functions are 130 // more debuggable and provide better coverage into than inline functions. 131 // Use inline functions in shipping libs, for performance. 132 // 133 134 # if !defined(KMP_DEBUG) && !defined(COVER) 135 136 class Address { 137 public: 138 static const unsigned maxDepth = 32; 139 unsigned labels[maxDepth]; 140 unsigned childNums[maxDepth]; 141 unsigned depth; 142 unsigned leader; 143 Address(unsigned _depth) 144 : depth(_depth), leader(FALSE) { 145 } 146 Address &operator=(const Address &b) { 147 depth = b.depth; 148 for (unsigned i = 0; i < depth; i++) { 149 labels[i] = b.labels[i]; 150 childNums[i] = b.childNums[i]; 151 } 152 leader = FALSE; 153 return *this; 154 } 155 bool operator==(const Address &b) const { 156 if (depth != b.depth) 157 return false; 158 for (unsigned i = 0; i < depth; i++) 159 if(labels[i] != b.labels[i]) 160 return false; 161 return true; 162 } 163 bool isClose(const Address &b, int level) const { 164 if (depth != b.depth) 165 return false; 166 if ((unsigned)level >= depth) 167 return true; 168 for (unsigned i = 0; i < (depth - level); i++) 169 if(labels[i] != b.labels[i]) 170 return false; 171 return true; 172 } 173 bool operator!=(const Address &b) const { 174 return !operator==(b); 175 } 176 }; 177 178 class AddrUnsPair { 179 public: 180 Address first; 181 unsigned second; 182 AddrUnsPair(Address _first, unsigned _second) 183 : first(_first), second(_second) { 184 } 185 AddrUnsPair &operator=(const AddrUnsPair &b) 186 { 187 first = b.first; 188 second = b.second; 189 return *this; 190 } 191 }; 192 193 # else 194 195 class Address { 196 public: 197 static const unsigned maxDepth = 32; 198 unsigned labels[maxDepth]; 199 unsigned childNums[maxDepth]; 200 unsigned depth; 201 unsigned leader; 202 Address(unsigned _depth); 203 Address &operator=(const Address &b); 204 bool operator==(const Address &b) const; 205 bool isClose(const Address &b, int level) const; 206 bool operator!=(const Address &b) const; 207 }; 208 209 Address::Address(unsigned _depth) 210 { 211 depth = _depth; 212 leader = FALSE; 213 } 214 215 Address &Address::operator=(const Address &b) { 216 depth = b.depth; 217 for (unsigned i = 0; i < depth; i++) { 218 labels[i] = b.labels[i]; 219 childNums[i] = b.childNums[i]; 220 } 221 leader = FALSE; 222 return *this; 223 } 224 225 bool Address::operator==(const Address &b) const { 226 if (depth != b.depth) 227 return false; 228 for (unsigned i = 0; i < depth; i++) 229 if(labels[i] != b.labels[i]) 230 return false; 231 return true; 232 } 233 234 bool Address::isClose(const Address &b, int level) const { 235 if (depth != b.depth) 236 return false; 237 if ((unsigned)level >= depth) 238 return true; 239 for (unsigned i = 0; i < (depth - level); i++) 240 if(labels[i] != b.labels[i]) 241 return false; 242 return true; 243 } 244 245 bool Address::operator!=(const Address &b) const { 246 return !operator==(b); 247 } 248 249 class AddrUnsPair { 250 public: 251 Address first; 252 unsigned second; 253 AddrUnsPair(Address _first, unsigned _second); 254 AddrUnsPair &operator=(const AddrUnsPair &b); 255 }; 256 257 AddrUnsPair::AddrUnsPair(Address _first, unsigned _second) 258 : first(_first), second(_second) 259 { 260 } 261 262 AddrUnsPair &AddrUnsPair::operator=(const AddrUnsPair &b) 263 { 264 first = b.first; 265 second = b.second; 266 return *this; 267 } 268 269 # endif /* !defined(KMP_DEBUG) && !defined(COVER) */ 270 271 272 static int 273 __kmp_affinity_cmp_Address_labels(const void *a, const void *b) 274 { 275 const Address *aa = (const Address *)&(((AddrUnsPair *)a) 276 ->first); 277 const Address *bb = (const Address *)&(((AddrUnsPair *)b) 278 ->first); 279 unsigned depth = aa->depth; 280 unsigned i; 281 KMP_DEBUG_ASSERT(depth == bb->depth); 282 for (i = 0; i < depth; i++) { 283 if (aa->labels[i] < bb->labels[i]) return -1; 284 if (aa->labels[i] > bb->labels[i]) return 1; 285 } 286 return 0; 287 } 288 289 290 static int 291 __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) 292 { 293 const Address *aa = (const Address *)&(((AddrUnsPair *)a) 294 ->first); 295 const Address *bb = (const Address *)&(((AddrUnsPair *)b) 296 ->first); 297 unsigned depth = aa->depth; 298 unsigned i; 299 KMP_DEBUG_ASSERT(depth == bb->depth); 300 KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth); 301 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0); 302 for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) { 303 int j = depth - i - 1; 304 if (aa->childNums[j] < bb->childNums[j]) return -1; 305 if (aa->childNums[j] > bb->childNums[j]) return 1; 306 } 307 for (; i < depth; i++) { 308 int j = i - __kmp_affinity_compact; 309 if (aa->childNums[j] < bb->childNums[j]) return -1; 310 if (aa->childNums[j] > bb->childNums[j]) return 1; 311 } 312 return 0; 313 } 314 315 /** A structure for holding machine-specific hierarchy info to be computed once at init. 316 This structure represents a mapping of threads to the actual machine hierarchy, or to 317 our best guess at what the hierarchy might be, for the purpose of performing an 318 efficient barrier. In the worst case, when there is no machine hierarchy information, 319 it produces a tree suitable for a barrier, similar to the tree used in the hyper barrier. */ 320 class hierarchy_info { 321 public: 322 /** Number of levels in the hierarchy. Typical levels are threads/core, cores/package 323 or socket, packages/node, nodes/machine, etc. We don't want to get specific with 324 nomenclature. When the machine is oversubscribed we add levels to duplicate the 325 hierarchy, doubling the thread capacity of the hierarchy each time we add a level. */ 326 kmp_uint32 maxLevels; 327 328 /** This is specifically the depth of the machine configuration hierarchy, in terms of the 329 number of levels along the longest path from root to any leaf. It corresponds to the 330 number of entries in numPerLevel if we exclude all but one trailing 1. */ 331 kmp_uint32 depth; 332 kmp_uint32 base_num_threads; 333 volatile kmp_int8 uninitialized; // 0=initialized, 1=uninitialized, 2=initialization in progress 334 volatile kmp_int8 resizing; // 0=not resizing, 1=resizing 335 336 /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children the parent of a 337 node at level i has. For example, if we have a machine with 4 packages, 4 cores/package 338 and 2 HT per core, then numPerLevel = {2, 4, 4, 1, 1}. All empty levels are set to 1. */ 339 kmp_uint32 *numPerLevel; 340 kmp_uint32 *skipPerLevel; 341 342 void deriveLevels(AddrUnsPair *adr2os, int num_addrs) { 343 int hier_depth = adr2os[0].first.depth; 344 int level = 0; 345 for (int i=hier_depth-1; i>=0; --i) { 346 int max = -1; 347 for (int j=0; j<num_addrs; ++j) { 348 int next = adr2os[j].first.childNums[i]; 349 if (next > max) max = next; 350 } 351 numPerLevel[level] = max+1; 352 ++level; 353 } 354 } 355 356 hierarchy_info() : maxLevels(7), depth(1), uninitialized(1), resizing(0) {} 357 358 // TO FIX: This destructor causes a segfault in the library at shutdown. 359 //~hierarchy_info() { if (!uninitialized && numPerLevel) __kmp_free(numPerLevel); } 360 361 void init(AddrUnsPair *adr2os, int num_addrs) 362 { 363 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&uninitialized, 1, 2); 364 if (bool_result == 0) { // Wait for initialization 365 while (TCR_1(uninitialized) != 0) KMP_CPU_PAUSE(); 366 return; 367 } 368 KMP_DEBUG_ASSERT(bool_result==1); 369 370 /* Added explicit initialization of the data fields here to prevent usage of dirty value 371 observed when static library is re-initialized multiple times (e.g. when 372 non-OpenMP thread repeatedly launches/joins thread that uses OpenMP). */ 373 depth = 1; 374 resizing = 0; 375 maxLevels = 7; 376 numPerLevel = (kmp_uint32 *)__kmp_allocate(maxLevels*2*sizeof(kmp_uint32)); 377 skipPerLevel = &(numPerLevel[maxLevels]); 378 for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level 379 numPerLevel[i] = 1; 380 skipPerLevel[i] = 1; 381 } 382 383 // Sort table by physical ID 384 if (adr2os) { 385 qsort(adr2os, num_addrs, sizeof(*adr2os), __kmp_affinity_cmp_Address_labels); 386 deriveLevels(adr2os, num_addrs); 387 } 388 else { 389 numPerLevel[0] = 4; 390 numPerLevel[1] = num_addrs/4; 391 if (num_addrs%4) numPerLevel[1]++; 392 } 393 394 base_num_threads = num_addrs; 395 for (int i=maxLevels-1; i>=0; --i) // count non-empty levels to get depth 396 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1' 397 depth++; 398 399 kmp_uint32 branch = 4; 400 if (numPerLevel[0] == 1) branch = num_addrs/4; 401 if (branch<4) branch=4; 402 for (kmp_uint32 d=0; d<depth-1; ++d) { // optimize hierarchy width 403 while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0! 404 if (numPerLevel[d] & 1) numPerLevel[d]++; 405 numPerLevel[d] = numPerLevel[d] >> 1; 406 if (numPerLevel[d+1] == 1) depth++; 407 numPerLevel[d+1] = numPerLevel[d+1] << 1; 408 } 409 if(numPerLevel[0] == 1) { 410 branch = branch >> 1; 411 if (branch<4) branch = 4; 412 } 413 } 414 415 for (kmp_uint32 i=1; i<depth; ++i) 416 skipPerLevel[i] = numPerLevel[i-1] * skipPerLevel[i-1]; 417 // Fill in hierarchy in the case of oversubscription 418 for (kmp_uint32 i=depth; i<maxLevels; ++i) 419 skipPerLevel[i] = 2*skipPerLevel[i-1]; 420 421 uninitialized = 0; // One writer 422 423 } 424 425 void resize(kmp_uint32 nproc) 426 { 427 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1); 428 if (bool_result == 0) { // Someone else is resizing 429 while (TCR_1(resizing) != 0) KMP_CPU_PAUSE(); 430 return; 431 } 432 KMP_DEBUG_ASSERT(bool_result!=0); 433 KMP_DEBUG_ASSERT(nproc > base_num_threads); 434 435 // Calculate new max_levels 436 kmp_uint32 old_sz = skipPerLevel[depth-1]; 437 kmp_uint32 incs = 0, old_maxLevels= maxLevels; 438 while (nproc > old_sz) { 439 old_sz *=2; 440 incs++; 441 } 442 maxLevels += incs; 443 444 // Resize arrays 445 kmp_uint32 *old_numPerLevel = numPerLevel; 446 kmp_uint32 *old_skipPerLevel = skipPerLevel; 447 numPerLevel = skipPerLevel = NULL; 448 numPerLevel = (kmp_uint32 *)__kmp_allocate(maxLevels*2*sizeof(kmp_uint32)); 449 skipPerLevel = &(numPerLevel[maxLevels]); 450 451 // Copy old elements from old arrays 452 for (kmp_uint32 i=0; i<old_maxLevels; ++i) { // init numPerLevel[*] to 1 item per level 453 numPerLevel[i] = old_numPerLevel[i]; 454 skipPerLevel[i] = old_skipPerLevel[i]; 455 } 456 457 // Init new elements in arrays to 1 458 for (kmp_uint32 i=old_maxLevels; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level 459 numPerLevel[i] = 1; 460 skipPerLevel[i] = 1; 461 } 462 463 // Free old arrays 464 __kmp_free(old_numPerLevel); 465 466 // Fill in oversubscription levels of hierarchy 467 for (kmp_uint32 i=old_maxLevels; i<maxLevels; ++i) 468 skipPerLevel[i] = 2*skipPerLevel[i-1]; 469 470 base_num_threads = nproc; 471 resizing = 0; // One writer 472 473 } 474 }; 475 476 static hierarchy_info machine_hierarchy; 477 478 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 479 kmp_uint32 depth; 480 // The test below is true if affinity is available, but set to "none". Need to init on first use of hierarchical barrier. 481 if (TCR_1(machine_hierarchy.uninitialized)) 482 machine_hierarchy.init(NULL, nproc); 483 // Adjust the hierarchy in case num threads exceeds original 484 if (nproc > machine_hierarchy.base_num_threads) 485 machine_hierarchy.resize(nproc); 486 487 depth = machine_hierarchy.depth; 488 KMP_DEBUG_ASSERT(depth > 0); 489 // The loop below adjusts the depth in the case of a resize 490 while (nproc > machine_hierarchy.skipPerLevel[depth-1]) 491 depth++; 492 493 thr_bar->depth = depth; 494 thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1; 495 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 496 } 497 498 // 499 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be 500 // called to renumber the labels from [0..n] and place them into the child_num 501 // vector of the address object. This is done in case the labels used for 502 // the children at one node of the hierarchy differ from those used for 503 // another node at the same level. Example: suppose the machine has 2 nodes 504 // with 2 packages each. The first node contains packages 601 and 602, and 505 // second node contains packages 603 and 604. If we try to sort the table 506 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604 507 // because we are paying attention to the labels themselves, not the ordinal 508 // child numbers. By using the child numbers in the sort, the result is 509 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604. 510 // 511 static void 512 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os, 513 int numAddrs) 514 { 515 KMP_DEBUG_ASSERT(numAddrs > 0); 516 int depth = address2os->first.depth; 517 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 518 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth 519 * sizeof(unsigned)); 520 int labCt; 521 for (labCt = 0; labCt < depth; labCt++) { 522 address2os[0].first.childNums[labCt] = counts[labCt] = 0; 523 lastLabel[labCt] = address2os[0].first.labels[labCt]; 524 } 525 int i; 526 for (i = 1; i < numAddrs; i++) { 527 for (labCt = 0; labCt < depth; labCt++) { 528 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) { 529 int labCt2; 530 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) { 531 counts[labCt2] = 0; 532 lastLabel[labCt2] = address2os[i].first.labels[labCt2]; 533 } 534 counts[labCt]++; 535 lastLabel[labCt] = address2os[i].first.labels[labCt]; 536 break; 537 } 538 } 539 for (labCt = 0; labCt < depth; labCt++) { 540 address2os[i].first.childNums[labCt] = counts[labCt]; 541 } 542 for (; labCt < (int)Address::maxDepth; labCt++) { 543 address2os[i].first.childNums[labCt] = 0; 544 } 545 } 546 } 547 548 549 // 550 // All of the __kmp_affinity_create_*_map() routines should set 551 // __kmp_affinity_masks to a vector of affinity mask objects of length 552 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and 553 // return the number of levels in the machine topology tree (zero if 554 // __kmp_affinity_type == affinity_none). 555 // 556 // All of the __kmp_affinity_create_*_map() routines should set *fullMask 557 // to the affinity mask for the initialization thread. They need to save and 558 // restore the mask, and it could be needed later, so saving it is just an 559 // optimization to avoid calling kmp_get_system_affinity() again. 560 // 561 static kmp_affin_mask_t *fullMask = NULL; 562 563 kmp_affin_mask_t * 564 __kmp_affinity_get_fullMask() { return fullMask; } 565 566 567 static int nCoresPerPkg, nPackages; 568 static int __kmp_nThreadsPerCore; 569 #ifndef KMP_DFLT_NTH_CORES 570 static int __kmp_ncores; 571 #endif 572 573 // 574 // __kmp_affinity_uniform_topology() doesn't work when called from 575 // places which support arbitrarily many levels in the machine topology 576 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map() 577 // __kmp_affinity_create_x2apicid_map(). 578 // 579 inline static bool 580 __kmp_affinity_uniform_topology() 581 { 582 return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages); 583 } 584 585 586 // 587 // Print out the detailed machine topology map, i.e. the physical locations 588 // of each OS proc. 589 // 590 static void 591 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth, 592 int pkgLevel, int coreLevel, int threadLevel) 593 { 594 int proc; 595 596 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 597 for (proc = 0; proc < len; proc++) { 598 int level; 599 kmp_str_buf_t buf; 600 __kmp_str_buf_init(&buf); 601 for (level = 0; level < depth; level++) { 602 if (level == threadLevel) { 603 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread)); 604 } 605 else if (level == coreLevel) { 606 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core)); 607 } 608 else if (level == pkgLevel) { 609 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package)); 610 } 611 else if (level > pkgLevel) { 612 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node), 613 level - pkgLevel - 1); 614 } 615 else { 616 __kmp_str_buf_print(&buf, "L%d ", level); 617 } 618 __kmp_str_buf_print(&buf, "%d ", 619 address2os[proc].first.labels[level]); 620 } 621 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second, 622 buf.str); 623 __kmp_str_buf_free(&buf); 624 } 625 } 626 627 628 // 629 // If we don't know how to retrieve the machine's processor topology, or 630 // encounter an error in doing so, this routine is called to form a "flat" 631 // mapping of os thread id's <-> processor id's. 632 // 633 static int 634 __kmp_affinity_create_flat_map(AddrUnsPair **address2os, 635 kmp_i18n_id_t *const msg_id) 636 { 637 *address2os = NULL; 638 *msg_id = kmp_i18n_null; 639 640 // 641 // Even if __kmp_affinity_type == affinity_none, this routine might still 642 // called to set __kmp_ncores, as well as 643 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 644 // 645 if (! KMP_AFFINITY_CAPABLE()) { 646 KMP_ASSERT(__kmp_affinity_type == affinity_none); 647 __kmp_ncores = nPackages = __kmp_xproc; 648 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 649 if (__kmp_affinity_verbose) { 650 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY"); 651 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 652 KMP_INFORM(Uniform, "KMP_AFFINITY"); 653 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 654 __kmp_nThreadsPerCore, __kmp_ncores); 655 } 656 return 0; 657 } 658 659 // 660 // When affinity is off, this routine will still be called to set 661 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 662 // nCoresPerPkg, & nPackages. Make sure all these vars are set 663 // correctly, and return now if affinity is not enabled. 664 // 665 __kmp_ncores = nPackages = __kmp_avail_proc; 666 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 667 if (__kmp_affinity_verbose) { 668 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 669 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask); 670 671 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY"); 672 if (__kmp_affinity_respect_mask) { 673 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 674 } else { 675 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 676 } 677 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 678 KMP_INFORM(Uniform, "KMP_AFFINITY"); 679 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 680 __kmp_nThreadsPerCore, __kmp_ncores); 681 } 682 if (__kmp_affinity_type == affinity_none) { 683 return 0; 684 } 685 686 // 687 // Contruct the data structure to be returned. 688 // 689 *address2os = (AddrUnsPair*) 690 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 691 int avail_ct = 0; 692 unsigned int i; 693 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 694 // 695 // Skip this proc if it is not included in the machine model. 696 // 697 if (! KMP_CPU_ISSET(i, fullMask)) { 698 continue; 699 } 700 701 Address addr(1); 702 addr.labels[0] = i; 703 (*address2os)[avail_ct++] = AddrUnsPair(addr,i); 704 } 705 if (__kmp_affinity_verbose) { 706 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 707 } 708 709 if (__kmp_affinity_gran_levels < 0) { 710 // 711 // Only the package level is modeled in the machine topology map, 712 // so the #levels of granularity is either 0 or 1. 713 // 714 if (__kmp_affinity_gran > affinity_gran_package) { 715 __kmp_affinity_gran_levels = 1; 716 } 717 else { 718 __kmp_affinity_gran_levels = 0; 719 } 720 } 721 return 1; 722 } 723 724 725 # if KMP_GROUP_AFFINITY 726 727 // 728 // If multiple Windows* OS processor groups exist, we can create a 2-level 729 // topology map with the groups at level 0 and the individual procs at 730 // level 1. 731 // 732 // This facilitates letting the threads float among all procs in a group, 733 // if granularity=group (the default when there are multiple groups). 734 // 735 static int 736 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os, 737 kmp_i18n_id_t *const msg_id) 738 { 739 *address2os = NULL; 740 *msg_id = kmp_i18n_null; 741 742 // 743 // If we don't have multiple processor groups, return now. 744 // The flat mapping will be used. 745 // 746 if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) { 747 // FIXME set *msg_id 748 return -1; 749 } 750 751 // 752 // Contruct the data structure to be returned. 753 // 754 *address2os = (AddrUnsPair*) 755 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 756 int avail_ct = 0; 757 int i; 758 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 759 // 760 // Skip this proc if it is not included in the machine model. 761 // 762 if (! KMP_CPU_ISSET(i, fullMask)) { 763 continue; 764 } 765 766 Address addr(2); 767 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR)); 768 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR)); 769 (*address2os)[avail_ct++] = AddrUnsPair(addr,i); 770 771 if (__kmp_affinity_verbose) { 772 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0], 773 addr.labels[1]); 774 } 775 } 776 777 if (__kmp_affinity_gran_levels < 0) { 778 if (__kmp_affinity_gran == affinity_gran_group) { 779 __kmp_affinity_gran_levels = 1; 780 } 781 else if ((__kmp_affinity_gran == affinity_gran_fine) 782 || (__kmp_affinity_gran == affinity_gran_thread)) { 783 __kmp_affinity_gran_levels = 0; 784 } 785 else { 786 const char *gran_str = NULL; 787 if (__kmp_affinity_gran == affinity_gran_core) { 788 gran_str = "core"; 789 } 790 else if (__kmp_affinity_gran == affinity_gran_package) { 791 gran_str = "package"; 792 } 793 else if (__kmp_affinity_gran == affinity_gran_node) { 794 gran_str = "node"; 795 } 796 else { 797 KMP_ASSERT(0); 798 } 799 800 // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread" 801 __kmp_affinity_gran_levels = 0; 802 } 803 } 804 return 2; 805 } 806 807 # endif /* KMP_GROUP_AFFINITY */ 808 809 810 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 811 812 static int 813 __kmp_cpuid_mask_width(int count) { 814 int r = 0; 815 816 while((1<<r) < count) 817 ++r; 818 return r; 819 } 820 821 822 class apicThreadInfo { 823 public: 824 unsigned osId; // param to __kmp_affinity_bind_thread 825 unsigned apicId; // from cpuid after binding 826 unsigned maxCoresPerPkg; // "" 827 unsigned maxThreadsPerPkg; // "" 828 unsigned pkgId; // inferred from above values 829 unsigned coreId; // "" 830 unsigned threadId; // "" 831 }; 832 833 834 static int 835 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b) 836 { 837 const apicThreadInfo *aa = (const apicThreadInfo *)a; 838 const apicThreadInfo *bb = (const apicThreadInfo *)b; 839 if (aa->osId < bb->osId) return -1; 840 if (aa->osId > bb->osId) return 1; 841 return 0; 842 } 843 844 845 static int 846 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b) 847 { 848 const apicThreadInfo *aa = (const apicThreadInfo *)a; 849 const apicThreadInfo *bb = (const apicThreadInfo *)b; 850 if (aa->pkgId < bb->pkgId) return -1; 851 if (aa->pkgId > bb->pkgId) return 1; 852 if (aa->coreId < bb->coreId) return -1; 853 if (aa->coreId > bb->coreId) return 1; 854 if (aa->threadId < bb->threadId) return -1; 855 if (aa->threadId > bb->threadId) return 1; 856 return 0; 857 } 858 859 860 // 861 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 862 // an algorithm which cycles through the available os threads, setting 863 // the current thread's affinity mask to that thread, and then retrieves 864 // the Apic Id for each thread context using the cpuid instruction. 865 // 866 static int 867 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os, 868 kmp_i18n_id_t *const msg_id) 869 { 870 kmp_cpuid buf; 871 int rc; 872 *address2os = NULL; 873 *msg_id = kmp_i18n_null; 874 875 // 876 // Check if cpuid leaf 4 is supported. 877 // 878 __kmp_x86_cpuid(0, 0, &buf); 879 if (buf.eax < 4) { 880 *msg_id = kmp_i18n_str_NoLeaf4Support; 881 return -1; 882 } 883 884 // 885 // The algorithm used starts by setting the affinity to each available 886 // thread and retrieving info from the cpuid instruction, so if we are 887 // not capable of calling __kmp_get_system_affinity() and 888 // _kmp_get_system_affinity(), then we need to do something else - use 889 // the defaults that we calculated from issuing cpuid without binding 890 // to each proc. 891 // 892 if (! KMP_AFFINITY_CAPABLE()) { 893 // 894 // Hack to try and infer the machine topology using only the data 895 // available from cpuid on the current thread, and __kmp_xproc. 896 // 897 KMP_ASSERT(__kmp_affinity_type == affinity_none); 898 899 // 900 // Get an upper bound on the number of threads per package using 901 // cpuid(1). 902 // 903 // On some OS/chps combinations where HT is supported by the chip 904 // but is disabled, this value will be 2 on a single core chip. 905 // Usually, it will be 2 if HT is enabled and 1 if HT is disabled. 906 // 907 __kmp_x86_cpuid(1, 0, &buf); 908 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 909 if (maxThreadsPerPkg == 0) { 910 maxThreadsPerPkg = 1; 911 } 912 913 // 914 // The num cores per pkg comes from cpuid(4). 915 // 1 must be added to the encoded value. 916 // 917 // The author of cpu_count.cpp treated this only an upper bound 918 // on the number of cores, but I haven't seen any cases where it 919 // was greater than the actual number of cores, so we will treat 920 // it as exact in this block of code. 921 // 922 // First, we need to check if cpuid(4) is supported on this chip. 923 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax 924 // has the value n or greater. 925 // 926 __kmp_x86_cpuid(0, 0, &buf); 927 if (buf.eax >= 4) { 928 __kmp_x86_cpuid(4, 0, &buf); 929 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 930 } 931 else { 932 nCoresPerPkg = 1; 933 } 934 935 // 936 // There is no way to reliably tell if HT is enabled without issuing 937 // the cpuid instruction from every thread, can correlating the cpuid 938 // info, so if the machine is not affinity capable, we assume that HT 939 // is off. We have seen quite a few machines where maxThreadsPerPkg 940 // is 2, yet the machine does not support HT. 941 // 942 // - Older OSes are usually found on machines with older chips, which 943 // do not support HT. 944 // 945 // - The performance penalty for mistakenly identifying a machine as 946 // HT when it isn't (which results in blocktime being incorrecly set 947 // to 0) is greater than the penalty when for mistakenly identifying 948 // a machine as being 1 thread/core when it is really HT enabled 949 // (which results in blocktime being incorrectly set to a positive 950 // value). 951 // 952 __kmp_ncores = __kmp_xproc; 953 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 954 __kmp_nThreadsPerCore = 1; 955 if (__kmp_affinity_verbose) { 956 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY"); 957 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 958 if (__kmp_affinity_uniform_topology()) { 959 KMP_INFORM(Uniform, "KMP_AFFINITY"); 960 } else { 961 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 962 } 963 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 964 __kmp_nThreadsPerCore, __kmp_ncores); 965 } 966 return 0; 967 } 968 969 // 970 // 971 // From here on, we can assume that it is safe to call 972 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), 973 // even if __kmp_affinity_type = affinity_none. 974 // 975 976 // 977 // Save the affinity mask for the current thread. 978 // 979 kmp_affin_mask_t *oldMask; 980 KMP_CPU_ALLOC(oldMask); 981 KMP_ASSERT(oldMask != NULL); 982 __kmp_get_system_affinity(oldMask, TRUE); 983 984 // 985 // Run through each of the available contexts, binding the current thread 986 // to it, and obtaining the pertinent information using the cpuid instr. 987 // 988 // The relevant information is: 989 // 990 // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 991 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 992 // 993 // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The 994 // value of this field determines the width of the core# + thread# 995 // fields in the Apic Id. It is also an upper bound on the number 996 // of threads per package, but it has been verified that situations 997 // happen were it is not exact. In particular, on certain OS/chip 998 // combinations where Intel(R) Hyper-Threading Technology is supported 999 // by the chip but has 1000 // been disabled, the value of this field will be 2 (for a single core 1001 // chip). On other OS/chip combinations supporting 1002 // Intel(R) Hyper-Threading Technology, the value of 1003 // this field will be 1 when Intel(R) Hyper-Threading Technology is 1004 // disabled and 2 when it is enabled. 1005 // 1006 // Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The 1007 // value of this field (+1) determines the width of the core# field in 1008 // the Apic Id. The comments in "cpucount.cpp" say that this value is 1009 // an upper bound, but the IA-32 architecture manual says that it is 1010 // exactly the number of cores per package, and I haven't seen any 1011 // case where it wasn't. 1012 // 1013 // From this information, deduce the package Id, core Id, and thread Id, 1014 // and set the corresponding fields in the apicThreadInfo struct. 1015 // 1016 unsigned i; 1017 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 1018 __kmp_avail_proc * sizeof(apicThreadInfo)); 1019 unsigned nApics = 0; 1020 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 1021 // 1022 // Skip this proc if it is not included in the machine model. 1023 // 1024 if (! KMP_CPU_ISSET(i, fullMask)) { 1025 continue; 1026 } 1027 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 1028 1029 __kmp_affinity_bind_thread(i); 1030 threadInfo[nApics].osId = i; 1031 1032 // 1033 // The apic id and max threads per pkg come from cpuid(1). 1034 // 1035 __kmp_x86_cpuid(1, 0, &buf); 1036 if (! (buf.edx >> 9) & 1) { 1037 __kmp_set_system_affinity(oldMask, TRUE); 1038 __kmp_free(threadInfo); 1039 KMP_CPU_FREE(oldMask); 1040 *msg_id = kmp_i18n_str_ApicNotPresent; 1041 return -1; 1042 } 1043 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 1044 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1045 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 1046 threadInfo[nApics].maxThreadsPerPkg = 1; 1047 } 1048 1049 // 1050 // Max cores per pkg comes from cpuid(4). 1051 // 1 must be added to the encoded value. 1052 // 1053 // First, we need to check if cpuid(4) is supported on this chip. 1054 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax 1055 // has the value n or greater. 1056 // 1057 __kmp_x86_cpuid(0, 0, &buf); 1058 if (buf.eax >= 4) { 1059 __kmp_x86_cpuid(4, 0, &buf); 1060 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1061 } 1062 else { 1063 threadInfo[nApics].maxCoresPerPkg = 1; 1064 } 1065 1066 // 1067 // Infer the pkgId / coreId / threadId using only the info 1068 // obtained locally. 1069 // 1070 int widthCT = __kmp_cpuid_mask_width( 1071 threadInfo[nApics].maxThreadsPerPkg); 1072 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 1073 1074 int widthC = __kmp_cpuid_mask_width( 1075 threadInfo[nApics].maxCoresPerPkg); 1076 int widthT = widthCT - widthC; 1077 if (widthT < 0) { 1078 // 1079 // I've never seen this one happen, but I suppose it could, if 1080 // the cpuid instruction on a chip was really screwed up. 1081 // Make sure to restore the affinity mask before the tail call. 1082 // 1083 __kmp_set_system_affinity(oldMask, TRUE); 1084 __kmp_free(threadInfo); 1085 KMP_CPU_FREE(oldMask); 1086 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1087 return -1; 1088 } 1089 1090 int maskC = (1 << widthC) - 1; 1091 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) 1092 &maskC; 1093 1094 int maskT = (1 << widthT) - 1; 1095 threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT; 1096 1097 nApics++; 1098 } 1099 1100 // 1101 // We've collected all the info we need. 1102 // Restore the old affinity mask for this thread. 1103 // 1104 __kmp_set_system_affinity(oldMask, TRUE); 1105 1106 // 1107 // If there's only one thread context to bind to, form an Address object 1108 // with depth 1 and return immediately (or, if affinity is off, set 1109 // address2os to NULL and return). 1110 // 1111 // If it is configured to omit the package level when there is only a 1112 // single package, the logic at the end of this routine won't work if 1113 // there is only a single thread - it would try to form an Address 1114 // object with depth 0. 1115 // 1116 KMP_ASSERT(nApics > 0); 1117 if (nApics == 1) { 1118 __kmp_ncores = nPackages = 1; 1119 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1120 if (__kmp_affinity_verbose) { 1121 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1122 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1123 1124 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1125 if (__kmp_affinity_respect_mask) { 1126 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1127 } else { 1128 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1129 } 1130 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1131 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1132 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1133 __kmp_nThreadsPerCore, __kmp_ncores); 1134 } 1135 1136 if (__kmp_affinity_type == affinity_none) { 1137 __kmp_free(threadInfo); 1138 KMP_CPU_FREE(oldMask); 1139 return 0; 1140 } 1141 1142 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair)); 1143 Address addr(1); 1144 addr.labels[0] = threadInfo[0].pkgId; 1145 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId); 1146 1147 if (__kmp_affinity_gran_levels < 0) { 1148 __kmp_affinity_gran_levels = 0; 1149 } 1150 1151 if (__kmp_affinity_verbose) { 1152 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 1153 } 1154 1155 __kmp_free(threadInfo); 1156 KMP_CPU_FREE(oldMask); 1157 return 1; 1158 } 1159 1160 // 1161 // Sort the threadInfo table by physical Id. 1162 // 1163 qsort(threadInfo, nApics, sizeof(*threadInfo), 1164 __kmp_affinity_cmp_apicThreadInfo_phys_id); 1165 1166 // 1167 // The table is now sorted by pkgId / coreId / threadId, but we really 1168 // don't know the radix of any of the fields. pkgId's may be sparsely 1169 // assigned among the chips on a system. Although coreId's are usually 1170 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned 1171 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1172 // 1173 // For that matter, we don't know what coresPerPkg and threadsPerCore 1174 // (or the total # packages) are at this point - we want to determine 1175 // that now. We only have an upper bound on the first two figures. 1176 // 1177 // We also perform a consistency check at this point: the values returned 1178 // by the cpuid instruction for any thread bound to a given package had 1179 // better return the same info for maxThreadsPerPkg and maxCoresPerPkg. 1180 // 1181 nPackages = 1; 1182 nCoresPerPkg = 1; 1183 __kmp_nThreadsPerCore = 1; 1184 unsigned nCores = 1; 1185 1186 unsigned pkgCt = 1; // to determine radii 1187 unsigned lastPkgId = threadInfo[0].pkgId; 1188 unsigned coreCt = 1; 1189 unsigned lastCoreId = threadInfo[0].coreId; 1190 unsigned threadCt = 1; 1191 unsigned lastThreadId = threadInfo[0].threadId; 1192 1193 // intra-pkg consist checks 1194 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 1195 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 1196 1197 for (i = 1; i < nApics; i++) { 1198 if (threadInfo[i].pkgId != lastPkgId) { 1199 nCores++; 1200 pkgCt++; 1201 lastPkgId = threadInfo[i].pkgId; 1202 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; 1203 coreCt = 1; 1204 lastCoreId = threadInfo[i].coreId; 1205 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1206 threadCt = 1; 1207 lastThreadId = threadInfo[i].threadId; 1208 1209 // 1210 // This is a different package, so go on to the next iteration 1211 // without doing any consistency checks. Reset the consistency 1212 // check vars, though. 1213 // 1214 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 1215 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 1216 continue; 1217 } 1218 1219 if (threadInfo[i].coreId != lastCoreId) { 1220 nCores++; 1221 coreCt++; 1222 lastCoreId = threadInfo[i].coreId; 1223 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1224 threadCt = 1; 1225 lastThreadId = threadInfo[i].threadId; 1226 } 1227 else if (threadInfo[i].threadId != lastThreadId) { 1228 threadCt++; 1229 lastThreadId = threadInfo[i].threadId; 1230 } 1231 else { 1232 __kmp_free(threadInfo); 1233 KMP_CPU_FREE(oldMask); 1234 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1235 return -1; 1236 } 1237 1238 // 1239 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 1240 // fields agree between all the threads bounds to a given package. 1241 // 1242 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) 1243 || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 1244 __kmp_free(threadInfo); 1245 KMP_CPU_FREE(oldMask); 1246 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1247 return -1; 1248 } 1249 } 1250 nPackages = pkgCt; 1251 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; 1252 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1253 1254 // 1255 // When affinity is off, this routine will still be called to set 1256 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 1257 // nCoresPerPkg, & nPackages. Make sure all these vars are set 1258 // correctly, and return now if affinity is not enabled. 1259 // 1260 __kmp_ncores = nCores; 1261 if (__kmp_affinity_verbose) { 1262 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1263 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1264 1265 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1266 if (__kmp_affinity_respect_mask) { 1267 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1268 } else { 1269 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1270 } 1271 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1272 if (__kmp_affinity_uniform_topology()) { 1273 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1274 } else { 1275 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1276 } 1277 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1278 __kmp_nThreadsPerCore, __kmp_ncores); 1279 1280 } 1281 1282 if (__kmp_affinity_type == affinity_none) { 1283 __kmp_free(threadInfo); 1284 KMP_CPU_FREE(oldMask); 1285 return 0; 1286 } 1287 1288 // 1289 // Now that we've determined the number of packages, the number of cores 1290 // per package, and the number of threads per core, we can construct the 1291 // data structure that is to be returned. 1292 // 1293 int pkgLevel = 0; 1294 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1; 1295 int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 1296 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 1297 1298 KMP_ASSERT(depth > 0); 1299 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1300 1301 for (i = 0; i < nApics; ++i) { 1302 Address addr(depth); 1303 unsigned os = threadInfo[i].osId; 1304 int d = 0; 1305 1306 if (pkgLevel >= 0) { 1307 addr.labels[d++] = threadInfo[i].pkgId; 1308 } 1309 if (coreLevel >= 0) { 1310 addr.labels[d++] = threadInfo[i].coreId; 1311 } 1312 if (threadLevel >= 0) { 1313 addr.labels[d++] = threadInfo[i].threadId; 1314 } 1315 (*address2os)[i] = AddrUnsPair(addr, os); 1316 } 1317 1318 if (__kmp_affinity_gran_levels < 0) { 1319 // 1320 // Set the granularity level based on what levels are modeled 1321 // in the machine topology map. 1322 // 1323 __kmp_affinity_gran_levels = 0; 1324 if ((threadLevel >= 0) 1325 && (__kmp_affinity_gran > affinity_gran_thread)) { 1326 __kmp_affinity_gran_levels++; 1327 } 1328 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1329 __kmp_affinity_gran_levels++; 1330 } 1331 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) { 1332 __kmp_affinity_gran_levels++; 1333 } 1334 } 1335 1336 if (__kmp_affinity_verbose) { 1337 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel, 1338 coreLevel, threadLevel); 1339 } 1340 1341 __kmp_free(threadInfo); 1342 KMP_CPU_FREE(oldMask); 1343 return depth; 1344 } 1345 1346 1347 // 1348 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 1349 // architectures support a newer interface for specifying the x2APIC Ids, 1350 // based on cpuid leaf 11. 1351 // 1352 static int 1353 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os, 1354 kmp_i18n_id_t *const msg_id) 1355 { 1356 kmp_cpuid buf; 1357 1358 *address2os = NULL; 1359 *msg_id = kmp_i18n_null; 1360 1361 // 1362 // Check to see if cpuid leaf 11 is supported. 1363 // 1364 __kmp_x86_cpuid(0, 0, &buf); 1365 if (buf.eax < 11) { 1366 *msg_id = kmp_i18n_str_NoLeaf11Support; 1367 return -1; 1368 } 1369 __kmp_x86_cpuid(11, 0, &buf); 1370 if (buf.ebx == 0) { 1371 *msg_id = kmp_i18n_str_NoLeaf11Support; 1372 return -1; 1373 } 1374 1375 // 1376 // Find the number of levels in the machine topology. While we're at it, 1377 // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will 1378 // try to get more accurate values later by explicitly counting them, 1379 // but get reasonable defaults now, in case we return early. 1380 // 1381 int level; 1382 int threadLevel = -1; 1383 int coreLevel = -1; 1384 int pkgLevel = -1; 1385 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1386 1387 for (level = 0;; level++) { 1388 if (level > 31) { 1389 // 1390 // FIXME: Hack for DPD200163180 1391 // 1392 // If level is big then something went wrong -> exiting 1393 // 1394 // There could actually be 32 valid levels in the machine topology, 1395 // but so far, the only machine we have seen which does not exit 1396 // this loop before iteration 32 has fubar x2APIC settings. 1397 // 1398 // For now, just reject this case based upon loop trip count. 1399 // 1400 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1401 return -1; 1402 } 1403 __kmp_x86_cpuid(11, level, &buf); 1404 if (buf.ebx == 0) { 1405 if (pkgLevel < 0) { 1406 // 1407 // Will infer nPackages from __kmp_xproc 1408 // 1409 pkgLevel = level; 1410 level++; 1411 } 1412 break; 1413 } 1414 int kind = (buf.ecx >> 8) & 0xff; 1415 if (kind == 1) { 1416 // 1417 // SMT level 1418 // 1419 threadLevel = level; 1420 coreLevel = -1; 1421 pkgLevel = -1; 1422 __kmp_nThreadsPerCore = buf.ebx & 0xff; 1423 if (__kmp_nThreadsPerCore == 0) { 1424 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1425 return -1; 1426 } 1427 } 1428 else if (kind == 2) { 1429 // 1430 // core level 1431 // 1432 coreLevel = level; 1433 pkgLevel = -1; 1434 nCoresPerPkg = buf.ebx & 0xff; 1435 if (nCoresPerPkg == 0) { 1436 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1437 return -1; 1438 } 1439 } 1440 else { 1441 if (level <= 0) { 1442 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1443 return -1; 1444 } 1445 if (pkgLevel >= 0) { 1446 continue; 1447 } 1448 pkgLevel = level; 1449 nPackages = buf.ebx & 0xff; 1450 if (nPackages == 0) { 1451 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1452 return -1; 1453 } 1454 } 1455 } 1456 int depth = level; 1457 1458 // 1459 // In the above loop, "level" was counted from the finest level (usually 1460 // thread) to the coarsest. The caller expects that we will place the 1461 // labels in (*address2os)[].first.labels[] in the inverse order, so 1462 // we need to invert the vars saying which level means what. 1463 // 1464 if (threadLevel >= 0) { 1465 threadLevel = depth - threadLevel - 1; 1466 } 1467 if (coreLevel >= 0) { 1468 coreLevel = depth - coreLevel - 1; 1469 } 1470 KMP_DEBUG_ASSERT(pkgLevel >= 0); 1471 pkgLevel = depth - pkgLevel - 1; 1472 1473 // 1474 // The algorithm used starts by setting the affinity to each available 1475 // thread and retrieving info from the cpuid instruction, so if we are 1476 // not capable of calling __kmp_get_system_affinity() and 1477 // _kmp_get_system_affinity(), then we need to do something else - use 1478 // the defaults that we calculated from issuing cpuid without binding 1479 // to each proc. 1480 // 1481 if (! KMP_AFFINITY_CAPABLE()) 1482 { 1483 // 1484 // Hack to try and infer the machine topology using only the data 1485 // available from cpuid on the current thread, and __kmp_xproc. 1486 // 1487 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1488 1489 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1490 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1491 if (__kmp_affinity_verbose) { 1492 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 1493 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1494 if (__kmp_affinity_uniform_topology()) { 1495 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1496 } else { 1497 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1498 } 1499 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1500 __kmp_nThreadsPerCore, __kmp_ncores); 1501 } 1502 return 0; 1503 } 1504 1505 // 1506 // 1507 // From here on, we can assume that it is safe to call 1508 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), 1509 // even if __kmp_affinity_type = affinity_none. 1510 // 1511 1512 // 1513 // Save the affinity mask for the current thread. 1514 // 1515 kmp_affin_mask_t *oldMask; 1516 KMP_CPU_ALLOC(oldMask); 1517 __kmp_get_system_affinity(oldMask, TRUE); 1518 1519 // 1520 // Allocate the data structure to be returned. 1521 // 1522 AddrUnsPair *retval = (AddrUnsPair *) 1523 __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 1524 1525 // 1526 // Run through each of the available contexts, binding the current thread 1527 // to it, and obtaining the pertinent information using the cpuid instr. 1528 // 1529 unsigned int proc; 1530 int nApics = 0; 1531 for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) { 1532 // 1533 // Skip this proc if it is not included in the machine model. 1534 // 1535 if (! KMP_CPU_ISSET(proc, fullMask)) { 1536 continue; 1537 } 1538 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc); 1539 1540 __kmp_affinity_bind_thread(proc); 1541 1542 // 1543 // Extrach the labels for each level in the machine topology map 1544 // from the Apic ID. 1545 // 1546 Address addr(depth); 1547 int prev_shift = 0; 1548 1549 for (level = 0; level < depth; level++) { 1550 __kmp_x86_cpuid(11, level, &buf); 1551 unsigned apicId = buf.edx; 1552 if (buf.ebx == 0) { 1553 if (level != depth - 1) { 1554 KMP_CPU_FREE(oldMask); 1555 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1556 return -1; 1557 } 1558 addr.labels[depth - level - 1] = apicId >> prev_shift; 1559 level++; 1560 break; 1561 } 1562 int shift = buf.eax & 0x1f; 1563 int mask = (1 << shift) - 1; 1564 addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift; 1565 prev_shift = shift; 1566 } 1567 if (level != depth) { 1568 KMP_CPU_FREE(oldMask); 1569 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1570 return -1; 1571 } 1572 1573 retval[nApics] = AddrUnsPair(addr, proc); 1574 nApics++; 1575 } 1576 1577 // 1578 // We've collected all the info we need. 1579 // Restore the old affinity mask for this thread. 1580 // 1581 __kmp_set_system_affinity(oldMask, TRUE); 1582 1583 // 1584 // If there's only one thread context to bind to, return now. 1585 // 1586 KMP_ASSERT(nApics > 0); 1587 if (nApics == 1) { 1588 __kmp_ncores = nPackages = 1; 1589 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1590 if (__kmp_affinity_verbose) { 1591 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1592 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1593 1594 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1595 if (__kmp_affinity_respect_mask) { 1596 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1597 } else { 1598 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1599 } 1600 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1601 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1602 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1603 __kmp_nThreadsPerCore, __kmp_ncores); 1604 } 1605 1606 if (__kmp_affinity_type == affinity_none) { 1607 __kmp_free(retval); 1608 KMP_CPU_FREE(oldMask); 1609 return 0; 1610 } 1611 1612 // 1613 // Form an Address object which only includes the package level. 1614 // 1615 Address addr(1); 1616 addr.labels[0] = retval[0].first.labels[pkgLevel]; 1617 retval[0].first = addr; 1618 1619 if (__kmp_affinity_gran_levels < 0) { 1620 __kmp_affinity_gran_levels = 0; 1621 } 1622 1623 if (__kmp_affinity_verbose) { 1624 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 1625 } 1626 1627 *address2os = retval; 1628 KMP_CPU_FREE(oldMask); 1629 return 1; 1630 } 1631 1632 // 1633 // Sort the table by physical Id. 1634 // 1635 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels); 1636 1637 // 1638 // Find the radix at each of the levels. 1639 // 1640 unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1641 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1642 unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1643 unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1644 for (level = 0; level < depth; level++) { 1645 totals[level] = 1; 1646 maxCt[level] = 1; 1647 counts[level] = 1; 1648 last[level] = retval[0].first.labels[level]; 1649 } 1650 1651 // 1652 // From here on, the iteration variable "level" runs from the finest 1653 // level to the coarsest, i.e. we iterate forward through 1654 // (*address2os)[].first.labels[] - in the previous loops, we iterated 1655 // backwards. 1656 // 1657 for (proc = 1; (int)proc < nApics; proc++) { 1658 int level; 1659 for (level = 0; level < depth; level++) { 1660 if (retval[proc].first.labels[level] != last[level]) { 1661 int j; 1662 for (j = level + 1; j < depth; j++) { 1663 totals[j]++; 1664 counts[j] = 1; 1665 // The line below causes printing incorrect topology information 1666 // in case the max value for some level (maxCt[level]) is encountered earlier than 1667 // some less value while going through the array. 1668 // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2 1669 // whereas it must be 4. 1670 // TODO!!! Check if it can be commented safely 1671 //maxCt[j] = 1; 1672 last[j] = retval[proc].first.labels[j]; 1673 } 1674 totals[level]++; 1675 counts[level]++; 1676 if (counts[level] > maxCt[level]) { 1677 maxCt[level] = counts[level]; 1678 } 1679 last[level] = retval[proc].first.labels[level]; 1680 break; 1681 } 1682 else if (level == depth - 1) { 1683 __kmp_free(last); 1684 __kmp_free(maxCt); 1685 __kmp_free(counts); 1686 __kmp_free(totals); 1687 __kmp_free(retval); 1688 KMP_CPU_FREE(oldMask); 1689 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 1690 return -1; 1691 } 1692 } 1693 } 1694 1695 // 1696 // When affinity is off, this routine will still be called to set 1697 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 1698 // nCoresPerPkg, & nPackages. Make sure all these vars are set 1699 // correctly, and return if affinity is not enabled. 1700 // 1701 if (threadLevel >= 0) { 1702 __kmp_nThreadsPerCore = maxCt[threadLevel]; 1703 } 1704 else { 1705 __kmp_nThreadsPerCore = 1; 1706 } 1707 nPackages = totals[pkgLevel]; 1708 1709 if (coreLevel >= 0) { 1710 __kmp_ncores = totals[coreLevel]; 1711 nCoresPerPkg = maxCt[coreLevel]; 1712 } 1713 else { 1714 __kmp_ncores = nPackages; 1715 nCoresPerPkg = 1; 1716 } 1717 1718 // 1719 // Check to see if the machine topology is uniform 1720 // 1721 unsigned prod = maxCt[0]; 1722 for (level = 1; level < depth; level++) { 1723 prod *= maxCt[level]; 1724 } 1725 bool uniform = (prod == totals[level - 1]); 1726 1727 // 1728 // Print the machine topology summary. 1729 // 1730 if (__kmp_affinity_verbose) { 1731 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 1732 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1733 1734 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1735 if (__kmp_affinity_respect_mask) { 1736 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 1737 } else { 1738 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 1739 } 1740 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1741 if (uniform) { 1742 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1743 } else { 1744 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1745 } 1746 1747 kmp_str_buf_t buf; 1748 __kmp_str_buf_init(&buf); 1749 1750 __kmp_str_buf_print(&buf, "%d", totals[0]); 1751 for (level = 1; level <= pkgLevel; level++) { 1752 __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 1753 } 1754 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 1755 __kmp_nThreadsPerCore, __kmp_ncores); 1756 1757 __kmp_str_buf_free(&buf); 1758 } 1759 1760 if (__kmp_affinity_type == affinity_none) { 1761 __kmp_free(last); 1762 __kmp_free(maxCt); 1763 __kmp_free(counts); 1764 __kmp_free(totals); 1765 __kmp_free(retval); 1766 KMP_CPU_FREE(oldMask); 1767 return 0; 1768 } 1769 1770 // 1771 // Find any levels with radiix 1, and remove them from the map 1772 // (except for the package level). 1773 // 1774 int new_depth = 0; 1775 for (level = 0; level < depth; level++) { 1776 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1777 continue; 1778 } 1779 new_depth++; 1780 } 1781 1782 // 1783 // If we are removing any levels, allocate a new vector to return, 1784 // and copy the relevant information to it. 1785 // 1786 if (new_depth != depth) { 1787 AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate( 1788 sizeof(AddrUnsPair) * nApics); 1789 for (proc = 0; (int)proc < nApics; proc++) { 1790 Address addr(new_depth); 1791 new_retval[proc] = AddrUnsPair(addr, retval[proc].second); 1792 } 1793 int new_level = 0; 1794 int newPkgLevel = -1; 1795 int newCoreLevel = -1; 1796 int newThreadLevel = -1; 1797 int i; 1798 for (level = 0; level < depth; level++) { 1799 if ((maxCt[level] == 1) 1800 && (level != pkgLevel)) { 1801 // 1802 // Remove this level. Never remove the package level 1803 // 1804 continue; 1805 } 1806 if (level == pkgLevel) { 1807 newPkgLevel = level; 1808 } 1809 if (level == coreLevel) { 1810 newCoreLevel = level; 1811 } 1812 if (level == threadLevel) { 1813 newThreadLevel = level; 1814 } 1815 for (proc = 0; (int)proc < nApics; proc++) { 1816 new_retval[proc].first.labels[new_level] 1817 = retval[proc].first.labels[level]; 1818 } 1819 new_level++; 1820 } 1821 1822 __kmp_free(retval); 1823 retval = new_retval; 1824 depth = new_depth; 1825 pkgLevel = newPkgLevel; 1826 coreLevel = newCoreLevel; 1827 threadLevel = newThreadLevel; 1828 } 1829 1830 if (__kmp_affinity_gran_levels < 0) { 1831 // 1832 // Set the granularity level based on what levels are modeled 1833 // in the machine topology map. 1834 // 1835 __kmp_affinity_gran_levels = 0; 1836 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1837 __kmp_affinity_gran_levels++; 1838 } 1839 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1840 __kmp_affinity_gran_levels++; 1841 } 1842 if (__kmp_affinity_gran > affinity_gran_package) { 1843 __kmp_affinity_gran_levels++; 1844 } 1845 } 1846 1847 if (__kmp_affinity_verbose) { 1848 __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, 1849 coreLevel, threadLevel); 1850 } 1851 1852 __kmp_free(last); 1853 __kmp_free(maxCt); 1854 __kmp_free(counts); 1855 __kmp_free(totals); 1856 KMP_CPU_FREE(oldMask); 1857 *address2os = retval; 1858 return depth; 1859 } 1860 1861 1862 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1863 1864 1865 #define osIdIndex 0 1866 #define threadIdIndex 1 1867 #define coreIdIndex 2 1868 #define pkgIdIndex 3 1869 #define nodeIdIndex 4 1870 1871 typedef unsigned *ProcCpuInfo; 1872 static unsigned maxIndex = pkgIdIndex; 1873 1874 1875 static int 1876 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b) 1877 { 1878 const unsigned *aa = (const unsigned *)a; 1879 const unsigned *bb = (const unsigned *)b; 1880 if (aa[osIdIndex] < bb[osIdIndex]) return -1; 1881 if (aa[osIdIndex] > bb[osIdIndex]) return 1; 1882 return 0; 1883 }; 1884 1885 1886 static int 1887 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b) 1888 { 1889 unsigned i; 1890 const unsigned *aa = *((const unsigned **)a); 1891 const unsigned *bb = *((const unsigned **)b); 1892 for (i = maxIndex; ; i--) { 1893 if (aa[i] < bb[i]) return -1; 1894 if (aa[i] > bb[i]) return 1; 1895 if (i == osIdIndex) break; 1896 } 1897 return 0; 1898 } 1899 1900 1901 // 1902 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 1903 // affinity map. 1904 // 1905 static int 1906 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line, 1907 kmp_i18n_id_t *const msg_id, FILE *f) 1908 { 1909 *address2os = NULL; 1910 *msg_id = kmp_i18n_null; 1911 1912 // 1913 // Scan of the file, and count the number of "processor" (osId) fields, 1914 // and find the highest value of <n> for a node_<n> field. 1915 // 1916 char buf[256]; 1917 unsigned num_records = 0; 1918 while (! feof(f)) { 1919 buf[sizeof(buf) - 1] = 1; 1920 if (! fgets(buf, sizeof(buf), f)) { 1921 // 1922 // Read errors presumably because of EOF 1923 // 1924 break; 1925 } 1926 1927 char s1[] = "processor"; 1928 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1929 num_records++; 1930 continue; 1931 } 1932 1933 // 1934 // FIXME - this will match "node_<n> <garbage>" 1935 // 1936 unsigned level; 1937 if (KMP_SSCANF(buf, "node_%d id", &level) == 1) { 1938 if (nodeIdIndex + level >= maxIndex) { 1939 maxIndex = nodeIdIndex + level; 1940 } 1941 continue; 1942 } 1943 } 1944 1945 // 1946 // Check for empty file / no valid processor records, or too many. 1947 // The number of records can't exceed the number of valid bits in the 1948 // affinity mask. 1949 // 1950 if (num_records == 0) { 1951 *line = 0; 1952 *msg_id = kmp_i18n_str_NoProcRecords; 1953 return -1; 1954 } 1955 if (num_records > (unsigned)__kmp_xproc) { 1956 *line = 0; 1957 *msg_id = kmp_i18n_str_TooManyProcRecords; 1958 return -1; 1959 } 1960 1961 // 1962 // Set the file pointer back to the begginning, so that we can scan the 1963 // file again, this time performing a full parse of the data. 1964 // Allocate a vector of ProcCpuInfo object, where we will place the data. 1965 // Adding an extra element at the end allows us to remove a lot of extra 1966 // checks for termination conditions. 1967 // 1968 if (fseek(f, 0, SEEK_SET) != 0) { 1969 *line = 0; 1970 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 1971 return -1; 1972 } 1973 1974 // 1975 // Allocate the array of records to store the proc info in. The dummy 1976 // element at the end makes the logic in filling them out easier to code. 1977 // 1978 unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1) 1979 * sizeof(unsigned *)); 1980 unsigned i; 1981 for (i = 0; i <= num_records; i++) { 1982 threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1) 1983 * sizeof(unsigned)); 1984 } 1985 1986 #define CLEANUP_THREAD_INFO \ 1987 for (i = 0; i <= num_records; i++) { \ 1988 __kmp_free(threadInfo[i]); \ 1989 } \ 1990 __kmp_free(threadInfo); 1991 1992 // 1993 // A value of UINT_MAX means that we didn't find the field 1994 // 1995 unsigned __index; 1996 1997 #define INIT_PROC_INFO(p) \ 1998 for (__index = 0; __index <= maxIndex; __index++) { \ 1999 (p)[__index] = UINT_MAX; \ 2000 } 2001 2002 for (i = 0; i <= num_records; i++) { 2003 INIT_PROC_INFO(threadInfo[i]); 2004 } 2005 2006 unsigned num_avail = 0; 2007 *line = 0; 2008 while (! feof(f)) { 2009 // 2010 // Create an inner scoping level, so that all the goto targets at the 2011 // end of the loop appear in an outer scoping level. This avoids 2012 // warnings about jumping past an initialization to a target in the 2013 // same block. 2014 // 2015 { 2016 buf[sizeof(buf) - 1] = 1; 2017 bool long_line = false; 2018 if (! fgets(buf, sizeof(buf), f)) { 2019 // 2020 // Read errors presumably because of EOF 2021 // 2022 // If there is valid data in threadInfo[num_avail], then fake 2023 // a blank line in ensure that the last address gets parsed. 2024 // 2025 bool valid = false; 2026 for (i = 0; i <= maxIndex; i++) { 2027 if (threadInfo[num_avail][i] != UINT_MAX) { 2028 valid = true; 2029 } 2030 } 2031 if (! valid) { 2032 break; 2033 } 2034 buf[0] = 0; 2035 } else if (!buf[sizeof(buf) - 1]) { 2036 // 2037 // The line is longer than the buffer. Set a flag and don't 2038 // emit an error if we were going to ignore the line, anyway. 2039 // 2040 long_line = true; 2041 2042 #define CHECK_LINE \ 2043 if (long_line) { \ 2044 CLEANUP_THREAD_INFO; \ 2045 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 2046 return -1; \ 2047 } 2048 } 2049 (*line)++; 2050 2051 char s1[] = "processor"; 2052 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 2053 CHECK_LINE; 2054 char *p = strchr(buf + sizeof(s1) - 1, ':'); 2055 unsigned val; 2056 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2057 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field; 2058 threadInfo[num_avail][osIdIndex] = val; 2059 #if KMP_OS_LINUX && USE_SYSFS_INFO 2060 char path[256]; 2061 KMP_SNPRINTF(path, sizeof(path), 2062 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 2063 threadInfo[num_avail][osIdIndex]); 2064 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 2065 2066 KMP_SNPRINTF(path, sizeof(path), 2067 "/sys/devices/system/cpu/cpu%u/topology/core_id", 2068 threadInfo[num_avail][osIdIndex]); 2069 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 2070 continue; 2071 #else 2072 } 2073 char s2[] = "physical id"; 2074 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 2075 CHECK_LINE; 2076 char *p = strchr(buf + sizeof(s2) - 1, ':'); 2077 unsigned val; 2078 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2079 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field; 2080 threadInfo[num_avail][pkgIdIndex] = val; 2081 continue; 2082 } 2083 char s3[] = "core id"; 2084 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 2085 CHECK_LINE; 2086 char *p = strchr(buf + sizeof(s3) - 1, ':'); 2087 unsigned val; 2088 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2089 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field; 2090 threadInfo[num_avail][coreIdIndex] = val; 2091 continue; 2092 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 2093 } 2094 char s4[] = "thread id"; 2095 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 2096 CHECK_LINE; 2097 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2098 unsigned val; 2099 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2100 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field; 2101 threadInfo[num_avail][threadIdIndex] = val; 2102 continue; 2103 } 2104 unsigned level; 2105 if (KMP_SSCANF(buf, "node_%d id", &level) == 1) { 2106 CHECK_LINE; 2107 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2108 unsigned val; 2109 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2110 KMP_ASSERT(nodeIdIndex + level <= maxIndex); 2111 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field; 2112 threadInfo[num_avail][nodeIdIndex + level] = val; 2113 continue; 2114 } 2115 2116 // 2117 // We didn't recognize the leading token on the line. 2118 // There are lots of leading tokens that we don't recognize - 2119 // if the line isn't empty, go on to the next line. 2120 // 2121 if ((*buf != 0) && (*buf != '\n')) { 2122 // 2123 // If the line is longer than the buffer, read characters 2124 // until we find a newline. 2125 // 2126 if (long_line) { 2127 int ch; 2128 while (((ch = fgetc(f)) != EOF) && (ch != '\n')); 2129 } 2130 continue; 2131 } 2132 2133 // 2134 // A newline has signalled the end of the processor record. 2135 // Check that there aren't too many procs specified. 2136 // 2137 if ((int)num_avail == __kmp_xproc) { 2138 CLEANUP_THREAD_INFO; 2139 *msg_id = kmp_i18n_str_TooManyEntries; 2140 return -1; 2141 } 2142 2143 // 2144 // Check for missing fields. The osId field must be there, and we 2145 // currently require that the physical id field is specified, also. 2146 // 2147 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 2148 CLEANUP_THREAD_INFO; 2149 *msg_id = kmp_i18n_str_MissingProcField; 2150 return -1; 2151 } 2152 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 2153 CLEANUP_THREAD_INFO; 2154 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 2155 return -1; 2156 } 2157 2158 // 2159 // Skip this proc if it is not included in the machine model. 2160 // 2161 if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) { 2162 INIT_PROC_INFO(threadInfo[num_avail]); 2163 continue; 2164 } 2165 2166 // 2167 // We have a successful parse of this proc's info. 2168 // Increment the counter, and prepare for the next proc. 2169 // 2170 num_avail++; 2171 KMP_ASSERT(num_avail <= num_records); 2172 INIT_PROC_INFO(threadInfo[num_avail]); 2173 } 2174 continue; 2175 2176 no_val: 2177 CLEANUP_THREAD_INFO; 2178 *msg_id = kmp_i18n_str_MissingValCpuinfo; 2179 return -1; 2180 2181 dup_field: 2182 CLEANUP_THREAD_INFO; 2183 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 2184 return -1; 2185 } 2186 *line = 0; 2187 2188 # if KMP_MIC && REDUCE_TEAM_SIZE 2189 unsigned teamSize = 0; 2190 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2191 2192 // check for num_records == __kmp_xproc ??? 2193 2194 // 2195 // If there's only one thread context to bind to, form an Address object 2196 // with depth 1 and return immediately (or, if affinity is off, set 2197 // address2os to NULL and return). 2198 // 2199 // If it is configured to omit the package level when there is only a 2200 // single package, the logic at the end of this routine won't work if 2201 // there is only a single thread - it would try to form an Address 2202 // object with depth 0. 2203 // 2204 KMP_ASSERT(num_avail > 0); 2205 KMP_ASSERT(num_avail <= num_records); 2206 if (num_avail == 1) { 2207 __kmp_ncores = 1; 2208 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 2209 if (__kmp_affinity_verbose) { 2210 if (! KMP_AFFINITY_CAPABLE()) { 2211 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2212 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2213 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2214 } 2215 else { 2216 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2217 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 2218 fullMask); 2219 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2220 if (__kmp_affinity_respect_mask) { 2221 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2222 } else { 2223 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2224 } 2225 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2226 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2227 } 2228 int index; 2229 kmp_str_buf_t buf; 2230 __kmp_str_buf_init(&buf); 2231 __kmp_str_buf_print(&buf, "1"); 2232 for (index = maxIndex - 1; index > pkgIdIndex; index--) { 2233 __kmp_str_buf_print(&buf, " x 1"); 2234 } 2235 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1); 2236 __kmp_str_buf_free(&buf); 2237 } 2238 2239 if (__kmp_affinity_type == affinity_none) { 2240 CLEANUP_THREAD_INFO; 2241 return 0; 2242 } 2243 2244 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair)); 2245 Address addr(1); 2246 addr.labels[0] = threadInfo[0][pkgIdIndex]; 2247 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]); 2248 2249 if (__kmp_affinity_gran_levels < 0) { 2250 __kmp_affinity_gran_levels = 0; 2251 } 2252 2253 if (__kmp_affinity_verbose) { 2254 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 2255 } 2256 2257 CLEANUP_THREAD_INFO; 2258 return 1; 2259 } 2260 2261 // 2262 // Sort the threadInfo table by physical Id. 2263 // 2264 qsort(threadInfo, num_avail, sizeof(*threadInfo), 2265 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 2266 2267 // 2268 // The table is now sorted by pkgId / coreId / threadId, but we really 2269 // don't know the radix of any of the fields. pkgId's may be sparsely 2270 // assigned among the chips on a system. Although coreId's are usually 2271 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned 2272 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2273 // 2274 // For that matter, we don't know what coresPerPkg and threadsPerCore 2275 // (or the total # packages) are at this point - we want to determine 2276 // that now. We only have an upper bound on the first two figures. 2277 // 2278 unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1) 2279 * sizeof(unsigned)); 2280 unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1) 2281 * sizeof(unsigned)); 2282 unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1) 2283 * sizeof(unsigned)); 2284 unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1) 2285 * sizeof(unsigned)); 2286 2287 bool assign_thread_ids = false; 2288 unsigned threadIdCt; 2289 unsigned index; 2290 2291 restart_radix_check: 2292 threadIdCt = 0; 2293 2294 // 2295 // Initialize the counter arrays with data from threadInfo[0]. 2296 // 2297 if (assign_thread_ids) { 2298 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 2299 threadInfo[0][threadIdIndex] = threadIdCt++; 2300 } 2301 else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 2302 threadIdCt = threadInfo[0][threadIdIndex] + 1; 2303 } 2304 } 2305 for (index = 0; index <= maxIndex; index++) { 2306 counts[index] = 1; 2307 maxCt[index] = 1; 2308 totals[index] = 1; 2309 lastId[index] = threadInfo[0][index];; 2310 } 2311 2312 // 2313 // Run through the rest of the OS procs. 2314 // 2315 for (i = 1; i < num_avail; i++) { 2316 // 2317 // Find the most significant index whose id differs 2318 // from the id for the previous OS proc. 2319 // 2320 for (index = maxIndex; index >= threadIdIndex; index--) { 2321 if (assign_thread_ids && (index == threadIdIndex)) { 2322 // 2323 // Auto-assign the thread id field if it wasn't specified. 2324 // 2325 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2326 threadInfo[i][threadIdIndex] = threadIdCt++; 2327 } 2328 2329 // 2330 // Aparrently the thread id field was specified for some 2331 // entries and not others. Start the thread id counter 2332 // off at the next higher thread id. 2333 // 2334 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2335 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2336 } 2337 } 2338 if (threadInfo[i][index] != lastId[index]) { 2339 // 2340 // Run through all indices which are less significant, 2341 // and reset the counts to 1. 2342 // 2343 // At all levels up to and including index, we need to 2344 // increment the totals and record the last id. 2345 // 2346 unsigned index2; 2347 for (index2 = threadIdIndex; index2 < index; index2++) { 2348 totals[index2]++; 2349 if (counts[index2] > maxCt[index2]) { 2350 maxCt[index2] = counts[index2]; 2351 } 2352 counts[index2] = 1; 2353 lastId[index2] = threadInfo[i][index2]; 2354 } 2355 counts[index]++; 2356 totals[index]++; 2357 lastId[index] = threadInfo[i][index]; 2358 2359 if (assign_thread_ids && (index > threadIdIndex)) { 2360 2361 # if KMP_MIC && REDUCE_TEAM_SIZE 2362 // 2363 // The default team size is the total #threads in the machine 2364 // minus 1 thread for every core that has 3 or more threads. 2365 // 2366 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 ); 2367 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2368 2369 // 2370 // Restart the thread counter, as we are on a new core. 2371 // 2372 threadIdCt = 0; 2373 2374 // 2375 // Auto-assign the thread id field if it wasn't specified. 2376 // 2377 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2378 threadInfo[i][threadIdIndex] = threadIdCt++; 2379 } 2380 2381 // 2382 // Aparrently the thread id field was specified for some 2383 // entries and not others. Start the thread id counter 2384 // off at the next higher thread id. 2385 // 2386 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2387 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2388 } 2389 } 2390 break; 2391 } 2392 } 2393 if (index < threadIdIndex) { 2394 // 2395 // If thread ids were specified, it is an error if they are not 2396 // unique. Also, check that we waven't already restarted the 2397 // loop (to be safe - shouldn't need to). 2398 // 2399 if ((threadInfo[i][threadIdIndex] != UINT_MAX) 2400 || assign_thread_ids) { 2401 __kmp_free(lastId); 2402 __kmp_free(totals); 2403 __kmp_free(maxCt); 2404 __kmp_free(counts); 2405 CLEANUP_THREAD_INFO; 2406 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2407 return -1; 2408 } 2409 2410 // 2411 // If the thread ids were not specified and we see entries 2412 // entries that are duplicates, start the loop over and 2413 // assign the thread ids manually. 2414 // 2415 assign_thread_ids = true; 2416 goto restart_radix_check; 2417 } 2418 } 2419 2420 # if KMP_MIC && REDUCE_TEAM_SIZE 2421 // 2422 // The default team size is the total #threads in the machine 2423 // minus 1 thread for every core that has 3 or more threads. 2424 // 2425 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 ); 2426 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2427 2428 for (index = threadIdIndex; index <= maxIndex; index++) { 2429 if (counts[index] > maxCt[index]) { 2430 maxCt[index] = counts[index]; 2431 } 2432 } 2433 2434 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2435 nCoresPerPkg = maxCt[coreIdIndex]; 2436 nPackages = totals[pkgIdIndex]; 2437 2438 // 2439 // Check to see if the machine topology is uniform 2440 // 2441 unsigned prod = totals[maxIndex]; 2442 for (index = threadIdIndex; index < maxIndex; index++) { 2443 prod *= maxCt[index]; 2444 } 2445 bool uniform = (prod == totals[threadIdIndex]); 2446 2447 // 2448 // When affinity is off, this routine will still be called to set 2449 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 2450 // nCoresPerPkg, & nPackages. Make sure all these vars are set 2451 // correctly, and return now if affinity is not enabled. 2452 // 2453 __kmp_ncores = totals[coreIdIndex]; 2454 2455 if (__kmp_affinity_verbose) { 2456 if (! KMP_AFFINITY_CAPABLE()) { 2457 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2458 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2459 if (uniform) { 2460 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2461 } else { 2462 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2463 } 2464 } 2465 else { 2466 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2467 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask); 2468 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2469 if (__kmp_affinity_respect_mask) { 2470 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2471 } else { 2472 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2473 } 2474 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2475 if (uniform) { 2476 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2477 } else { 2478 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2479 } 2480 } 2481 kmp_str_buf_t buf; 2482 __kmp_str_buf_init(&buf); 2483 2484 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]); 2485 for (index = maxIndex - 1; index >= pkgIdIndex; index--) { 2486 __kmp_str_buf_print(&buf, " x %d", maxCt[index]); 2487 } 2488 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex], 2489 maxCt[threadIdIndex], __kmp_ncores); 2490 2491 __kmp_str_buf_free(&buf); 2492 } 2493 2494 # if KMP_MIC && REDUCE_TEAM_SIZE 2495 // 2496 // Set the default team size. 2497 // 2498 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2499 __kmp_dflt_team_nth = teamSize; 2500 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n", 2501 __kmp_dflt_team_nth)); 2502 } 2503 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2504 2505 if (__kmp_affinity_type == affinity_none) { 2506 __kmp_free(lastId); 2507 __kmp_free(totals); 2508 __kmp_free(maxCt); 2509 __kmp_free(counts); 2510 CLEANUP_THREAD_INFO; 2511 return 0; 2512 } 2513 2514 // 2515 // Count the number of levels which have more nodes at that level than 2516 // at the parent's level (with there being an implicit root node of 2517 // the top level). This is equivalent to saying that there is at least 2518 // one node at this level which has a sibling. These levels are in the 2519 // map, and the package level is always in the map. 2520 // 2521 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2522 int level = 0; 2523 for (index = threadIdIndex; index < maxIndex; index++) { 2524 KMP_ASSERT(totals[index] >= totals[index + 1]); 2525 inMap[index] = (totals[index] > totals[index + 1]); 2526 } 2527 inMap[maxIndex] = (totals[maxIndex] > 1); 2528 inMap[pkgIdIndex] = true; 2529 2530 int depth = 0; 2531 for (index = threadIdIndex; index <= maxIndex; index++) { 2532 if (inMap[index]) { 2533 depth++; 2534 } 2535 } 2536 KMP_ASSERT(depth > 0); 2537 2538 // 2539 // Construct the data structure that is to be returned. 2540 // 2541 *address2os = (AddrUnsPair*) 2542 __kmp_allocate(sizeof(AddrUnsPair) * num_avail); 2543 int pkgLevel = -1; 2544 int coreLevel = -1; 2545 int threadLevel = -1; 2546 2547 for (i = 0; i < num_avail; ++i) { 2548 Address addr(depth); 2549 unsigned os = threadInfo[i][osIdIndex]; 2550 int src_index; 2551 int dst_index = 0; 2552 2553 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2554 if (! inMap[src_index]) { 2555 continue; 2556 } 2557 addr.labels[dst_index] = threadInfo[i][src_index]; 2558 if (src_index == pkgIdIndex) { 2559 pkgLevel = dst_index; 2560 } 2561 else if (src_index == coreIdIndex) { 2562 coreLevel = dst_index; 2563 } 2564 else if (src_index == threadIdIndex) { 2565 threadLevel = dst_index; 2566 } 2567 dst_index++; 2568 } 2569 (*address2os)[i] = AddrUnsPair(addr, os); 2570 } 2571 2572 if (__kmp_affinity_gran_levels < 0) { 2573 // 2574 // Set the granularity level based on what levels are modeled 2575 // in the machine topology map. 2576 // 2577 unsigned src_index; 2578 __kmp_affinity_gran_levels = 0; 2579 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) { 2580 if (! inMap[src_index]) { 2581 continue; 2582 } 2583 switch (src_index) { 2584 case threadIdIndex: 2585 if (__kmp_affinity_gran > affinity_gran_thread) { 2586 __kmp_affinity_gran_levels++; 2587 } 2588 2589 break; 2590 case coreIdIndex: 2591 if (__kmp_affinity_gran > affinity_gran_core) { 2592 __kmp_affinity_gran_levels++; 2593 } 2594 break; 2595 2596 case pkgIdIndex: 2597 if (__kmp_affinity_gran > affinity_gran_package) { 2598 __kmp_affinity_gran_levels++; 2599 } 2600 break; 2601 } 2602 } 2603 } 2604 2605 if (__kmp_affinity_verbose) { 2606 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel, 2607 coreLevel, threadLevel); 2608 } 2609 2610 __kmp_free(inMap); 2611 __kmp_free(lastId); 2612 __kmp_free(totals); 2613 __kmp_free(maxCt); 2614 __kmp_free(counts); 2615 CLEANUP_THREAD_INFO; 2616 return depth; 2617 } 2618 2619 2620 // 2621 // Create and return a table of affinity masks, indexed by OS thread ID. 2622 // This routine handles OR'ing together all the affinity masks of threads 2623 // that are sufficiently close, if granularity > fine. 2624 // 2625 static kmp_affin_mask_t * 2626 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique, 2627 AddrUnsPair *address2os, unsigned numAddrs) 2628 { 2629 // 2630 // First form a table of affinity masks in order of OS thread id. 2631 // 2632 unsigned depth; 2633 unsigned maxOsId; 2634 unsigned i; 2635 2636 KMP_ASSERT(numAddrs > 0); 2637 depth = address2os[0].first.depth; 2638 2639 maxOsId = 0; 2640 for (i = 0; i < numAddrs; i++) { 2641 unsigned osId = address2os[i].second; 2642 if (osId > maxOsId) { 2643 maxOsId = osId; 2644 } 2645 } 2646 kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate( 2647 (maxOsId + 1) * __kmp_affin_mask_size); 2648 2649 // 2650 // Sort the address2os table according to physical order. Doing so 2651 // will put all threads on the same core/package/node in consecutive 2652 // locations. 2653 // 2654 qsort(address2os, numAddrs, sizeof(*address2os), 2655 __kmp_affinity_cmp_Address_labels); 2656 2657 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2658 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2659 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2660 } 2661 if (__kmp_affinity_gran_levels >= (int)depth) { 2662 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2663 && (__kmp_affinity_type != affinity_none))) { 2664 KMP_WARNING(AffThreadsMayMigrate); 2665 } 2666 } 2667 2668 // 2669 // Run through the table, forming the masks for all threads on each 2670 // core. Threads on the same core will have identical "Address" 2671 // objects, not considering the last level, which must be the thread 2672 // id. All threads on a core will appear consecutively. 2673 // 2674 unsigned unique = 0; 2675 unsigned j = 0; // index of 1st thread on core 2676 unsigned leader = 0; 2677 Address *leaderAddr = &(address2os[0].first); 2678 kmp_affin_mask_t *sum 2679 = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size); 2680 KMP_CPU_ZERO(sum); 2681 KMP_CPU_SET(address2os[0].second, sum); 2682 for (i = 1; i < numAddrs; i++) { 2683 // 2684 // If this thread is sufficiently close to the leader (within the 2685 // granularity setting), then set the bit for this os thread in the 2686 // affinity mask for this group, and go on to the next thread. 2687 // 2688 if (leaderAddr->isClose(address2os[i].first, 2689 __kmp_affinity_gran_levels)) { 2690 KMP_CPU_SET(address2os[i].second, sum); 2691 continue; 2692 } 2693 2694 // 2695 // For every thread in this group, copy the mask to the thread's 2696 // entry in the osId2Mask table. Mark the first address as a 2697 // leader. 2698 // 2699 for (; j < i; j++) { 2700 unsigned osId = address2os[j].second; 2701 KMP_DEBUG_ASSERT(osId <= maxOsId); 2702 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2703 KMP_CPU_COPY(mask, sum); 2704 address2os[j].first.leader = (j == leader); 2705 } 2706 unique++; 2707 2708 // 2709 // Start a new mask. 2710 // 2711 leader = i; 2712 leaderAddr = &(address2os[i].first); 2713 KMP_CPU_ZERO(sum); 2714 KMP_CPU_SET(address2os[i].second, sum); 2715 } 2716 2717 // 2718 // For every thread in last group, copy the mask to the thread's 2719 // entry in the osId2Mask table. 2720 // 2721 for (; j < i; j++) { 2722 unsigned osId = address2os[j].second; 2723 KMP_DEBUG_ASSERT(osId <= maxOsId); 2724 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2725 KMP_CPU_COPY(mask, sum); 2726 address2os[j].first.leader = (j == leader); 2727 } 2728 unique++; 2729 2730 *maxIndex = maxOsId; 2731 *numUnique = unique; 2732 return osId2Mask; 2733 } 2734 2735 2736 // 2737 // Stuff for the affinity proclist parsers. It's easier to declare these vars 2738 // as file-static than to try and pass them through the calling sequence of 2739 // the recursive-descent OMP_PLACES parser. 2740 // 2741 static kmp_affin_mask_t *newMasks; 2742 static int numNewMasks; 2743 static int nextNewMask; 2744 2745 #define ADD_MASK(_mask) \ 2746 { \ 2747 if (nextNewMask >= numNewMasks) { \ 2748 numNewMasks *= 2; \ 2749 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \ 2750 numNewMasks * __kmp_affin_mask_size); \ 2751 } \ 2752 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 2753 nextNewMask++; \ 2754 } 2755 2756 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \ 2757 { \ 2758 if (((_osId) > _maxOsId) || \ 2759 (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 2760 if (__kmp_affinity_verbose || (__kmp_affinity_warnings \ 2761 && (__kmp_affinity_type != affinity_none))) { \ 2762 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 2763 } \ 2764 } \ 2765 else { \ 2766 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 2767 } \ 2768 } 2769 2770 2771 // 2772 // Re-parse the proclist (for the explicit affinity type), and form the list 2773 // of affinity newMasks indexed by gtid. 2774 // 2775 static void 2776 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 2777 unsigned int *out_numMasks, const char *proclist, 2778 kmp_affin_mask_t *osId2Mask, int maxOsId) 2779 { 2780 const char *scan = proclist; 2781 const char *next = proclist; 2782 2783 // 2784 // We use malloc() for the temporary mask vector, 2785 // so that we can use realloc() to extend it. 2786 // 2787 numNewMasks = 2; 2788 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks 2789 * __kmp_affin_mask_size); 2790 nextNewMask = 0; 2791 kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate( 2792 __kmp_affin_mask_size); 2793 int setSize = 0; 2794 2795 for (;;) { 2796 int start, end, stride; 2797 2798 SKIP_WS(scan); 2799 next = scan; 2800 if (*next == '\0') { 2801 break; 2802 } 2803 2804 if (*next == '{') { 2805 int num; 2806 setSize = 0; 2807 next++; // skip '{' 2808 SKIP_WS(next); 2809 scan = next; 2810 2811 // 2812 // Read the first integer in the set. 2813 // 2814 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2815 "bad proclist"); 2816 SKIP_DIGITS(next); 2817 num = __kmp_str_to_int(scan, *next); 2818 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2819 2820 // 2821 // Copy the mask for that osId to the sum (union) mask. 2822 // 2823 if ((num > maxOsId) || 2824 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2825 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2826 && (__kmp_affinity_type != affinity_none))) { 2827 KMP_WARNING(AffIgnoreInvalidProcID, num); 2828 } 2829 KMP_CPU_ZERO(sumMask); 2830 } 2831 else { 2832 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2833 setSize = 1; 2834 } 2835 2836 for (;;) { 2837 // 2838 // Check for end of set. 2839 // 2840 SKIP_WS(next); 2841 if (*next == '}') { 2842 next++; // skip '}' 2843 break; 2844 } 2845 2846 // 2847 // Skip optional comma. 2848 // 2849 if (*next == ',') { 2850 next++; 2851 } 2852 SKIP_WS(next); 2853 2854 // 2855 // Read the next integer in the set. 2856 // 2857 scan = next; 2858 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2859 "bad explicit proc list"); 2860 2861 SKIP_DIGITS(next); 2862 num = __kmp_str_to_int(scan, *next); 2863 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2864 2865 // 2866 // Add the mask for that osId to the sum mask. 2867 // 2868 if ((num > maxOsId) || 2869 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2870 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2871 && (__kmp_affinity_type != affinity_none))) { 2872 KMP_WARNING(AffIgnoreInvalidProcID, num); 2873 } 2874 } 2875 else { 2876 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2877 setSize++; 2878 } 2879 } 2880 if (setSize > 0) { 2881 ADD_MASK(sumMask); 2882 } 2883 2884 SKIP_WS(next); 2885 if (*next == ',') { 2886 next++; 2887 } 2888 scan = next; 2889 continue; 2890 } 2891 2892 // 2893 // Read the first integer. 2894 // 2895 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2896 SKIP_DIGITS(next); 2897 start = __kmp_str_to_int(scan, *next); 2898 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 2899 SKIP_WS(next); 2900 2901 // 2902 // If this isn't a range, then add a mask to the list and go on. 2903 // 2904 if (*next != '-') { 2905 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2906 2907 // 2908 // Skip optional comma. 2909 // 2910 if (*next == ',') { 2911 next++; 2912 } 2913 scan = next; 2914 continue; 2915 } 2916 2917 // 2918 // This is a range. Skip over the '-' and read in the 2nd int. 2919 // 2920 next++; // skip '-' 2921 SKIP_WS(next); 2922 scan = next; 2923 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2924 SKIP_DIGITS(next); 2925 end = __kmp_str_to_int(scan, *next); 2926 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 2927 2928 // 2929 // Check for a stride parameter 2930 // 2931 stride = 1; 2932 SKIP_WS(next); 2933 if (*next == ':') { 2934 // 2935 // A stride is specified. Skip over the ':" and read the 3rd int. 2936 // 2937 int sign = +1; 2938 next++; // skip ':' 2939 SKIP_WS(next); 2940 scan = next; 2941 if (*next == '-') { 2942 sign = -1; 2943 next++; 2944 SKIP_WS(next); 2945 scan = next; 2946 } 2947 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2948 "bad explicit proc list"); 2949 SKIP_DIGITS(next); 2950 stride = __kmp_str_to_int(scan, *next); 2951 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 2952 stride *= sign; 2953 } 2954 2955 // 2956 // Do some range checks. 2957 // 2958 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 2959 if (stride > 0) { 2960 KMP_ASSERT2(start <= end, "bad explicit proc list"); 2961 } 2962 else { 2963 KMP_ASSERT2(start >= end, "bad explicit proc list"); 2964 } 2965 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 2966 2967 // 2968 // Add the mask for each OS proc # to the list. 2969 // 2970 if (stride > 0) { 2971 do { 2972 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2973 start += stride; 2974 } while (start <= end); 2975 } 2976 else { 2977 do { 2978 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2979 start += stride; 2980 } while (start >= end); 2981 } 2982 2983 // 2984 // Skip optional comma. 2985 // 2986 SKIP_WS(next); 2987 if (*next == ',') { 2988 next++; 2989 } 2990 scan = next; 2991 } 2992 2993 *out_numMasks = nextNewMask; 2994 if (nextNewMask == 0) { 2995 *out_masks = NULL; 2996 KMP_INTERNAL_FREE(newMasks); 2997 return; 2998 } 2999 *out_masks 3000 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size); 3001 KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size); 3002 __kmp_free(sumMask); 3003 KMP_INTERNAL_FREE(newMasks); 3004 } 3005 3006 3007 # if OMP_40_ENABLED 3008 3009 /*----------------------------------------------------------------------------- 3010 3011 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 3012 places. Again, Here is the grammar: 3013 3014 place_list := place 3015 place_list := place , place_list 3016 place := num 3017 place := place : num 3018 place := place : num : signed 3019 place := { subplacelist } 3020 place := ! place // (lowest priority) 3021 subplace_list := subplace 3022 subplace_list := subplace , subplace_list 3023 subplace := num 3024 subplace := num : num 3025 subplace := num : num : signed 3026 signed := num 3027 signed := + signed 3028 signed := - signed 3029 3030 -----------------------------------------------------------------------------*/ 3031 3032 static void 3033 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask, 3034 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize) 3035 { 3036 const char *next; 3037 3038 for (;;) { 3039 int start, count, stride, i; 3040 3041 // 3042 // Read in the starting proc id 3043 // 3044 SKIP_WS(*scan); 3045 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 3046 "bad explicit places list"); 3047 next = *scan; 3048 SKIP_DIGITS(next); 3049 start = __kmp_str_to_int(*scan, *next); 3050 KMP_ASSERT(start >= 0); 3051 *scan = next; 3052 3053 // 3054 // valid follow sets are ',' ':' and '}' 3055 // 3056 SKIP_WS(*scan); 3057 if (**scan == '}' || **scan == ',') { 3058 if ((start > maxOsId) || 3059 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3060 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3061 && (__kmp_affinity_type != affinity_none))) { 3062 KMP_WARNING(AffIgnoreInvalidProcID, start); 3063 } 3064 } 3065 else { 3066 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3067 (*setSize)++; 3068 } 3069 if (**scan == '}') { 3070 break; 3071 } 3072 (*scan)++; // skip ',' 3073 continue; 3074 } 3075 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3076 (*scan)++; // skip ':' 3077 3078 // 3079 // Read count parameter 3080 // 3081 SKIP_WS(*scan); 3082 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 3083 "bad explicit places list"); 3084 next = *scan; 3085 SKIP_DIGITS(next); 3086 count = __kmp_str_to_int(*scan, *next); 3087 KMP_ASSERT(count >= 0); 3088 *scan = next; 3089 3090 // 3091 // valid follow sets are ',' ':' and '}' 3092 // 3093 SKIP_WS(*scan); 3094 if (**scan == '}' || **scan == ',') { 3095 for (i = 0; i < count; i++) { 3096 if ((start > maxOsId) || 3097 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3098 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3099 && (__kmp_affinity_type != affinity_none))) { 3100 KMP_WARNING(AffIgnoreInvalidProcID, start); 3101 } 3102 break; // don't proliferate warnings for large count 3103 } 3104 else { 3105 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3106 start++; 3107 (*setSize)++; 3108 } 3109 } 3110 if (**scan == '}') { 3111 break; 3112 } 3113 (*scan)++; // skip ',' 3114 continue; 3115 } 3116 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3117 (*scan)++; // skip ':' 3118 3119 // 3120 // Read stride parameter 3121 // 3122 int sign = +1; 3123 for (;;) { 3124 SKIP_WS(*scan); 3125 if (**scan == '+') { 3126 (*scan)++; // skip '+' 3127 continue; 3128 } 3129 if (**scan == '-') { 3130 sign *= -1; 3131 (*scan)++; // skip '-' 3132 continue; 3133 } 3134 break; 3135 } 3136 SKIP_WS(*scan); 3137 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 3138 "bad explicit places list"); 3139 next = *scan; 3140 SKIP_DIGITS(next); 3141 stride = __kmp_str_to_int(*scan, *next); 3142 KMP_ASSERT(stride >= 0); 3143 *scan = next; 3144 stride *= sign; 3145 3146 // 3147 // valid follow sets are ',' and '}' 3148 // 3149 SKIP_WS(*scan); 3150 if (**scan == '}' || **scan == ',') { 3151 for (i = 0; i < count; i++) { 3152 if ((start > maxOsId) || 3153 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3154 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3155 && (__kmp_affinity_type != affinity_none))) { 3156 KMP_WARNING(AffIgnoreInvalidProcID, start); 3157 } 3158 break; // don't proliferate warnings for large count 3159 } 3160 else { 3161 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3162 start += stride; 3163 (*setSize)++; 3164 } 3165 } 3166 if (**scan == '}') { 3167 break; 3168 } 3169 (*scan)++; // skip ',' 3170 continue; 3171 } 3172 3173 KMP_ASSERT2(0, "bad explicit places list"); 3174 } 3175 } 3176 3177 3178 static void 3179 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 3180 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize) 3181 { 3182 const char *next; 3183 3184 // 3185 // valid follow sets are '{' '!' and num 3186 // 3187 SKIP_WS(*scan); 3188 if (**scan == '{') { 3189 (*scan)++; // skip '{' 3190 __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask, 3191 setSize); 3192 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 3193 (*scan)++; // skip '}' 3194 } 3195 else if (**scan == '!') { 3196 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 3197 KMP_CPU_COMPLEMENT(tempMask); 3198 (*scan)++; // skip '!' 3199 } 3200 else if ((**scan >= '0') && (**scan <= '9')) { 3201 next = *scan; 3202 SKIP_DIGITS(next); 3203 int num = __kmp_str_to_int(*scan, *next); 3204 KMP_ASSERT(num >= 0); 3205 if ((num > maxOsId) || 3206 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3207 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3208 && (__kmp_affinity_type != affinity_none))) { 3209 KMP_WARNING(AffIgnoreInvalidProcID, num); 3210 } 3211 } 3212 else { 3213 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 3214 (*setSize)++; 3215 } 3216 *scan = next; // skip num 3217 } 3218 else { 3219 KMP_ASSERT2(0, "bad explicit places list"); 3220 } 3221 } 3222 3223 3224 //static void 3225 void 3226 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 3227 unsigned int *out_numMasks, const char *placelist, 3228 kmp_affin_mask_t *osId2Mask, int maxOsId) 3229 { 3230 const char *scan = placelist; 3231 const char *next = placelist; 3232 3233 numNewMasks = 2; 3234 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks 3235 * __kmp_affin_mask_size); 3236 nextNewMask = 0; 3237 3238 kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate( 3239 __kmp_affin_mask_size); 3240 KMP_CPU_ZERO(tempMask); 3241 int setSize = 0; 3242 3243 for (;;) { 3244 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 3245 3246 // 3247 // valid follow sets are ',' ':' and EOL 3248 // 3249 SKIP_WS(scan); 3250 if (*scan == '\0' || *scan == ',') { 3251 if (setSize > 0) { 3252 ADD_MASK(tempMask); 3253 } 3254 KMP_CPU_ZERO(tempMask); 3255 setSize = 0; 3256 if (*scan == '\0') { 3257 break; 3258 } 3259 scan++; // skip ',' 3260 continue; 3261 } 3262 3263 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3264 scan++; // skip ':' 3265 3266 // 3267 // Read count parameter 3268 // 3269 SKIP_WS(scan); 3270 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), 3271 "bad explicit places list"); 3272 next = scan; 3273 SKIP_DIGITS(next); 3274 int count = __kmp_str_to_int(scan, *next); 3275 KMP_ASSERT(count >= 0); 3276 scan = next; 3277 3278 // 3279 // valid follow sets are ',' ':' and EOL 3280 // 3281 SKIP_WS(scan); 3282 int stride; 3283 if (*scan == '\0' || *scan == ',') { 3284 stride = +1; 3285 } 3286 else { 3287 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3288 scan++; // skip ':' 3289 3290 // 3291 // Read stride parameter 3292 // 3293 int sign = +1; 3294 for (;;) { 3295 SKIP_WS(scan); 3296 if (*scan == '+') { 3297 scan++; // skip '+' 3298 continue; 3299 } 3300 if (*scan == '-') { 3301 sign *= -1; 3302 scan++; // skip '-' 3303 continue; 3304 } 3305 break; 3306 } 3307 SKIP_WS(scan); 3308 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), 3309 "bad explicit places list"); 3310 next = scan; 3311 SKIP_DIGITS(next); 3312 stride = __kmp_str_to_int(scan, *next); 3313 KMP_DEBUG_ASSERT(stride >= 0); 3314 scan = next; 3315 stride *= sign; 3316 } 3317 3318 if (stride > 0) { 3319 int i; 3320 for (i = 0; i < count; i++) { 3321 int j; 3322 if (setSize == 0) { 3323 break; 3324 } 3325 ADD_MASK(tempMask); 3326 setSize = 0; 3327 for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) { 3328 if (! KMP_CPU_ISSET(j - stride, tempMask)) { 3329 KMP_CPU_CLR(j, tempMask); 3330 } 3331 else if ((j > maxOsId) || 3332 (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) { 3333 if ((__kmp_affinity_verbose || (__kmp_affinity_warnings 3334 && (__kmp_affinity_type != affinity_none))) && i < count - 1) { 3335 KMP_WARNING(AffIgnoreInvalidProcID, j); 3336 } 3337 KMP_CPU_CLR(j, tempMask); 3338 } 3339 else { 3340 KMP_CPU_SET(j, tempMask); 3341 setSize++; 3342 } 3343 } 3344 for (; j >= 0; j--) { 3345 KMP_CPU_CLR(j, tempMask); 3346 } 3347 } 3348 } 3349 else { 3350 int i; 3351 for (i = 0; i < count; i++) { 3352 int j; 3353 if (setSize == 0) { 3354 break; 3355 } 3356 ADD_MASK(tempMask); 3357 setSize = 0; 3358 for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride; 3359 j++) { 3360 if (! KMP_CPU_ISSET(j - stride, tempMask)) { 3361 KMP_CPU_CLR(j, tempMask); 3362 } 3363 else if ((j > maxOsId) || 3364 (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) { 3365 if ((__kmp_affinity_verbose || (__kmp_affinity_warnings 3366 && (__kmp_affinity_type != affinity_none))) && i < count - 1) { 3367 KMP_WARNING(AffIgnoreInvalidProcID, j); 3368 } 3369 KMP_CPU_CLR(j, tempMask); 3370 } 3371 else { 3372 KMP_CPU_SET(j, tempMask); 3373 setSize++; 3374 } 3375 } 3376 for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) { 3377 KMP_CPU_CLR(j, tempMask); 3378 } 3379 } 3380 } 3381 KMP_CPU_ZERO(tempMask); 3382 setSize = 0; 3383 3384 // 3385 // valid follow sets are ',' and EOL 3386 // 3387 SKIP_WS(scan); 3388 if (*scan == '\0') { 3389 break; 3390 } 3391 if (*scan == ',') { 3392 scan++; // skip ',' 3393 continue; 3394 } 3395 3396 KMP_ASSERT2(0, "bad explicit places list"); 3397 } 3398 3399 *out_numMasks = nextNewMask; 3400 if (nextNewMask == 0) { 3401 *out_masks = NULL; 3402 KMP_INTERNAL_FREE(newMasks); 3403 return; 3404 } 3405 *out_masks 3406 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size); 3407 KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size); 3408 __kmp_free(tempMask); 3409 KMP_INTERNAL_FREE(newMasks); 3410 } 3411 3412 # endif /* OMP_40_ENABLED */ 3413 3414 #undef ADD_MASK 3415 #undef ADD_MASK_OSID 3416 3417 static void 3418 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) 3419 { 3420 if ( __kmp_place_num_cores == 0 ) { 3421 if ( __kmp_place_num_threads_per_core == 0 ) { 3422 return; // no cores limiting actions requested, exit 3423 } 3424 __kmp_place_num_cores = nCoresPerPkg; // use all available cores 3425 } 3426 if ( !__kmp_affinity_uniform_topology() ) { 3427 KMP_WARNING( AffThrPlaceNonUniform ); 3428 return; // don't support non-uniform topology 3429 } 3430 if ( depth != 3 ) { 3431 KMP_WARNING( AffThrPlaceNonThreeLevel ); 3432 return; // don't support not-3-level topology 3433 } 3434 if ( __kmp_place_num_threads_per_core == 0 ) { 3435 __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts 3436 } 3437 if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) { 3438 KMP_WARNING( AffThrPlaceManyCores ); 3439 return; 3440 } 3441 3442 AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) * 3443 nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core); 3444 int i, j, k, n_old = 0, n_new = 0; 3445 for ( i = 0; i < nPackages; ++i ) { 3446 for ( j = 0; j < nCoresPerPkg; ++j ) { 3447 if ( j < __kmp_place_core_offset || j >= __kmp_place_core_offset + __kmp_place_num_cores ) { 3448 n_old += __kmp_nThreadsPerCore; // skip not-requested core 3449 } else { 3450 for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) { 3451 if ( k < __kmp_place_num_threads_per_core ) { 3452 newAddr[n_new] = (*pAddr)[n_old]; // copy requested core' data to new location 3453 n_new++; 3454 } 3455 n_old++; 3456 } 3457 } 3458 } 3459 } 3460 nCoresPerPkg = __kmp_place_num_cores; // correct nCoresPerPkg 3461 __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore 3462 __kmp_avail_proc = n_new; // correct avail_proc 3463 __kmp_ncores = nPackages * __kmp_place_num_cores; // correct ncores 3464 3465 __kmp_free( *pAddr ); 3466 *pAddr = newAddr; // replace old topology with new one 3467 } 3468 3469 3470 static AddrUnsPair *address2os = NULL; 3471 static int * procarr = NULL; 3472 static int __kmp_aff_depth = 0; 3473 3474 static void 3475 __kmp_aux_affinity_initialize(void) 3476 { 3477 if (__kmp_affinity_masks != NULL) { 3478 KMP_ASSERT(fullMask != NULL); 3479 return; 3480 } 3481 3482 // 3483 // Create the "full" mask - this defines all of the processors that we 3484 // consider to be in the machine model. If respect is set, then it is 3485 // the initialization thread's affinity mask. Otherwise, it is all 3486 // processors that we know about on the machine. 3487 // 3488 if (fullMask == NULL) { 3489 fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size); 3490 } 3491 if (KMP_AFFINITY_CAPABLE()) { 3492 if (__kmp_affinity_respect_mask) { 3493 __kmp_get_system_affinity(fullMask, TRUE); 3494 3495 // 3496 // Count the number of available processors. 3497 // 3498 unsigned i; 3499 __kmp_avail_proc = 0; 3500 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 3501 if (! KMP_CPU_ISSET(i, fullMask)) { 3502 continue; 3503 } 3504 __kmp_avail_proc++; 3505 } 3506 if (__kmp_avail_proc > __kmp_xproc) { 3507 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3508 && (__kmp_affinity_type != affinity_none))) { 3509 KMP_WARNING(ErrorInitializeAffinity); 3510 } 3511 __kmp_affinity_type = affinity_none; 3512 KMP_AFFINITY_DISABLE(); 3513 return; 3514 } 3515 } 3516 else { 3517 __kmp_affinity_entire_machine_mask(fullMask); 3518 __kmp_avail_proc = __kmp_xproc; 3519 } 3520 } 3521 3522 int depth = -1; 3523 kmp_i18n_id_t msg_id = kmp_i18n_null; 3524 3525 // 3526 // For backward compatibility, setting KMP_CPUINFO_FILE => 3527 // KMP_TOPOLOGY_METHOD=cpuinfo 3528 // 3529 if ((__kmp_cpuinfo_file != NULL) && 3530 (__kmp_affinity_top_method == affinity_top_method_all)) { 3531 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 3532 } 3533 3534 if (__kmp_affinity_top_method == affinity_top_method_all) { 3535 // 3536 // In the default code path, errors are not fatal - we just try using 3537 // another method. We only emit a warning message if affinity is on, 3538 // or the verbose flag is set, an the nowarnings flag was not set. 3539 // 3540 const char *file_name = NULL; 3541 int line = 0; 3542 3543 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 3544 3545 if (__kmp_affinity_verbose) { 3546 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 3547 } 3548 3549 file_name = NULL; 3550 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3551 if (depth == 0) { 3552 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3553 KMP_ASSERT(address2os == NULL); 3554 return; 3555 } 3556 3557 if (depth < 0) { 3558 if (__kmp_affinity_verbose) { 3559 if (msg_id != kmp_i18n_null) { 3560 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), 3561 KMP_I18N_STR(DecodingLegacyAPIC)); 3562 } 3563 else { 3564 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 3565 } 3566 } 3567 3568 file_name = NULL; 3569 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3570 if (depth == 0) { 3571 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3572 KMP_ASSERT(address2os == NULL); 3573 return; 3574 } 3575 } 3576 3577 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3578 3579 # if KMP_OS_LINUX 3580 3581 if (depth < 0) { 3582 if (__kmp_affinity_verbose) { 3583 if (msg_id != kmp_i18n_null) { 3584 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo"); 3585 } 3586 else { 3587 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo"); 3588 } 3589 } 3590 3591 FILE *f = fopen("/proc/cpuinfo", "r"); 3592 if (f == NULL) { 3593 msg_id = kmp_i18n_str_CantOpenCpuinfo; 3594 } 3595 else { 3596 file_name = "/proc/cpuinfo"; 3597 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3598 fclose(f); 3599 if (depth == 0) { 3600 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3601 KMP_ASSERT(address2os == NULL); 3602 return; 3603 } 3604 } 3605 } 3606 3607 # endif /* KMP_OS_LINUX */ 3608 3609 # if KMP_GROUP_AFFINITY 3610 3611 if ((depth < 0) && (__kmp_num_proc_groups > 1)) { 3612 if (__kmp_affinity_verbose) { 3613 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 3614 } 3615 3616 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 3617 KMP_ASSERT(depth != 0); 3618 } 3619 3620 # endif /* KMP_GROUP_AFFINITY */ 3621 3622 if (depth < 0) { 3623 if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) { 3624 if (file_name == NULL) { 3625 KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id)); 3626 } 3627 else if (line == 0) { 3628 KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id)); 3629 } 3630 else { 3631 KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id)); 3632 } 3633 } 3634 // FIXME - print msg if msg_id = kmp_i18n_null ??? 3635 3636 file_name = ""; 3637 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3638 if (depth == 0) { 3639 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3640 KMP_ASSERT(address2os == NULL); 3641 return; 3642 } 3643 KMP_ASSERT(depth > 0); 3644 KMP_ASSERT(address2os != NULL); 3645 } 3646 } 3647 3648 // 3649 // If the user has specified that a paricular topology discovery method 3650 // is to be used, then we abort if that method fails. The exception is 3651 // group affinity, which might have been implicitly set. 3652 // 3653 3654 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 3655 3656 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 3657 if (__kmp_affinity_verbose) { 3658 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3659 KMP_I18N_STR(Decodingx2APIC)); 3660 } 3661 3662 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3663 if (depth == 0) { 3664 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3665 KMP_ASSERT(address2os == NULL); 3666 return; 3667 } 3668 if (depth < 0) { 3669 KMP_ASSERT(msg_id != kmp_i18n_null); 3670 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3671 } 3672 } 3673 else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 3674 if (__kmp_affinity_verbose) { 3675 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3676 KMP_I18N_STR(DecodingLegacyAPIC)); 3677 } 3678 3679 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3680 if (depth == 0) { 3681 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3682 KMP_ASSERT(address2os == NULL); 3683 return; 3684 } 3685 if (depth < 0) { 3686 KMP_ASSERT(msg_id != kmp_i18n_null); 3687 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3688 } 3689 } 3690 3691 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3692 3693 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 3694 const char *filename; 3695 if (__kmp_cpuinfo_file != NULL) { 3696 filename = __kmp_cpuinfo_file; 3697 } 3698 else { 3699 filename = "/proc/cpuinfo"; 3700 } 3701 3702 if (__kmp_affinity_verbose) { 3703 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 3704 } 3705 3706 FILE *f = fopen(filename, "r"); 3707 if (f == NULL) { 3708 int code = errno; 3709 if (__kmp_cpuinfo_file != NULL) { 3710 __kmp_msg( 3711 kmp_ms_fatal, 3712 KMP_MSG(CantOpenFileForReading, filename), 3713 KMP_ERR(code), 3714 KMP_HNT(NameComesFrom_CPUINFO_FILE), 3715 __kmp_msg_null 3716 ); 3717 } 3718 else { 3719 __kmp_msg( 3720 kmp_ms_fatal, 3721 KMP_MSG(CantOpenFileForReading, filename), 3722 KMP_ERR(code), 3723 __kmp_msg_null 3724 ); 3725 } 3726 } 3727 int line = 0; 3728 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3729 fclose(f); 3730 if (depth < 0) { 3731 KMP_ASSERT(msg_id != kmp_i18n_null); 3732 if (line > 0) { 3733 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id)); 3734 } 3735 else { 3736 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 3737 } 3738 } 3739 if (__kmp_affinity_type == affinity_none) { 3740 KMP_ASSERT(depth == 0); 3741 KMP_ASSERT(address2os == NULL); 3742 return; 3743 } 3744 } 3745 3746 # if KMP_GROUP_AFFINITY 3747 3748 else if (__kmp_affinity_top_method == affinity_top_method_group) { 3749 if (__kmp_affinity_verbose) { 3750 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 3751 } 3752 3753 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 3754 KMP_ASSERT(depth != 0); 3755 if (depth < 0) { 3756 KMP_ASSERT(msg_id != kmp_i18n_null); 3757 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3758 } 3759 } 3760 3761 # endif /* KMP_GROUP_AFFINITY */ 3762 3763 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 3764 if (__kmp_affinity_verbose) { 3765 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY"); 3766 } 3767 3768 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3769 if (depth == 0) { 3770 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3771 KMP_ASSERT(address2os == NULL); 3772 return; 3773 } 3774 // should not fail 3775 KMP_ASSERT(depth > 0); 3776 KMP_ASSERT(address2os != NULL); 3777 } 3778 3779 if (address2os == NULL) { 3780 if (KMP_AFFINITY_CAPABLE() 3781 && (__kmp_affinity_verbose || (__kmp_affinity_warnings 3782 && (__kmp_affinity_type != affinity_none)))) { 3783 KMP_WARNING(ErrorInitializeAffinity); 3784 } 3785 __kmp_affinity_type = affinity_none; 3786 KMP_AFFINITY_DISABLE(); 3787 return; 3788 } 3789 3790 __kmp_apply_thread_places(&address2os, depth); 3791 3792 // 3793 // Create the table of masks, indexed by thread Id. 3794 // 3795 unsigned maxIndex; 3796 unsigned numUnique; 3797 kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique, 3798 address2os, __kmp_avail_proc); 3799 if (__kmp_affinity_gran_levels == 0) { 3800 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 3801 } 3802 3803 // 3804 // Set the childNums vector in all Address objects. This must be done 3805 // before we can sort using __kmp_affinity_cmp_Address_child_num(), 3806 // which takes into account the setting of __kmp_affinity_compact. 3807 // 3808 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc); 3809 3810 switch (__kmp_affinity_type) { 3811 3812 case affinity_explicit: 3813 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 3814 # if OMP_40_ENABLED 3815 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 3816 # endif 3817 { 3818 __kmp_affinity_process_proclist(&__kmp_affinity_masks, 3819 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask, 3820 maxIndex); 3821 } 3822 # if OMP_40_ENABLED 3823 else { 3824 __kmp_affinity_process_placelist(&__kmp_affinity_masks, 3825 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask, 3826 maxIndex); 3827 } 3828 # endif 3829 if (__kmp_affinity_num_masks == 0) { 3830 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3831 && (__kmp_affinity_type != affinity_none))) { 3832 KMP_WARNING(AffNoValidProcID); 3833 } 3834 __kmp_affinity_type = affinity_none; 3835 return; 3836 } 3837 break; 3838 3839 // 3840 // The other affinity types rely on sorting the Addresses according 3841 // to some permutation of the machine topology tree. Set 3842 // __kmp_affinity_compact and __kmp_affinity_offset appropriately, 3843 // then jump to a common code fragment to do the sort and create 3844 // the array of affinity masks. 3845 // 3846 3847 case affinity_logical: 3848 __kmp_affinity_compact = 0; 3849 if (__kmp_affinity_offset) { 3850 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset 3851 % __kmp_avail_proc; 3852 } 3853 goto sortAddresses; 3854 3855 case affinity_physical: 3856 if (__kmp_nThreadsPerCore > 1) { 3857 __kmp_affinity_compact = 1; 3858 if (__kmp_affinity_compact >= depth) { 3859 __kmp_affinity_compact = 0; 3860 } 3861 } else { 3862 __kmp_affinity_compact = 0; 3863 } 3864 if (__kmp_affinity_offset) { 3865 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset 3866 % __kmp_avail_proc; 3867 } 3868 goto sortAddresses; 3869 3870 case affinity_scatter: 3871 if (__kmp_affinity_compact >= depth) { 3872 __kmp_affinity_compact = 0; 3873 } 3874 else { 3875 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 3876 } 3877 goto sortAddresses; 3878 3879 case affinity_compact: 3880 if (__kmp_affinity_compact >= depth) { 3881 __kmp_affinity_compact = depth - 1; 3882 } 3883 goto sortAddresses; 3884 3885 case affinity_balanced: 3886 // Balanced works only for the case of a single package 3887 if( nPackages > 1 ) { 3888 if( __kmp_affinity_verbose || __kmp_affinity_warnings ) { 3889 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" ); 3890 } 3891 __kmp_affinity_type = affinity_none; 3892 return; 3893 } else if( __kmp_affinity_uniform_topology() ) { 3894 break; 3895 } else { // Non-uniform topology 3896 3897 // Save the depth for further usage 3898 __kmp_aff_depth = depth; 3899 3900 // Number of hyper threads per core in HT machine 3901 int nth_per_core = __kmp_nThreadsPerCore; 3902 3903 int core_level; 3904 if( nth_per_core > 1 ) { 3905 core_level = depth - 2; 3906 } else { 3907 core_level = depth - 1; 3908 } 3909 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1; 3910 int nproc = nth_per_core * ncores; 3911 3912 procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc ); 3913 for( int i = 0; i < nproc; i++ ) { 3914 procarr[ i ] = -1; 3915 } 3916 3917 for( int i = 0; i < __kmp_avail_proc; i++ ) { 3918 int proc = address2os[ i ].second; 3919 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread. 3920 // If there is only one thread per core then depth == 2: level 0 - package, 3921 // level 1 - core. 3922 int level = depth - 1; 3923 3924 // __kmp_nth_per_core == 1 3925 int thread = 0; 3926 int core = address2os[ i ].first.labels[ level ]; 3927 // If the thread level exists, that is we have more than one thread context per core 3928 if( nth_per_core > 1 ) { 3929 thread = address2os[ i ].first.labels[ level ] % nth_per_core; 3930 core = address2os[ i ].first.labels[ level - 1 ]; 3931 } 3932 procarr[ core * nth_per_core + thread ] = proc; 3933 } 3934 3935 break; 3936 } 3937 3938 sortAddresses: 3939 // 3940 // Allocate the gtid->affinity mask table. 3941 // 3942 if (__kmp_affinity_dups) { 3943 __kmp_affinity_num_masks = __kmp_avail_proc; 3944 } 3945 else { 3946 __kmp_affinity_num_masks = numUnique; 3947 } 3948 3949 # if OMP_40_ENABLED 3950 if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel ) 3951 && ( __kmp_affinity_num_places > 0 ) 3952 && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) { 3953 __kmp_affinity_num_masks = __kmp_affinity_num_places; 3954 } 3955 # endif 3956 3957 __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate( 3958 __kmp_affinity_num_masks * __kmp_affin_mask_size); 3959 3960 // 3961 // Sort the address2os table according to the current setting of 3962 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 3963 // 3964 qsort(address2os, __kmp_avail_proc, sizeof(*address2os), 3965 __kmp_affinity_cmp_Address_child_num); 3966 { 3967 int i; 3968 unsigned j; 3969 for (i = 0, j = 0; i < __kmp_avail_proc; i++) { 3970 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) { 3971 continue; 3972 } 3973 unsigned osId = address2os[i].second; 3974 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 3975 kmp_affin_mask_t *dest 3976 = KMP_CPU_INDEX(__kmp_affinity_masks, j); 3977 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 3978 KMP_CPU_COPY(dest, src); 3979 if (++j >= __kmp_affinity_num_masks) { 3980 break; 3981 } 3982 } 3983 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 3984 } 3985 break; 3986 3987 default: 3988 KMP_ASSERT2(0, "Unexpected affinity setting"); 3989 } 3990 3991 __kmp_free(osId2Mask); 3992 machine_hierarchy.init(address2os, __kmp_avail_proc); 3993 } 3994 3995 3996 void 3997 __kmp_affinity_initialize(void) 3998 { 3999 // 4000 // Much of the code above was written assumming that if a machine was not 4001 // affinity capable, then __kmp_affinity_type == affinity_none. We now 4002 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 4003 // 4004 // There are too many checks for __kmp_affinity_type == affinity_none 4005 // in this code. Instead of trying to change them all, check if 4006 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 4007 // affinity_none, call the real initialization routine, then restore 4008 // __kmp_affinity_type to affinity_disabled. 4009 // 4010 int disabled = (__kmp_affinity_type == affinity_disabled); 4011 if (! KMP_AFFINITY_CAPABLE()) { 4012 KMP_ASSERT(disabled); 4013 } 4014 if (disabled) { 4015 __kmp_affinity_type = affinity_none; 4016 } 4017 __kmp_aux_affinity_initialize(); 4018 if (disabled) { 4019 __kmp_affinity_type = affinity_disabled; 4020 } 4021 } 4022 4023 4024 void 4025 __kmp_affinity_uninitialize(void) 4026 { 4027 if (__kmp_affinity_masks != NULL) { 4028 __kmp_free(__kmp_affinity_masks); 4029 __kmp_affinity_masks = NULL; 4030 } 4031 if (fullMask != NULL) { 4032 KMP_CPU_FREE(fullMask); 4033 fullMask = NULL; 4034 } 4035 __kmp_affinity_num_masks = 0; 4036 # if OMP_40_ENABLED 4037 __kmp_affinity_num_places = 0; 4038 # endif 4039 if (__kmp_affinity_proclist != NULL) { 4040 __kmp_free(__kmp_affinity_proclist); 4041 __kmp_affinity_proclist = NULL; 4042 } 4043 if( address2os != NULL ) { 4044 __kmp_free( address2os ); 4045 address2os = NULL; 4046 } 4047 if( procarr != NULL ) { 4048 __kmp_free( procarr ); 4049 procarr = NULL; 4050 } 4051 } 4052 4053 4054 void 4055 __kmp_affinity_set_init_mask(int gtid, int isa_root) 4056 { 4057 if (! KMP_AFFINITY_CAPABLE()) { 4058 return; 4059 } 4060 4061 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4062 if (th->th.th_affin_mask == NULL) { 4063 KMP_CPU_ALLOC(th->th.th_affin_mask); 4064 } 4065 else { 4066 KMP_CPU_ZERO(th->th.th_affin_mask); 4067 } 4068 4069 // 4070 // Copy the thread mask to the kmp_info_t strucuture. 4071 // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one 4072 // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask 4073 // is set, then the full mask is the same as the mask of the initialization 4074 // thread. 4075 // 4076 kmp_affin_mask_t *mask; 4077 int i; 4078 4079 # if OMP_40_ENABLED 4080 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 4081 # endif 4082 { 4083 if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced) 4084 ) { 4085 # if KMP_GROUP_AFFINITY 4086 if (__kmp_num_proc_groups > 1) { 4087 return; 4088 } 4089 # endif 4090 KMP_ASSERT(fullMask != NULL); 4091 i = KMP_PLACE_ALL; 4092 mask = fullMask; 4093 } 4094 else { 4095 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 ); 4096 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4097 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4098 } 4099 } 4100 # if OMP_40_ENABLED 4101 else { 4102 if ((! isa_root) 4103 || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 4104 # if KMP_GROUP_AFFINITY 4105 if (__kmp_num_proc_groups > 1) { 4106 return; 4107 } 4108 # endif 4109 KMP_ASSERT(fullMask != NULL); 4110 i = KMP_PLACE_ALL; 4111 mask = fullMask; 4112 } 4113 else { 4114 // 4115 // int i = some hash function or just a counter that doesn't 4116 // always start at 0. Use gtid for now. 4117 // 4118 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 ); 4119 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4120 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4121 } 4122 } 4123 # endif 4124 4125 # if OMP_40_ENABLED 4126 th->th.th_current_place = i; 4127 if (isa_root) { 4128 th->th.th_new_place = i; 4129 th->th.th_first_place = 0; 4130 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4131 } 4132 4133 if (i == KMP_PLACE_ALL) { 4134 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 4135 gtid)); 4136 } 4137 else { 4138 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 4139 gtid, i)); 4140 } 4141 # else 4142 if (i == -1) { 4143 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n", 4144 gtid)); 4145 } 4146 else { 4147 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n", 4148 gtid, i)); 4149 } 4150 # endif /* OMP_40_ENABLED */ 4151 4152 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4153 4154 if (__kmp_affinity_verbose) { 4155 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4156 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4157 th->th.th_affin_mask); 4158 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid, 4159 buf); 4160 } 4161 4162 # if KMP_OS_WINDOWS 4163 // 4164 // On Windows* OS, the process affinity mask might have changed. 4165 // If the user didn't request affinity and this call fails, 4166 // just continue silently. See CQ171393. 4167 // 4168 if ( __kmp_affinity_type == affinity_none ) { 4169 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 4170 } 4171 else 4172 # endif 4173 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4174 } 4175 4176 4177 # if OMP_40_ENABLED 4178 4179 void 4180 __kmp_affinity_set_place(int gtid) 4181 { 4182 int retval; 4183 4184 if (! KMP_AFFINITY_CAPABLE()) { 4185 return; 4186 } 4187 4188 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4189 4190 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n", 4191 gtid, th->th.th_new_place, th->th.th_current_place)); 4192 4193 // 4194 // Check that the new place is within this thread's partition. 4195 // 4196 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4197 KMP_ASSERT(th->th.th_new_place >= 0); 4198 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 4199 if (th->th.th_first_place <= th->th.th_last_place) { 4200 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) 4201 && (th->th.th_new_place <= th->th.th_last_place)); 4202 } 4203 else { 4204 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) 4205 || (th->th.th_new_place >= th->th.th_last_place)); 4206 } 4207 4208 // 4209 // Copy the thread mask to the kmp_info_t strucuture, 4210 // and set this thread's affinity. 4211 // 4212 kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, 4213 th->th.th_new_place); 4214 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4215 th->th.th_current_place = th->th.th_new_place; 4216 4217 if (__kmp_affinity_verbose) { 4218 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4219 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4220 th->th.th_affin_mask); 4221 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), 4222 gtid, buf); 4223 } 4224 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4225 } 4226 4227 # endif /* OMP_40_ENABLED */ 4228 4229 4230 int 4231 __kmp_aux_set_affinity(void **mask) 4232 { 4233 int gtid; 4234 kmp_info_t *th; 4235 int retval; 4236 4237 if (! KMP_AFFINITY_CAPABLE()) { 4238 return -1; 4239 } 4240 4241 gtid = __kmp_entry_gtid(); 4242 KA_TRACE(1000, ;{ 4243 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4244 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4245 (kmp_affin_mask_t *)(*mask)); 4246 __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n", 4247 gtid, buf); 4248 }); 4249 4250 if (__kmp_env_consistency_check) { 4251 if ((mask == NULL) || (*mask == NULL)) { 4252 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4253 } 4254 else { 4255 unsigned proc; 4256 int num_procs = 0; 4257 4258 for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) { 4259 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 4260 continue; 4261 } 4262 num_procs++; 4263 if (! KMP_CPU_ISSET(proc, fullMask)) { 4264 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4265 break; 4266 } 4267 } 4268 if (num_procs == 0) { 4269 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4270 } 4271 4272 # if KMP_GROUP_AFFINITY 4273 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 4274 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4275 } 4276 # endif /* KMP_GROUP_AFFINITY */ 4277 4278 } 4279 } 4280 4281 th = __kmp_threads[gtid]; 4282 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4283 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4284 if (retval == 0) { 4285 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 4286 } 4287 4288 # if OMP_40_ENABLED 4289 th->th.th_current_place = KMP_PLACE_UNDEFINED; 4290 th->th.th_new_place = KMP_PLACE_UNDEFINED; 4291 th->th.th_first_place = 0; 4292 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4293 4294 // 4295 // Turn off 4.0 affinity for the current tread at this parallel level. 4296 // 4297 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 4298 # endif 4299 4300 return retval; 4301 } 4302 4303 4304 int 4305 __kmp_aux_get_affinity(void **mask) 4306 { 4307 int gtid; 4308 int retval; 4309 kmp_info_t *th; 4310 4311 if (! KMP_AFFINITY_CAPABLE()) { 4312 return -1; 4313 } 4314 4315 gtid = __kmp_entry_gtid(); 4316 th = __kmp_threads[gtid]; 4317 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4318 4319 KA_TRACE(1000, ;{ 4320 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4321 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4322 th->th.th_affin_mask); 4323 __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf); 4324 }); 4325 4326 if (__kmp_env_consistency_check) { 4327 if ((mask == NULL) || (*mask == NULL)) { 4328 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 4329 } 4330 } 4331 4332 # if !KMP_OS_WINDOWS 4333 4334 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4335 KA_TRACE(1000, ;{ 4336 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4337 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4338 (kmp_affin_mask_t *)(*mask)); 4339 __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf); 4340 }); 4341 return retval; 4342 4343 # else 4344 4345 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 4346 return 0; 4347 4348 # endif /* KMP_OS_WINDOWS */ 4349 4350 } 4351 4352 int 4353 __kmp_aux_set_affinity_mask_proc(int proc, void **mask) 4354 { 4355 int retval; 4356 4357 if (! KMP_AFFINITY_CAPABLE()) { 4358 return -1; 4359 } 4360 4361 KA_TRACE(1000, ;{ 4362 int gtid = __kmp_entry_gtid(); 4363 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4364 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4365 (kmp_affin_mask_t *)(*mask)); 4366 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n", 4367 proc, gtid, buf); 4368 }); 4369 4370 if (__kmp_env_consistency_check) { 4371 if ((mask == NULL) || (*mask == NULL)) { 4372 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4373 } 4374 } 4375 4376 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4377 return -1; 4378 } 4379 if (! KMP_CPU_ISSET(proc, fullMask)) { 4380 return -2; 4381 } 4382 4383 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 4384 return 0; 4385 } 4386 4387 4388 int 4389 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) 4390 { 4391 int retval; 4392 4393 if (! KMP_AFFINITY_CAPABLE()) { 4394 return -1; 4395 } 4396 4397 KA_TRACE(1000, ;{ 4398 int gtid = __kmp_entry_gtid(); 4399 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4400 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4401 (kmp_affin_mask_t *)(*mask)); 4402 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n", 4403 proc, gtid, buf); 4404 }); 4405 4406 if (__kmp_env_consistency_check) { 4407 if ((mask == NULL) || (*mask == NULL)) { 4408 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 4409 } 4410 } 4411 4412 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4413 return -1; 4414 } 4415 if (! KMP_CPU_ISSET(proc, fullMask)) { 4416 return -2; 4417 } 4418 4419 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 4420 return 0; 4421 } 4422 4423 4424 int 4425 __kmp_aux_get_affinity_mask_proc(int proc, void **mask) 4426 { 4427 int retval; 4428 4429 if (! KMP_AFFINITY_CAPABLE()) { 4430 return -1; 4431 } 4432 4433 KA_TRACE(1000, ;{ 4434 int gtid = __kmp_entry_gtid(); 4435 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4436 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4437 (kmp_affin_mask_t *)(*mask)); 4438 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n", 4439 proc, gtid, buf); 4440 }); 4441 4442 if (__kmp_env_consistency_check) { 4443 if ((mask == NULL) || (*mask == NULL)) { 4444 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc"); 4445 } 4446 } 4447 4448 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4449 return 0; 4450 } 4451 if (! KMP_CPU_ISSET(proc, fullMask)) { 4452 return 0; 4453 } 4454 4455 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 4456 } 4457 4458 4459 // Dynamic affinity settings - Affinity balanced 4460 void __kmp_balanced_affinity( int tid, int nthreads ) 4461 { 4462 if( __kmp_affinity_uniform_topology() ) { 4463 int coreID; 4464 int threadID; 4465 // Number of hyper threads per core in HT machine 4466 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 4467 // Number of cores 4468 int ncores = __kmp_ncores; 4469 // How many threads will be bound to each core 4470 int chunk = nthreads / ncores; 4471 // How many cores will have an additional thread bound to it - "big cores" 4472 int big_cores = nthreads % ncores; 4473 // Number of threads on the big cores 4474 int big_nth = ( chunk + 1 ) * big_cores; 4475 if( tid < big_nth ) { 4476 coreID = tid / (chunk + 1 ); 4477 threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ; 4478 } else { //tid >= big_nth 4479 coreID = ( tid - big_cores ) / chunk; 4480 threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ; 4481 } 4482 4483 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 4484 "Illegal set affinity operation when not capable"); 4485 4486 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size); 4487 KMP_CPU_ZERO(mask); 4488 4489 // Granularity == thread 4490 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4491 int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second; 4492 KMP_CPU_SET( osID, mask); 4493 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4494 for( int i = 0; i < __kmp_nth_per_core; i++ ) { 4495 int osID; 4496 osID = address2os[ coreID * __kmp_nth_per_core + i ].second; 4497 KMP_CPU_SET( osID, mask); 4498 } 4499 } 4500 if (__kmp_affinity_verbose) { 4501 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4502 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4503 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4504 tid, buf); 4505 } 4506 __kmp_set_system_affinity( mask, TRUE ); 4507 } else { // Non-uniform topology 4508 4509 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size); 4510 KMP_CPU_ZERO(mask); 4511 4512 // Number of hyper threads per core in HT machine 4513 int nth_per_core = __kmp_nThreadsPerCore; 4514 int core_level; 4515 if( nth_per_core > 1 ) { 4516 core_level = __kmp_aff_depth - 2; 4517 } else { 4518 core_level = __kmp_aff_depth - 1; 4519 } 4520 4521 // Number of cores - maximum value; it does not count trail cores with 0 processors 4522 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1; 4523 4524 // For performance gain consider the special case nthreads == __kmp_avail_proc 4525 if( nthreads == __kmp_avail_proc ) { 4526 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4527 int osID = address2os[ tid ].second; 4528 KMP_CPU_SET( osID, mask); 4529 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4530 int coreID = address2os[ tid ].first.labels[ core_level ]; 4531 // We'll count found osIDs for the current core; they can be not more than nth_per_core; 4532 // since the address2os is sortied we can break when cnt==nth_per_core 4533 int cnt = 0; 4534 for( int i = 0; i < __kmp_avail_proc; i++ ) { 4535 int osID = address2os[ i ].second; 4536 int core = address2os[ i ].first.labels[ core_level ]; 4537 if( core == coreID ) { 4538 KMP_CPU_SET( osID, mask); 4539 cnt++; 4540 if( cnt == nth_per_core ) { 4541 break; 4542 } 4543 } 4544 } 4545 } 4546 } else if( nthreads <= __kmp_ncores ) { 4547 4548 int core = 0; 4549 for( int i = 0; i < ncores; i++ ) { 4550 // Check if this core from procarr[] is in the mask 4551 int in_mask = 0; 4552 for( int j = 0; j < nth_per_core; j++ ) { 4553 if( procarr[ i * nth_per_core + j ] != - 1 ) { 4554 in_mask = 1; 4555 break; 4556 } 4557 } 4558 if( in_mask ) { 4559 if( tid == core ) { 4560 for( int j = 0; j < nth_per_core; j++ ) { 4561 int osID = procarr[ i * nth_per_core + j ]; 4562 if( osID != -1 ) { 4563 KMP_CPU_SET( osID, mask ); 4564 // For granularity=thread it is enough to set the first available osID for this core 4565 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4566 break; 4567 } 4568 } 4569 } 4570 break; 4571 } else { 4572 core++; 4573 } 4574 } 4575 } 4576 4577 } else { // nthreads > __kmp_ncores 4578 4579 // Array to save the number of processors at each core 4580 int* nproc_at_core = (int*)KMP_ALLOCA(sizeof(int)*ncores); 4581 // Array to save the number of cores with "x" available processors; 4582 int* ncores_with_x_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1)); 4583 // Array to save the number of cores with # procs from x to nth_per_core 4584 int* ncores_with_x_to_max_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1)); 4585 4586 for( int i = 0; i <= nth_per_core; i++ ) { 4587 ncores_with_x_procs[ i ] = 0; 4588 ncores_with_x_to_max_procs[ i ] = 0; 4589 } 4590 4591 for( int i = 0; i < ncores; i++ ) { 4592 int cnt = 0; 4593 for( int j = 0; j < nth_per_core; j++ ) { 4594 if( procarr[ i * nth_per_core + j ] != -1 ) { 4595 cnt++; 4596 } 4597 } 4598 nproc_at_core[ i ] = cnt; 4599 ncores_with_x_procs[ cnt ]++; 4600 } 4601 4602 for( int i = 0; i <= nth_per_core; i++ ) { 4603 for( int j = i; j <= nth_per_core; j++ ) { 4604 ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ]; 4605 } 4606 } 4607 4608 // Max number of processors 4609 int nproc = nth_per_core * ncores; 4610 // An array to keep number of threads per each context 4611 int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc ); 4612 for( int i = 0; i < nproc; i++ ) { 4613 newarr[ i ] = 0; 4614 } 4615 4616 int nth = nthreads; 4617 int flag = 0; 4618 while( nth > 0 ) { 4619 for( int j = 1; j <= nth_per_core; j++ ) { 4620 int cnt = ncores_with_x_to_max_procs[ j ]; 4621 for( int i = 0; i < ncores; i++ ) { 4622 // Skip the core with 0 processors 4623 if( nproc_at_core[ i ] == 0 ) { 4624 continue; 4625 } 4626 for( int k = 0; k < nth_per_core; k++ ) { 4627 if( procarr[ i * nth_per_core + k ] != -1 ) { 4628 if( newarr[ i * nth_per_core + k ] == 0 ) { 4629 newarr[ i * nth_per_core + k ] = 1; 4630 cnt--; 4631 nth--; 4632 break; 4633 } else { 4634 if( flag != 0 ) { 4635 newarr[ i * nth_per_core + k ] ++; 4636 cnt--; 4637 nth--; 4638 break; 4639 } 4640 } 4641 } 4642 } 4643 if( cnt == 0 || nth == 0 ) { 4644 break; 4645 } 4646 } 4647 if( nth == 0 ) { 4648 break; 4649 } 4650 } 4651 flag = 1; 4652 } 4653 int sum = 0; 4654 for( int i = 0; i < nproc; i++ ) { 4655 sum += newarr[ i ]; 4656 if( sum > tid ) { 4657 // Granularity == thread 4658 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4659 int osID = procarr[ i ]; 4660 KMP_CPU_SET( osID, mask); 4661 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4662 int coreID = i / nth_per_core; 4663 for( int ii = 0; ii < nth_per_core; ii++ ) { 4664 int osID = procarr[ coreID * nth_per_core + ii ]; 4665 if( osID != -1 ) { 4666 KMP_CPU_SET( osID, mask); 4667 } 4668 } 4669 } 4670 break; 4671 } 4672 } 4673 __kmp_free( newarr ); 4674 } 4675 4676 if (__kmp_affinity_verbose) { 4677 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4678 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4679 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4680 tid, buf); 4681 } 4682 __kmp_set_system_affinity( mask, TRUE ); 4683 } 4684 } 4685 4686 #else 4687 // affinity not supported 4688 4689 static const kmp_uint32 noaff_maxLevels=7; 4690 kmp_uint32 noaff_skipPerLevel[noaff_maxLevels]; 4691 kmp_uint32 noaff_depth; 4692 kmp_uint8 noaff_leaf_kids; 4693 kmp_int8 noaff_uninitialized=1; 4694 4695 void noaff_init(int nprocs) 4696 { 4697 kmp_int8 result = KMP_COMPARE_AND_STORE_ACQ8(&noaff_uninitialized, 1, 2); 4698 if (result == 0) return; // Already initialized 4699 else if (result == 2) { // Someone else is initializing 4700 while (TCR_1(noaff_uninitialized) != 0) KMP_CPU_PAUSE(); 4701 return; 4702 } 4703 KMP_DEBUG_ASSERT(result==1); 4704 4705 kmp_uint32 numPerLevel[noaff_maxLevels]; 4706 noaff_depth = 1; 4707 for (kmp_uint32 i=0; i<noaff_maxLevels; ++i) { // init numPerLevel[*] to 1 item per level 4708 numPerLevel[i] = 1; 4709 noaff_skipPerLevel[i] = 1; 4710 } 4711 4712 numPerLevel[0] = 4; 4713 numPerLevel[1] = nprocs/4; 4714 if (nprocs%4) numPerLevel[1]++; 4715 4716 for (int i=noaff_maxLevels-1; i>=0; --i) // count non-empty levels to get depth 4717 if (numPerLevel[i] != 1 || noaff_depth > 1) // only count one top-level '1' 4718 noaff_depth++; 4719 4720 kmp_uint32 branch = 4; 4721 if (numPerLevel[0] == 1) branch = nprocs/4; 4722 if (branch<4) branch=4; 4723 for (kmp_uint32 d=0; d<noaff_depth-1; ++d) { // optimize hierarchy width 4724 while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0! 4725 if (numPerLevel[d] & 1) numPerLevel[d]++; 4726 numPerLevel[d] = numPerLevel[d] >> 1; 4727 if (numPerLevel[d+1] == 1) noaff_depth++; 4728 numPerLevel[d+1] = numPerLevel[d+1] << 1; 4729 } 4730 if(numPerLevel[0] == 1) { 4731 branch = branch >> 1; 4732 if (branch<4) branch = 4; 4733 } 4734 } 4735 4736 for (kmp_uint32 i=1; i<noaff_depth; ++i) 4737 noaff_skipPerLevel[i] = numPerLevel[i-1] * noaff_skipPerLevel[i-1]; 4738 // Fill in hierarchy in the case of oversubscription 4739 for (kmp_uint32 i=noaff_depth; i<noaff_maxLevels; ++i) 4740 noaff_skipPerLevel[i] = 2*noaff_skipPerLevel[i-1]; 4741 noaff_leaf_kids = (kmp_uint8)numPerLevel[0]-1; 4742 noaff_uninitialized = 0; // One writer 4743 4744 } 4745 4746 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 4747 if (noaff_uninitialized) 4748 noaff_init(nproc); 4749 4750 thr_bar->depth = noaff_depth; 4751 thr_bar->base_leaf_kids = noaff_leaf_kids; 4752 thr_bar->skip_per_level = noaff_skipPerLevel; 4753 } 4754 4755 #endif // KMP_AFFINITY_SUPPORTED 4756