1 /* 2 * kmp_affinity.cpp -- affinity management 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "kmp.h" 17 #include "kmp_i18n.h" 18 #include "kmp_io.h" 19 #include "kmp_str.h" 20 #include "kmp_wrapper_getpid.h" 21 22 #if KMP_AFFINITY_SUPPORTED 23 24 // 25 // Print the affinity mask to the character array in a pretty format. 26 // 27 char * 28 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask) 29 { 30 KMP_ASSERT(buf_len >= 40); 31 char *scan = buf; 32 char *end = buf + buf_len - 1; 33 34 // 35 // Find first element / check for empty set. 36 // 37 size_t i; 38 for (i = 0; i < KMP_CPU_SETSIZE; i++) { 39 if (KMP_CPU_ISSET(i, mask)) { 40 break; 41 } 42 } 43 if (i == KMP_CPU_SETSIZE) { 44 KMP_SNPRINTF(scan, buf_len, "{<empty>}"); 45 while (*scan != '\0') scan++; 46 KMP_ASSERT(scan <= end); 47 return buf; 48 } 49 50 KMP_SNPRINTF(scan, buf_len, "{%ld", (long)i); 51 while (*scan != '\0') scan++; 52 i++; 53 for (; i < KMP_CPU_SETSIZE; i++) { 54 if (! KMP_CPU_ISSET(i, mask)) { 55 continue; 56 } 57 58 // 59 // Check for buffer overflow. A string of the form ",<n>" will have 60 // at most 10 characters, plus we want to leave room to print ",...}" 61 // if the set is too large to print for a total of 15 characters. 62 // We already left room for '\0' in setting end. 63 // 64 if (end - scan < 15) { 65 break; 66 } 67 KMP_SNPRINTF(scan, buf_len, ",%-ld", (long)i); 68 while (*scan != '\0') scan++; 69 } 70 if (i < KMP_CPU_SETSIZE) { 71 KMP_SNPRINTF(scan, buf_len, ",..."); 72 while (*scan != '\0') scan++; 73 } 74 KMP_SNPRINTF(scan, buf_len, "}"); 75 while (*scan != '\0') scan++; 76 KMP_ASSERT(scan <= end); 77 return buf; 78 } 79 80 81 void 82 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) 83 { 84 KMP_CPU_ZERO(mask); 85 86 # if KMP_GROUP_AFFINITY 87 88 if (__kmp_num_proc_groups > 1) { 89 int group; 90 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 91 for (group = 0; group < __kmp_num_proc_groups; group++) { 92 int i; 93 int num = __kmp_GetActiveProcessorCount(group); 94 for (i = 0; i < num; i++) { 95 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 96 } 97 } 98 } 99 else 100 101 # endif /* KMP_GROUP_AFFINITY */ 102 103 { 104 int proc; 105 for (proc = 0; proc < __kmp_xproc; proc++) { 106 KMP_CPU_SET(proc, mask); 107 } 108 } 109 } 110 111 112 // 113 // In Linux* OS debug & cover (-O0) builds, we need to avoid inline member 114 // functions. 115 // 116 // The icc codegen emits sections with extremely long names, of the form 117 // ".gnu.linkonce.<mangled_name>". There seems to have been a linker bug 118 // introduced between GNU ld version 2.14.90.0.4 and 2.15.92.0.2 involving 119 // some sort of memory corruption or table overflow that is triggered by 120 // these long strings. I checked the latest version of the linker - 121 // GNU ld (Linux* OS/GNU Binutils) 2.18.50.0.7.20080422 - and the bug is not 122 // fixed. 123 // 124 // Unfortunately, my attempts to reproduce it in a smaller example have 125 // failed - I'm not sure what the prospects are of getting it fixed 126 // properly - but we need a reproducer smaller than all of libomp. 127 // 128 // Work around the problem by avoiding inline constructors in such builds. 129 // We do this for all platforms, not just Linux* OS - non-inline functions are 130 // more debuggable and provide better coverage into than inline functions. 131 // Use inline functions in shipping libs, for performance. 132 // 133 134 # if !defined(KMP_DEBUG) && !defined(COVER) 135 136 class Address { 137 public: 138 static const unsigned maxDepth = 32; 139 unsigned labels[maxDepth]; 140 unsigned childNums[maxDepth]; 141 unsigned depth; 142 unsigned leader; 143 Address(unsigned _depth) 144 : depth(_depth), leader(FALSE) { 145 } 146 Address &operator=(const Address &b) { 147 depth = b.depth; 148 for (unsigned i = 0; i < depth; i++) { 149 labels[i] = b.labels[i]; 150 childNums[i] = b.childNums[i]; 151 } 152 leader = FALSE; 153 return *this; 154 } 155 bool operator==(const Address &b) const { 156 if (depth != b.depth) 157 return false; 158 for (unsigned i = 0; i < depth; i++) 159 if(labels[i] != b.labels[i]) 160 return false; 161 return true; 162 } 163 bool isClose(const Address &b, int level) const { 164 if (depth != b.depth) 165 return false; 166 if ((unsigned)level >= depth) 167 return true; 168 for (unsigned i = 0; i < (depth - level); i++) 169 if(labels[i] != b.labels[i]) 170 return false; 171 return true; 172 } 173 bool operator!=(const Address &b) const { 174 return !operator==(b); 175 } 176 }; 177 178 class AddrUnsPair { 179 public: 180 Address first; 181 unsigned second; 182 AddrUnsPair(Address _first, unsigned _second) 183 : first(_first), second(_second) { 184 } 185 AddrUnsPair &operator=(const AddrUnsPair &b) 186 { 187 first = b.first; 188 second = b.second; 189 return *this; 190 } 191 }; 192 193 # else 194 195 class Address { 196 public: 197 static const unsigned maxDepth = 32; 198 unsigned labels[maxDepth]; 199 unsigned childNums[maxDepth]; 200 unsigned depth; 201 unsigned leader; 202 Address(unsigned _depth); 203 Address &operator=(const Address &b); 204 bool operator==(const Address &b) const; 205 bool isClose(const Address &b, int level) const; 206 bool operator!=(const Address &b) const; 207 }; 208 209 Address::Address(unsigned _depth) 210 { 211 depth = _depth; 212 leader = FALSE; 213 } 214 215 Address &Address::operator=(const Address &b) { 216 depth = b.depth; 217 for (unsigned i = 0; i < depth; i++) { 218 labels[i] = b.labels[i]; 219 childNums[i] = b.childNums[i]; 220 } 221 leader = FALSE; 222 return *this; 223 } 224 225 bool Address::operator==(const Address &b) const { 226 if (depth != b.depth) 227 return false; 228 for (unsigned i = 0; i < depth; i++) 229 if(labels[i] != b.labels[i]) 230 return false; 231 return true; 232 } 233 234 bool Address::isClose(const Address &b, int level) const { 235 if (depth != b.depth) 236 return false; 237 if ((unsigned)level >= depth) 238 return true; 239 for (unsigned i = 0; i < (depth - level); i++) 240 if(labels[i] != b.labels[i]) 241 return false; 242 return true; 243 } 244 245 bool Address::operator!=(const Address &b) const { 246 return !operator==(b); 247 } 248 249 class AddrUnsPair { 250 public: 251 Address first; 252 unsigned second; 253 AddrUnsPair(Address _first, unsigned _second); 254 AddrUnsPair &operator=(const AddrUnsPair &b); 255 }; 256 257 AddrUnsPair::AddrUnsPair(Address _first, unsigned _second) 258 : first(_first), second(_second) 259 { 260 } 261 262 AddrUnsPair &AddrUnsPair::operator=(const AddrUnsPair &b) 263 { 264 first = b.first; 265 second = b.second; 266 return *this; 267 } 268 269 # endif /* !defined(KMP_DEBUG) && !defined(COVER) */ 270 271 272 static int 273 __kmp_affinity_cmp_Address_labels(const void *a, const void *b) 274 { 275 const Address *aa = (const Address *)&(((AddrUnsPair *)a) 276 ->first); 277 const Address *bb = (const Address *)&(((AddrUnsPair *)b) 278 ->first); 279 unsigned depth = aa->depth; 280 unsigned i; 281 KMP_DEBUG_ASSERT(depth == bb->depth); 282 for (i = 0; i < depth; i++) { 283 if (aa->labels[i] < bb->labels[i]) return -1; 284 if (aa->labels[i] > bb->labels[i]) return 1; 285 } 286 return 0; 287 } 288 289 290 static int 291 __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) 292 { 293 const Address *aa = (const Address *)&(((AddrUnsPair *)a) 294 ->first); 295 const Address *bb = (const Address *)&(((AddrUnsPair *)b) 296 ->first); 297 unsigned depth = aa->depth; 298 unsigned i; 299 KMP_DEBUG_ASSERT(depth == bb->depth); 300 KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth); 301 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0); 302 for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) { 303 int j = depth - i - 1; 304 if (aa->childNums[j] < bb->childNums[j]) return -1; 305 if (aa->childNums[j] > bb->childNums[j]) return 1; 306 } 307 for (; i < depth; i++) { 308 int j = i - __kmp_affinity_compact; 309 if (aa->childNums[j] < bb->childNums[j]) return -1; 310 if (aa->childNums[j] > bb->childNums[j]) return 1; 311 } 312 return 0; 313 } 314 315 /** A structure for holding machine-specific hierarchy info to be computed once at init. 316 This structure represents a mapping of threads to the actual machine hierarchy, or to 317 our best guess at what the hierarchy might be, for the purpose of performing an 318 efficient barrier. In the worst case, when there is no machine hierarchy information, 319 it produces a tree suitable for a barrier, similar to the tree used in the hyper barrier. */ 320 class hierarchy_info { 321 public: 322 /** Number of levels in the hierarchy. Typical levels are threads/core, cores/package 323 or socket, packages/node, nodes/machine, etc. We don't want to get specific with 324 nomenclature. When the machine is oversubscribed we add levels to duplicate the 325 hierarchy, doubling the thread capacity of the hierarchy each time we add a level. */ 326 kmp_uint32 maxLevels; 327 328 /** This is specifically the depth of the machine configuration hierarchy, in terms of the 329 number of levels along the longest path from root to any leaf. It corresponds to the 330 number of entries in numPerLevel if we exclude all but one trailing 1. */ 331 kmp_uint32 depth; 332 kmp_uint32 base_num_threads; 333 volatile kmp_int8 uninitialized; // 0=initialized, 1=uninitialized, 2=initialization in progress 334 volatile kmp_int8 resizing; // 0=not resizing, 1=resizing 335 336 /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children the parent of a 337 node at level i has. For example, if we have a machine with 4 packages, 4 cores/package 338 and 2 HT per core, then numPerLevel = {2, 4, 4, 1, 1}. All empty levels are set to 1. */ 339 kmp_uint32 *numPerLevel; 340 kmp_uint32 *skipPerLevel; 341 342 void deriveLevels(AddrUnsPair *adr2os, int num_addrs) { 343 int hier_depth = adr2os[0].first.depth; 344 int level = 0; 345 for (int i=hier_depth-1; i>=0; --i) { 346 int max = -1; 347 for (int j=0; j<num_addrs; ++j) { 348 int next = adr2os[j].first.childNums[i]; 349 if (next > max) max = next; 350 } 351 numPerLevel[level] = max+1; 352 ++level; 353 } 354 } 355 356 hierarchy_info() : maxLevels(7), depth(1), uninitialized(1), resizing(0) {} 357 358 // TO FIX: This destructor causes a segfault in the library at shutdown. 359 //~hierarchy_info() { if (!uninitialized && numPerLevel) __kmp_free(numPerLevel); } 360 361 void init(AddrUnsPair *adr2os, int num_addrs) 362 { 363 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&uninitialized, 1, 2); 364 if (bool_result == 0) { // Wait for initialization 365 while (TCR_1(uninitialized) != 0) KMP_CPU_PAUSE(); 366 return; 367 } 368 KMP_DEBUG_ASSERT(bool_result==1); 369 370 /* Added explicit initialization of the data fields here to prevent usage of dirty value 371 observed when static library is re-initialized multiple times (e.g. when 372 non-OpenMP thread repeatedly launches/joins thread that uses OpenMP). */ 373 depth = 1; 374 resizing = 0; 375 maxLevels = 7; 376 numPerLevel = (kmp_uint32 *)__kmp_allocate(maxLevels*2*sizeof(kmp_uint32)); 377 skipPerLevel = &(numPerLevel[maxLevels]); 378 for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level 379 numPerLevel[i] = 1; 380 skipPerLevel[i] = 1; 381 } 382 383 // Sort table by physical ID 384 if (adr2os) { 385 qsort(adr2os, num_addrs, sizeof(*adr2os), __kmp_affinity_cmp_Address_labels); 386 deriveLevels(adr2os, num_addrs); 387 } 388 else { 389 numPerLevel[0] = 4; 390 numPerLevel[1] = num_addrs/4; 391 if (num_addrs%4) numPerLevel[1]++; 392 } 393 394 base_num_threads = num_addrs; 395 for (int i=maxLevels-1; i>=0; --i) // count non-empty levels to get depth 396 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1' 397 depth++; 398 399 kmp_uint32 branch = 4; 400 if (numPerLevel[0] == 1) branch = num_addrs/4; 401 if (branch<4) branch=4; 402 for (kmp_uint32 d=0; d<depth-1; ++d) { // optimize hierarchy width 403 while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0! 404 if (numPerLevel[d] & 1) numPerLevel[d]++; 405 numPerLevel[d] = numPerLevel[d] >> 1; 406 if (numPerLevel[d+1] == 1) depth++; 407 numPerLevel[d+1] = numPerLevel[d+1] << 1; 408 } 409 if(numPerLevel[0] == 1) { 410 branch = branch >> 1; 411 if (branch<4) branch = 4; 412 } 413 } 414 415 for (kmp_uint32 i=1; i<depth; ++i) 416 skipPerLevel[i] = numPerLevel[i-1] * skipPerLevel[i-1]; 417 // Fill in hierarchy in the case of oversubscription 418 for (kmp_uint32 i=depth; i<maxLevels; ++i) 419 skipPerLevel[i] = 2*skipPerLevel[i-1]; 420 421 uninitialized = 0; // One writer 422 423 } 424 425 void resize(kmp_uint32 nproc) 426 { 427 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1); 428 if (bool_result == 0) { // Someone else is resizing 429 while (TCR_1(resizing) != 0) KMP_CPU_PAUSE(); 430 return; 431 } 432 KMP_DEBUG_ASSERT(bool_result!=0); 433 KMP_DEBUG_ASSERT(nproc > base_num_threads); 434 435 // Calculate new max_levels 436 kmp_uint32 old_sz = skipPerLevel[depth-1]; 437 kmp_uint32 incs = 0, old_maxLevels= maxLevels; 438 while (nproc > old_sz) { 439 old_sz *=2; 440 incs++; 441 } 442 maxLevels += incs; 443 444 // Resize arrays 445 kmp_uint32 *old_numPerLevel = numPerLevel; 446 kmp_uint32 *old_skipPerLevel = skipPerLevel; 447 numPerLevel = skipPerLevel = NULL; 448 numPerLevel = (kmp_uint32 *)__kmp_allocate(maxLevels*2*sizeof(kmp_uint32)); 449 skipPerLevel = &(numPerLevel[maxLevels]); 450 451 // Copy old elements from old arrays 452 for (kmp_uint32 i=0; i<old_maxLevels; ++i) { // init numPerLevel[*] to 1 item per level 453 numPerLevel[i] = old_numPerLevel[i]; 454 skipPerLevel[i] = old_skipPerLevel[i]; 455 } 456 457 // Init new elements in arrays to 1 458 for (kmp_uint32 i=old_maxLevels; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level 459 numPerLevel[i] = 1; 460 skipPerLevel[i] = 1; 461 } 462 463 // Free old arrays 464 __kmp_free(old_numPerLevel); 465 466 // Fill in oversubscription levels of hierarchy 467 for (kmp_uint32 i=old_maxLevels; i<maxLevels; ++i) 468 skipPerLevel[i] = 2*skipPerLevel[i-1]; 469 470 base_num_threads = nproc; 471 resizing = 0; // One writer 472 473 } 474 }; 475 476 static hierarchy_info machine_hierarchy; 477 478 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 479 kmp_uint32 depth; 480 // The test below is true if affinity is available, but set to "none". Need to init on first use of hierarchical barrier. 481 if (TCR_1(machine_hierarchy.uninitialized)) 482 machine_hierarchy.init(NULL, nproc); 483 // Adjust the hierarchy in case num threads exceeds original 484 if (nproc > machine_hierarchy.base_num_threads) 485 machine_hierarchy.resize(nproc); 486 487 depth = machine_hierarchy.depth; 488 KMP_DEBUG_ASSERT(depth > 0); 489 // The loop below adjusts the depth in the case of a resize 490 while (nproc > machine_hierarchy.skipPerLevel[depth-1]) 491 depth++; 492 493 thr_bar->depth = depth; 494 thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1; 495 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 496 } 497 498 // 499 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be 500 // called to renumber the labels from [0..n] and place them into the child_num 501 // vector of the address object. This is done in case the labels used for 502 // the children at one node of the hierarchy differ from those used for 503 // another node at the same level. Example: suppose the machine has 2 nodes 504 // with 2 packages each. The first node contains packages 601 and 602, and 505 // second node contains packages 603 and 604. If we try to sort the table 506 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604 507 // because we are paying attention to the labels themselves, not the ordinal 508 // child numbers. By using the child numbers in the sort, the result is 509 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604. 510 // 511 static void 512 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os, 513 int numAddrs) 514 { 515 KMP_DEBUG_ASSERT(numAddrs > 0); 516 int depth = address2os->first.depth; 517 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 518 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth 519 * sizeof(unsigned)); 520 int labCt; 521 for (labCt = 0; labCt < depth; labCt++) { 522 address2os[0].first.childNums[labCt] = counts[labCt] = 0; 523 lastLabel[labCt] = address2os[0].first.labels[labCt]; 524 } 525 int i; 526 for (i = 1; i < numAddrs; i++) { 527 for (labCt = 0; labCt < depth; labCt++) { 528 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) { 529 int labCt2; 530 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) { 531 counts[labCt2] = 0; 532 lastLabel[labCt2] = address2os[i].first.labels[labCt2]; 533 } 534 counts[labCt]++; 535 lastLabel[labCt] = address2os[i].first.labels[labCt]; 536 break; 537 } 538 } 539 for (labCt = 0; labCt < depth; labCt++) { 540 address2os[i].first.childNums[labCt] = counts[labCt]; 541 } 542 for (; labCt < (int)Address::maxDepth; labCt++) { 543 address2os[i].first.childNums[labCt] = 0; 544 } 545 } 546 } 547 548 549 // 550 // All of the __kmp_affinity_create_*_map() routines should set 551 // __kmp_affinity_masks to a vector of affinity mask objects of length 552 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and 553 // return the number of levels in the machine topology tree (zero if 554 // __kmp_affinity_type == affinity_none). 555 // 556 // All of the __kmp_affinity_create_*_map() routines should set *fullMask 557 // to the affinity mask for the initialization thread. They need to save and 558 // restore the mask, and it could be needed later, so saving it is just an 559 // optimization to avoid calling kmp_get_system_affinity() again. 560 // 561 static kmp_affin_mask_t *fullMask = NULL; 562 563 kmp_affin_mask_t * 564 __kmp_affinity_get_fullMask() { return fullMask; } 565 566 567 static int nCoresPerPkg, nPackages; 568 static int __kmp_nThreadsPerCore; 569 #ifndef KMP_DFLT_NTH_CORES 570 static int __kmp_ncores; 571 #endif 572 573 // 574 // __kmp_affinity_uniform_topology() doesn't work when called from 575 // places which support arbitrarily many levels in the machine topology 576 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map() 577 // __kmp_affinity_create_x2apicid_map(). 578 // 579 inline static bool 580 __kmp_affinity_uniform_topology() 581 { 582 return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages); 583 } 584 585 586 // 587 // Print out the detailed machine topology map, i.e. the physical locations 588 // of each OS proc. 589 // 590 static void 591 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth, 592 int pkgLevel, int coreLevel, int threadLevel) 593 { 594 int proc; 595 596 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 597 for (proc = 0; proc < len; proc++) { 598 int level; 599 kmp_str_buf_t buf; 600 __kmp_str_buf_init(&buf); 601 for (level = 0; level < depth; level++) { 602 if (level == threadLevel) { 603 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread)); 604 } 605 else if (level == coreLevel) { 606 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core)); 607 } 608 else if (level == pkgLevel) { 609 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package)); 610 } 611 else if (level > pkgLevel) { 612 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node), 613 level - pkgLevel - 1); 614 } 615 else { 616 __kmp_str_buf_print(&buf, "L%d ", level); 617 } 618 __kmp_str_buf_print(&buf, "%d ", 619 address2os[proc].first.labels[level]); 620 } 621 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second, 622 buf.str); 623 __kmp_str_buf_free(&buf); 624 } 625 } 626 627 628 // 629 // If we don't know how to retrieve the machine's processor topology, or 630 // encounter an error in doing so, this routine is called to form a "flat" 631 // mapping of os thread id's <-> processor id's. 632 // 633 static int 634 __kmp_affinity_create_flat_map(AddrUnsPair **address2os, 635 kmp_i18n_id_t *const msg_id) 636 { 637 *address2os = NULL; 638 *msg_id = kmp_i18n_null; 639 640 // 641 // Even if __kmp_affinity_type == affinity_none, this routine might still 642 // called to set __kmp_ncores, as well as 643 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 644 // 645 if (! KMP_AFFINITY_CAPABLE()) { 646 KMP_ASSERT(__kmp_affinity_type == affinity_none); 647 __kmp_ncores = nPackages = __kmp_xproc; 648 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 649 if (__kmp_affinity_verbose) { 650 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY"); 651 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 652 KMP_INFORM(Uniform, "KMP_AFFINITY"); 653 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 654 __kmp_nThreadsPerCore, __kmp_ncores); 655 } 656 return 0; 657 } 658 659 // 660 // When affinity is off, this routine will still be called to set 661 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 662 // nCoresPerPkg, & nPackages. Make sure all these vars are set 663 // correctly, and return now if affinity is not enabled. 664 // 665 __kmp_ncores = nPackages = __kmp_avail_proc; 666 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 667 if (__kmp_affinity_verbose) { 668 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 669 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask); 670 671 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY"); 672 if (__kmp_affinity_respect_mask) { 673 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 674 } else { 675 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 676 } 677 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 678 KMP_INFORM(Uniform, "KMP_AFFINITY"); 679 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 680 __kmp_nThreadsPerCore, __kmp_ncores); 681 } 682 if (__kmp_affinity_type == affinity_none) { 683 return 0; 684 } 685 686 // 687 // Contruct the data structure to be returned. 688 // 689 *address2os = (AddrUnsPair*) 690 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 691 int avail_ct = 0; 692 unsigned int i; 693 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 694 // 695 // Skip this proc if it is not included in the machine model. 696 // 697 if (! KMP_CPU_ISSET(i, fullMask)) { 698 continue; 699 } 700 701 Address addr(1); 702 addr.labels[0] = i; 703 (*address2os)[avail_ct++] = AddrUnsPair(addr,i); 704 } 705 if (__kmp_affinity_verbose) { 706 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 707 } 708 709 if (__kmp_affinity_gran_levels < 0) { 710 // 711 // Only the package level is modeled in the machine topology map, 712 // so the #levels of granularity is either 0 or 1. 713 // 714 if (__kmp_affinity_gran > affinity_gran_package) { 715 __kmp_affinity_gran_levels = 1; 716 } 717 else { 718 __kmp_affinity_gran_levels = 0; 719 } 720 } 721 return 1; 722 } 723 724 725 # if KMP_GROUP_AFFINITY 726 727 // 728 // If multiple Windows* OS processor groups exist, we can create a 2-level 729 // topology map with the groups at level 0 and the individual procs at 730 // level 1. 731 // 732 // This facilitates letting the threads float among all procs in a group, 733 // if granularity=group (the default when there are multiple groups). 734 // 735 static int 736 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os, 737 kmp_i18n_id_t *const msg_id) 738 { 739 *address2os = NULL; 740 *msg_id = kmp_i18n_null; 741 742 // 743 // If we don't have multiple processor groups, return now. 744 // The flat mapping will be used. 745 // 746 if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) { 747 // FIXME set *msg_id 748 return -1; 749 } 750 751 // 752 // Contruct the data structure to be returned. 753 // 754 *address2os = (AddrUnsPair*) 755 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 756 int avail_ct = 0; 757 int i; 758 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 759 // 760 // Skip this proc if it is not included in the machine model. 761 // 762 if (! KMP_CPU_ISSET(i, fullMask)) { 763 continue; 764 } 765 766 Address addr(2); 767 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR)); 768 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR)); 769 (*address2os)[avail_ct++] = AddrUnsPair(addr,i); 770 771 if (__kmp_affinity_verbose) { 772 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0], 773 addr.labels[1]); 774 } 775 } 776 777 if (__kmp_affinity_gran_levels < 0) { 778 if (__kmp_affinity_gran == affinity_gran_group) { 779 __kmp_affinity_gran_levels = 1; 780 } 781 else if ((__kmp_affinity_gran == affinity_gran_fine) 782 || (__kmp_affinity_gran == affinity_gran_thread)) { 783 __kmp_affinity_gran_levels = 0; 784 } 785 else { 786 const char *gran_str = NULL; 787 if (__kmp_affinity_gran == affinity_gran_core) { 788 gran_str = "core"; 789 } 790 else if (__kmp_affinity_gran == affinity_gran_package) { 791 gran_str = "package"; 792 } 793 else if (__kmp_affinity_gran == affinity_gran_node) { 794 gran_str = "node"; 795 } 796 else { 797 KMP_ASSERT(0); 798 } 799 800 // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread" 801 __kmp_affinity_gran_levels = 0; 802 } 803 } 804 return 2; 805 } 806 807 # endif /* KMP_GROUP_AFFINITY */ 808 809 810 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 811 812 static int 813 __kmp_cpuid_mask_width(int count) { 814 int r = 0; 815 816 while((1<<r) < count) 817 ++r; 818 return r; 819 } 820 821 822 class apicThreadInfo { 823 public: 824 unsigned osId; // param to __kmp_affinity_bind_thread 825 unsigned apicId; // from cpuid after binding 826 unsigned maxCoresPerPkg; // "" 827 unsigned maxThreadsPerPkg; // "" 828 unsigned pkgId; // inferred from above values 829 unsigned coreId; // "" 830 unsigned threadId; // "" 831 }; 832 833 834 static int 835 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b) 836 { 837 const apicThreadInfo *aa = (const apicThreadInfo *)a; 838 const apicThreadInfo *bb = (const apicThreadInfo *)b; 839 if (aa->osId < bb->osId) return -1; 840 if (aa->osId > bb->osId) return 1; 841 return 0; 842 } 843 844 845 static int 846 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b) 847 { 848 const apicThreadInfo *aa = (const apicThreadInfo *)a; 849 const apicThreadInfo *bb = (const apicThreadInfo *)b; 850 if (aa->pkgId < bb->pkgId) return -1; 851 if (aa->pkgId > bb->pkgId) return 1; 852 if (aa->coreId < bb->coreId) return -1; 853 if (aa->coreId > bb->coreId) return 1; 854 if (aa->threadId < bb->threadId) return -1; 855 if (aa->threadId > bb->threadId) return 1; 856 return 0; 857 } 858 859 860 // 861 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 862 // an algorithm which cycles through the available os threads, setting 863 // the current thread's affinity mask to that thread, and then retrieves 864 // the Apic Id for each thread context using the cpuid instruction. 865 // 866 static int 867 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os, 868 kmp_i18n_id_t *const msg_id) 869 { 870 kmp_cpuid buf; 871 int rc; 872 *address2os = NULL; 873 *msg_id = kmp_i18n_null; 874 875 // 876 // Check if cpuid leaf 4 is supported. 877 // 878 __kmp_x86_cpuid(0, 0, &buf); 879 if (buf.eax < 4) { 880 *msg_id = kmp_i18n_str_NoLeaf4Support; 881 return -1; 882 } 883 884 // 885 // The algorithm used starts by setting the affinity to each available 886 // thread and retrieving info from the cpuid instruction, so if we are 887 // not capable of calling __kmp_get_system_affinity() and 888 // _kmp_get_system_affinity(), then we need to do something else - use 889 // the defaults that we calculated from issuing cpuid without binding 890 // to each proc. 891 // 892 if (! KMP_AFFINITY_CAPABLE()) { 893 // 894 // Hack to try and infer the machine topology using only the data 895 // available from cpuid on the current thread, and __kmp_xproc. 896 // 897 KMP_ASSERT(__kmp_affinity_type == affinity_none); 898 899 // 900 // Get an upper bound on the number of threads per package using 901 // cpuid(1). 902 // 903 // On some OS/chps combinations where HT is supported by the chip 904 // but is disabled, this value will be 2 on a single core chip. 905 // Usually, it will be 2 if HT is enabled and 1 if HT is disabled. 906 // 907 __kmp_x86_cpuid(1, 0, &buf); 908 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 909 if (maxThreadsPerPkg == 0) { 910 maxThreadsPerPkg = 1; 911 } 912 913 // 914 // The num cores per pkg comes from cpuid(4). 915 // 1 must be added to the encoded value. 916 // 917 // The author of cpu_count.cpp treated this only an upper bound 918 // on the number of cores, but I haven't seen any cases where it 919 // was greater than the actual number of cores, so we will treat 920 // it as exact in this block of code. 921 // 922 // First, we need to check if cpuid(4) is supported on this chip. 923 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax 924 // has the value n or greater. 925 // 926 __kmp_x86_cpuid(0, 0, &buf); 927 if (buf.eax >= 4) { 928 __kmp_x86_cpuid(4, 0, &buf); 929 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 930 } 931 else { 932 nCoresPerPkg = 1; 933 } 934 935 // 936 // There is no way to reliably tell if HT is enabled without issuing 937 // the cpuid instruction from every thread, can correlating the cpuid 938 // info, so if the machine is not affinity capable, we assume that HT 939 // is off. We have seen quite a few machines where maxThreadsPerPkg 940 // is 2, yet the machine does not support HT. 941 // 942 // - Older OSes are usually found on machines with older chips, which 943 // do not support HT. 944 // 945 // - The performance penalty for mistakenly identifying a machine as 946 // HT when it isn't (which results in blocktime being incorrecly set 947 // to 0) is greater than the penalty when for mistakenly identifying 948 // a machine as being 1 thread/core when it is really HT enabled 949 // (which results in blocktime being incorrectly set to a positive 950 // value). 951 // 952 __kmp_ncores = __kmp_xproc; 953 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 954 __kmp_nThreadsPerCore = 1; 955 if (__kmp_affinity_verbose) { 956 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY"); 957 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 958 if (__kmp_affinity_uniform_topology()) { 959 KMP_INFORM(Uniform, "KMP_AFFINITY"); 960 } else { 961 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 962 } 963 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 964 __kmp_nThreadsPerCore, __kmp_ncores); 965 } 966 return 0; 967 } 968 969 // 970 // 971 // From here on, we can assume that it is safe to call 972 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), 973 // even if __kmp_affinity_type = affinity_none. 974 // 975 976 // 977 // Save the affinity mask for the current thread. 978 // 979 kmp_affin_mask_t *oldMask; 980 KMP_CPU_ALLOC(oldMask); 981 KMP_ASSERT(oldMask != NULL); 982 __kmp_get_system_affinity(oldMask, TRUE); 983 984 // 985 // Run through each of the available contexts, binding the current thread 986 // to it, and obtaining the pertinent information using the cpuid instr. 987 // 988 // The relevant information is: 989 // 990 // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 991 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 992 // 993 // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The 994 // value of this field determines the width of the core# + thread# 995 // fields in the Apic Id. It is also an upper bound on the number 996 // of threads per package, but it has been verified that situations 997 // happen were it is not exact. In particular, on certain OS/chip 998 // combinations where Intel(R) Hyper-Threading Technology is supported 999 // by the chip but has 1000 // been disabled, the value of this field will be 2 (for a single core 1001 // chip). On other OS/chip combinations supporting 1002 // Intel(R) Hyper-Threading Technology, the value of 1003 // this field will be 1 when Intel(R) Hyper-Threading Technology is 1004 // disabled and 2 when it is enabled. 1005 // 1006 // Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The 1007 // value of this field (+1) determines the width of the core# field in 1008 // the Apic Id. The comments in "cpucount.cpp" say that this value is 1009 // an upper bound, but the IA-32 architecture manual says that it is 1010 // exactly the number of cores per package, and I haven't seen any 1011 // case where it wasn't. 1012 // 1013 // From this information, deduce the package Id, core Id, and thread Id, 1014 // and set the corresponding fields in the apicThreadInfo struct. 1015 // 1016 unsigned i; 1017 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 1018 __kmp_avail_proc * sizeof(apicThreadInfo)); 1019 unsigned nApics = 0; 1020 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 1021 // 1022 // Skip this proc if it is not included in the machine model. 1023 // 1024 if (! KMP_CPU_ISSET(i, fullMask)) { 1025 continue; 1026 } 1027 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 1028 1029 __kmp_affinity_bind_thread(i); 1030 threadInfo[nApics].osId = i; 1031 1032 // 1033 // The apic id and max threads per pkg come from cpuid(1). 1034 // 1035 __kmp_x86_cpuid(1, 0, &buf); 1036 if (! (buf.edx >> 9) & 1) { 1037 __kmp_set_system_affinity(oldMask, TRUE); 1038 __kmp_free(threadInfo); 1039 KMP_CPU_FREE(oldMask); 1040 *msg_id = kmp_i18n_str_ApicNotPresent; 1041 return -1; 1042 } 1043 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 1044 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1045 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 1046 threadInfo[nApics].maxThreadsPerPkg = 1; 1047 } 1048 1049 // 1050 // Max cores per pkg comes from cpuid(4). 1051 // 1 must be added to the encoded value. 1052 // 1053 // First, we need to check if cpuid(4) is supported on this chip. 1054 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax 1055 // has the value n or greater. 1056 // 1057 __kmp_x86_cpuid(0, 0, &buf); 1058 if (buf.eax >= 4) { 1059 __kmp_x86_cpuid(4, 0, &buf); 1060 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1061 } 1062 else { 1063 threadInfo[nApics].maxCoresPerPkg = 1; 1064 } 1065 1066 // 1067 // Infer the pkgId / coreId / threadId using only the info 1068 // obtained locally. 1069 // 1070 int widthCT = __kmp_cpuid_mask_width( 1071 threadInfo[nApics].maxThreadsPerPkg); 1072 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 1073 1074 int widthC = __kmp_cpuid_mask_width( 1075 threadInfo[nApics].maxCoresPerPkg); 1076 int widthT = widthCT - widthC; 1077 if (widthT < 0) { 1078 // 1079 // I've never seen this one happen, but I suppose it could, if 1080 // the cpuid instruction on a chip was really screwed up. 1081 // Make sure to restore the affinity mask before the tail call. 1082 // 1083 __kmp_set_system_affinity(oldMask, TRUE); 1084 __kmp_free(threadInfo); 1085 KMP_CPU_FREE(oldMask); 1086 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1087 return -1; 1088 } 1089 1090 int maskC = (1 << widthC) - 1; 1091 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) 1092 &maskC; 1093 1094 int maskT = (1 << widthT) - 1; 1095 threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT; 1096 1097 nApics++; 1098 } 1099 1100 // 1101 // We've collected all the info we need. 1102 // Restore the old affinity mask for this thread. 1103 // 1104 __kmp_set_system_affinity(oldMask, TRUE); 1105 1106 // 1107 // If there's only one thread context to bind to, form an Address object 1108 // with depth 1 and return immediately (or, if affinity is off, set 1109 // address2os to NULL and return). 1110 // 1111 // If it is configured to omit the package level when there is only a 1112 // single package, the logic at the end of this routine won't work if 1113 // there is only a single thread - it would try to form an Address 1114 // object with depth 0. 1115 // 1116 KMP_ASSERT(nApics > 0); 1117 if (nApics == 1) { 1118 __kmp_ncores = nPackages = 1; 1119 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1120 if (__kmp_affinity_verbose) { 1121 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1122 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1123 1124 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1125 if (__kmp_affinity_respect_mask) { 1126 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1127 } else { 1128 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1129 } 1130 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1131 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1132 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1133 __kmp_nThreadsPerCore, __kmp_ncores); 1134 } 1135 1136 if (__kmp_affinity_type == affinity_none) { 1137 __kmp_free(threadInfo); 1138 KMP_CPU_FREE(oldMask); 1139 return 0; 1140 } 1141 1142 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair)); 1143 Address addr(1); 1144 addr.labels[0] = threadInfo[0].pkgId; 1145 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId); 1146 1147 if (__kmp_affinity_gran_levels < 0) { 1148 __kmp_affinity_gran_levels = 0; 1149 } 1150 1151 if (__kmp_affinity_verbose) { 1152 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 1153 } 1154 1155 __kmp_free(threadInfo); 1156 KMP_CPU_FREE(oldMask); 1157 return 1; 1158 } 1159 1160 // 1161 // Sort the threadInfo table by physical Id. 1162 // 1163 qsort(threadInfo, nApics, sizeof(*threadInfo), 1164 __kmp_affinity_cmp_apicThreadInfo_phys_id); 1165 1166 // 1167 // The table is now sorted by pkgId / coreId / threadId, but we really 1168 // don't know the radix of any of the fields. pkgId's may be sparsely 1169 // assigned among the chips on a system. Although coreId's are usually 1170 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned 1171 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1172 // 1173 // For that matter, we don't know what coresPerPkg and threadsPerCore 1174 // (or the total # packages) are at this point - we want to determine 1175 // that now. We only have an upper bound on the first two figures. 1176 // 1177 // We also perform a consistency check at this point: the values returned 1178 // by the cpuid instruction for any thread bound to a given package had 1179 // better return the same info for maxThreadsPerPkg and maxCoresPerPkg. 1180 // 1181 nPackages = 1; 1182 nCoresPerPkg = 1; 1183 __kmp_nThreadsPerCore = 1; 1184 unsigned nCores = 1; 1185 1186 unsigned pkgCt = 1; // to determine radii 1187 unsigned lastPkgId = threadInfo[0].pkgId; 1188 unsigned coreCt = 1; 1189 unsigned lastCoreId = threadInfo[0].coreId; 1190 unsigned threadCt = 1; 1191 unsigned lastThreadId = threadInfo[0].threadId; 1192 1193 // intra-pkg consist checks 1194 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 1195 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 1196 1197 for (i = 1; i < nApics; i++) { 1198 if (threadInfo[i].pkgId != lastPkgId) { 1199 nCores++; 1200 pkgCt++; 1201 lastPkgId = threadInfo[i].pkgId; 1202 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; 1203 coreCt = 1; 1204 lastCoreId = threadInfo[i].coreId; 1205 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1206 threadCt = 1; 1207 lastThreadId = threadInfo[i].threadId; 1208 1209 // 1210 // This is a different package, so go on to the next iteration 1211 // without doing any consistency checks. Reset the consistency 1212 // check vars, though. 1213 // 1214 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 1215 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 1216 continue; 1217 } 1218 1219 if (threadInfo[i].coreId != lastCoreId) { 1220 nCores++; 1221 coreCt++; 1222 lastCoreId = threadInfo[i].coreId; 1223 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1224 threadCt = 1; 1225 lastThreadId = threadInfo[i].threadId; 1226 } 1227 else if (threadInfo[i].threadId != lastThreadId) { 1228 threadCt++; 1229 lastThreadId = threadInfo[i].threadId; 1230 } 1231 else { 1232 __kmp_free(threadInfo); 1233 KMP_CPU_FREE(oldMask); 1234 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1235 return -1; 1236 } 1237 1238 // 1239 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 1240 // fields agree between all the threads bounds to a given package. 1241 // 1242 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) 1243 || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 1244 __kmp_free(threadInfo); 1245 KMP_CPU_FREE(oldMask); 1246 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1247 return -1; 1248 } 1249 } 1250 nPackages = pkgCt; 1251 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; 1252 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1253 1254 // 1255 // When affinity is off, this routine will still be called to set 1256 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 1257 // nCoresPerPkg, & nPackages. Make sure all these vars are set 1258 // correctly, and return now if affinity is not enabled. 1259 // 1260 __kmp_ncores = nCores; 1261 if (__kmp_affinity_verbose) { 1262 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1263 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1264 1265 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1266 if (__kmp_affinity_respect_mask) { 1267 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1268 } else { 1269 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1270 } 1271 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1272 if (__kmp_affinity_uniform_topology()) { 1273 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1274 } else { 1275 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1276 } 1277 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1278 __kmp_nThreadsPerCore, __kmp_ncores); 1279 1280 } 1281 1282 if (__kmp_affinity_type == affinity_none) { 1283 __kmp_free(threadInfo); 1284 KMP_CPU_FREE(oldMask); 1285 return 0; 1286 } 1287 1288 // 1289 // Now that we've determined the number of packages, the number of cores 1290 // per package, and the number of threads per core, we can construct the 1291 // data structure that is to be returned. 1292 // 1293 int pkgLevel = 0; 1294 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1; 1295 int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 1296 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 1297 1298 KMP_ASSERT(depth > 0); 1299 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1300 1301 for (i = 0; i < nApics; ++i) { 1302 Address addr(depth); 1303 unsigned os = threadInfo[i].osId; 1304 int d = 0; 1305 1306 if (pkgLevel >= 0) { 1307 addr.labels[d++] = threadInfo[i].pkgId; 1308 } 1309 if (coreLevel >= 0) { 1310 addr.labels[d++] = threadInfo[i].coreId; 1311 } 1312 if (threadLevel >= 0) { 1313 addr.labels[d++] = threadInfo[i].threadId; 1314 } 1315 (*address2os)[i] = AddrUnsPair(addr, os); 1316 } 1317 1318 if (__kmp_affinity_gran_levels < 0) { 1319 // 1320 // Set the granularity level based on what levels are modeled 1321 // in the machine topology map. 1322 // 1323 __kmp_affinity_gran_levels = 0; 1324 if ((threadLevel >= 0) 1325 && (__kmp_affinity_gran > affinity_gran_thread)) { 1326 __kmp_affinity_gran_levels++; 1327 } 1328 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1329 __kmp_affinity_gran_levels++; 1330 } 1331 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) { 1332 __kmp_affinity_gran_levels++; 1333 } 1334 } 1335 1336 if (__kmp_affinity_verbose) { 1337 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel, 1338 coreLevel, threadLevel); 1339 } 1340 1341 __kmp_free(threadInfo); 1342 KMP_CPU_FREE(oldMask); 1343 return depth; 1344 } 1345 1346 1347 // 1348 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 1349 // architectures support a newer interface for specifying the x2APIC Ids, 1350 // based on cpuid leaf 11. 1351 // 1352 static int 1353 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os, 1354 kmp_i18n_id_t *const msg_id) 1355 { 1356 kmp_cpuid buf; 1357 1358 *address2os = NULL; 1359 *msg_id = kmp_i18n_null; 1360 1361 // 1362 // Check to see if cpuid leaf 11 is supported. 1363 // 1364 __kmp_x86_cpuid(0, 0, &buf); 1365 if (buf.eax < 11) { 1366 *msg_id = kmp_i18n_str_NoLeaf11Support; 1367 return -1; 1368 } 1369 __kmp_x86_cpuid(11, 0, &buf); 1370 if (buf.ebx == 0) { 1371 *msg_id = kmp_i18n_str_NoLeaf11Support; 1372 return -1; 1373 } 1374 1375 // 1376 // Find the number of levels in the machine topology. While we're at it, 1377 // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will 1378 // try to get more accurate values later by explicitly counting them, 1379 // but get reasonable defaults now, in case we return early. 1380 // 1381 int level; 1382 int threadLevel = -1; 1383 int coreLevel = -1; 1384 int pkgLevel = -1; 1385 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1386 1387 for (level = 0;; level++) { 1388 if (level > 31) { 1389 // 1390 // FIXME: Hack for DPD200163180 1391 // 1392 // If level is big then something went wrong -> exiting 1393 // 1394 // There could actually be 32 valid levels in the machine topology, 1395 // but so far, the only machine we have seen which does not exit 1396 // this loop before iteration 32 has fubar x2APIC settings. 1397 // 1398 // For now, just reject this case based upon loop trip count. 1399 // 1400 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1401 return -1; 1402 } 1403 __kmp_x86_cpuid(11, level, &buf); 1404 if (buf.ebx == 0) { 1405 if (pkgLevel < 0) { 1406 // 1407 // Will infer nPackages from __kmp_xproc 1408 // 1409 pkgLevel = level; 1410 level++; 1411 } 1412 break; 1413 } 1414 int kind = (buf.ecx >> 8) & 0xff; 1415 if (kind == 1) { 1416 // 1417 // SMT level 1418 // 1419 threadLevel = level; 1420 coreLevel = -1; 1421 pkgLevel = -1; 1422 __kmp_nThreadsPerCore = buf.ebx & 0xff; 1423 if (__kmp_nThreadsPerCore == 0) { 1424 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1425 return -1; 1426 } 1427 } 1428 else if (kind == 2) { 1429 // 1430 // core level 1431 // 1432 coreLevel = level; 1433 pkgLevel = -1; 1434 nCoresPerPkg = buf.ebx & 0xff; 1435 if (nCoresPerPkg == 0) { 1436 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1437 return -1; 1438 } 1439 } 1440 else { 1441 if (level <= 0) { 1442 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1443 return -1; 1444 } 1445 if (pkgLevel >= 0) { 1446 continue; 1447 } 1448 pkgLevel = level; 1449 nPackages = buf.ebx & 0xff; 1450 if (nPackages == 0) { 1451 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1452 return -1; 1453 } 1454 } 1455 } 1456 int depth = level; 1457 1458 // 1459 // In the above loop, "level" was counted from the finest level (usually 1460 // thread) to the coarsest. The caller expects that we will place the 1461 // labels in (*address2os)[].first.labels[] in the inverse order, so 1462 // we need to invert the vars saying which level means what. 1463 // 1464 if (threadLevel >= 0) { 1465 threadLevel = depth - threadLevel - 1; 1466 } 1467 if (coreLevel >= 0) { 1468 coreLevel = depth - coreLevel - 1; 1469 } 1470 KMP_DEBUG_ASSERT(pkgLevel >= 0); 1471 pkgLevel = depth - pkgLevel - 1; 1472 1473 // 1474 // The algorithm used starts by setting the affinity to each available 1475 // thread and retrieving info from the cpuid instruction, so if we are 1476 // not capable of calling __kmp_get_system_affinity() and 1477 // _kmp_get_system_affinity(), then we need to do something else - use 1478 // the defaults that we calculated from issuing cpuid without binding 1479 // to each proc. 1480 // 1481 if (! KMP_AFFINITY_CAPABLE()) 1482 { 1483 // 1484 // Hack to try and infer the machine topology using only the data 1485 // available from cpuid on the current thread, and __kmp_xproc. 1486 // 1487 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1488 1489 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1490 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1491 if (__kmp_affinity_verbose) { 1492 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 1493 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1494 if (__kmp_affinity_uniform_topology()) { 1495 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1496 } else { 1497 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1498 } 1499 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1500 __kmp_nThreadsPerCore, __kmp_ncores); 1501 } 1502 return 0; 1503 } 1504 1505 // 1506 // 1507 // From here on, we can assume that it is safe to call 1508 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), 1509 // even if __kmp_affinity_type = affinity_none. 1510 // 1511 1512 // 1513 // Save the affinity mask for the current thread. 1514 // 1515 kmp_affin_mask_t *oldMask; 1516 KMP_CPU_ALLOC(oldMask); 1517 __kmp_get_system_affinity(oldMask, TRUE); 1518 1519 // 1520 // Allocate the data structure to be returned. 1521 // 1522 AddrUnsPair *retval = (AddrUnsPair *) 1523 __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 1524 1525 // 1526 // Run through each of the available contexts, binding the current thread 1527 // to it, and obtaining the pertinent information using the cpuid instr. 1528 // 1529 unsigned int proc; 1530 int nApics = 0; 1531 for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) { 1532 // 1533 // Skip this proc if it is not included in the machine model. 1534 // 1535 if (! KMP_CPU_ISSET(proc, fullMask)) { 1536 continue; 1537 } 1538 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc); 1539 1540 __kmp_affinity_bind_thread(proc); 1541 1542 // 1543 // Extrach the labels for each level in the machine topology map 1544 // from the Apic ID. 1545 // 1546 Address addr(depth); 1547 int prev_shift = 0; 1548 1549 for (level = 0; level < depth; level++) { 1550 __kmp_x86_cpuid(11, level, &buf); 1551 unsigned apicId = buf.edx; 1552 if (buf.ebx == 0) { 1553 if (level != depth - 1) { 1554 KMP_CPU_FREE(oldMask); 1555 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1556 return -1; 1557 } 1558 addr.labels[depth - level - 1] = apicId >> prev_shift; 1559 level++; 1560 break; 1561 } 1562 int shift = buf.eax & 0x1f; 1563 int mask = (1 << shift) - 1; 1564 addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift; 1565 prev_shift = shift; 1566 } 1567 if (level != depth) { 1568 KMP_CPU_FREE(oldMask); 1569 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1570 return -1; 1571 } 1572 1573 retval[nApics] = AddrUnsPair(addr, proc); 1574 nApics++; 1575 } 1576 1577 // 1578 // We've collected all the info we need. 1579 // Restore the old affinity mask for this thread. 1580 // 1581 __kmp_set_system_affinity(oldMask, TRUE); 1582 1583 // 1584 // If there's only one thread context to bind to, return now. 1585 // 1586 KMP_ASSERT(nApics > 0); 1587 if (nApics == 1) { 1588 __kmp_ncores = nPackages = 1; 1589 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1590 if (__kmp_affinity_verbose) { 1591 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1592 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1593 1594 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1595 if (__kmp_affinity_respect_mask) { 1596 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1597 } else { 1598 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1599 } 1600 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1601 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1602 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1603 __kmp_nThreadsPerCore, __kmp_ncores); 1604 } 1605 1606 if (__kmp_affinity_type == affinity_none) { 1607 __kmp_free(retval); 1608 KMP_CPU_FREE(oldMask); 1609 return 0; 1610 } 1611 1612 // 1613 // Form an Address object which only includes the package level. 1614 // 1615 Address addr(1); 1616 addr.labels[0] = retval[0].first.labels[pkgLevel]; 1617 retval[0].first = addr; 1618 1619 if (__kmp_affinity_gran_levels < 0) { 1620 __kmp_affinity_gran_levels = 0; 1621 } 1622 1623 if (__kmp_affinity_verbose) { 1624 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 1625 } 1626 1627 *address2os = retval; 1628 KMP_CPU_FREE(oldMask); 1629 return 1; 1630 } 1631 1632 // 1633 // Sort the table by physical Id. 1634 // 1635 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels); 1636 1637 // 1638 // Find the radix at each of the levels. 1639 // 1640 unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1641 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1642 unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1643 unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1644 for (level = 0; level < depth; level++) { 1645 totals[level] = 1; 1646 maxCt[level] = 1; 1647 counts[level] = 1; 1648 last[level] = retval[0].first.labels[level]; 1649 } 1650 1651 // 1652 // From here on, the iteration variable "level" runs from the finest 1653 // level to the coarsest, i.e. we iterate forward through 1654 // (*address2os)[].first.labels[] - in the previous loops, we iterated 1655 // backwards. 1656 // 1657 for (proc = 1; (int)proc < nApics; proc++) { 1658 int level; 1659 for (level = 0; level < depth; level++) { 1660 if (retval[proc].first.labels[level] != last[level]) { 1661 int j; 1662 for (j = level + 1; j < depth; j++) { 1663 totals[j]++; 1664 counts[j] = 1; 1665 // The line below causes printing incorrect topology information 1666 // in case the max value for some level (maxCt[level]) is encountered earlier than 1667 // some less value while going through the array. 1668 // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2 1669 // whereas it must be 4. 1670 // TODO!!! Check if it can be commented safely 1671 //maxCt[j] = 1; 1672 last[j] = retval[proc].first.labels[j]; 1673 } 1674 totals[level]++; 1675 counts[level]++; 1676 if (counts[level] > maxCt[level]) { 1677 maxCt[level] = counts[level]; 1678 } 1679 last[level] = retval[proc].first.labels[level]; 1680 break; 1681 } 1682 else if (level == depth - 1) { 1683 __kmp_free(last); 1684 __kmp_free(maxCt); 1685 __kmp_free(counts); 1686 __kmp_free(totals); 1687 __kmp_free(retval); 1688 KMP_CPU_FREE(oldMask); 1689 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 1690 return -1; 1691 } 1692 } 1693 } 1694 1695 // 1696 // When affinity is off, this routine will still be called to set 1697 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 1698 // nCoresPerPkg, & nPackages. Make sure all these vars are set 1699 // correctly, and return if affinity is not enabled. 1700 // 1701 if (threadLevel >= 0) { 1702 __kmp_nThreadsPerCore = maxCt[threadLevel]; 1703 } 1704 else { 1705 __kmp_nThreadsPerCore = 1; 1706 } 1707 nPackages = totals[pkgLevel]; 1708 1709 if (coreLevel >= 0) { 1710 __kmp_ncores = totals[coreLevel]; 1711 nCoresPerPkg = maxCt[coreLevel]; 1712 } 1713 else { 1714 __kmp_ncores = nPackages; 1715 nCoresPerPkg = 1; 1716 } 1717 1718 // 1719 // Check to see if the machine topology is uniform 1720 // 1721 unsigned prod = maxCt[0]; 1722 for (level = 1; level < depth; level++) { 1723 prod *= maxCt[level]; 1724 } 1725 bool uniform = (prod == totals[level - 1]); 1726 1727 // 1728 // Print the machine topology summary. 1729 // 1730 if (__kmp_affinity_verbose) { 1731 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 1732 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1733 1734 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1735 if (__kmp_affinity_respect_mask) { 1736 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 1737 } else { 1738 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 1739 } 1740 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1741 if (uniform) { 1742 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1743 } else { 1744 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1745 } 1746 1747 kmp_str_buf_t buf; 1748 __kmp_str_buf_init(&buf); 1749 1750 __kmp_str_buf_print(&buf, "%d", totals[0]); 1751 for (level = 1; level <= pkgLevel; level++) { 1752 __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 1753 } 1754 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 1755 __kmp_nThreadsPerCore, __kmp_ncores); 1756 1757 __kmp_str_buf_free(&buf); 1758 } 1759 1760 if (__kmp_affinity_type == affinity_none) { 1761 __kmp_free(last); 1762 __kmp_free(maxCt); 1763 __kmp_free(counts); 1764 __kmp_free(totals); 1765 __kmp_free(retval); 1766 KMP_CPU_FREE(oldMask); 1767 return 0; 1768 } 1769 1770 // 1771 // Find any levels with radiix 1, and remove them from the map 1772 // (except for the package level). 1773 // 1774 int new_depth = 0; 1775 for (level = 0; level < depth; level++) { 1776 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1777 continue; 1778 } 1779 new_depth++; 1780 } 1781 1782 // 1783 // If we are removing any levels, allocate a new vector to return, 1784 // and copy the relevant information to it. 1785 // 1786 if (new_depth != depth) { 1787 AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate( 1788 sizeof(AddrUnsPair) * nApics); 1789 for (proc = 0; (int)proc < nApics; proc++) { 1790 Address addr(new_depth); 1791 new_retval[proc] = AddrUnsPair(addr, retval[proc].second); 1792 } 1793 int new_level = 0; 1794 for (level = 0; level < depth; level++) { 1795 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1796 if (level == threadLevel) { 1797 threadLevel = -1; 1798 } 1799 else if ((threadLevel >= 0) && (level < threadLevel)) { 1800 threadLevel--; 1801 } 1802 if (level == coreLevel) { 1803 coreLevel = -1; 1804 } 1805 else if ((coreLevel >= 0) && (level < coreLevel)) { 1806 coreLevel--; 1807 } 1808 if (level < pkgLevel) { 1809 pkgLevel--; 1810 } 1811 continue; 1812 } 1813 for (proc = 0; (int)proc < nApics; proc++) { 1814 new_retval[proc].first.labels[new_level] 1815 = retval[proc].first.labels[level]; 1816 } 1817 new_level++; 1818 } 1819 1820 __kmp_free(retval); 1821 retval = new_retval; 1822 depth = new_depth; 1823 } 1824 1825 if (__kmp_affinity_gran_levels < 0) { 1826 // 1827 // Set the granularity level based on what levels are modeled 1828 // in the machine topology map. 1829 // 1830 __kmp_affinity_gran_levels = 0; 1831 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1832 __kmp_affinity_gran_levels++; 1833 } 1834 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1835 __kmp_affinity_gran_levels++; 1836 } 1837 if (__kmp_affinity_gran > affinity_gran_package) { 1838 __kmp_affinity_gran_levels++; 1839 } 1840 } 1841 1842 if (__kmp_affinity_verbose) { 1843 __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, 1844 coreLevel, threadLevel); 1845 } 1846 1847 __kmp_free(last); 1848 __kmp_free(maxCt); 1849 __kmp_free(counts); 1850 __kmp_free(totals); 1851 KMP_CPU_FREE(oldMask); 1852 *address2os = retval; 1853 return depth; 1854 } 1855 1856 1857 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1858 1859 1860 #define osIdIndex 0 1861 #define threadIdIndex 1 1862 #define coreIdIndex 2 1863 #define pkgIdIndex 3 1864 #define nodeIdIndex 4 1865 1866 typedef unsigned *ProcCpuInfo; 1867 static unsigned maxIndex = pkgIdIndex; 1868 1869 1870 static int 1871 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b) 1872 { 1873 const unsigned *aa = (const unsigned *)a; 1874 const unsigned *bb = (const unsigned *)b; 1875 if (aa[osIdIndex] < bb[osIdIndex]) return -1; 1876 if (aa[osIdIndex] > bb[osIdIndex]) return 1; 1877 return 0; 1878 }; 1879 1880 1881 static int 1882 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b) 1883 { 1884 unsigned i; 1885 const unsigned *aa = *((const unsigned **)a); 1886 const unsigned *bb = *((const unsigned **)b); 1887 for (i = maxIndex; ; i--) { 1888 if (aa[i] < bb[i]) return -1; 1889 if (aa[i] > bb[i]) return 1; 1890 if (i == osIdIndex) break; 1891 } 1892 return 0; 1893 } 1894 1895 1896 // 1897 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 1898 // affinity map. 1899 // 1900 static int 1901 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line, 1902 kmp_i18n_id_t *const msg_id, FILE *f) 1903 { 1904 *address2os = NULL; 1905 *msg_id = kmp_i18n_null; 1906 1907 // 1908 // Scan of the file, and count the number of "processor" (osId) fields, 1909 // and find the highest value of <n> for a node_<n> field. 1910 // 1911 char buf[256]; 1912 unsigned num_records = 0; 1913 while (! feof(f)) { 1914 buf[sizeof(buf) - 1] = 1; 1915 if (! fgets(buf, sizeof(buf), f)) { 1916 // 1917 // Read errors presumably because of EOF 1918 // 1919 break; 1920 } 1921 1922 char s1[] = "processor"; 1923 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1924 num_records++; 1925 continue; 1926 } 1927 1928 // 1929 // FIXME - this will match "node_<n> <garbage>" 1930 // 1931 unsigned level; 1932 if (KMP_SSCANF(buf, "node_%d id", &level) == 1) { 1933 if (nodeIdIndex + level >= maxIndex) { 1934 maxIndex = nodeIdIndex + level; 1935 } 1936 continue; 1937 } 1938 } 1939 1940 // 1941 // Check for empty file / no valid processor records, or too many. 1942 // The number of records can't exceed the number of valid bits in the 1943 // affinity mask. 1944 // 1945 if (num_records == 0) { 1946 *line = 0; 1947 *msg_id = kmp_i18n_str_NoProcRecords; 1948 return -1; 1949 } 1950 if (num_records > (unsigned)__kmp_xproc) { 1951 *line = 0; 1952 *msg_id = kmp_i18n_str_TooManyProcRecords; 1953 return -1; 1954 } 1955 1956 // 1957 // Set the file pointer back to the begginning, so that we can scan the 1958 // file again, this time performing a full parse of the data. 1959 // Allocate a vector of ProcCpuInfo object, where we will place the data. 1960 // Adding an extra element at the end allows us to remove a lot of extra 1961 // checks for termination conditions. 1962 // 1963 if (fseek(f, 0, SEEK_SET) != 0) { 1964 *line = 0; 1965 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 1966 return -1; 1967 } 1968 1969 // 1970 // Allocate the array of records to store the proc info in. The dummy 1971 // element at the end makes the logic in filling them out easier to code. 1972 // 1973 unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1) 1974 * sizeof(unsigned *)); 1975 unsigned i; 1976 for (i = 0; i <= num_records; i++) { 1977 threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1) 1978 * sizeof(unsigned)); 1979 } 1980 1981 #define CLEANUP_THREAD_INFO \ 1982 for (i = 0; i <= num_records; i++) { \ 1983 __kmp_free(threadInfo[i]); \ 1984 } \ 1985 __kmp_free(threadInfo); 1986 1987 // 1988 // A value of UINT_MAX means that we didn't find the field 1989 // 1990 unsigned __index; 1991 1992 #define INIT_PROC_INFO(p) \ 1993 for (__index = 0; __index <= maxIndex; __index++) { \ 1994 (p)[__index] = UINT_MAX; \ 1995 } 1996 1997 for (i = 0; i <= num_records; i++) { 1998 INIT_PROC_INFO(threadInfo[i]); 1999 } 2000 2001 unsigned num_avail = 0; 2002 *line = 0; 2003 while (! feof(f)) { 2004 // 2005 // Create an inner scoping level, so that all the goto targets at the 2006 // end of the loop appear in an outer scoping level. This avoids 2007 // warnings about jumping past an initialization to a target in the 2008 // same block. 2009 // 2010 { 2011 buf[sizeof(buf) - 1] = 1; 2012 bool long_line = false; 2013 if (! fgets(buf, sizeof(buf), f)) { 2014 // 2015 // Read errors presumably because of EOF 2016 // 2017 // If there is valid data in threadInfo[num_avail], then fake 2018 // a blank line in ensure that the last address gets parsed. 2019 // 2020 bool valid = false; 2021 for (i = 0; i <= maxIndex; i++) { 2022 if (threadInfo[num_avail][i] != UINT_MAX) { 2023 valid = true; 2024 } 2025 } 2026 if (! valid) { 2027 break; 2028 } 2029 buf[0] = 0; 2030 } else if (!buf[sizeof(buf) - 1]) { 2031 // 2032 // The line is longer than the buffer. Set a flag and don't 2033 // emit an error if we were going to ignore the line, anyway. 2034 // 2035 long_line = true; 2036 2037 #define CHECK_LINE \ 2038 if (long_line) { \ 2039 CLEANUP_THREAD_INFO; \ 2040 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 2041 return -1; \ 2042 } 2043 } 2044 (*line)++; 2045 2046 char s1[] = "processor"; 2047 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 2048 CHECK_LINE; 2049 char *p = strchr(buf + sizeof(s1) - 1, ':'); 2050 unsigned val; 2051 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2052 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field; 2053 threadInfo[num_avail][osIdIndex] = val; 2054 #if KMP_OS_LINUX && USE_SYSFS_INFO 2055 char path[256]; 2056 KMP_SNPRINTF(path, sizeof(path), 2057 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 2058 threadInfo[num_avail][osIdIndex]); 2059 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 2060 2061 KMP_SNPRINTF(path, sizeof(path), 2062 "/sys/devices/system/cpu/cpu%u/topology/core_id", 2063 threadInfo[num_avail][osIdIndex]); 2064 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 2065 continue; 2066 #else 2067 } 2068 char s2[] = "physical id"; 2069 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 2070 CHECK_LINE; 2071 char *p = strchr(buf + sizeof(s2) - 1, ':'); 2072 unsigned val; 2073 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2074 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field; 2075 threadInfo[num_avail][pkgIdIndex] = val; 2076 continue; 2077 } 2078 char s3[] = "core id"; 2079 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 2080 CHECK_LINE; 2081 char *p = strchr(buf + sizeof(s3) - 1, ':'); 2082 unsigned val; 2083 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2084 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field; 2085 threadInfo[num_avail][coreIdIndex] = val; 2086 continue; 2087 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 2088 } 2089 char s4[] = "thread id"; 2090 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 2091 CHECK_LINE; 2092 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2093 unsigned val; 2094 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2095 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field; 2096 threadInfo[num_avail][threadIdIndex] = val; 2097 continue; 2098 } 2099 unsigned level; 2100 if (KMP_SSCANF(buf, "node_%d id", &level) == 1) { 2101 CHECK_LINE; 2102 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2103 unsigned val; 2104 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2105 KMP_ASSERT(nodeIdIndex + level <= maxIndex); 2106 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field; 2107 threadInfo[num_avail][nodeIdIndex + level] = val; 2108 continue; 2109 } 2110 2111 // 2112 // We didn't recognize the leading token on the line. 2113 // There are lots of leading tokens that we don't recognize - 2114 // if the line isn't empty, go on to the next line. 2115 // 2116 if ((*buf != 0) && (*buf != '\n')) { 2117 // 2118 // If the line is longer than the buffer, read characters 2119 // until we find a newline. 2120 // 2121 if (long_line) { 2122 int ch; 2123 while (((ch = fgetc(f)) != EOF) && (ch != '\n')); 2124 } 2125 continue; 2126 } 2127 2128 // 2129 // A newline has signalled the end of the processor record. 2130 // Check that there aren't too many procs specified. 2131 // 2132 if ((int)num_avail == __kmp_xproc) { 2133 CLEANUP_THREAD_INFO; 2134 *msg_id = kmp_i18n_str_TooManyEntries; 2135 return -1; 2136 } 2137 2138 // 2139 // Check for missing fields. The osId field must be there, and we 2140 // currently require that the physical id field is specified, also. 2141 // 2142 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 2143 CLEANUP_THREAD_INFO; 2144 *msg_id = kmp_i18n_str_MissingProcField; 2145 return -1; 2146 } 2147 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 2148 CLEANUP_THREAD_INFO; 2149 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 2150 return -1; 2151 } 2152 2153 // 2154 // Skip this proc if it is not included in the machine model. 2155 // 2156 if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) { 2157 INIT_PROC_INFO(threadInfo[num_avail]); 2158 continue; 2159 } 2160 2161 // 2162 // We have a successful parse of this proc's info. 2163 // Increment the counter, and prepare for the next proc. 2164 // 2165 num_avail++; 2166 KMP_ASSERT(num_avail <= num_records); 2167 INIT_PROC_INFO(threadInfo[num_avail]); 2168 } 2169 continue; 2170 2171 no_val: 2172 CLEANUP_THREAD_INFO; 2173 *msg_id = kmp_i18n_str_MissingValCpuinfo; 2174 return -1; 2175 2176 dup_field: 2177 CLEANUP_THREAD_INFO; 2178 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 2179 return -1; 2180 } 2181 *line = 0; 2182 2183 # if KMP_MIC && REDUCE_TEAM_SIZE 2184 unsigned teamSize = 0; 2185 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2186 2187 // check for num_records == __kmp_xproc ??? 2188 2189 // 2190 // If there's only one thread context to bind to, form an Address object 2191 // with depth 1 and return immediately (or, if affinity is off, set 2192 // address2os to NULL and return). 2193 // 2194 // If it is configured to omit the package level when there is only a 2195 // single package, the logic at the end of this routine won't work if 2196 // there is only a single thread - it would try to form an Address 2197 // object with depth 0. 2198 // 2199 KMP_ASSERT(num_avail > 0); 2200 KMP_ASSERT(num_avail <= num_records); 2201 if (num_avail == 1) { 2202 __kmp_ncores = 1; 2203 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 2204 if (__kmp_affinity_verbose) { 2205 if (! KMP_AFFINITY_CAPABLE()) { 2206 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2207 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2208 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2209 } 2210 else { 2211 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2212 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 2213 fullMask); 2214 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2215 if (__kmp_affinity_respect_mask) { 2216 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2217 } else { 2218 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2219 } 2220 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2221 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2222 } 2223 int index; 2224 kmp_str_buf_t buf; 2225 __kmp_str_buf_init(&buf); 2226 __kmp_str_buf_print(&buf, "1"); 2227 for (index = maxIndex - 1; index > pkgIdIndex; index--) { 2228 __kmp_str_buf_print(&buf, " x 1"); 2229 } 2230 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1); 2231 __kmp_str_buf_free(&buf); 2232 } 2233 2234 if (__kmp_affinity_type == affinity_none) { 2235 CLEANUP_THREAD_INFO; 2236 return 0; 2237 } 2238 2239 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair)); 2240 Address addr(1); 2241 addr.labels[0] = threadInfo[0][pkgIdIndex]; 2242 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]); 2243 2244 if (__kmp_affinity_gran_levels < 0) { 2245 __kmp_affinity_gran_levels = 0; 2246 } 2247 2248 if (__kmp_affinity_verbose) { 2249 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 2250 } 2251 2252 CLEANUP_THREAD_INFO; 2253 return 1; 2254 } 2255 2256 // 2257 // Sort the threadInfo table by physical Id. 2258 // 2259 qsort(threadInfo, num_avail, sizeof(*threadInfo), 2260 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 2261 2262 // 2263 // The table is now sorted by pkgId / coreId / threadId, but we really 2264 // don't know the radix of any of the fields. pkgId's may be sparsely 2265 // assigned among the chips on a system. Although coreId's are usually 2266 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned 2267 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2268 // 2269 // For that matter, we don't know what coresPerPkg and threadsPerCore 2270 // (or the total # packages) are at this point - we want to determine 2271 // that now. We only have an upper bound on the first two figures. 2272 // 2273 unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1) 2274 * sizeof(unsigned)); 2275 unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1) 2276 * sizeof(unsigned)); 2277 unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1) 2278 * sizeof(unsigned)); 2279 unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1) 2280 * sizeof(unsigned)); 2281 2282 bool assign_thread_ids = false; 2283 unsigned threadIdCt; 2284 unsigned index; 2285 2286 restart_radix_check: 2287 threadIdCt = 0; 2288 2289 // 2290 // Initialize the counter arrays with data from threadInfo[0]. 2291 // 2292 if (assign_thread_ids) { 2293 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 2294 threadInfo[0][threadIdIndex] = threadIdCt++; 2295 } 2296 else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 2297 threadIdCt = threadInfo[0][threadIdIndex] + 1; 2298 } 2299 } 2300 for (index = 0; index <= maxIndex; index++) { 2301 counts[index] = 1; 2302 maxCt[index] = 1; 2303 totals[index] = 1; 2304 lastId[index] = threadInfo[0][index];; 2305 } 2306 2307 // 2308 // Run through the rest of the OS procs. 2309 // 2310 for (i = 1; i < num_avail; i++) { 2311 // 2312 // Find the most significant index whose id differs 2313 // from the id for the previous OS proc. 2314 // 2315 for (index = maxIndex; index >= threadIdIndex; index--) { 2316 if (assign_thread_ids && (index == threadIdIndex)) { 2317 // 2318 // Auto-assign the thread id field if it wasn't specified. 2319 // 2320 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2321 threadInfo[i][threadIdIndex] = threadIdCt++; 2322 } 2323 2324 // 2325 // Aparrently the thread id field was specified for some 2326 // entries and not others. Start the thread id counter 2327 // off at the next higher thread id. 2328 // 2329 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2330 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2331 } 2332 } 2333 if (threadInfo[i][index] != lastId[index]) { 2334 // 2335 // Run through all indices which are less significant, 2336 // and reset the counts to 1. 2337 // 2338 // At all levels up to and including index, we need to 2339 // increment the totals and record the last id. 2340 // 2341 unsigned index2; 2342 for (index2 = threadIdIndex; index2 < index; index2++) { 2343 totals[index2]++; 2344 if (counts[index2] > maxCt[index2]) { 2345 maxCt[index2] = counts[index2]; 2346 } 2347 counts[index2] = 1; 2348 lastId[index2] = threadInfo[i][index2]; 2349 } 2350 counts[index]++; 2351 totals[index]++; 2352 lastId[index] = threadInfo[i][index]; 2353 2354 if (assign_thread_ids && (index > threadIdIndex)) { 2355 2356 # if KMP_MIC && REDUCE_TEAM_SIZE 2357 // 2358 // The default team size is the total #threads in the machine 2359 // minus 1 thread for every core that has 3 or more threads. 2360 // 2361 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 ); 2362 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2363 2364 // 2365 // Restart the thread counter, as we are on a new core. 2366 // 2367 threadIdCt = 0; 2368 2369 // 2370 // Auto-assign the thread id field if it wasn't specified. 2371 // 2372 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2373 threadInfo[i][threadIdIndex] = threadIdCt++; 2374 } 2375 2376 // 2377 // Aparrently the thread id field was specified for some 2378 // entries and not others. Start the thread id counter 2379 // off at the next higher thread id. 2380 // 2381 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2382 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2383 } 2384 } 2385 break; 2386 } 2387 } 2388 if (index < threadIdIndex) { 2389 // 2390 // If thread ids were specified, it is an error if they are not 2391 // unique. Also, check that we waven't already restarted the 2392 // loop (to be safe - shouldn't need to). 2393 // 2394 if ((threadInfo[i][threadIdIndex] != UINT_MAX) 2395 || assign_thread_ids) { 2396 __kmp_free(lastId); 2397 __kmp_free(totals); 2398 __kmp_free(maxCt); 2399 __kmp_free(counts); 2400 CLEANUP_THREAD_INFO; 2401 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2402 return -1; 2403 } 2404 2405 // 2406 // If the thread ids were not specified and we see entries 2407 // entries that are duplicates, start the loop over and 2408 // assign the thread ids manually. 2409 // 2410 assign_thread_ids = true; 2411 goto restart_radix_check; 2412 } 2413 } 2414 2415 # if KMP_MIC && REDUCE_TEAM_SIZE 2416 // 2417 // The default team size is the total #threads in the machine 2418 // minus 1 thread for every core that has 3 or more threads. 2419 // 2420 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 ); 2421 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2422 2423 for (index = threadIdIndex; index <= maxIndex; index++) { 2424 if (counts[index] > maxCt[index]) { 2425 maxCt[index] = counts[index]; 2426 } 2427 } 2428 2429 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2430 nCoresPerPkg = maxCt[coreIdIndex]; 2431 nPackages = totals[pkgIdIndex]; 2432 2433 // 2434 // Check to see if the machine topology is uniform 2435 // 2436 unsigned prod = totals[maxIndex]; 2437 for (index = threadIdIndex; index < maxIndex; index++) { 2438 prod *= maxCt[index]; 2439 } 2440 bool uniform = (prod == totals[threadIdIndex]); 2441 2442 // 2443 // When affinity is off, this routine will still be called to set 2444 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 2445 // nCoresPerPkg, & nPackages. Make sure all these vars are set 2446 // correctly, and return now if affinity is not enabled. 2447 // 2448 __kmp_ncores = totals[coreIdIndex]; 2449 2450 if (__kmp_affinity_verbose) { 2451 if (! KMP_AFFINITY_CAPABLE()) { 2452 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2453 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2454 if (uniform) { 2455 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2456 } else { 2457 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2458 } 2459 } 2460 else { 2461 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2462 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask); 2463 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2464 if (__kmp_affinity_respect_mask) { 2465 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2466 } else { 2467 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2468 } 2469 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2470 if (uniform) { 2471 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2472 } else { 2473 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2474 } 2475 } 2476 kmp_str_buf_t buf; 2477 __kmp_str_buf_init(&buf); 2478 2479 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]); 2480 for (index = maxIndex - 1; index >= pkgIdIndex; index--) { 2481 __kmp_str_buf_print(&buf, " x %d", maxCt[index]); 2482 } 2483 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex], 2484 maxCt[threadIdIndex], __kmp_ncores); 2485 2486 __kmp_str_buf_free(&buf); 2487 } 2488 2489 # if KMP_MIC && REDUCE_TEAM_SIZE 2490 // 2491 // Set the default team size. 2492 // 2493 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2494 __kmp_dflt_team_nth = teamSize; 2495 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n", 2496 __kmp_dflt_team_nth)); 2497 } 2498 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2499 2500 if (__kmp_affinity_type == affinity_none) { 2501 __kmp_free(lastId); 2502 __kmp_free(totals); 2503 __kmp_free(maxCt); 2504 __kmp_free(counts); 2505 CLEANUP_THREAD_INFO; 2506 return 0; 2507 } 2508 2509 // 2510 // Count the number of levels which have more nodes at that level than 2511 // at the parent's level (with there being an implicit root node of 2512 // the top level). This is equivalent to saying that there is at least 2513 // one node at this level which has a sibling. These levels are in the 2514 // map, and the package level is always in the map. 2515 // 2516 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2517 int level = 0; 2518 for (index = threadIdIndex; index < maxIndex; index++) { 2519 KMP_ASSERT(totals[index] >= totals[index + 1]); 2520 inMap[index] = (totals[index] > totals[index + 1]); 2521 } 2522 inMap[maxIndex] = (totals[maxIndex] > 1); 2523 inMap[pkgIdIndex] = true; 2524 2525 int depth = 0; 2526 for (index = threadIdIndex; index <= maxIndex; index++) { 2527 if (inMap[index]) { 2528 depth++; 2529 } 2530 } 2531 KMP_ASSERT(depth > 0); 2532 2533 // 2534 // Construct the data structure that is to be returned. 2535 // 2536 *address2os = (AddrUnsPair*) 2537 __kmp_allocate(sizeof(AddrUnsPair) * num_avail); 2538 int pkgLevel = -1; 2539 int coreLevel = -1; 2540 int threadLevel = -1; 2541 2542 for (i = 0; i < num_avail; ++i) { 2543 Address addr(depth); 2544 unsigned os = threadInfo[i][osIdIndex]; 2545 int src_index; 2546 int dst_index = 0; 2547 2548 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2549 if (! inMap[src_index]) { 2550 continue; 2551 } 2552 addr.labels[dst_index] = threadInfo[i][src_index]; 2553 if (src_index == pkgIdIndex) { 2554 pkgLevel = dst_index; 2555 } 2556 else if (src_index == coreIdIndex) { 2557 coreLevel = dst_index; 2558 } 2559 else if (src_index == threadIdIndex) { 2560 threadLevel = dst_index; 2561 } 2562 dst_index++; 2563 } 2564 (*address2os)[i] = AddrUnsPair(addr, os); 2565 } 2566 2567 if (__kmp_affinity_gran_levels < 0) { 2568 // 2569 // Set the granularity level based on what levels are modeled 2570 // in the machine topology map. 2571 // 2572 unsigned src_index; 2573 __kmp_affinity_gran_levels = 0; 2574 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) { 2575 if (! inMap[src_index]) { 2576 continue; 2577 } 2578 switch (src_index) { 2579 case threadIdIndex: 2580 if (__kmp_affinity_gran > affinity_gran_thread) { 2581 __kmp_affinity_gran_levels++; 2582 } 2583 2584 break; 2585 case coreIdIndex: 2586 if (__kmp_affinity_gran > affinity_gran_core) { 2587 __kmp_affinity_gran_levels++; 2588 } 2589 break; 2590 2591 case pkgIdIndex: 2592 if (__kmp_affinity_gran > affinity_gran_package) { 2593 __kmp_affinity_gran_levels++; 2594 } 2595 break; 2596 } 2597 } 2598 } 2599 2600 if (__kmp_affinity_verbose) { 2601 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel, 2602 coreLevel, threadLevel); 2603 } 2604 2605 __kmp_free(inMap); 2606 __kmp_free(lastId); 2607 __kmp_free(totals); 2608 __kmp_free(maxCt); 2609 __kmp_free(counts); 2610 CLEANUP_THREAD_INFO; 2611 return depth; 2612 } 2613 2614 2615 // 2616 // Create and return a table of affinity masks, indexed by OS thread ID. 2617 // This routine handles OR'ing together all the affinity masks of threads 2618 // that are sufficiently close, if granularity > fine. 2619 // 2620 static kmp_affin_mask_t * 2621 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique, 2622 AddrUnsPair *address2os, unsigned numAddrs) 2623 { 2624 // 2625 // First form a table of affinity masks in order of OS thread id. 2626 // 2627 unsigned depth; 2628 unsigned maxOsId; 2629 unsigned i; 2630 2631 KMP_ASSERT(numAddrs > 0); 2632 depth = address2os[0].first.depth; 2633 2634 maxOsId = 0; 2635 for (i = 0; i < numAddrs; i++) { 2636 unsigned osId = address2os[i].second; 2637 if (osId > maxOsId) { 2638 maxOsId = osId; 2639 } 2640 } 2641 kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate( 2642 (maxOsId + 1) * __kmp_affin_mask_size); 2643 2644 // 2645 // Sort the address2os table according to physical order. Doing so 2646 // will put all threads on the same core/package/node in consecutive 2647 // locations. 2648 // 2649 qsort(address2os, numAddrs, sizeof(*address2os), 2650 __kmp_affinity_cmp_Address_labels); 2651 2652 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2653 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2654 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2655 } 2656 if (__kmp_affinity_gran_levels >= (int)depth) { 2657 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2658 && (__kmp_affinity_type != affinity_none))) { 2659 KMP_WARNING(AffThreadsMayMigrate); 2660 } 2661 } 2662 2663 // 2664 // Run through the table, forming the masks for all threads on each 2665 // core. Threads on the same core will have identical "Address" 2666 // objects, not considering the last level, which must be the thread 2667 // id. All threads on a core will appear consecutively. 2668 // 2669 unsigned unique = 0; 2670 unsigned j = 0; // index of 1st thread on core 2671 unsigned leader = 0; 2672 Address *leaderAddr = &(address2os[0].first); 2673 kmp_affin_mask_t *sum 2674 = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size); 2675 KMP_CPU_ZERO(sum); 2676 KMP_CPU_SET(address2os[0].second, sum); 2677 for (i = 1; i < numAddrs; i++) { 2678 // 2679 // If this thread is sufficiently close to the leader (within the 2680 // granularity setting), then set the bit for this os thread in the 2681 // affinity mask for this group, and go on to the next thread. 2682 // 2683 if (leaderAddr->isClose(address2os[i].first, 2684 __kmp_affinity_gran_levels)) { 2685 KMP_CPU_SET(address2os[i].second, sum); 2686 continue; 2687 } 2688 2689 // 2690 // For every thread in this group, copy the mask to the thread's 2691 // entry in the osId2Mask table. Mark the first address as a 2692 // leader. 2693 // 2694 for (; j < i; j++) { 2695 unsigned osId = address2os[j].second; 2696 KMP_DEBUG_ASSERT(osId <= maxOsId); 2697 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2698 KMP_CPU_COPY(mask, sum); 2699 address2os[j].first.leader = (j == leader); 2700 } 2701 unique++; 2702 2703 // 2704 // Start a new mask. 2705 // 2706 leader = i; 2707 leaderAddr = &(address2os[i].first); 2708 KMP_CPU_ZERO(sum); 2709 KMP_CPU_SET(address2os[i].second, sum); 2710 } 2711 2712 // 2713 // For every thread in last group, copy the mask to the thread's 2714 // entry in the osId2Mask table. 2715 // 2716 for (; j < i; j++) { 2717 unsigned osId = address2os[j].second; 2718 KMP_DEBUG_ASSERT(osId <= maxOsId); 2719 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2720 KMP_CPU_COPY(mask, sum); 2721 address2os[j].first.leader = (j == leader); 2722 } 2723 unique++; 2724 2725 *maxIndex = maxOsId; 2726 *numUnique = unique; 2727 return osId2Mask; 2728 } 2729 2730 2731 // 2732 // Stuff for the affinity proclist parsers. It's easier to declare these vars 2733 // as file-static than to try and pass them through the calling sequence of 2734 // the recursive-descent OMP_PLACES parser. 2735 // 2736 static kmp_affin_mask_t *newMasks; 2737 static int numNewMasks; 2738 static int nextNewMask; 2739 2740 #define ADD_MASK(_mask) \ 2741 { \ 2742 if (nextNewMask >= numNewMasks) { \ 2743 numNewMasks *= 2; \ 2744 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \ 2745 numNewMasks * __kmp_affin_mask_size); \ 2746 } \ 2747 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 2748 nextNewMask++; \ 2749 } 2750 2751 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \ 2752 { \ 2753 if (((_osId) > _maxOsId) || \ 2754 (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 2755 if (__kmp_affinity_verbose || (__kmp_affinity_warnings \ 2756 && (__kmp_affinity_type != affinity_none))) { \ 2757 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 2758 } \ 2759 } \ 2760 else { \ 2761 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 2762 } \ 2763 } 2764 2765 2766 // 2767 // Re-parse the proclist (for the explicit affinity type), and form the list 2768 // of affinity newMasks indexed by gtid. 2769 // 2770 static void 2771 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 2772 unsigned int *out_numMasks, const char *proclist, 2773 kmp_affin_mask_t *osId2Mask, int maxOsId) 2774 { 2775 const char *scan = proclist; 2776 const char *next = proclist; 2777 2778 // 2779 // We use malloc() for the temporary mask vector, 2780 // so that we can use realloc() to extend it. 2781 // 2782 numNewMasks = 2; 2783 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks 2784 * __kmp_affin_mask_size); 2785 nextNewMask = 0; 2786 kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate( 2787 __kmp_affin_mask_size); 2788 int setSize = 0; 2789 2790 for (;;) { 2791 int start, end, stride; 2792 2793 SKIP_WS(scan); 2794 next = scan; 2795 if (*next == '\0') { 2796 break; 2797 } 2798 2799 if (*next == '{') { 2800 int num; 2801 setSize = 0; 2802 next++; // skip '{' 2803 SKIP_WS(next); 2804 scan = next; 2805 2806 // 2807 // Read the first integer in the set. 2808 // 2809 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2810 "bad proclist"); 2811 SKIP_DIGITS(next); 2812 num = __kmp_str_to_int(scan, *next); 2813 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2814 2815 // 2816 // Copy the mask for that osId to the sum (union) mask. 2817 // 2818 if ((num > maxOsId) || 2819 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2820 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2821 && (__kmp_affinity_type != affinity_none))) { 2822 KMP_WARNING(AffIgnoreInvalidProcID, num); 2823 } 2824 KMP_CPU_ZERO(sumMask); 2825 } 2826 else { 2827 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2828 setSize = 1; 2829 } 2830 2831 for (;;) { 2832 // 2833 // Check for end of set. 2834 // 2835 SKIP_WS(next); 2836 if (*next == '}') { 2837 next++; // skip '}' 2838 break; 2839 } 2840 2841 // 2842 // Skip optional comma. 2843 // 2844 if (*next == ',') { 2845 next++; 2846 } 2847 SKIP_WS(next); 2848 2849 // 2850 // Read the next integer in the set. 2851 // 2852 scan = next; 2853 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2854 "bad explicit proc list"); 2855 2856 SKIP_DIGITS(next); 2857 num = __kmp_str_to_int(scan, *next); 2858 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2859 2860 // 2861 // Add the mask for that osId to the sum mask. 2862 // 2863 if ((num > maxOsId) || 2864 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2865 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2866 && (__kmp_affinity_type != affinity_none))) { 2867 KMP_WARNING(AffIgnoreInvalidProcID, num); 2868 } 2869 } 2870 else { 2871 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2872 setSize++; 2873 } 2874 } 2875 if (setSize > 0) { 2876 ADD_MASK(sumMask); 2877 } 2878 2879 SKIP_WS(next); 2880 if (*next == ',') { 2881 next++; 2882 } 2883 scan = next; 2884 continue; 2885 } 2886 2887 // 2888 // Read the first integer. 2889 // 2890 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2891 SKIP_DIGITS(next); 2892 start = __kmp_str_to_int(scan, *next); 2893 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 2894 SKIP_WS(next); 2895 2896 // 2897 // If this isn't a range, then add a mask to the list and go on. 2898 // 2899 if (*next != '-') { 2900 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2901 2902 // 2903 // Skip optional comma. 2904 // 2905 if (*next == ',') { 2906 next++; 2907 } 2908 scan = next; 2909 continue; 2910 } 2911 2912 // 2913 // This is a range. Skip over the '-' and read in the 2nd int. 2914 // 2915 next++; // skip '-' 2916 SKIP_WS(next); 2917 scan = next; 2918 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2919 SKIP_DIGITS(next); 2920 end = __kmp_str_to_int(scan, *next); 2921 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 2922 2923 // 2924 // Check for a stride parameter 2925 // 2926 stride = 1; 2927 SKIP_WS(next); 2928 if (*next == ':') { 2929 // 2930 // A stride is specified. Skip over the ':" and read the 3rd int. 2931 // 2932 int sign = +1; 2933 next++; // skip ':' 2934 SKIP_WS(next); 2935 scan = next; 2936 if (*next == '-') { 2937 sign = -1; 2938 next++; 2939 SKIP_WS(next); 2940 scan = next; 2941 } 2942 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2943 "bad explicit proc list"); 2944 SKIP_DIGITS(next); 2945 stride = __kmp_str_to_int(scan, *next); 2946 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 2947 stride *= sign; 2948 } 2949 2950 // 2951 // Do some range checks. 2952 // 2953 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 2954 if (stride > 0) { 2955 KMP_ASSERT2(start <= end, "bad explicit proc list"); 2956 } 2957 else { 2958 KMP_ASSERT2(start >= end, "bad explicit proc list"); 2959 } 2960 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 2961 2962 // 2963 // Add the mask for each OS proc # to the list. 2964 // 2965 if (stride > 0) { 2966 do { 2967 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2968 start += stride; 2969 } while (start <= end); 2970 } 2971 else { 2972 do { 2973 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2974 start += stride; 2975 } while (start >= end); 2976 } 2977 2978 // 2979 // Skip optional comma. 2980 // 2981 SKIP_WS(next); 2982 if (*next == ',') { 2983 next++; 2984 } 2985 scan = next; 2986 } 2987 2988 *out_numMasks = nextNewMask; 2989 if (nextNewMask == 0) { 2990 *out_masks = NULL; 2991 KMP_INTERNAL_FREE(newMasks); 2992 return; 2993 } 2994 *out_masks 2995 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size); 2996 KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size); 2997 __kmp_free(sumMask); 2998 KMP_INTERNAL_FREE(newMasks); 2999 } 3000 3001 3002 # if OMP_40_ENABLED 3003 3004 /*----------------------------------------------------------------------------- 3005 3006 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 3007 places. Again, Here is the grammar: 3008 3009 place_list := place 3010 place_list := place , place_list 3011 place := num 3012 place := place : num 3013 place := place : num : signed 3014 place := { subplacelist } 3015 place := ! place // (lowest priority) 3016 subplace_list := subplace 3017 subplace_list := subplace , subplace_list 3018 subplace := num 3019 subplace := num : num 3020 subplace := num : num : signed 3021 signed := num 3022 signed := + signed 3023 signed := - signed 3024 3025 -----------------------------------------------------------------------------*/ 3026 3027 static void 3028 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask, 3029 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize) 3030 { 3031 const char *next; 3032 3033 for (;;) { 3034 int start, count, stride, i; 3035 3036 // 3037 // Read in the starting proc id 3038 // 3039 SKIP_WS(*scan); 3040 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 3041 "bad explicit places list"); 3042 next = *scan; 3043 SKIP_DIGITS(next); 3044 start = __kmp_str_to_int(*scan, *next); 3045 KMP_ASSERT(start >= 0); 3046 *scan = next; 3047 3048 // 3049 // valid follow sets are ',' ':' and '}' 3050 // 3051 SKIP_WS(*scan); 3052 if (**scan == '}' || **scan == ',') { 3053 if ((start > maxOsId) || 3054 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3055 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3056 && (__kmp_affinity_type != affinity_none))) { 3057 KMP_WARNING(AffIgnoreInvalidProcID, start); 3058 } 3059 } 3060 else { 3061 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3062 (*setSize)++; 3063 } 3064 if (**scan == '}') { 3065 break; 3066 } 3067 (*scan)++; // skip ',' 3068 continue; 3069 } 3070 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3071 (*scan)++; // skip ':' 3072 3073 // 3074 // Read count parameter 3075 // 3076 SKIP_WS(*scan); 3077 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 3078 "bad explicit places list"); 3079 next = *scan; 3080 SKIP_DIGITS(next); 3081 count = __kmp_str_to_int(*scan, *next); 3082 KMP_ASSERT(count >= 0); 3083 *scan = next; 3084 3085 // 3086 // valid follow sets are ',' ':' and '}' 3087 // 3088 SKIP_WS(*scan); 3089 if (**scan == '}' || **scan == ',') { 3090 for (i = 0; i < count; i++) { 3091 if ((start > maxOsId) || 3092 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3093 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3094 && (__kmp_affinity_type != affinity_none))) { 3095 KMP_WARNING(AffIgnoreInvalidProcID, start); 3096 } 3097 break; // don't proliferate warnings for large count 3098 } 3099 else { 3100 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3101 start++; 3102 (*setSize)++; 3103 } 3104 } 3105 if (**scan == '}') { 3106 break; 3107 } 3108 (*scan)++; // skip ',' 3109 continue; 3110 } 3111 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3112 (*scan)++; // skip ':' 3113 3114 // 3115 // Read stride parameter 3116 // 3117 int sign = +1; 3118 for (;;) { 3119 SKIP_WS(*scan); 3120 if (**scan == '+') { 3121 (*scan)++; // skip '+' 3122 continue; 3123 } 3124 if (**scan == '-') { 3125 sign *= -1; 3126 (*scan)++; // skip '-' 3127 continue; 3128 } 3129 break; 3130 } 3131 SKIP_WS(*scan); 3132 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 3133 "bad explicit places list"); 3134 next = *scan; 3135 SKIP_DIGITS(next); 3136 stride = __kmp_str_to_int(*scan, *next); 3137 KMP_ASSERT(stride >= 0); 3138 *scan = next; 3139 stride *= sign; 3140 3141 // 3142 // valid follow sets are ',' and '}' 3143 // 3144 SKIP_WS(*scan); 3145 if (**scan == '}' || **scan == ',') { 3146 for (i = 0; i < count; i++) { 3147 if ((start > maxOsId) || 3148 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3149 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3150 && (__kmp_affinity_type != affinity_none))) { 3151 KMP_WARNING(AffIgnoreInvalidProcID, start); 3152 } 3153 break; // don't proliferate warnings for large count 3154 } 3155 else { 3156 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3157 start += stride; 3158 (*setSize)++; 3159 } 3160 } 3161 if (**scan == '}') { 3162 break; 3163 } 3164 (*scan)++; // skip ',' 3165 continue; 3166 } 3167 3168 KMP_ASSERT2(0, "bad explicit places list"); 3169 } 3170 } 3171 3172 3173 static void 3174 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 3175 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize) 3176 { 3177 const char *next; 3178 3179 // 3180 // valid follow sets are '{' '!' and num 3181 // 3182 SKIP_WS(*scan); 3183 if (**scan == '{') { 3184 (*scan)++; // skip '{' 3185 __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask, 3186 setSize); 3187 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 3188 (*scan)++; // skip '}' 3189 } 3190 else if (**scan == '!') { 3191 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 3192 KMP_CPU_COMPLEMENT(tempMask); 3193 (*scan)++; // skip '!' 3194 } 3195 else if ((**scan >= '0') && (**scan <= '9')) { 3196 next = *scan; 3197 SKIP_DIGITS(next); 3198 int num = __kmp_str_to_int(*scan, *next); 3199 KMP_ASSERT(num >= 0); 3200 if ((num > maxOsId) || 3201 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3202 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3203 && (__kmp_affinity_type != affinity_none))) { 3204 KMP_WARNING(AffIgnoreInvalidProcID, num); 3205 } 3206 } 3207 else { 3208 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 3209 (*setSize)++; 3210 } 3211 *scan = next; // skip num 3212 } 3213 else { 3214 KMP_ASSERT2(0, "bad explicit places list"); 3215 } 3216 } 3217 3218 3219 //static void 3220 void 3221 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 3222 unsigned int *out_numMasks, const char *placelist, 3223 kmp_affin_mask_t *osId2Mask, int maxOsId) 3224 { 3225 const char *scan = placelist; 3226 const char *next = placelist; 3227 3228 numNewMasks = 2; 3229 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks 3230 * __kmp_affin_mask_size); 3231 nextNewMask = 0; 3232 3233 kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate( 3234 __kmp_affin_mask_size); 3235 KMP_CPU_ZERO(tempMask); 3236 int setSize = 0; 3237 3238 for (;;) { 3239 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 3240 3241 // 3242 // valid follow sets are ',' ':' and EOL 3243 // 3244 SKIP_WS(scan); 3245 if (*scan == '\0' || *scan == ',') { 3246 if (setSize > 0) { 3247 ADD_MASK(tempMask); 3248 } 3249 KMP_CPU_ZERO(tempMask); 3250 setSize = 0; 3251 if (*scan == '\0') { 3252 break; 3253 } 3254 scan++; // skip ',' 3255 continue; 3256 } 3257 3258 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3259 scan++; // skip ':' 3260 3261 // 3262 // Read count parameter 3263 // 3264 SKIP_WS(scan); 3265 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), 3266 "bad explicit places list"); 3267 next = scan; 3268 SKIP_DIGITS(next); 3269 int count = __kmp_str_to_int(scan, *next); 3270 KMP_ASSERT(count >= 0); 3271 scan = next; 3272 3273 // 3274 // valid follow sets are ',' ':' and EOL 3275 // 3276 SKIP_WS(scan); 3277 int stride; 3278 if (*scan == '\0' || *scan == ',') { 3279 stride = +1; 3280 } 3281 else { 3282 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3283 scan++; // skip ':' 3284 3285 // 3286 // Read stride parameter 3287 // 3288 int sign = +1; 3289 for (;;) { 3290 SKIP_WS(scan); 3291 if (*scan == '+') { 3292 scan++; // skip '+' 3293 continue; 3294 } 3295 if (*scan == '-') { 3296 sign *= -1; 3297 scan++; // skip '-' 3298 continue; 3299 } 3300 break; 3301 } 3302 SKIP_WS(scan); 3303 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), 3304 "bad explicit places list"); 3305 next = scan; 3306 SKIP_DIGITS(next); 3307 stride = __kmp_str_to_int(scan, *next); 3308 KMP_DEBUG_ASSERT(stride >= 0); 3309 scan = next; 3310 stride *= sign; 3311 } 3312 3313 if (stride > 0) { 3314 int i; 3315 for (i = 0; i < count; i++) { 3316 int j; 3317 if (setSize == 0) { 3318 break; 3319 } 3320 ADD_MASK(tempMask); 3321 setSize = 0; 3322 for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) { 3323 if (! KMP_CPU_ISSET(j - stride, tempMask)) { 3324 KMP_CPU_CLR(j, tempMask); 3325 } 3326 else if ((j > maxOsId) || 3327 (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) { 3328 if ((__kmp_affinity_verbose || (__kmp_affinity_warnings 3329 && (__kmp_affinity_type != affinity_none))) && i < count - 1) { 3330 KMP_WARNING(AffIgnoreInvalidProcID, j); 3331 } 3332 KMP_CPU_CLR(j, tempMask); 3333 } 3334 else { 3335 KMP_CPU_SET(j, tempMask); 3336 setSize++; 3337 } 3338 } 3339 for (; j >= 0; j--) { 3340 KMP_CPU_CLR(j, tempMask); 3341 } 3342 } 3343 } 3344 else { 3345 int i; 3346 for (i = 0; i < count; i++) { 3347 int j; 3348 if (setSize == 0) { 3349 break; 3350 } 3351 ADD_MASK(tempMask); 3352 setSize = 0; 3353 for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride; 3354 j++) { 3355 if (! KMP_CPU_ISSET(j - stride, tempMask)) { 3356 KMP_CPU_CLR(j, tempMask); 3357 } 3358 else if ((j > maxOsId) || 3359 (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) { 3360 if ((__kmp_affinity_verbose || (__kmp_affinity_warnings 3361 && (__kmp_affinity_type != affinity_none))) && i < count - 1) { 3362 KMP_WARNING(AffIgnoreInvalidProcID, j); 3363 } 3364 KMP_CPU_CLR(j, tempMask); 3365 } 3366 else { 3367 KMP_CPU_SET(j, tempMask); 3368 setSize++; 3369 } 3370 } 3371 for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) { 3372 KMP_CPU_CLR(j, tempMask); 3373 } 3374 } 3375 } 3376 KMP_CPU_ZERO(tempMask); 3377 setSize = 0; 3378 3379 // 3380 // valid follow sets are ',' and EOL 3381 // 3382 SKIP_WS(scan); 3383 if (*scan == '\0') { 3384 break; 3385 } 3386 if (*scan == ',') { 3387 scan++; // skip ',' 3388 continue; 3389 } 3390 3391 KMP_ASSERT2(0, "bad explicit places list"); 3392 } 3393 3394 *out_numMasks = nextNewMask; 3395 if (nextNewMask == 0) { 3396 *out_masks = NULL; 3397 KMP_INTERNAL_FREE(newMasks); 3398 return; 3399 } 3400 *out_masks 3401 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size); 3402 KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size); 3403 __kmp_free(tempMask); 3404 KMP_INTERNAL_FREE(newMasks); 3405 } 3406 3407 # endif /* OMP_40_ENABLED */ 3408 3409 #undef ADD_MASK 3410 #undef ADD_MASK_OSID 3411 3412 static void 3413 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) 3414 { 3415 if ( __kmp_place_num_cores == 0 ) { 3416 if ( __kmp_place_num_threads_per_core == 0 ) { 3417 return; // no cores limiting actions requested, exit 3418 } 3419 __kmp_place_num_cores = nCoresPerPkg; // use all available cores 3420 } 3421 if ( !__kmp_affinity_uniform_topology() ) { 3422 KMP_WARNING( AffThrPlaceNonUniform ); 3423 return; // don't support non-uniform topology 3424 } 3425 if ( depth != 3 ) { 3426 KMP_WARNING( AffThrPlaceNonThreeLevel ); 3427 return; // don't support not-3-level topology 3428 } 3429 if ( __kmp_place_num_threads_per_core == 0 ) { 3430 __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts 3431 } 3432 if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) { 3433 KMP_WARNING( AffThrPlaceManyCores ); 3434 return; 3435 } 3436 3437 AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) * 3438 nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core); 3439 int i, j, k, n_old = 0, n_new = 0; 3440 for ( i = 0; i < nPackages; ++i ) { 3441 for ( j = 0; j < nCoresPerPkg; ++j ) { 3442 if ( j < __kmp_place_core_offset || j >= __kmp_place_core_offset + __kmp_place_num_cores ) { 3443 n_old += __kmp_nThreadsPerCore; // skip not-requested core 3444 } else { 3445 for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) { 3446 if ( k < __kmp_place_num_threads_per_core ) { 3447 newAddr[n_new] = (*pAddr)[n_old]; // copy requested core' data to new location 3448 n_new++; 3449 } 3450 n_old++; 3451 } 3452 } 3453 } 3454 } 3455 nCoresPerPkg = __kmp_place_num_cores; // correct nCoresPerPkg 3456 __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore 3457 __kmp_avail_proc = n_new; // correct avail_proc 3458 __kmp_ncores = nPackages * __kmp_place_num_cores; // correct ncores 3459 3460 __kmp_free( *pAddr ); 3461 *pAddr = newAddr; // replace old topology with new one 3462 } 3463 3464 3465 static AddrUnsPair *address2os = NULL; 3466 static int * procarr = NULL; 3467 static int __kmp_aff_depth = 0; 3468 3469 static void 3470 __kmp_aux_affinity_initialize(void) 3471 { 3472 if (__kmp_affinity_masks != NULL) { 3473 KMP_ASSERT(fullMask != NULL); 3474 return; 3475 } 3476 3477 // 3478 // Create the "full" mask - this defines all of the processors that we 3479 // consider to be in the machine model. If respect is set, then it is 3480 // the initialization thread's affinity mask. Otherwise, it is all 3481 // processors that we know about on the machine. 3482 // 3483 if (fullMask == NULL) { 3484 fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size); 3485 } 3486 if (KMP_AFFINITY_CAPABLE()) { 3487 if (__kmp_affinity_respect_mask) { 3488 __kmp_get_system_affinity(fullMask, TRUE); 3489 3490 // 3491 // Count the number of available processors. 3492 // 3493 unsigned i; 3494 __kmp_avail_proc = 0; 3495 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 3496 if (! KMP_CPU_ISSET(i, fullMask)) { 3497 continue; 3498 } 3499 __kmp_avail_proc++; 3500 } 3501 if (__kmp_avail_proc > __kmp_xproc) { 3502 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3503 && (__kmp_affinity_type != affinity_none))) { 3504 KMP_WARNING(ErrorInitializeAffinity); 3505 } 3506 __kmp_affinity_type = affinity_none; 3507 KMP_AFFINITY_DISABLE(); 3508 return; 3509 } 3510 } 3511 else { 3512 __kmp_affinity_entire_machine_mask(fullMask); 3513 __kmp_avail_proc = __kmp_xproc; 3514 } 3515 } 3516 3517 int depth = -1; 3518 kmp_i18n_id_t msg_id = kmp_i18n_null; 3519 3520 // 3521 // For backward compatibility, setting KMP_CPUINFO_FILE => 3522 // KMP_TOPOLOGY_METHOD=cpuinfo 3523 // 3524 if ((__kmp_cpuinfo_file != NULL) && 3525 (__kmp_affinity_top_method == affinity_top_method_all)) { 3526 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 3527 } 3528 3529 if (__kmp_affinity_top_method == affinity_top_method_all) { 3530 // 3531 // In the default code path, errors are not fatal - we just try using 3532 // another method. We only emit a warning message if affinity is on, 3533 // or the verbose flag is set, an the nowarnings flag was not set. 3534 // 3535 const char *file_name = NULL; 3536 int line = 0; 3537 3538 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 3539 3540 if (__kmp_affinity_verbose) { 3541 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 3542 } 3543 3544 file_name = NULL; 3545 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3546 if (depth == 0) { 3547 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3548 KMP_ASSERT(address2os == NULL); 3549 return; 3550 } 3551 3552 if (depth < 0) { 3553 if (__kmp_affinity_verbose) { 3554 if (msg_id != kmp_i18n_null) { 3555 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), 3556 KMP_I18N_STR(DecodingLegacyAPIC)); 3557 } 3558 else { 3559 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 3560 } 3561 } 3562 3563 file_name = NULL; 3564 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3565 if (depth == 0) { 3566 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3567 KMP_ASSERT(address2os == NULL); 3568 return; 3569 } 3570 } 3571 3572 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3573 3574 # if KMP_OS_LINUX 3575 3576 if (depth < 0) { 3577 if (__kmp_affinity_verbose) { 3578 if (msg_id != kmp_i18n_null) { 3579 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo"); 3580 } 3581 else { 3582 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo"); 3583 } 3584 } 3585 3586 FILE *f = fopen("/proc/cpuinfo", "r"); 3587 if (f == NULL) { 3588 msg_id = kmp_i18n_str_CantOpenCpuinfo; 3589 } 3590 else { 3591 file_name = "/proc/cpuinfo"; 3592 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3593 fclose(f); 3594 if (depth == 0) { 3595 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3596 KMP_ASSERT(address2os == NULL); 3597 return; 3598 } 3599 } 3600 } 3601 3602 # endif /* KMP_OS_LINUX */ 3603 3604 # if KMP_GROUP_AFFINITY 3605 3606 if ((depth < 0) && (__kmp_num_proc_groups > 1)) { 3607 if (__kmp_affinity_verbose) { 3608 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 3609 } 3610 3611 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 3612 KMP_ASSERT(depth != 0); 3613 } 3614 3615 # endif /* KMP_GROUP_AFFINITY */ 3616 3617 if (depth < 0) { 3618 if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) { 3619 if (file_name == NULL) { 3620 KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id)); 3621 } 3622 else if (line == 0) { 3623 KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id)); 3624 } 3625 else { 3626 KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id)); 3627 } 3628 } 3629 // FIXME - print msg if msg_id = kmp_i18n_null ??? 3630 3631 file_name = ""; 3632 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3633 if (depth == 0) { 3634 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3635 KMP_ASSERT(address2os == NULL); 3636 return; 3637 } 3638 KMP_ASSERT(depth > 0); 3639 KMP_ASSERT(address2os != NULL); 3640 } 3641 } 3642 3643 // 3644 // If the user has specified that a paricular topology discovery method 3645 // is to be used, then we abort if that method fails. The exception is 3646 // group affinity, which might have been implicitly set. 3647 // 3648 3649 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 3650 3651 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 3652 if (__kmp_affinity_verbose) { 3653 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3654 KMP_I18N_STR(Decodingx2APIC)); 3655 } 3656 3657 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3658 if (depth == 0) { 3659 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3660 KMP_ASSERT(address2os == NULL); 3661 return; 3662 } 3663 if (depth < 0) { 3664 KMP_ASSERT(msg_id != kmp_i18n_null); 3665 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3666 } 3667 } 3668 else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 3669 if (__kmp_affinity_verbose) { 3670 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3671 KMP_I18N_STR(DecodingLegacyAPIC)); 3672 } 3673 3674 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3675 if (depth == 0) { 3676 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3677 KMP_ASSERT(address2os == NULL); 3678 return; 3679 } 3680 if (depth < 0) { 3681 KMP_ASSERT(msg_id != kmp_i18n_null); 3682 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3683 } 3684 } 3685 3686 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3687 3688 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 3689 const char *filename; 3690 if (__kmp_cpuinfo_file != NULL) { 3691 filename = __kmp_cpuinfo_file; 3692 } 3693 else { 3694 filename = "/proc/cpuinfo"; 3695 } 3696 3697 if (__kmp_affinity_verbose) { 3698 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 3699 } 3700 3701 FILE *f = fopen(filename, "r"); 3702 if (f == NULL) { 3703 int code = errno; 3704 if (__kmp_cpuinfo_file != NULL) { 3705 __kmp_msg( 3706 kmp_ms_fatal, 3707 KMP_MSG(CantOpenFileForReading, filename), 3708 KMP_ERR(code), 3709 KMP_HNT(NameComesFrom_CPUINFO_FILE), 3710 __kmp_msg_null 3711 ); 3712 } 3713 else { 3714 __kmp_msg( 3715 kmp_ms_fatal, 3716 KMP_MSG(CantOpenFileForReading, filename), 3717 KMP_ERR(code), 3718 __kmp_msg_null 3719 ); 3720 } 3721 } 3722 int line = 0; 3723 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3724 fclose(f); 3725 if (depth < 0) { 3726 KMP_ASSERT(msg_id != kmp_i18n_null); 3727 if (line > 0) { 3728 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id)); 3729 } 3730 else { 3731 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 3732 } 3733 } 3734 if (__kmp_affinity_type == affinity_none) { 3735 KMP_ASSERT(depth == 0); 3736 KMP_ASSERT(address2os == NULL); 3737 return; 3738 } 3739 } 3740 3741 # if KMP_GROUP_AFFINITY 3742 3743 else if (__kmp_affinity_top_method == affinity_top_method_group) { 3744 if (__kmp_affinity_verbose) { 3745 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 3746 } 3747 3748 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 3749 KMP_ASSERT(depth != 0); 3750 if (depth < 0) { 3751 KMP_ASSERT(msg_id != kmp_i18n_null); 3752 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3753 } 3754 } 3755 3756 # endif /* KMP_GROUP_AFFINITY */ 3757 3758 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 3759 if (__kmp_affinity_verbose) { 3760 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY"); 3761 } 3762 3763 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3764 if (depth == 0) { 3765 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3766 KMP_ASSERT(address2os == NULL); 3767 return; 3768 } 3769 // should not fail 3770 KMP_ASSERT(depth > 0); 3771 KMP_ASSERT(address2os != NULL); 3772 } 3773 3774 if (address2os == NULL) { 3775 if (KMP_AFFINITY_CAPABLE() 3776 && (__kmp_affinity_verbose || (__kmp_affinity_warnings 3777 && (__kmp_affinity_type != affinity_none)))) { 3778 KMP_WARNING(ErrorInitializeAffinity); 3779 } 3780 __kmp_affinity_type = affinity_none; 3781 KMP_AFFINITY_DISABLE(); 3782 return; 3783 } 3784 3785 __kmp_apply_thread_places(&address2os, depth); 3786 3787 // 3788 // Create the table of masks, indexed by thread Id. 3789 // 3790 unsigned maxIndex; 3791 unsigned numUnique; 3792 kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique, 3793 address2os, __kmp_avail_proc); 3794 if (__kmp_affinity_gran_levels == 0) { 3795 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 3796 } 3797 3798 // 3799 // Set the childNums vector in all Address objects. This must be done 3800 // before we can sort using __kmp_affinity_cmp_Address_child_num(), 3801 // which takes into account the setting of __kmp_affinity_compact. 3802 // 3803 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc); 3804 3805 switch (__kmp_affinity_type) { 3806 3807 case affinity_explicit: 3808 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 3809 # if OMP_40_ENABLED 3810 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 3811 # endif 3812 { 3813 __kmp_affinity_process_proclist(&__kmp_affinity_masks, 3814 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask, 3815 maxIndex); 3816 } 3817 # if OMP_40_ENABLED 3818 else { 3819 __kmp_affinity_process_placelist(&__kmp_affinity_masks, 3820 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask, 3821 maxIndex); 3822 } 3823 # endif 3824 if (__kmp_affinity_num_masks == 0) { 3825 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3826 && (__kmp_affinity_type != affinity_none))) { 3827 KMP_WARNING(AffNoValidProcID); 3828 } 3829 __kmp_affinity_type = affinity_none; 3830 return; 3831 } 3832 break; 3833 3834 // 3835 // The other affinity types rely on sorting the Addresses according 3836 // to some permutation of the machine topology tree. Set 3837 // __kmp_affinity_compact and __kmp_affinity_offset appropriately, 3838 // then jump to a common code fragment to do the sort and create 3839 // the array of affinity masks. 3840 // 3841 3842 case affinity_logical: 3843 __kmp_affinity_compact = 0; 3844 if (__kmp_affinity_offset) { 3845 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset 3846 % __kmp_avail_proc; 3847 } 3848 goto sortAddresses; 3849 3850 case affinity_physical: 3851 if (__kmp_nThreadsPerCore > 1) { 3852 __kmp_affinity_compact = 1; 3853 if (__kmp_affinity_compact >= depth) { 3854 __kmp_affinity_compact = 0; 3855 } 3856 } else { 3857 __kmp_affinity_compact = 0; 3858 } 3859 if (__kmp_affinity_offset) { 3860 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset 3861 % __kmp_avail_proc; 3862 } 3863 goto sortAddresses; 3864 3865 case affinity_scatter: 3866 if (__kmp_affinity_compact >= depth) { 3867 __kmp_affinity_compact = 0; 3868 } 3869 else { 3870 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 3871 } 3872 goto sortAddresses; 3873 3874 case affinity_compact: 3875 if (__kmp_affinity_compact >= depth) { 3876 __kmp_affinity_compact = depth - 1; 3877 } 3878 goto sortAddresses; 3879 3880 case affinity_balanced: 3881 // Balanced works only for the case of a single package 3882 if( nPackages > 1 ) { 3883 if( __kmp_affinity_verbose || __kmp_affinity_warnings ) { 3884 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" ); 3885 } 3886 __kmp_affinity_type = affinity_none; 3887 return; 3888 } else if( __kmp_affinity_uniform_topology() ) { 3889 break; 3890 } else { // Non-uniform topology 3891 3892 // Save the depth for further usage 3893 __kmp_aff_depth = depth; 3894 3895 // Number of hyper threads per core in HT machine 3896 int nth_per_core = __kmp_nThreadsPerCore; 3897 3898 int core_level; 3899 if( nth_per_core > 1 ) { 3900 core_level = depth - 2; 3901 } else { 3902 core_level = depth - 1; 3903 } 3904 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1; 3905 int nproc = nth_per_core * ncores; 3906 3907 procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc ); 3908 for( int i = 0; i < nproc; i++ ) { 3909 procarr[ i ] = -1; 3910 } 3911 3912 for( int i = 0; i < __kmp_avail_proc; i++ ) { 3913 int proc = address2os[ i ].second; 3914 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread. 3915 // If there is only one thread per core then depth == 2: level 0 - package, 3916 // level 1 - core. 3917 int level = depth - 1; 3918 3919 // __kmp_nth_per_core == 1 3920 int thread = 0; 3921 int core = address2os[ i ].first.labels[ level ]; 3922 // If the thread level exists, that is we have more than one thread context per core 3923 if( nth_per_core > 1 ) { 3924 thread = address2os[ i ].first.labels[ level ] % nth_per_core; 3925 core = address2os[ i ].first.labels[ level - 1 ]; 3926 } 3927 procarr[ core * nth_per_core + thread ] = proc; 3928 } 3929 3930 break; 3931 } 3932 3933 sortAddresses: 3934 // 3935 // Allocate the gtid->affinity mask table. 3936 // 3937 if (__kmp_affinity_dups) { 3938 __kmp_affinity_num_masks = __kmp_avail_proc; 3939 } 3940 else { 3941 __kmp_affinity_num_masks = numUnique; 3942 } 3943 3944 # if OMP_40_ENABLED 3945 if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel ) 3946 && ( __kmp_affinity_num_places > 0 ) 3947 && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) { 3948 __kmp_affinity_num_masks = __kmp_affinity_num_places; 3949 } 3950 # endif 3951 3952 __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate( 3953 __kmp_affinity_num_masks * __kmp_affin_mask_size); 3954 3955 // 3956 // Sort the address2os table according to the current setting of 3957 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 3958 // 3959 qsort(address2os, __kmp_avail_proc, sizeof(*address2os), 3960 __kmp_affinity_cmp_Address_child_num); 3961 { 3962 int i; 3963 unsigned j; 3964 for (i = 0, j = 0; i < __kmp_avail_proc; i++) { 3965 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) { 3966 continue; 3967 } 3968 unsigned osId = address2os[i].second; 3969 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 3970 kmp_affin_mask_t *dest 3971 = KMP_CPU_INDEX(__kmp_affinity_masks, j); 3972 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 3973 KMP_CPU_COPY(dest, src); 3974 if (++j >= __kmp_affinity_num_masks) { 3975 break; 3976 } 3977 } 3978 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 3979 } 3980 break; 3981 3982 default: 3983 KMP_ASSERT2(0, "Unexpected affinity setting"); 3984 } 3985 3986 __kmp_free(osId2Mask); 3987 machine_hierarchy.init(address2os, __kmp_avail_proc); 3988 } 3989 3990 3991 void 3992 __kmp_affinity_initialize(void) 3993 { 3994 // 3995 // Much of the code above was written assumming that if a machine was not 3996 // affinity capable, then __kmp_affinity_type == affinity_none. We now 3997 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 3998 // 3999 // There are too many checks for __kmp_affinity_type == affinity_none 4000 // in this code. Instead of trying to change them all, check if 4001 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 4002 // affinity_none, call the real initialization routine, then restore 4003 // __kmp_affinity_type to affinity_disabled. 4004 // 4005 int disabled = (__kmp_affinity_type == affinity_disabled); 4006 if (! KMP_AFFINITY_CAPABLE()) { 4007 KMP_ASSERT(disabled); 4008 } 4009 if (disabled) { 4010 __kmp_affinity_type = affinity_none; 4011 } 4012 __kmp_aux_affinity_initialize(); 4013 if (disabled) { 4014 __kmp_affinity_type = affinity_disabled; 4015 } 4016 } 4017 4018 4019 void 4020 __kmp_affinity_uninitialize(void) 4021 { 4022 if (__kmp_affinity_masks != NULL) { 4023 __kmp_free(__kmp_affinity_masks); 4024 __kmp_affinity_masks = NULL; 4025 } 4026 if (fullMask != NULL) { 4027 KMP_CPU_FREE(fullMask); 4028 fullMask = NULL; 4029 } 4030 __kmp_affinity_num_masks = 0; 4031 # if OMP_40_ENABLED 4032 __kmp_affinity_num_places = 0; 4033 # endif 4034 if (__kmp_affinity_proclist != NULL) { 4035 __kmp_free(__kmp_affinity_proclist); 4036 __kmp_affinity_proclist = NULL; 4037 } 4038 if( address2os != NULL ) { 4039 __kmp_free( address2os ); 4040 address2os = NULL; 4041 } 4042 if( procarr != NULL ) { 4043 __kmp_free( procarr ); 4044 procarr = NULL; 4045 } 4046 } 4047 4048 4049 void 4050 __kmp_affinity_set_init_mask(int gtid, int isa_root) 4051 { 4052 if (! KMP_AFFINITY_CAPABLE()) { 4053 return; 4054 } 4055 4056 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4057 if (th->th.th_affin_mask == NULL) { 4058 KMP_CPU_ALLOC(th->th.th_affin_mask); 4059 } 4060 else { 4061 KMP_CPU_ZERO(th->th.th_affin_mask); 4062 } 4063 4064 // 4065 // Copy the thread mask to the kmp_info_t strucuture. 4066 // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one 4067 // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask 4068 // is set, then the full mask is the same as the mask of the initialization 4069 // thread. 4070 // 4071 kmp_affin_mask_t *mask; 4072 int i; 4073 4074 # if OMP_40_ENABLED 4075 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 4076 # endif 4077 { 4078 if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced) 4079 ) { 4080 # if KMP_GROUP_AFFINITY 4081 if (__kmp_num_proc_groups > 1) { 4082 return; 4083 } 4084 # endif 4085 KMP_ASSERT(fullMask != NULL); 4086 i = KMP_PLACE_ALL; 4087 mask = fullMask; 4088 } 4089 else { 4090 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 ); 4091 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4092 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4093 } 4094 } 4095 # if OMP_40_ENABLED 4096 else { 4097 if ((! isa_root) 4098 || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 4099 # if KMP_GROUP_AFFINITY 4100 if (__kmp_num_proc_groups > 1) { 4101 return; 4102 } 4103 # endif 4104 KMP_ASSERT(fullMask != NULL); 4105 i = KMP_PLACE_ALL; 4106 mask = fullMask; 4107 } 4108 else { 4109 // 4110 // int i = some hash function or just a counter that doesn't 4111 // always start at 0. Use gtid for now. 4112 // 4113 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 ); 4114 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4115 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4116 } 4117 } 4118 # endif 4119 4120 # if OMP_40_ENABLED 4121 th->th.th_current_place = i; 4122 if (isa_root) { 4123 th->th.th_new_place = i; 4124 th->th.th_first_place = 0; 4125 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4126 } 4127 4128 if (i == KMP_PLACE_ALL) { 4129 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 4130 gtid)); 4131 } 4132 else { 4133 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 4134 gtid, i)); 4135 } 4136 # else 4137 if (i == -1) { 4138 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n", 4139 gtid)); 4140 } 4141 else { 4142 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n", 4143 gtid, i)); 4144 } 4145 # endif /* OMP_40_ENABLED */ 4146 4147 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4148 4149 if (__kmp_affinity_verbose) { 4150 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4151 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4152 th->th.th_affin_mask); 4153 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid, 4154 buf); 4155 } 4156 4157 # if KMP_OS_WINDOWS 4158 // 4159 // On Windows* OS, the process affinity mask might have changed. 4160 // If the user didn't request affinity and this call fails, 4161 // just continue silently. See CQ171393. 4162 // 4163 if ( __kmp_affinity_type == affinity_none ) { 4164 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 4165 } 4166 else 4167 # endif 4168 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4169 } 4170 4171 4172 # if OMP_40_ENABLED 4173 4174 void 4175 __kmp_affinity_set_place(int gtid) 4176 { 4177 int retval; 4178 4179 if (! KMP_AFFINITY_CAPABLE()) { 4180 return; 4181 } 4182 4183 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4184 4185 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n", 4186 gtid, th->th.th_new_place, th->th.th_current_place)); 4187 4188 // 4189 // Check that the new place is within this thread's partition. 4190 // 4191 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4192 KMP_ASSERT(th->th.th_new_place >= 0); 4193 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 4194 if (th->th.th_first_place <= th->th.th_last_place) { 4195 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) 4196 && (th->th.th_new_place <= th->th.th_last_place)); 4197 } 4198 else { 4199 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) 4200 || (th->th.th_new_place >= th->th.th_last_place)); 4201 } 4202 4203 // 4204 // Copy the thread mask to the kmp_info_t strucuture, 4205 // and set this thread's affinity. 4206 // 4207 kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, 4208 th->th.th_new_place); 4209 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4210 th->th.th_current_place = th->th.th_new_place; 4211 4212 if (__kmp_affinity_verbose) { 4213 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4214 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4215 th->th.th_affin_mask); 4216 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), 4217 gtid, buf); 4218 } 4219 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4220 } 4221 4222 # endif /* OMP_40_ENABLED */ 4223 4224 4225 int 4226 __kmp_aux_set_affinity(void **mask) 4227 { 4228 int gtid; 4229 kmp_info_t *th; 4230 int retval; 4231 4232 if (! KMP_AFFINITY_CAPABLE()) { 4233 return -1; 4234 } 4235 4236 gtid = __kmp_entry_gtid(); 4237 KA_TRACE(1000, ;{ 4238 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4239 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4240 (kmp_affin_mask_t *)(*mask)); 4241 __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n", 4242 gtid, buf); 4243 }); 4244 4245 if (__kmp_env_consistency_check) { 4246 if ((mask == NULL) || (*mask == NULL)) { 4247 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4248 } 4249 else { 4250 unsigned proc; 4251 int num_procs = 0; 4252 4253 for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) { 4254 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 4255 continue; 4256 } 4257 num_procs++; 4258 if (! KMP_CPU_ISSET(proc, fullMask)) { 4259 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4260 break; 4261 } 4262 } 4263 if (num_procs == 0) { 4264 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4265 } 4266 4267 # if KMP_GROUP_AFFINITY 4268 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 4269 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4270 } 4271 # endif /* KMP_GROUP_AFFINITY */ 4272 4273 } 4274 } 4275 4276 th = __kmp_threads[gtid]; 4277 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4278 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4279 if (retval == 0) { 4280 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 4281 } 4282 4283 # if OMP_40_ENABLED 4284 th->th.th_current_place = KMP_PLACE_UNDEFINED; 4285 th->th.th_new_place = KMP_PLACE_UNDEFINED; 4286 th->th.th_first_place = 0; 4287 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4288 4289 // 4290 // Turn off 4.0 affinity for the current tread at this parallel level. 4291 // 4292 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 4293 # endif 4294 4295 return retval; 4296 } 4297 4298 4299 int 4300 __kmp_aux_get_affinity(void **mask) 4301 { 4302 int gtid; 4303 int retval; 4304 kmp_info_t *th; 4305 4306 if (! KMP_AFFINITY_CAPABLE()) { 4307 return -1; 4308 } 4309 4310 gtid = __kmp_entry_gtid(); 4311 th = __kmp_threads[gtid]; 4312 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4313 4314 KA_TRACE(1000, ;{ 4315 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4316 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4317 th->th.th_affin_mask); 4318 __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf); 4319 }); 4320 4321 if (__kmp_env_consistency_check) { 4322 if ((mask == NULL) || (*mask == NULL)) { 4323 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 4324 } 4325 } 4326 4327 # if !KMP_OS_WINDOWS 4328 4329 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4330 KA_TRACE(1000, ;{ 4331 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4332 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4333 (kmp_affin_mask_t *)(*mask)); 4334 __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf); 4335 }); 4336 return retval; 4337 4338 # else 4339 4340 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 4341 return 0; 4342 4343 # endif /* KMP_OS_WINDOWS */ 4344 4345 } 4346 4347 int 4348 __kmp_aux_set_affinity_mask_proc(int proc, void **mask) 4349 { 4350 int retval; 4351 4352 if (! KMP_AFFINITY_CAPABLE()) { 4353 return -1; 4354 } 4355 4356 KA_TRACE(1000, ;{ 4357 int gtid = __kmp_entry_gtid(); 4358 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4359 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4360 (kmp_affin_mask_t *)(*mask)); 4361 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n", 4362 proc, gtid, buf); 4363 }); 4364 4365 if (__kmp_env_consistency_check) { 4366 if ((mask == NULL) || (*mask == NULL)) { 4367 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4368 } 4369 } 4370 4371 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4372 return -1; 4373 } 4374 if (! KMP_CPU_ISSET(proc, fullMask)) { 4375 return -2; 4376 } 4377 4378 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 4379 return 0; 4380 } 4381 4382 4383 int 4384 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) 4385 { 4386 int retval; 4387 4388 if (! KMP_AFFINITY_CAPABLE()) { 4389 return -1; 4390 } 4391 4392 KA_TRACE(1000, ;{ 4393 int gtid = __kmp_entry_gtid(); 4394 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4395 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4396 (kmp_affin_mask_t *)(*mask)); 4397 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n", 4398 proc, gtid, buf); 4399 }); 4400 4401 if (__kmp_env_consistency_check) { 4402 if ((mask == NULL) || (*mask == NULL)) { 4403 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 4404 } 4405 } 4406 4407 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4408 return -1; 4409 } 4410 if (! KMP_CPU_ISSET(proc, fullMask)) { 4411 return -2; 4412 } 4413 4414 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 4415 return 0; 4416 } 4417 4418 4419 int 4420 __kmp_aux_get_affinity_mask_proc(int proc, void **mask) 4421 { 4422 int retval; 4423 4424 if (! KMP_AFFINITY_CAPABLE()) { 4425 return -1; 4426 } 4427 4428 KA_TRACE(1000, ;{ 4429 int gtid = __kmp_entry_gtid(); 4430 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4431 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4432 (kmp_affin_mask_t *)(*mask)); 4433 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n", 4434 proc, gtid, buf); 4435 }); 4436 4437 if (__kmp_env_consistency_check) { 4438 if ((mask == NULL) || (*mask == NULL)) { 4439 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc"); 4440 } 4441 } 4442 4443 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4444 return 0; 4445 } 4446 if (! KMP_CPU_ISSET(proc, fullMask)) { 4447 return 0; 4448 } 4449 4450 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 4451 } 4452 4453 4454 // Dynamic affinity settings - Affinity balanced 4455 void __kmp_balanced_affinity( int tid, int nthreads ) 4456 { 4457 if( __kmp_affinity_uniform_topology() ) { 4458 int coreID; 4459 int threadID; 4460 // Number of hyper threads per core in HT machine 4461 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 4462 // Number of cores 4463 int ncores = __kmp_ncores; 4464 // How many threads will be bound to each core 4465 int chunk = nthreads / ncores; 4466 // How many cores will have an additional thread bound to it - "big cores" 4467 int big_cores = nthreads % ncores; 4468 // Number of threads on the big cores 4469 int big_nth = ( chunk + 1 ) * big_cores; 4470 if( tid < big_nth ) { 4471 coreID = tid / (chunk + 1 ); 4472 threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ; 4473 } else { //tid >= big_nth 4474 coreID = ( tid - big_cores ) / chunk; 4475 threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ; 4476 } 4477 4478 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 4479 "Illegal set affinity operation when not capable"); 4480 4481 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size); 4482 KMP_CPU_ZERO(mask); 4483 4484 // Granularity == thread 4485 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4486 int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second; 4487 KMP_CPU_SET( osID, mask); 4488 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4489 for( int i = 0; i < __kmp_nth_per_core; i++ ) { 4490 int osID; 4491 osID = address2os[ coreID * __kmp_nth_per_core + i ].second; 4492 KMP_CPU_SET( osID, mask); 4493 } 4494 } 4495 if (__kmp_affinity_verbose) { 4496 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4497 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4498 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4499 tid, buf); 4500 } 4501 __kmp_set_system_affinity( mask, TRUE ); 4502 } else { // Non-uniform topology 4503 4504 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size); 4505 KMP_CPU_ZERO(mask); 4506 4507 // Number of hyper threads per core in HT machine 4508 int nth_per_core = __kmp_nThreadsPerCore; 4509 int core_level; 4510 if( nth_per_core > 1 ) { 4511 core_level = __kmp_aff_depth - 2; 4512 } else { 4513 core_level = __kmp_aff_depth - 1; 4514 } 4515 4516 // Number of cores - maximum value; it does not count trail cores with 0 processors 4517 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1; 4518 4519 // For performance gain consider the special case nthreads == __kmp_avail_proc 4520 if( nthreads == __kmp_avail_proc ) { 4521 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4522 int osID = address2os[ tid ].second; 4523 KMP_CPU_SET( osID, mask); 4524 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4525 int coreID = address2os[ tid ].first.labels[ core_level ]; 4526 // We'll count found osIDs for the current core; they can be not more than nth_per_core; 4527 // since the address2os is sortied we can break when cnt==nth_per_core 4528 int cnt = 0; 4529 for( int i = 0; i < __kmp_avail_proc; i++ ) { 4530 int osID = address2os[ i ].second; 4531 int core = address2os[ i ].first.labels[ core_level ]; 4532 if( core == coreID ) { 4533 KMP_CPU_SET( osID, mask); 4534 cnt++; 4535 if( cnt == nth_per_core ) { 4536 break; 4537 } 4538 } 4539 } 4540 } 4541 } else if( nthreads <= __kmp_ncores ) { 4542 4543 int core = 0; 4544 for( int i = 0; i < ncores; i++ ) { 4545 // Check if this core from procarr[] is in the mask 4546 int in_mask = 0; 4547 for( int j = 0; j < nth_per_core; j++ ) { 4548 if( procarr[ i * nth_per_core + j ] != - 1 ) { 4549 in_mask = 1; 4550 break; 4551 } 4552 } 4553 if( in_mask ) { 4554 if( tid == core ) { 4555 for( int j = 0; j < nth_per_core; j++ ) { 4556 int osID = procarr[ i * nth_per_core + j ]; 4557 if( osID != -1 ) { 4558 KMP_CPU_SET( osID, mask ); 4559 // For granularity=thread it is enough to set the first available osID for this core 4560 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4561 break; 4562 } 4563 } 4564 } 4565 break; 4566 } else { 4567 core++; 4568 } 4569 } 4570 } 4571 4572 } else { // nthreads > __kmp_ncores 4573 4574 // Array to save the number of processors at each core 4575 int* nproc_at_core = (int*)KMP_ALLOCA(sizeof(int)*ncores); 4576 // Array to save the number of cores with "x" available processors; 4577 int* ncores_with_x_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1)); 4578 // Array to save the number of cores with # procs from x to nth_per_core 4579 int* ncores_with_x_to_max_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1)); 4580 4581 for( int i = 0; i <= nth_per_core; i++ ) { 4582 ncores_with_x_procs[ i ] = 0; 4583 ncores_with_x_to_max_procs[ i ] = 0; 4584 } 4585 4586 for( int i = 0; i < ncores; i++ ) { 4587 int cnt = 0; 4588 for( int j = 0; j < nth_per_core; j++ ) { 4589 if( procarr[ i * nth_per_core + j ] != -1 ) { 4590 cnt++; 4591 } 4592 } 4593 nproc_at_core[ i ] = cnt; 4594 ncores_with_x_procs[ cnt ]++; 4595 } 4596 4597 for( int i = 0; i <= nth_per_core; i++ ) { 4598 for( int j = i; j <= nth_per_core; j++ ) { 4599 ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ]; 4600 } 4601 } 4602 4603 // Max number of processors 4604 int nproc = nth_per_core * ncores; 4605 // An array to keep number of threads per each context 4606 int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc ); 4607 for( int i = 0; i < nproc; i++ ) { 4608 newarr[ i ] = 0; 4609 } 4610 4611 int nth = nthreads; 4612 int flag = 0; 4613 while( nth > 0 ) { 4614 for( int j = 1; j <= nth_per_core; j++ ) { 4615 int cnt = ncores_with_x_to_max_procs[ j ]; 4616 for( int i = 0; i < ncores; i++ ) { 4617 // Skip the core with 0 processors 4618 if( nproc_at_core[ i ] == 0 ) { 4619 continue; 4620 } 4621 for( int k = 0; k < nth_per_core; k++ ) { 4622 if( procarr[ i * nth_per_core + k ] != -1 ) { 4623 if( newarr[ i * nth_per_core + k ] == 0 ) { 4624 newarr[ i * nth_per_core + k ] = 1; 4625 cnt--; 4626 nth--; 4627 break; 4628 } else { 4629 if( flag != 0 ) { 4630 newarr[ i * nth_per_core + k ] ++; 4631 cnt--; 4632 nth--; 4633 break; 4634 } 4635 } 4636 } 4637 } 4638 if( cnt == 0 || nth == 0 ) { 4639 break; 4640 } 4641 } 4642 if( nth == 0 ) { 4643 break; 4644 } 4645 } 4646 flag = 1; 4647 } 4648 int sum = 0; 4649 for( int i = 0; i < nproc; i++ ) { 4650 sum += newarr[ i ]; 4651 if( sum > tid ) { 4652 // Granularity == thread 4653 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4654 int osID = procarr[ i ]; 4655 KMP_CPU_SET( osID, mask); 4656 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4657 int coreID = i / nth_per_core; 4658 for( int ii = 0; ii < nth_per_core; ii++ ) { 4659 int osID = procarr[ coreID * nth_per_core + ii ]; 4660 if( osID != -1 ) { 4661 KMP_CPU_SET( osID, mask); 4662 } 4663 } 4664 } 4665 break; 4666 } 4667 } 4668 __kmp_free( newarr ); 4669 } 4670 4671 if (__kmp_affinity_verbose) { 4672 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4673 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4674 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4675 tid, buf); 4676 } 4677 __kmp_set_system_affinity( mask, TRUE ); 4678 } 4679 } 4680 4681 #else 4682 // affinity not supported 4683 4684 static const kmp_uint32 noaff_maxLevels=7; 4685 kmp_uint32 noaff_skipPerLevel[noaff_maxLevels]; 4686 kmp_uint32 noaff_depth; 4687 kmp_uint8 noaff_leaf_kids; 4688 kmp_int8 noaff_uninitialized=1; 4689 4690 void noaff_init(int nprocs) 4691 { 4692 kmp_int8 result = KMP_COMPARE_AND_STORE_ACQ8(&noaff_uninitialized, 1, 2); 4693 if (result == 0) return; // Already initialized 4694 else if (result == 2) { // Someone else is initializing 4695 while (TCR_1(noaff_uninitialized) != 0) KMP_CPU_PAUSE(); 4696 return; 4697 } 4698 KMP_DEBUG_ASSERT(result==1); 4699 4700 kmp_uint32 numPerLevel[noaff_maxLevels]; 4701 noaff_depth = 1; 4702 for (kmp_uint32 i=0; i<noaff_maxLevels; ++i) { // init numPerLevel[*] to 1 item per level 4703 numPerLevel[i] = 1; 4704 noaff_skipPerLevel[i] = 1; 4705 } 4706 4707 numPerLevel[0] = 4; 4708 numPerLevel[1] = nprocs/4; 4709 if (nprocs%4) numPerLevel[1]++; 4710 4711 for (int i=noaff_maxLevels-1; i>=0; --i) // count non-empty levels to get depth 4712 if (numPerLevel[i] != 1 || noaff_depth > 1) // only count one top-level '1' 4713 noaff_depth++; 4714 4715 kmp_uint32 branch = 4; 4716 if (numPerLevel[0] == 1) branch = nprocs/4; 4717 if (branch<4) branch=4; 4718 for (kmp_uint32 d=0; d<noaff_depth-1; ++d) { // optimize hierarchy width 4719 while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0! 4720 if (numPerLevel[d] & 1) numPerLevel[d]++; 4721 numPerLevel[d] = numPerLevel[d] >> 1; 4722 if (numPerLevel[d+1] == 1) noaff_depth++; 4723 numPerLevel[d+1] = numPerLevel[d+1] << 1; 4724 } 4725 if(numPerLevel[0] == 1) { 4726 branch = branch >> 1; 4727 if (branch<4) branch = 4; 4728 } 4729 } 4730 4731 for (kmp_uint32 i=1; i<noaff_depth; ++i) 4732 noaff_skipPerLevel[i] = numPerLevel[i-1] * noaff_skipPerLevel[i-1]; 4733 // Fill in hierarchy in the case of oversubscription 4734 for (kmp_uint32 i=noaff_depth; i<noaff_maxLevels; ++i) 4735 noaff_skipPerLevel[i] = 2*noaff_skipPerLevel[i-1]; 4736 noaff_leaf_kids = (kmp_uint8)numPerLevel[0]-1; 4737 noaff_uninitialized = 0; // One writer 4738 4739 } 4740 4741 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 4742 if (noaff_uninitialized) 4743 noaff_init(nproc); 4744 4745 thr_bar->depth = noaff_depth; 4746 thr_bar->base_leaf_kids = noaff_leaf_kids; 4747 thr_bar->skip_per_level = noaff_skipPerLevel; 4748 } 4749 4750 #endif // KMP_AFFINITY_SUPPORTED 4751