1 /* 2 * kmp_affinity.cpp -- affinity management 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "kmp.h" 17 #include "kmp_i18n.h" 18 #include "kmp_io.h" 19 #include "kmp_str.h" 20 #include "kmp_wrapper_getpid.h" 21 #include "kmp_affinity.h" 22 23 // Store the real or imagined machine hierarchy here 24 static hierarchy_info machine_hierarchy; 25 26 void __kmp_cleanup_hierarchy() { 27 machine_hierarchy.fini(); 28 } 29 30 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 31 kmp_uint32 depth; 32 // The test below is true if affinity is available, but set to "none". Need to init on first use of hierarchical barrier. 33 if (TCR_1(machine_hierarchy.uninitialized)) 34 machine_hierarchy.init(NULL, nproc); 35 36 // Adjust the hierarchy in case num threads exceeds original 37 if (nproc > machine_hierarchy.base_num_threads) 38 machine_hierarchy.resize(nproc); 39 40 depth = machine_hierarchy.depth; 41 KMP_DEBUG_ASSERT(depth > 0); 42 43 thr_bar->depth = depth; 44 thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1; 45 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 46 } 47 48 #if KMP_AFFINITY_SUPPORTED 49 50 // 51 // Print the affinity mask to the character array in a pretty format. 52 // 53 char * 54 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask) 55 { 56 KMP_ASSERT(buf_len >= 40); 57 char *scan = buf; 58 char *end = buf + buf_len - 1; 59 60 // 61 // Find first element / check for empty set. 62 // 63 size_t i; 64 for (i = 0; i < KMP_CPU_SETSIZE; i++) { 65 if (KMP_CPU_ISSET(i, mask)) { 66 break; 67 } 68 } 69 if (i == KMP_CPU_SETSIZE) { 70 KMP_SNPRINTF(scan, end-scan+1, "{<empty>}"); 71 while (*scan != '\0') scan++; 72 KMP_ASSERT(scan <= end); 73 return buf; 74 } 75 76 KMP_SNPRINTF(scan, end-scan+1, "{%ld", (long)i); 77 while (*scan != '\0') scan++; 78 i++; 79 for (; i < KMP_CPU_SETSIZE; i++) { 80 if (! KMP_CPU_ISSET(i, mask)) { 81 continue; 82 } 83 84 // 85 // Check for buffer overflow. A string of the form ",<n>" will have 86 // at most 10 characters, plus we want to leave room to print ",...}" 87 // if the set is too large to print for a total of 15 characters. 88 // We already left room for '\0' in setting end. 89 // 90 if (end - scan < 15) { 91 break; 92 } 93 KMP_SNPRINTF(scan, end-scan+1, ",%-ld", (long)i); 94 while (*scan != '\0') scan++; 95 } 96 if (i < KMP_CPU_SETSIZE) { 97 KMP_SNPRINTF(scan, end-scan+1, ",..."); 98 while (*scan != '\0') scan++; 99 } 100 KMP_SNPRINTF(scan, end-scan+1, "}"); 101 while (*scan != '\0') scan++; 102 KMP_ASSERT(scan <= end); 103 return buf; 104 } 105 106 107 void 108 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) 109 { 110 KMP_CPU_ZERO(mask); 111 112 # if KMP_GROUP_AFFINITY 113 114 if (__kmp_num_proc_groups > 1) { 115 int group; 116 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 117 for (group = 0; group < __kmp_num_proc_groups; group++) { 118 int i; 119 int num = __kmp_GetActiveProcessorCount(group); 120 for (i = 0; i < num; i++) { 121 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 122 } 123 } 124 } 125 else 126 127 # endif /* KMP_GROUP_AFFINITY */ 128 129 { 130 int proc; 131 for (proc = 0; proc < __kmp_xproc; proc++) { 132 KMP_CPU_SET(proc, mask); 133 } 134 } 135 } 136 137 // 138 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be 139 // called to renumber the labels from [0..n] and place them into the child_num 140 // vector of the address object. This is done in case the labels used for 141 // the children at one node of the hierarchy differ from those used for 142 // another node at the same level. Example: suppose the machine has 2 nodes 143 // with 2 packages each. The first node contains packages 601 and 602, and 144 // second node contains packages 603 and 604. If we try to sort the table 145 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604 146 // because we are paying attention to the labels themselves, not the ordinal 147 // child numbers. By using the child numbers in the sort, the result is 148 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604. 149 // 150 static void 151 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os, 152 int numAddrs) 153 { 154 KMP_DEBUG_ASSERT(numAddrs > 0); 155 int depth = address2os->first.depth; 156 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 157 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth 158 * sizeof(unsigned)); 159 int labCt; 160 for (labCt = 0; labCt < depth; labCt++) { 161 address2os[0].first.childNums[labCt] = counts[labCt] = 0; 162 lastLabel[labCt] = address2os[0].first.labels[labCt]; 163 } 164 int i; 165 for (i = 1; i < numAddrs; i++) { 166 for (labCt = 0; labCt < depth; labCt++) { 167 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) { 168 int labCt2; 169 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) { 170 counts[labCt2] = 0; 171 lastLabel[labCt2] = address2os[i].first.labels[labCt2]; 172 } 173 counts[labCt]++; 174 lastLabel[labCt] = address2os[i].first.labels[labCt]; 175 break; 176 } 177 } 178 for (labCt = 0; labCt < depth; labCt++) { 179 address2os[i].first.childNums[labCt] = counts[labCt]; 180 } 181 for (; labCt < (int)Address::maxDepth; labCt++) { 182 address2os[i].first.childNums[labCt] = 0; 183 } 184 } 185 } 186 187 188 // 189 // All of the __kmp_affinity_create_*_map() routines should set 190 // __kmp_affinity_masks to a vector of affinity mask objects of length 191 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and 192 // return the number of levels in the machine topology tree (zero if 193 // __kmp_affinity_type == affinity_none). 194 // 195 // All of the __kmp_affinity_create_*_map() routines should set *fullMask 196 // to the affinity mask for the initialization thread. They need to save and 197 // restore the mask, and it could be needed later, so saving it is just an 198 // optimization to avoid calling kmp_get_system_affinity() again. 199 // 200 static kmp_affin_mask_t *fullMask = NULL; 201 202 kmp_affin_mask_t * 203 __kmp_affinity_get_fullMask() { return fullMask; } 204 205 206 static int nCoresPerPkg, nPackages; 207 static int __kmp_nThreadsPerCore; 208 #ifndef KMP_DFLT_NTH_CORES 209 static int __kmp_ncores; 210 #endif 211 212 // 213 // __kmp_affinity_uniform_topology() doesn't work when called from 214 // places which support arbitrarily many levels in the machine topology 215 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map() 216 // __kmp_affinity_create_x2apicid_map(). 217 // 218 inline static bool 219 __kmp_affinity_uniform_topology() 220 { 221 return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages); 222 } 223 224 225 // 226 // Print out the detailed machine topology map, i.e. the physical locations 227 // of each OS proc. 228 // 229 static void 230 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth, 231 int pkgLevel, int coreLevel, int threadLevel) 232 { 233 int proc; 234 235 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 236 for (proc = 0; proc < len; proc++) { 237 int level; 238 kmp_str_buf_t buf; 239 __kmp_str_buf_init(&buf); 240 for (level = 0; level < depth; level++) { 241 if (level == threadLevel) { 242 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread)); 243 } 244 else if (level == coreLevel) { 245 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core)); 246 } 247 else if (level == pkgLevel) { 248 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package)); 249 } 250 else if (level > pkgLevel) { 251 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node), 252 level - pkgLevel - 1); 253 } 254 else { 255 __kmp_str_buf_print(&buf, "L%d ", level); 256 } 257 __kmp_str_buf_print(&buf, "%d ", 258 address2os[proc].first.labels[level]); 259 } 260 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second, 261 buf.str); 262 __kmp_str_buf_free(&buf); 263 } 264 } 265 266 267 // 268 // If we don't know how to retrieve the machine's processor topology, or 269 // encounter an error in doing so, this routine is called to form a "flat" 270 // mapping of os thread id's <-> processor id's. 271 // 272 static int 273 __kmp_affinity_create_flat_map(AddrUnsPair **address2os, 274 kmp_i18n_id_t *const msg_id) 275 { 276 *address2os = NULL; 277 *msg_id = kmp_i18n_null; 278 279 // 280 // Even if __kmp_affinity_type == affinity_none, this routine might still 281 // called to set __kmp_ncores, as well as 282 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 283 // 284 if (! KMP_AFFINITY_CAPABLE()) { 285 KMP_ASSERT(__kmp_affinity_type == affinity_none); 286 __kmp_ncores = nPackages = __kmp_xproc; 287 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 288 if (__kmp_affinity_verbose) { 289 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY"); 290 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 291 KMP_INFORM(Uniform, "KMP_AFFINITY"); 292 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 293 __kmp_nThreadsPerCore, __kmp_ncores); 294 } 295 return 0; 296 } 297 298 // 299 // When affinity is off, this routine will still be called to set 300 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 301 // nCoresPerPkg, & nPackages. Make sure all these vars are set 302 // correctly, and return now if affinity is not enabled. 303 // 304 __kmp_ncores = nPackages = __kmp_avail_proc; 305 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 306 if (__kmp_affinity_verbose) { 307 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 308 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask); 309 310 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY"); 311 if (__kmp_affinity_respect_mask) { 312 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 313 } else { 314 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 315 } 316 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 317 KMP_INFORM(Uniform, "KMP_AFFINITY"); 318 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 319 __kmp_nThreadsPerCore, __kmp_ncores); 320 } 321 if (__kmp_affinity_type == affinity_none) { 322 return 0; 323 } 324 325 // 326 // Contruct the data structure to be returned. 327 // 328 *address2os = (AddrUnsPair*) 329 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 330 int avail_ct = 0; 331 unsigned int i; 332 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 333 // 334 // Skip this proc if it is not included in the machine model. 335 // 336 if (! KMP_CPU_ISSET(i, fullMask)) { 337 continue; 338 } 339 340 Address addr(1); 341 addr.labels[0] = i; 342 (*address2os)[avail_ct++] = AddrUnsPair(addr,i); 343 } 344 if (__kmp_affinity_verbose) { 345 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 346 } 347 348 if (__kmp_affinity_gran_levels < 0) { 349 // 350 // Only the package level is modeled in the machine topology map, 351 // so the #levels of granularity is either 0 or 1. 352 // 353 if (__kmp_affinity_gran > affinity_gran_package) { 354 __kmp_affinity_gran_levels = 1; 355 } 356 else { 357 __kmp_affinity_gran_levels = 0; 358 } 359 } 360 return 1; 361 } 362 363 364 # if KMP_GROUP_AFFINITY 365 366 // 367 // If multiple Windows* OS processor groups exist, we can create a 2-level 368 // topology map with the groups at level 0 and the individual procs at 369 // level 1. 370 // 371 // This facilitates letting the threads float among all procs in a group, 372 // if granularity=group (the default when there are multiple groups). 373 // 374 static int 375 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os, 376 kmp_i18n_id_t *const msg_id) 377 { 378 *address2os = NULL; 379 *msg_id = kmp_i18n_null; 380 381 // 382 // If we don't have multiple processor groups, return now. 383 // The flat mapping will be used. 384 // 385 if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) { 386 // FIXME set *msg_id 387 return -1; 388 } 389 390 // 391 // Contruct the data structure to be returned. 392 // 393 *address2os = (AddrUnsPair*) 394 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 395 int avail_ct = 0; 396 int i; 397 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 398 // 399 // Skip this proc if it is not included in the machine model. 400 // 401 if (! KMP_CPU_ISSET(i, fullMask)) { 402 continue; 403 } 404 405 Address addr(2); 406 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR)); 407 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR)); 408 (*address2os)[avail_ct++] = AddrUnsPair(addr,i); 409 410 if (__kmp_affinity_verbose) { 411 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0], 412 addr.labels[1]); 413 } 414 } 415 416 if (__kmp_affinity_gran_levels < 0) { 417 if (__kmp_affinity_gran == affinity_gran_group) { 418 __kmp_affinity_gran_levels = 1; 419 } 420 else if ((__kmp_affinity_gran == affinity_gran_fine) 421 || (__kmp_affinity_gran == affinity_gran_thread)) { 422 __kmp_affinity_gran_levels = 0; 423 } 424 else { 425 const char *gran_str = NULL; 426 if (__kmp_affinity_gran == affinity_gran_core) { 427 gran_str = "core"; 428 } 429 else if (__kmp_affinity_gran == affinity_gran_package) { 430 gran_str = "package"; 431 } 432 else if (__kmp_affinity_gran == affinity_gran_node) { 433 gran_str = "node"; 434 } 435 else { 436 KMP_ASSERT(0); 437 } 438 439 // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread" 440 __kmp_affinity_gran_levels = 0; 441 } 442 } 443 return 2; 444 } 445 446 # endif /* KMP_GROUP_AFFINITY */ 447 448 449 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 450 451 static int 452 __kmp_cpuid_mask_width(int count) { 453 int r = 0; 454 455 while((1<<r) < count) 456 ++r; 457 return r; 458 } 459 460 461 class apicThreadInfo { 462 public: 463 unsigned osId; // param to __kmp_affinity_bind_thread 464 unsigned apicId; // from cpuid after binding 465 unsigned maxCoresPerPkg; // "" 466 unsigned maxThreadsPerPkg; // "" 467 unsigned pkgId; // inferred from above values 468 unsigned coreId; // "" 469 unsigned threadId; // "" 470 }; 471 472 473 static int 474 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b) 475 { 476 const apicThreadInfo *aa = (const apicThreadInfo *)a; 477 const apicThreadInfo *bb = (const apicThreadInfo *)b; 478 if (aa->osId < bb->osId) return -1; 479 if (aa->osId > bb->osId) return 1; 480 return 0; 481 } 482 483 484 static int 485 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b) 486 { 487 const apicThreadInfo *aa = (const apicThreadInfo *)a; 488 const apicThreadInfo *bb = (const apicThreadInfo *)b; 489 if (aa->pkgId < bb->pkgId) return -1; 490 if (aa->pkgId > bb->pkgId) return 1; 491 if (aa->coreId < bb->coreId) return -1; 492 if (aa->coreId > bb->coreId) return 1; 493 if (aa->threadId < bb->threadId) return -1; 494 if (aa->threadId > bb->threadId) return 1; 495 return 0; 496 } 497 498 499 // 500 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 501 // an algorithm which cycles through the available os threads, setting 502 // the current thread's affinity mask to that thread, and then retrieves 503 // the Apic Id for each thread context using the cpuid instruction. 504 // 505 static int 506 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os, 507 kmp_i18n_id_t *const msg_id) 508 { 509 kmp_cpuid buf; 510 int rc; 511 *address2os = NULL; 512 *msg_id = kmp_i18n_null; 513 514 // 515 // Check if cpuid leaf 4 is supported. 516 // 517 __kmp_x86_cpuid(0, 0, &buf); 518 if (buf.eax < 4) { 519 *msg_id = kmp_i18n_str_NoLeaf4Support; 520 return -1; 521 } 522 523 // 524 // The algorithm used starts by setting the affinity to each available 525 // thread and retrieving info from the cpuid instruction, so if we are 526 // not capable of calling __kmp_get_system_affinity() and 527 // _kmp_get_system_affinity(), then we need to do something else - use 528 // the defaults that we calculated from issuing cpuid without binding 529 // to each proc. 530 // 531 if (! KMP_AFFINITY_CAPABLE()) { 532 // 533 // Hack to try and infer the machine topology using only the data 534 // available from cpuid on the current thread, and __kmp_xproc. 535 // 536 KMP_ASSERT(__kmp_affinity_type == affinity_none); 537 538 // 539 // Get an upper bound on the number of threads per package using 540 // cpuid(1). 541 // 542 // On some OS/chps combinations where HT is supported by the chip 543 // but is disabled, this value will be 2 on a single core chip. 544 // Usually, it will be 2 if HT is enabled and 1 if HT is disabled. 545 // 546 __kmp_x86_cpuid(1, 0, &buf); 547 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 548 if (maxThreadsPerPkg == 0) { 549 maxThreadsPerPkg = 1; 550 } 551 552 // 553 // The num cores per pkg comes from cpuid(4). 554 // 1 must be added to the encoded value. 555 // 556 // The author of cpu_count.cpp treated this only an upper bound 557 // on the number of cores, but I haven't seen any cases where it 558 // was greater than the actual number of cores, so we will treat 559 // it as exact in this block of code. 560 // 561 // First, we need to check if cpuid(4) is supported on this chip. 562 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax 563 // has the value n or greater. 564 // 565 __kmp_x86_cpuid(0, 0, &buf); 566 if (buf.eax >= 4) { 567 __kmp_x86_cpuid(4, 0, &buf); 568 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 569 } 570 else { 571 nCoresPerPkg = 1; 572 } 573 574 // 575 // There is no way to reliably tell if HT is enabled without issuing 576 // the cpuid instruction from every thread, can correlating the cpuid 577 // info, so if the machine is not affinity capable, we assume that HT 578 // is off. We have seen quite a few machines where maxThreadsPerPkg 579 // is 2, yet the machine does not support HT. 580 // 581 // - Older OSes are usually found on machines with older chips, which 582 // do not support HT. 583 // 584 // - The performance penalty for mistakenly identifying a machine as 585 // HT when it isn't (which results in blocktime being incorrecly set 586 // to 0) is greater than the penalty when for mistakenly identifying 587 // a machine as being 1 thread/core when it is really HT enabled 588 // (which results in blocktime being incorrectly set to a positive 589 // value). 590 // 591 __kmp_ncores = __kmp_xproc; 592 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 593 __kmp_nThreadsPerCore = 1; 594 if (__kmp_affinity_verbose) { 595 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY"); 596 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 597 if (__kmp_affinity_uniform_topology()) { 598 KMP_INFORM(Uniform, "KMP_AFFINITY"); 599 } else { 600 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 601 } 602 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 603 __kmp_nThreadsPerCore, __kmp_ncores); 604 } 605 return 0; 606 } 607 608 // 609 // 610 // From here on, we can assume that it is safe to call 611 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), 612 // even if __kmp_affinity_type = affinity_none. 613 // 614 615 // 616 // Save the affinity mask for the current thread. 617 // 618 kmp_affin_mask_t *oldMask; 619 KMP_CPU_ALLOC(oldMask); 620 KMP_ASSERT(oldMask != NULL); 621 __kmp_get_system_affinity(oldMask, TRUE); 622 623 // 624 // Run through each of the available contexts, binding the current thread 625 // to it, and obtaining the pertinent information using the cpuid instr. 626 // 627 // The relevant information is: 628 // 629 // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 630 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 631 // 632 // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The 633 // value of this field determines the width of the core# + thread# 634 // fields in the Apic Id. It is also an upper bound on the number 635 // of threads per package, but it has been verified that situations 636 // happen were it is not exact. In particular, on certain OS/chip 637 // combinations where Intel(R) Hyper-Threading Technology is supported 638 // by the chip but has 639 // been disabled, the value of this field will be 2 (for a single core 640 // chip). On other OS/chip combinations supporting 641 // Intel(R) Hyper-Threading Technology, the value of 642 // this field will be 1 when Intel(R) Hyper-Threading Technology is 643 // disabled and 2 when it is enabled. 644 // 645 // Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The 646 // value of this field (+1) determines the width of the core# field in 647 // the Apic Id. The comments in "cpucount.cpp" say that this value is 648 // an upper bound, but the IA-32 architecture manual says that it is 649 // exactly the number of cores per package, and I haven't seen any 650 // case where it wasn't. 651 // 652 // From this information, deduce the package Id, core Id, and thread Id, 653 // and set the corresponding fields in the apicThreadInfo struct. 654 // 655 unsigned i; 656 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 657 __kmp_avail_proc * sizeof(apicThreadInfo)); 658 unsigned nApics = 0; 659 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 660 // 661 // Skip this proc if it is not included in the machine model. 662 // 663 if (! KMP_CPU_ISSET(i, fullMask)) { 664 continue; 665 } 666 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 667 668 __kmp_affinity_bind_thread(i); 669 threadInfo[nApics].osId = i; 670 671 // 672 // The apic id and max threads per pkg come from cpuid(1). 673 // 674 __kmp_x86_cpuid(1, 0, &buf); 675 if (! (buf.edx >> 9) & 1) { 676 __kmp_set_system_affinity(oldMask, TRUE); 677 __kmp_free(threadInfo); 678 KMP_CPU_FREE(oldMask); 679 *msg_id = kmp_i18n_str_ApicNotPresent; 680 return -1; 681 } 682 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 683 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 684 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 685 threadInfo[nApics].maxThreadsPerPkg = 1; 686 } 687 688 // 689 // Max cores per pkg comes from cpuid(4). 690 // 1 must be added to the encoded value. 691 // 692 // First, we need to check if cpuid(4) is supported on this chip. 693 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax 694 // has the value n or greater. 695 // 696 __kmp_x86_cpuid(0, 0, &buf); 697 if (buf.eax >= 4) { 698 __kmp_x86_cpuid(4, 0, &buf); 699 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 700 } 701 else { 702 threadInfo[nApics].maxCoresPerPkg = 1; 703 } 704 705 // 706 // Infer the pkgId / coreId / threadId using only the info 707 // obtained locally. 708 // 709 int widthCT = __kmp_cpuid_mask_width( 710 threadInfo[nApics].maxThreadsPerPkg); 711 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 712 713 int widthC = __kmp_cpuid_mask_width( 714 threadInfo[nApics].maxCoresPerPkg); 715 int widthT = widthCT - widthC; 716 if (widthT < 0) { 717 // 718 // I've never seen this one happen, but I suppose it could, if 719 // the cpuid instruction on a chip was really screwed up. 720 // Make sure to restore the affinity mask before the tail call. 721 // 722 __kmp_set_system_affinity(oldMask, TRUE); 723 __kmp_free(threadInfo); 724 KMP_CPU_FREE(oldMask); 725 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 726 return -1; 727 } 728 729 int maskC = (1 << widthC) - 1; 730 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) 731 &maskC; 732 733 int maskT = (1 << widthT) - 1; 734 threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT; 735 736 nApics++; 737 } 738 739 // 740 // We've collected all the info we need. 741 // Restore the old affinity mask for this thread. 742 // 743 __kmp_set_system_affinity(oldMask, TRUE); 744 745 // 746 // If there's only one thread context to bind to, form an Address object 747 // with depth 1 and return immediately (or, if affinity is off, set 748 // address2os to NULL and return). 749 // 750 // If it is configured to omit the package level when there is only a 751 // single package, the logic at the end of this routine won't work if 752 // there is only a single thread - it would try to form an Address 753 // object with depth 0. 754 // 755 KMP_ASSERT(nApics > 0); 756 if (nApics == 1) { 757 __kmp_ncores = nPackages = 1; 758 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 759 if (__kmp_affinity_verbose) { 760 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 761 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 762 763 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 764 if (__kmp_affinity_respect_mask) { 765 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 766 } else { 767 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 768 } 769 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 770 KMP_INFORM(Uniform, "KMP_AFFINITY"); 771 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 772 __kmp_nThreadsPerCore, __kmp_ncores); 773 } 774 775 if (__kmp_affinity_type == affinity_none) { 776 __kmp_free(threadInfo); 777 KMP_CPU_FREE(oldMask); 778 return 0; 779 } 780 781 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair)); 782 Address addr(1); 783 addr.labels[0] = threadInfo[0].pkgId; 784 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId); 785 786 if (__kmp_affinity_gran_levels < 0) { 787 __kmp_affinity_gran_levels = 0; 788 } 789 790 if (__kmp_affinity_verbose) { 791 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 792 } 793 794 __kmp_free(threadInfo); 795 KMP_CPU_FREE(oldMask); 796 return 1; 797 } 798 799 // 800 // Sort the threadInfo table by physical Id. 801 // 802 qsort(threadInfo, nApics, sizeof(*threadInfo), 803 __kmp_affinity_cmp_apicThreadInfo_phys_id); 804 805 // 806 // The table is now sorted by pkgId / coreId / threadId, but we really 807 // don't know the radix of any of the fields. pkgId's may be sparsely 808 // assigned among the chips on a system. Although coreId's are usually 809 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned 810 // [0..threadsPerCore-1], we don't want to make any such assumptions. 811 // 812 // For that matter, we don't know what coresPerPkg and threadsPerCore 813 // (or the total # packages) are at this point - we want to determine 814 // that now. We only have an upper bound on the first two figures. 815 // 816 // We also perform a consistency check at this point: the values returned 817 // by the cpuid instruction for any thread bound to a given package had 818 // better return the same info for maxThreadsPerPkg and maxCoresPerPkg. 819 // 820 nPackages = 1; 821 nCoresPerPkg = 1; 822 __kmp_nThreadsPerCore = 1; 823 unsigned nCores = 1; 824 825 unsigned pkgCt = 1; // to determine radii 826 unsigned lastPkgId = threadInfo[0].pkgId; 827 unsigned coreCt = 1; 828 unsigned lastCoreId = threadInfo[0].coreId; 829 unsigned threadCt = 1; 830 unsigned lastThreadId = threadInfo[0].threadId; 831 832 // intra-pkg consist checks 833 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 834 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 835 836 for (i = 1; i < nApics; i++) { 837 if (threadInfo[i].pkgId != lastPkgId) { 838 nCores++; 839 pkgCt++; 840 lastPkgId = threadInfo[i].pkgId; 841 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; 842 coreCt = 1; 843 lastCoreId = threadInfo[i].coreId; 844 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 845 threadCt = 1; 846 lastThreadId = threadInfo[i].threadId; 847 848 // 849 // This is a different package, so go on to the next iteration 850 // without doing any consistency checks. Reset the consistency 851 // check vars, though. 852 // 853 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 854 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 855 continue; 856 } 857 858 if (threadInfo[i].coreId != lastCoreId) { 859 nCores++; 860 coreCt++; 861 lastCoreId = threadInfo[i].coreId; 862 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 863 threadCt = 1; 864 lastThreadId = threadInfo[i].threadId; 865 } 866 else if (threadInfo[i].threadId != lastThreadId) { 867 threadCt++; 868 lastThreadId = threadInfo[i].threadId; 869 } 870 else { 871 __kmp_free(threadInfo); 872 KMP_CPU_FREE(oldMask); 873 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 874 return -1; 875 } 876 877 // 878 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 879 // fields agree between all the threads bounds to a given package. 880 // 881 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) 882 || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 883 __kmp_free(threadInfo); 884 KMP_CPU_FREE(oldMask); 885 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 886 return -1; 887 } 888 } 889 nPackages = pkgCt; 890 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; 891 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 892 893 // 894 // When affinity is off, this routine will still be called to set 895 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 896 // nCoresPerPkg, & nPackages. Make sure all these vars are set 897 // correctly, and return now if affinity is not enabled. 898 // 899 __kmp_ncores = nCores; 900 if (__kmp_affinity_verbose) { 901 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 902 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 903 904 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 905 if (__kmp_affinity_respect_mask) { 906 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 907 } else { 908 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 909 } 910 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 911 if (__kmp_affinity_uniform_topology()) { 912 KMP_INFORM(Uniform, "KMP_AFFINITY"); 913 } else { 914 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 915 } 916 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 917 __kmp_nThreadsPerCore, __kmp_ncores); 918 919 } 920 921 if (__kmp_affinity_type == affinity_none) { 922 __kmp_free(threadInfo); 923 KMP_CPU_FREE(oldMask); 924 return 0; 925 } 926 927 // 928 // Now that we've determined the number of packages, the number of cores 929 // per package, and the number of threads per core, we can construct the 930 // data structure that is to be returned. 931 // 932 int pkgLevel = 0; 933 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1; 934 int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 935 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 936 937 KMP_ASSERT(depth > 0); 938 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 939 940 for (i = 0; i < nApics; ++i) { 941 Address addr(depth); 942 unsigned os = threadInfo[i].osId; 943 int d = 0; 944 945 if (pkgLevel >= 0) { 946 addr.labels[d++] = threadInfo[i].pkgId; 947 } 948 if (coreLevel >= 0) { 949 addr.labels[d++] = threadInfo[i].coreId; 950 } 951 if (threadLevel >= 0) { 952 addr.labels[d++] = threadInfo[i].threadId; 953 } 954 (*address2os)[i] = AddrUnsPair(addr, os); 955 } 956 957 if (__kmp_affinity_gran_levels < 0) { 958 // 959 // Set the granularity level based on what levels are modeled 960 // in the machine topology map. 961 // 962 __kmp_affinity_gran_levels = 0; 963 if ((threadLevel >= 0) 964 && (__kmp_affinity_gran > affinity_gran_thread)) { 965 __kmp_affinity_gran_levels++; 966 } 967 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 968 __kmp_affinity_gran_levels++; 969 } 970 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) { 971 __kmp_affinity_gran_levels++; 972 } 973 } 974 975 if (__kmp_affinity_verbose) { 976 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel, 977 coreLevel, threadLevel); 978 } 979 980 __kmp_free(threadInfo); 981 KMP_CPU_FREE(oldMask); 982 return depth; 983 } 984 985 986 // 987 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 988 // architectures support a newer interface for specifying the x2APIC Ids, 989 // based on cpuid leaf 11. 990 // 991 static int 992 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os, 993 kmp_i18n_id_t *const msg_id) 994 { 995 kmp_cpuid buf; 996 997 *address2os = NULL; 998 *msg_id = kmp_i18n_null; 999 1000 // 1001 // Check to see if cpuid leaf 11 is supported. 1002 // 1003 __kmp_x86_cpuid(0, 0, &buf); 1004 if (buf.eax < 11) { 1005 *msg_id = kmp_i18n_str_NoLeaf11Support; 1006 return -1; 1007 } 1008 __kmp_x86_cpuid(11, 0, &buf); 1009 if (buf.ebx == 0) { 1010 *msg_id = kmp_i18n_str_NoLeaf11Support; 1011 return -1; 1012 } 1013 1014 // 1015 // Find the number of levels in the machine topology. While we're at it, 1016 // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will 1017 // try to get more accurate values later by explicitly counting them, 1018 // but get reasonable defaults now, in case we return early. 1019 // 1020 int level; 1021 int threadLevel = -1; 1022 int coreLevel = -1; 1023 int pkgLevel = -1; 1024 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1025 1026 for (level = 0;; level++) { 1027 if (level > 31) { 1028 // 1029 // FIXME: Hack for DPD200163180 1030 // 1031 // If level is big then something went wrong -> exiting 1032 // 1033 // There could actually be 32 valid levels in the machine topology, 1034 // but so far, the only machine we have seen which does not exit 1035 // this loop before iteration 32 has fubar x2APIC settings. 1036 // 1037 // For now, just reject this case based upon loop trip count. 1038 // 1039 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1040 return -1; 1041 } 1042 __kmp_x86_cpuid(11, level, &buf); 1043 if (buf.ebx == 0) { 1044 if (pkgLevel < 0) { 1045 // 1046 // Will infer nPackages from __kmp_xproc 1047 // 1048 pkgLevel = level; 1049 level++; 1050 } 1051 break; 1052 } 1053 int kind = (buf.ecx >> 8) & 0xff; 1054 if (kind == 1) { 1055 // 1056 // SMT level 1057 // 1058 threadLevel = level; 1059 coreLevel = -1; 1060 pkgLevel = -1; 1061 __kmp_nThreadsPerCore = buf.ebx & 0xff; 1062 if (__kmp_nThreadsPerCore == 0) { 1063 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1064 return -1; 1065 } 1066 } 1067 else if (kind == 2) { 1068 // 1069 // core level 1070 // 1071 coreLevel = level; 1072 pkgLevel = -1; 1073 nCoresPerPkg = buf.ebx & 0xff; 1074 if (nCoresPerPkg == 0) { 1075 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1076 return -1; 1077 } 1078 } 1079 else { 1080 if (level <= 0) { 1081 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1082 return -1; 1083 } 1084 if (pkgLevel >= 0) { 1085 continue; 1086 } 1087 pkgLevel = level; 1088 nPackages = buf.ebx & 0xff; 1089 if (nPackages == 0) { 1090 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1091 return -1; 1092 } 1093 } 1094 } 1095 int depth = level; 1096 1097 // 1098 // In the above loop, "level" was counted from the finest level (usually 1099 // thread) to the coarsest. The caller expects that we will place the 1100 // labels in (*address2os)[].first.labels[] in the inverse order, so 1101 // we need to invert the vars saying which level means what. 1102 // 1103 if (threadLevel >= 0) { 1104 threadLevel = depth - threadLevel - 1; 1105 } 1106 if (coreLevel >= 0) { 1107 coreLevel = depth - coreLevel - 1; 1108 } 1109 KMP_DEBUG_ASSERT(pkgLevel >= 0); 1110 pkgLevel = depth - pkgLevel - 1; 1111 1112 // 1113 // The algorithm used starts by setting the affinity to each available 1114 // thread and retrieving info from the cpuid instruction, so if we are 1115 // not capable of calling __kmp_get_system_affinity() and 1116 // _kmp_get_system_affinity(), then we need to do something else - use 1117 // the defaults that we calculated from issuing cpuid without binding 1118 // to each proc. 1119 // 1120 if (! KMP_AFFINITY_CAPABLE()) 1121 { 1122 // 1123 // Hack to try and infer the machine topology using only the data 1124 // available from cpuid on the current thread, and __kmp_xproc. 1125 // 1126 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1127 1128 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1129 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1130 if (__kmp_affinity_verbose) { 1131 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 1132 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1133 if (__kmp_affinity_uniform_topology()) { 1134 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1135 } else { 1136 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1137 } 1138 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1139 __kmp_nThreadsPerCore, __kmp_ncores); 1140 } 1141 return 0; 1142 } 1143 1144 // 1145 // 1146 // From here on, we can assume that it is safe to call 1147 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), 1148 // even if __kmp_affinity_type = affinity_none. 1149 // 1150 1151 // 1152 // Save the affinity mask for the current thread. 1153 // 1154 kmp_affin_mask_t *oldMask; 1155 KMP_CPU_ALLOC(oldMask); 1156 __kmp_get_system_affinity(oldMask, TRUE); 1157 1158 // 1159 // Allocate the data structure to be returned. 1160 // 1161 AddrUnsPair *retval = (AddrUnsPair *) 1162 __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 1163 1164 // 1165 // Run through each of the available contexts, binding the current thread 1166 // to it, and obtaining the pertinent information using the cpuid instr. 1167 // 1168 unsigned int proc; 1169 int nApics = 0; 1170 for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) { 1171 // 1172 // Skip this proc if it is not included in the machine model. 1173 // 1174 if (! KMP_CPU_ISSET(proc, fullMask)) { 1175 continue; 1176 } 1177 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc); 1178 1179 __kmp_affinity_bind_thread(proc); 1180 1181 // 1182 // Extrach the labels for each level in the machine topology map 1183 // from the Apic ID. 1184 // 1185 Address addr(depth); 1186 int prev_shift = 0; 1187 1188 for (level = 0; level < depth; level++) { 1189 __kmp_x86_cpuid(11, level, &buf); 1190 unsigned apicId = buf.edx; 1191 if (buf.ebx == 0) { 1192 if (level != depth - 1) { 1193 KMP_CPU_FREE(oldMask); 1194 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1195 return -1; 1196 } 1197 addr.labels[depth - level - 1] = apicId >> prev_shift; 1198 level++; 1199 break; 1200 } 1201 int shift = buf.eax & 0x1f; 1202 int mask = (1 << shift) - 1; 1203 addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift; 1204 prev_shift = shift; 1205 } 1206 if (level != depth) { 1207 KMP_CPU_FREE(oldMask); 1208 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1209 return -1; 1210 } 1211 1212 retval[nApics] = AddrUnsPair(addr, proc); 1213 nApics++; 1214 } 1215 1216 // 1217 // We've collected all the info we need. 1218 // Restore the old affinity mask for this thread. 1219 // 1220 __kmp_set_system_affinity(oldMask, TRUE); 1221 1222 // 1223 // If there's only one thread context to bind to, return now. 1224 // 1225 KMP_ASSERT(nApics > 0); 1226 if (nApics == 1) { 1227 __kmp_ncores = nPackages = 1; 1228 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1229 if (__kmp_affinity_verbose) { 1230 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1231 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1232 1233 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1234 if (__kmp_affinity_respect_mask) { 1235 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1236 } else { 1237 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1238 } 1239 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1240 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1241 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1242 __kmp_nThreadsPerCore, __kmp_ncores); 1243 } 1244 1245 if (__kmp_affinity_type == affinity_none) { 1246 __kmp_free(retval); 1247 KMP_CPU_FREE(oldMask); 1248 return 0; 1249 } 1250 1251 // 1252 // Form an Address object which only includes the package level. 1253 // 1254 Address addr(1); 1255 addr.labels[0] = retval[0].first.labels[pkgLevel]; 1256 retval[0].first = addr; 1257 1258 if (__kmp_affinity_gran_levels < 0) { 1259 __kmp_affinity_gran_levels = 0; 1260 } 1261 1262 if (__kmp_affinity_verbose) { 1263 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 1264 } 1265 1266 *address2os = retval; 1267 KMP_CPU_FREE(oldMask); 1268 return 1; 1269 } 1270 1271 // 1272 // Sort the table by physical Id. 1273 // 1274 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels); 1275 1276 // 1277 // Find the radix at each of the levels. 1278 // 1279 unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1280 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1281 unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1282 unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1283 for (level = 0; level < depth; level++) { 1284 totals[level] = 1; 1285 maxCt[level] = 1; 1286 counts[level] = 1; 1287 last[level] = retval[0].first.labels[level]; 1288 } 1289 1290 // 1291 // From here on, the iteration variable "level" runs from the finest 1292 // level to the coarsest, i.e. we iterate forward through 1293 // (*address2os)[].first.labels[] - in the previous loops, we iterated 1294 // backwards. 1295 // 1296 for (proc = 1; (int)proc < nApics; proc++) { 1297 int level; 1298 for (level = 0; level < depth; level++) { 1299 if (retval[proc].first.labels[level] != last[level]) { 1300 int j; 1301 for (j = level + 1; j < depth; j++) { 1302 totals[j]++; 1303 counts[j] = 1; 1304 // The line below causes printing incorrect topology information 1305 // in case the max value for some level (maxCt[level]) is encountered earlier than 1306 // some less value while going through the array. 1307 // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2 1308 // whereas it must be 4. 1309 // TODO!!! Check if it can be commented safely 1310 //maxCt[j] = 1; 1311 last[j] = retval[proc].first.labels[j]; 1312 } 1313 totals[level]++; 1314 counts[level]++; 1315 if (counts[level] > maxCt[level]) { 1316 maxCt[level] = counts[level]; 1317 } 1318 last[level] = retval[proc].first.labels[level]; 1319 break; 1320 } 1321 else if (level == depth - 1) { 1322 __kmp_free(last); 1323 __kmp_free(maxCt); 1324 __kmp_free(counts); 1325 __kmp_free(totals); 1326 __kmp_free(retval); 1327 KMP_CPU_FREE(oldMask); 1328 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 1329 return -1; 1330 } 1331 } 1332 } 1333 1334 // 1335 // When affinity is off, this routine will still be called to set 1336 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 1337 // nCoresPerPkg, & nPackages. Make sure all these vars are set 1338 // correctly, and return if affinity is not enabled. 1339 // 1340 if (threadLevel >= 0) { 1341 __kmp_nThreadsPerCore = maxCt[threadLevel]; 1342 } 1343 else { 1344 __kmp_nThreadsPerCore = 1; 1345 } 1346 nPackages = totals[pkgLevel]; 1347 1348 if (coreLevel >= 0) { 1349 __kmp_ncores = totals[coreLevel]; 1350 nCoresPerPkg = maxCt[coreLevel]; 1351 } 1352 else { 1353 __kmp_ncores = nPackages; 1354 nCoresPerPkg = 1; 1355 } 1356 1357 // 1358 // Check to see if the machine topology is uniform 1359 // 1360 unsigned prod = maxCt[0]; 1361 for (level = 1; level < depth; level++) { 1362 prod *= maxCt[level]; 1363 } 1364 bool uniform = (prod == totals[level - 1]); 1365 1366 // 1367 // Print the machine topology summary. 1368 // 1369 if (__kmp_affinity_verbose) { 1370 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 1371 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1372 1373 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1374 if (__kmp_affinity_respect_mask) { 1375 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 1376 } else { 1377 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 1378 } 1379 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1380 if (uniform) { 1381 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1382 } else { 1383 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1384 } 1385 1386 kmp_str_buf_t buf; 1387 __kmp_str_buf_init(&buf); 1388 1389 __kmp_str_buf_print(&buf, "%d", totals[0]); 1390 for (level = 1; level <= pkgLevel; level++) { 1391 __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 1392 } 1393 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 1394 __kmp_nThreadsPerCore, __kmp_ncores); 1395 1396 __kmp_str_buf_free(&buf); 1397 } 1398 1399 if (__kmp_affinity_type == affinity_none) { 1400 __kmp_free(last); 1401 __kmp_free(maxCt); 1402 __kmp_free(counts); 1403 __kmp_free(totals); 1404 __kmp_free(retval); 1405 KMP_CPU_FREE(oldMask); 1406 return 0; 1407 } 1408 1409 // 1410 // Find any levels with radiix 1, and remove them from the map 1411 // (except for the package level). 1412 // 1413 int new_depth = 0; 1414 for (level = 0; level < depth; level++) { 1415 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1416 continue; 1417 } 1418 new_depth++; 1419 } 1420 1421 // 1422 // If we are removing any levels, allocate a new vector to return, 1423 // and copy the relevant information to it. 1424 // 1425 if (new_depth != depth) { 1426 AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate( 1427 sizeof(AddrUnsPair) * nApics); 1428 for (proc = 0; (int)proc < nApics; proc++) { 1429 Address addr(new_depth); 1430 new_retval[proc] = AddrUnsPair(addr, retval[proc].second); 1431 } 1432 int new_level = 0; 1433 int newPkgLevel = -1; 1434 int newCoreLevel = -1; 1435 int newThreadLevel = -1; 1436 int i; 1437 for (level = 0; level < depth; level++) { 1438 if ((maxCt[level] == 1) 1439 && (level != pkgLevel)) { 1440 // 1441 // Remove this level. Never remove the package level 1442 // 1443 continue; 1444 } 1445 if (level == pkgLevel) { 1446 newPkgLevel = level; 1447 } 1448 if (level == coreLevel) { 1449 newCoreLevel = level; 1450 } 1451 if (level == threadLevel) { 1452 newThreadLevel = level; 1453 } 1454 for (proc = 0; (int)proc < nApics; proc++) { 1455 new_retval[proc].first.labels[new_level] 1456 = retval[proc].first.labels[level]; 1457 } 1458 new_level++; 1459 } 1460 1461 __kmp_free(retval); 1462 retval = new_retval; 1463 depth = new_depth; 1464 pkgLevel = newPkgLevel; 1465 coreLevel = newCoreLevel; 1466 threadLevel = newThreadLevel; 1467 } 1468 1469 if (__kmp_affinity_gran_levels < 0) { 1470 // 1471 // Set the granularity level based on what levels are modeled 1472 // in the machine topology map. 1473 // 1474 __kmp_affinity_gran_levels = 0; 1475 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1476 __kmp_affinity_gran_levels++; 1477 } 1478 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1479 __kmp_affinity_gran_levels++; 1480 } 1481 if (__kmp_affinity_gran > affinity_gran_package) { 1482 __kmp_affinity_gran_levels++; 1483 } 1484 } 1485 1486 if (__kmp_affinity_verbose) { 1487 __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, 1488 coreLevel, threadLevel); 1489 } 1490 1491 __kmp_free(last); 1492 __kmp_free(maxCt); 1493 __kmp_free(counts); 1494 __kmp_free(totals); 1495 KMP_CPU_FREE(oldMask); 1496 *address2os = retval; 1497 return depth; 1498 } 1499 1500 1501 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1502 1503 1504 #define osIdIndex 0 1505 #define threadIdIndex 1 1506 #define coreIdIndex 2 1507 #define pkgIdIndex 3 1508 #define nodeIdIndex 4 1509 1510 typedef unsigned *ProcCpuInfo; 1511 static unsigned maxIndex = pkgIdIndex; 1512 1513 1514 static int 1515 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b) 1516 { 1517 const unsigned *aa = (const unsigned *)a; 1518 const unsigned *bb = (const unsigned *)b; 1519 if (aa[osIdIndex] < bb[osIdIndex]) return -1; 1520 if (aa[osIdIndex] > bb[osIdIndex]) return 1; 1521 return 0; 1522 }; 1523 1524 1525 static int 1526 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b) 1527 { 1528 unsigned i; 1529 const unsigned *aa = *((const unsigned **)a); 1530 const unsigned *bb = *((const unsigned **)b); 1531 for (i = maxIndex; ; i--) { 1532 if (aa[i] < bb[i]) return -1; 1533 if (aa[i] > bb[i]) return 1; 1534 if (i == osIdIndex) break; 1535 } 1536 return 0; 1537 } 1538 1539 1540 // 1541 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 1542 // affinity map. 1543 // 1544 static int 1545 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line, 1546 kmp_i18n_id_t *const msg_id, FILE *f) 1547 { 1548 *address2os = NULL; 1549 *msg_id = kmp_i18n_null; 1550 1551 // 1552 // Scan of the file, and count the number of "processor" (osId) fields, 1553 // and find the highest value of <n> for a node_<n> field. 1554 // 1555 char buf[256]; 1556 unsigned num_records = 0; 1557 while (! feof(f)) { 1558 buf[sizeof(buf) - 1] = 1; 1559 if (! fgets(buf, sizeof(buf), f)) { 1560 // 1561 // Read errors presumably because of EOF 1562 // 1563 break; 1564 } 1565 1566 char s1[] = "processor"; 1567 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1568 num_records++; 1569 continue; 1570 } 1571 1572 // 1573 // FIXME - this will match "node_<n> <garbage>" 1574 // 1575 unsigned level; 1576 if (KMP_SSCANF(buf, "node_%d id", &level) == 1) { 1577 if (nodeIdIndex + level >= maxIndex) { 1578 maxIndex = nodeIdIndex + level; 1579 } 1580 continue; 1581 } 1582 } 1583 1584 // 1585 // Check for empty file / no valid processor records, or too many. 1586 // The number of records can't exceed the number of valid bits in the 1587 // affinity mask. 1588 // 1589 if (num_records == 0) { 1590 *line = 0; 1591 *msg_id = kmp_i18n_str_NoProcRecords; 1592 return -1; 1593 } 1594 if (num_records > (unsigned)__kmp_xproc) { 1595 *line = 0; 1596 *msg_id = kmp_i18n_str_TooManyProcRecords; 1597 return -1; 1598 } 1599 1600 // 1601 // Set the file pointer back to the begginning, so that we can scan the 1602 // file again, this time performing a full parse of the data. 1603 // Allocate a vector of ProcCpuInfo object, where we will place the data. 1604 // Adding an extra element at the end allows us to remove a lot of extra 1605 // checks for termination conditions. 1606 // 1607 if (fseek(f, 0, SEEK_SET) != 0) { 1608 *line = 0; 1609 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 1610 return -1; 1611 } 1612 1613 // 1614 // Allocate the array of records to store the proc info in. The dummy 1615 // element at the end makes the logic in filling them out easier to code. 1616 // 1617 unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1) 1618 * sizeof(unsigned *)); 1619 unsigned i; 1620 for (i = 0; i <= num_records; i++) { 1621 threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1) 1622 * sizeof(unsigned)); 1623 } 1624 1625 #define CLEANUP_THREAD_INFO \ 1626 for (i = 0; i <= num_records; i++) { \ 1627 __kmp_free(threadInfo[i]); \ 1628 } \ 1629 __kmp_free(threadInfo); 1630 1631 // 1632 // A value of UINT_MAX means that we didn't find the field 1633 // 1634 unsigned __index; 1635 1636 #define INIT_PROC_INFO(p) \ 1637 for (__index = 0; __index <= maxIndex; __index++) { \ 1638 (p)[__index] = UINT_MAX; \ 1639 } 1640 1641 for (i = 0; i <= num_records; i++) { 1642 INIT_PROC_INFO(threadInfo[i]); 1643 } 1644 1645 unsigned num_avail = 0; 1646 *line = 0; 1647 while (! feof(f)) { 1648 // 1649 // Create an inner scoping level, so that all the goto targets at the 1650 // end of the loop appear in an outer scoping level. This avoids 1651 // warnings about jumping past an initialization to a target in the 1652 // same block. 1653 // 1654 { 1655 buf[sizeof(buf) - 1] = 1; 1656 bool long_line = false; 1657 if (! fgets(buf, sizeof(buf), f)) { 1658 // 1659 // Read errors presumably because of EOF 1660 // 1661 // If there is valid data in threadInfo[num_avail], then fake 1662 // a blank line in ensure that the last address gets parsed. 1663 // 1664 bool valid = false; 1665 for (i = 0; i <= maxIndex; i++) { 1666 if (threadInfo[num_avail][i] != UINT_MAX) { 1667 valid = true; 1668 } 1669 } 1670 if (! valid) { 1671 break; 1672 } 1673 buf[0] = 0; 1674 } else if (!buf[sizeof(buf) - 1]) { 1675 // 1676 // The line is longer than the buffer. Set a flag and don't 1677 // emit an error if we were going to ignore the line, anyway. 1678 // 1679 long_line = true; 1680 1681 #define CHECK_LINE \ 1682 if (long_line) { \ 1683 CLEANUP_THREAD_INFO; \ 1684 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 1685 return -1; \ 1686 } 1687 } 1688 (*line)++; 1689 1690 char s1[] = "processor"; 1691 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1692 CHECK_LINE; 1693 char *p = strchr(buf + sizeof(s1) - 1, ':'); 1694 unsigned val; 1695 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 1696 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field; 1697 threadInfo[num_avail][osIdIndex] = val; 1698 #if KMP_OS_LINUX && USE_SYSFS_INFO 1699 char path[256]; 1700 KMP_SNPRINTF(path, sizeof(path), 1701 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 1702 threadInfo[num_avail][osIdIndex]); 1703 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 1704 1705 KMP_SNPRINTF(path, sizeof(path), 1706 "/sys/devices/system/cpu/cpu%u/topology/core_id", 1707 threadInfo[num_avail][osIdIndex]); 1708 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 1709 continue; 1710 #else 1711 } 1712 char s2[] = "physical id"; 1713 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 1714 CHECK_LINE; 1715 char *p = strchr(buf + sizeof(s2) - 1, ':'); 1716 unsigned val; 1717 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 1718 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field; 1719 threadInfo[num_avail][pkgIdIndex] = val; 1720 continue; 1721 } 1722 char s3[] = "core id"; 1723 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 1724 CHECK_LINE; 1725 char *p = strchr(buf + sizeof(s3) - 1, ':'); 1726 unsigned val; 1727 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 1728 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field; 1729 threadInfo[num_avail][coreIdIndex] = val; 1730 continue; 1731 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 1732 } 1733 char s4[] = "thread id"; 1734 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 1735 CHECK_LINE; 1736 char *p = strchr(buf + sizeof(s4) - 1, ':'); 1737 unsigned val; 1738 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 1739 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field; 1740 threadInfo[num_avail][threadIdIndex] = val; 1741 continue; 1742 } 1743 unsigned level; 1744 if (KMP_SSCANF(buf, "node_%d id", &level) == 1) { 1745 CHECK_LINE; 1746 char *p = strchr(buf + sizeof(s4) - 1, ':'); 1747 unsigned val; 1748 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 1749 KMP_ASSERT(nodeIdIndex + level <= maxIndex); 1750 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field; 1751 threadInfo[num_avail][nodeIdIndex + level] = val; 1752 continue; 1753 } 1754 1755 // 1756 // We didn't recognize the leading token on the line. 1757 // There are lots of leading tokens that we don't recognize - 1758 // if the line isn't empty, go on to the next line. 1759 // 1760 if ((*buf != 0) && (*buf != '\n')) { 1761 // 1762 // If the line is longer than the buffer, read characters 1763 // until we find a newline. 1764 // 1765 if (long_line) { 1766 int ch; 1767 while (((ch = fgetc(f)) != EOF) && (ch != '\n')); 1768 } 1769 continue; 1770 } 1771 1772 // 1773 // A newline has signalled the end of the processor record. 1774 // Check that there aren't too many procs specified. 1775 // 1776 if ((int)num_avail == __kmp_xproc) { 1777 CLEANUP_THREAD_INFO; 1778 *msg_id = kmp_i18n_str_TooManyEntries; 1779 return -1; 1780 } 1781 1782 // 1783 // Check for missing fields. The osId field must be there, and we 1784 // currently require that the physical id field is specified, also. 1785 // 1786 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 1787 CLEANUP_THREAD_INFO; 1788 *msg_id = kmp_i18n_str_MissingProcField; 1789 return -1; 1790 } 1791 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 1792 CLEANUP_THREAD_INFO; 1793 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 1794 return -1; 1795 } 1796 1797 // 1798 // Skip this proc if it is not included in the machine model. 1799 // 1800 if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) { 1801 INIT_PROC_INFO(threadInfo[num_avail]); 1802 continue; 1803 } 1804 1805 // 1806 // We have a successful parse of this proc's info. 1807 // Increment the counter, and prepare for the next proc. 1808 // 1809 num_avail++; 1810 KMP_ASSERT(num_avail <= num_records); 1811 INIT_PROC_INFO(threadInfo[num_avail]); 1812 } 1813 continue; 1814 1815 no_val: 1816 CLEANUP_THREAD_INFO; 1817 *msg_id = kmp_i18n_str_MissingValCpuinfo; 1818 return -1; 1819 1820 dup_field: 1821 CLEANUP_THREAD_INFO; 1822 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 1823 return -1; 1824 } 1825 *line = 0; 1826 1827 # if KMP_MIC && REDUCE_TEAM_SIZE 1828 unsigned teamSize = 0; 1829 # endif // KMP_MIC && REDUCE_TEAM_SIZE 1830 1831 // check for num_records == __kmp_xproc ??? 1832 1833 // 1834 // If there's only one thread context to bind to, form an Address object 1835 // with depth 1 and return immediately (or, if affinity is off, set 1836 // address2os to NULL and return). 1837 // 1838 // If it is configured to omit the package level when there is only a 1839 // single package, the logic at the end of this routine won't work if 1840 // there is only a single thread - it would try to form an Address 1841 // object with depth 0. 1842 // 1843 KMP_ASSERT(num_avail > 0); 1844 KMP_ASSERT(num_avail <= num_records); 1845 if (num_avail == 1) { 1846 __kmp_ncores = 1; 1847 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1848 if (__kmp_affinity_verbose) { 1849 if (! KMP_AFFINITY_CAPABLE()) { 1850 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 1851 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1852 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1853 } 1854 else { 1855 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1856 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 1857 fullMask); 1858 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 1859 if (__kmp_affinity_respect_mask) { 1860 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1861 } else { 1862 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1863 } 1864 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1865 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1866 } 1867 int index; 1868 kmp_str_buf_t buf; 1869 __kmp_str_buf_init(&buf); 1870 __kmp_str_buf_print(&buf, "1"); 1871 for (index = maxIndex - 1; index > pkgIdIndex; index--) { 1872 __kmp_str_buf_print(&buf, " x 1"); 1873 } 1874 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1); 1875 __kmp_str_buf_free(&buf); 1876 } 1877 1878 if (__kmp_affinity_type == affinity_none) { 1879 CLEANUP_THREAD_INFO; 1880 return 0; 1881 } 1882 1883 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair)); 1884 Address addr(1); 1885 addr.labels[0] = threadInfo[0][pkgIdIndex]; 1886 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]); 1887 1888 if (__kmp_affinity_gran_levels < 0) { 1889 __kmp_affinity_gran_levels = 0; 1890 } 1891 1892 if (__kmp_affinity_verbose) { 1893 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 1894 } 1895 1896 CLEANUP_THREAD_INFO; 1897 return 1; 1898 } 1899 1900 // 1901 // Sort the threadInfo table by physical Id. 1902 // 1903 qsort(threadInfo, num_avail, sizeof(*threadInfo), 1904 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 1905 1906 // 1907 // The table is now sorted by pkgId / coreId / threadId, but we really 1908 // don't know the radix of any of the fields. pkgId's may be sparsely 1909 // assigned among the chips on a system. Although coreId's are usually 1910 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned 1911 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1912 // 1913 // For that matter, we don't know what coresPerPkg and threadsPerCore 1914 // (or the total # packages) are at this point - we want to determine 1915 // that now. We only have an upper bound on the first two figures. 1916 // 1917 unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1) 1918 * sizeof(unsigned)); 1919 unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1) 1920 * sizeof(unsigned)); 1921 unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1) 1922 * sizeof(unsigned)); 1923 unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1) 1924 * sizeof(unsigned)); 1925 1926 bool assign_thread_ids = false; 1927 unsigned threadIdCt; 1928 unsigned index; 1929 1930 restart_radix_check: 1931 threadIdCt = 0; 1932 1933 // 1934 // Initialize the counter arrays with data from threadInfo[0]. 1935 // 1936 if (assign_thread_ids) { 1937 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 1938 threadInfo[0][threadIdIndex] = threadIdCt++; 1939 } 1940 else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 1941 threadIdCt = threadInfo[0][threadIdIndex] + 1; 1942 } 1943 } 1944 for (index = 0; index <= maxIndex; index++) { 1945 counts[index] = 1; 1946 maxCt[index] = 1; 1947 totals[index] = 1; 1948 lastId[index] = threadInfo[0][index];; 1949 } 1950 1951 // 1952 // Run through the rest of the OS procs. 1953 // 1954 for (i = 1; i < num_avail; i++) { 1955 // 1956 // Find the most significant index whose id differs 1957 // from the id for the previous OS proc. 1958 // 1959 for (index = maxIndex; index >= threadIdIndex; index--) { 1960 if (assign_thread_ids && (index == threadIdIndex)) { 1961 // 1962 // Auto-assign the thread id field if it wasn't specified. 1963 // 1964 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 1965 threadInfo[i][threadIdIndex] = threadIdCt++; 1966 } 1967 1968 // 1969 // Aparrently the thread id field was specified for some 1970 // entries and not others. Start the thread id counter 1971 // off at the next higher thread id. 1972 // 1973 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 1974 threadIdCt = threadInfo[i][threadIdIndex] + 1; 1975 } 1976 } 1977 if (threadInfo[i][index] != lastId[index]) { 1978 // 1979 // Run through all indices which are less significant, 1980 // and reset the counts to 1. 1981 // 1982 // At all levels up to and including index, we need to 1983 // increment the totals and record the last id. 1984 // 1985 unsigned index2; 1986 for (index2 = threadIdIndex; index2 < index; index2++) { 1987 totals[index2]++; 1988 if (counts[index2] > maxCt[index2]) { 1989 maxCt[index2] = counts[index2]; 1990 } 1991 counts[index2] = 1; 1992 lastId[index2] = threadInfo[i][index2]; 1993 } 1994 counts[index]++; 1995 totals[index]++; 1996 lastId[index] = threadInfo[i][index]; 1997 1998 if (assign_thread_ids && (index > threadIdIndex)) { 1999 2000 # if KMP_MIC && REDUCE_TEAM_SIZE 2001 // 2002 // The default team size is the total #threads in the machine 2003 // minus 1 thread for every core that has 3 or more threads. 2004 // 2005 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 ); 2006 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2007 2008 // 2009 // Restart the thread counter, as we are on a new core. 2010 // 2011 threadIdCt = 0; 2012 2013 // 2014 // Auto-assign the thread id field if it wasn't specified. 2015 // 2016 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2017 threadInfo[i][threadIdIndex] = threadIdCt++; 2018 } 2019 2020 // 2021 // Aparrently the thread id field was specified for some 2022 // entries and not others. Start the thread id counter 2023 // off at the next higher thread id. 2024 // 2025 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2026 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2027 } 2028 } 2029 break; 2030 } 2031 } 2032 if (index < threadIdIndex) { 2033 // 2034 // If thread ids were specified, it is an error if they are not 2035 // unique. Also, check that we waven't already restarted the 2036 // loop (to be safe - shouldn't need to). 2037 // 2038 if ((threadInfo[i][threadIdIndex] != UINT_MAX) 2039 || assign_thread_ids) { 2040 __kmp_free(lastId); 2041 __kmp_free(totals); 2042 __kmp_free(maxCt); 2043 __kmp_free(counts); 2044 CLEANUP_THREAD_INFO; 2045 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2046 return -1; 2047 } 2048 2049 // 2050 // If the thread ids were not specified and we see entries 2051 // entries that are duplicates, start the loop over and 2052 // assign the thread ids manually. 2053 // 2054 assign_thread_ids = true; 2055 goto restart_radix_check; 2056 } 2057 } 2058 2059 # if KMP_MIC && REDUCE_TEAM_SIZE 2060 // 2061 // The default team size is the total #threads in the machine 2062 // minus 1 thread for every core that has 3 or more threads. 2063 // 2064 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 ); 2065 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2066 2067 for (index = threadIdIndex; index <= maxIndex; index++) { 2068 if (counts[index] > maxCt[index]) { 2069 maxCt[index] = counts[index]; 2070 } 2071 } 2072 2073 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2074 nCoresPerPkg = maxCt[coreIdIndex]; 2075 nPackages = totals[pkgIdIndex]; 2076 2077 // 2078 // Check to see if the machine topology is uniform 2079 // 2080 unsigned prod = totals[maxIndex]; 2081 for (index = threadIdIndex; index < maxIndex; index++) { 2082 prod *= maxCt[index]; 2083 } 2084 bool uniform = (prod == totals[threadIdIndex]); 2085 2086 // 2087 // When affinity is off, this routine will still be called to set 2088 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 2089 // nCoresPerPkg, & nPackages. Make sure all these vars are set 2090 // correctly, and return now if affinity is not enabled. 2091 // 2092 __kmp_ncores = totals[coreIdIndex]; 2093 2094 if (__kmp_affinity_verbose) { 2095 if (! KMP_AFFINITY_CAPABLE()) { 2096 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2097 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2098 if (uniform) { 2099 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2100 } else { 2101 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2102 } 2103 } 2104 else { 2105 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2106 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask); 2107 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2108 if (__kmp_affinity_respect_mask) { 2109 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2110 } else { 2111 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2112 } 2113 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2114 if (uniform) { 2115 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2116 } else { 2117 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2118 } 2119 } 2120 kmp_str_buf_t buf; 2121 __kmp_str_buf_init(&buf); 2122 2123 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]); 2124 for (index = maxIndex - 1; index >= pkgIdIndex; index--) { 2125 __kmp_str_buf_print(&buf, " x %d", maxCt[index]); 2126 } 2127 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex], 2128 maxCt[threadIdIndex], __kmp_ncores); 2129 2130 __kmp_str_buf_free(&buf); 2131 } 2132 2133 # if KMP_MIC && REDUCE_TEAM_SIZE 2134 // 2135 // Set the default team size. 2136 // 2137 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2138 __kmp_dflt_team_nth = teamSize; 2139 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n", 2140 __kmp_dflt_team_nth)); 2141 } 2142 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2143 2144 if (__kmp_affinity_type == affinity_none) { 2145 __kmp_free(lastId); 2146 __kmp_free(totals); 2147 __kmp_free(maxCt); 2148 __kmp_free(counts); 2149 CLEANUP_THREAD_INFO; 2150 return 0; 2151 } 2152 2153 // 2154 // Count the number of levels which have more nodes at that level than 2155 // at the parent's level (with there being an implicit root node of 2156 // the top level). This is equivalent to saying that there is at least 2157 // one node at this level which has a sibling. These levels are in the 2158 // map, and the package level is always in the map. 2159 // 2160 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2161 int level = 0; 2162 for (index = threadIdIndex; index < maxIndex; index++) { 2163 KMP_ASSERT(totals[index] >= totals[index + 1]); 2164 inMap[index] = (totals[index] > totals[index + 1]); 2165 } 2166 inMap[maxIndex] = (totals[maxIndex] > 1); 2167 inMap[pkgIdIndex] = true; 2168 2169 int depth = 0; 2170 for (index = threadIdIndex; index <= maxIndex; index++) { 2171 if (inMap[index]) { 2172 depth++; 2173 } 2174 } 2175 KMP_ASSERT(depth > 0); 2176 2177 // 2178 // Construct the data structure that is to be returned. 2179 // 2180 *address2os = (AddrUnsPair*) 2181 __kmp_allocate(sizeof(AddrUnsPair) * num_avail); 2182 int pkgLevel = -1; 2183 int coreLevel = -1; 2184 int threadLevel = -1; 2185 2186 for (i = 0; i < num_avail; ++i) { 2187 Address addr(depth); 2188 unsigned os = threadInfo[i][osIdIndex]; 2189 int src_index; 2190 int dst_index = 0; 2191 2192 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2193 if (! inMap[src_index]) { 2194 continue; 2195 } 2196 addr.labels[dst_index] = threadInfo[i][src_index]; 2197 if (src_index == pkgIdIndex) { 2198 pkgLevel = dst_index; 2199 } 2200 else if (src_index == coreIdIndex) { 2201 coreLevel = dst_index; 2202 } 2203 else if (src_index == threadIdIndex) { 2204 threadLevel = dst_index; 2205 } 2206 dst_index++; 2207 } 2208 (*address2os)[i] = AddrUnsPair(addr, os); 2209 } 2210 2211 if (__kmp_affinity_gran_levels < 0) { 2212 // 2213 // Set the granularity level based on what levels are modeled 2214 // in the machine topology map. 2215 // 2216 unsigned src_index; 2217 __kmp_affinity_gran_levels = 0; 2218 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) { 2219 if (! inMap[src_index]) { 2220 continue; 2221 } 2222 switch (src_index) { 2223 case threadIdIndex: 2224 if (__kmp_affinity_gran > affinity_gran_thread) { 2225 __kmp_affinity_gran_levels++; 2226 } 2227 2228 break; 2229 case coreIdIndex: 2230 if (__kmp_affinity_gran > affinity_gran_core) { 2231 __kmp_affinity_gran_levels++; 2232 } 2233 break; 2234 2235 case pkgIdIndex: 2236 if (__kmp_affinity_gran > affinity_gran_package) { 2237 __kmp_affinity_gran_levels++; 2238 } 2239 break; 2240 } 2241 } 2242 } 2243 2244 if (__kmp_affinity_verbose) { 2245 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel, 2246 coreLevel, threadLevel); 2247 } 2248 2249 __kmp_free(inMap); 2250 __kmp_free(lastId); 2251 __kmp_free(totals); 2252 __kmp_free(maxCt); 2253 __kmp_free(counts); 2254 CLEANUP_THREAD_INFO; 2255 return depth; 2256 } 2257 2258 2259 // 2260 // Create and return a table of affinity masks, indexed by OS thread ID. 2261 // This routine handles OR'ing together all the affinity masks of threads 2262 // that are sufficiently close, if granularity > fine. 2263 // 2264 static kmp_affin_mask_t * 2265 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique, 2266 AddrUnsPair *address2os, unsigned numAddrs) 2267 { 2268 // 2269 // First form a table of affinity masks in order of OS thread id. 2270 // 2271 unsigned depth; 2272 unsigned maxOsId; 2273 unsigned i; 2274 2275 KMP_ASSERT(numAddrs > 0); 2276 depth = address2os[0].first.depth; 2277 2278 maxOsId = 0; 2279 for (i = 0; i < numAddrs; i++) { 2280 unsigned osId = address2os[i].second; 2281 if (osId > maxOsId) { 2282 maxOsId = osId; 2283 } 2284 } 2285 kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate( 2286 (maxOsId + 1) * __kmp_affin_mask_size); 2287 2288 // 2289 // Sort the address2os table according to physical order. Doing so 2290 // will put all threads on the same core/package/node in consecutive 2291 // locations. 2292 // 2293 qsort(address2os, numAddrs, sizeof(*address2os), 2294 __kmp_affinity_cmp_Address_labels); 2295 2296 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2297 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2298 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2299 } 2300 if (__kmp_affinity_gran_levels >= (int)depth) { 2301 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2302 && (__kmp_affinity_type != affinity_none))) { 2303 KMP_WARNING(AffThreadsMayMigrate); 2304 } 2305 } 2306 2307 // 2308 // Run through the table, forming the masks for all threads on each 2309 // core. Threads on the same core will have identical "Address" 2310 // objects, not considering the last level, which must be the thread 2311 // id. All threads on a core will appear consecutively. 2312 // 2313 unsigned unique = 0; 2314 unsigned j = 0; // index of 1st thread on core 2315 unsigned leader = 0; 2316 Address *leaderAddr = &(address2os[0].first); 2317 kmp_affin_mask_t *sum 2318 = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size); 2319 KMP_CPU_ZERO(sum); 2320 KMP_CPU_SET(address2os[0].second, sum); 2321 for (i = 1; i < numAddrs; i++) { 2322 // 2323 // If this thread is sufficiently close to the leader (within the 2324 // granularity setting), then set the bit for this os thread in the 2325 // affinity mask for this group, and go on to the next thread. 2326 // 2327 if (leaderAddr->isClose(address2os[i].first, 2328 __kmp_affinity_gran_levels)) { 2329 KMP_CPU_SET(address2os[i].second, sum); 2330 continue; 2331 } 2332 2333 // 2334 // For every thread in this group, copy the mask to the thread's 2335 // entry in the osId2Mask table. Mark the first address as a 2336 // leader. 2337 // 2338 for (; j < i; j++) { 2339 unsigned osId = address2os[j].second; 2340 KMP_DEBUG_ASSERT(osId <= maxOsId); 2341 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2342 KMP_CPU_COPY(mask, sum); 2343 address2os[j].first.leader = (j == leader); 2344 } 2345 unique++; 2346 2347 // 2348 // Start a new mask. 2349 // 2350 leader = i; 2351 leaderAddr = &(address2os[i].first); 2352 KMP_CPU_ZERO(sum); 2353 KMP_CPU_SET(address2os[i].second, sum); 2354 } 2355 2356 // 2357 // For every thread in last group, copy the mask to the thread's 2358 // entry in the osId2Mask table. 2359 // 2360 for (; j < i; j++) { 2361 unsigned osId = address2os[j].second; 2362 KMP_DEBUG_ASSERT(osId <= maxOsId); 2363 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2364 KMP_CPU_COPY(mask, sum); 2365 address2os[j].first.leader = (j == leader); 2366 } 2367 unique++; 2368 2369 *maxIndex = maxOsId; 2370 *numUnique = unique; 2371 return osId2Mask; 2372 } 2373 2374 2375 // 2376 // Stuff for the affinity proclist parsers. It's easier to declare these vars 2377 // as file-static than to try and pass them through the calling sequence of 2378 // the recursive-descent OMP_PLACES parser. 2379 // 2380 static kmp_affin_mask_t *newMasks; 2381 static int numNewMasks; 2382 static int nextNewMask; 2383 2384 #define ADD_MASK(_mask) \ 2385 { \ 2386 if (nextNewMask >= numNewMasks) { \ 2387 numNewMasks *= 2; \ 2388 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \ 2389 numNewMasks * __kmp_affin_mask_size); \ 2390 } \ 2391 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 2392 nextNewMask++; \ 2393 } 2394 2395 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \ 2396 { \ 2397 if (((_osId) > _maxOsId) || \ 2398 (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 2399 if (__kmp_affinity_verbose || (__kmp_affinity_warnings \ 2400 && (__kmp_affinity_type != affinity_none))) { \ 2401 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 2402 } \ 2403 } \ 2404 else { \ 2405 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 2406 } \ 2407 } 2408 2409 2410 // 2411 // Re-parse the proclist (for the explicit affinity type), and form the list 2412 // of affinity newMasks indexed by gtid. 2413 // 2414 static void 2415 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 2416 unsigned int *out_numMasks, const char *proclist, 2417 kmp_affin_mask_t *osId2Mask, int maxOsId) 2418 { 2419 const char *scan = proclist; 2420 const char *next = proclist; 2421 2422 // 2423 // We use malloc() for the temporary mask vector, 2424 // so that we can use realloc() to extend it. 2425 // 2426 numNewMasks = 2; 2427 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks 2428 * __kmp_affin_mask_size); 2429 nextNewMask = 0; 2430 kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate( 2431 __kmp_affin_mask_size); 2432 int setSize = 0; 2433 2434 for (;;) { 2435 int start, end, stride; 2436 2437 SKIP_WS(scan); 2438 next = scan; 2439 if (*next == '\0') { 2440 break; 2441 } 2442 2443 if (*next == '{') { 2444 int num; 2445 setSize = 0; 2446 next++; // skip '{' 2447 SKIP_WS(next); 2448 scan = next; 2449 2450 // 2451 // Read the first integer in the set. 2452 // 2453 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2454 "bad proclist"); 2455 SKIP_DIGITS(next); 2456 num = __kmp_str_to_int(scan, *next); 2457 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2458 2459 // 2460 // Copy the mask for that osId to the sum (union) mask. 2461 // 2462 if ((num > maxOsId) || 2463 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2464 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2465 && (__kmp_affinity_type != affinity_none))) { 2466 KMP_WARNING(AffIgnoreInvalidProcID, num); 2467 } 2468 KMP_CPU_ZERO(sumMask); 2469 } 2470 else { 2471 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2472 setSize = 1; 2473 } 2474 2475 for (;;) { 2476 // 2477 // Check for end of set. 2478 // 2479 SKIP_WS(next); 2480 if (*next == '}') { 2481 next++; // skip '}' 2482 break; 2483 } 2484 2485 // 2486 // Skip optional comma. 2487 // 2488 if (*next == ',') { 2489 next++; 2490 } 2491 SKIP_WS(next); 2492 2493 // 2494 // Read the next integer in the set. 2495 // 2496 scan = next; 2497 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2498 "bad explicit proc list"); 2499 2500 SKIP_DIGITS(next); 2501 num = __kmp_str_to_int(scan, *next); 2502 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2503 2504 // 2505 // Add the mask for that osId to the sum mask. 2506 // 2507 if ((num > maxOsId) || 2508 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2509 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2510 && (__kmp_affinity_type != affinity_none))) { 2511 KMP_WARNING(AffIgnoreInvalidProcID, num); 2512 } 2513 } 2514 else { 2515 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2516 setSize++; 2517 } 2518 } 2519 if (setSize > 0) { 2520 ADD_MASK(sumMask); 2521 } 2522 2523 SKIP_WS(next); 2524 if (*next == ',') { 2525 next++; 2526 } 2527 scan = next; 2528 continue; 2529 } 2530 2531 // 2532 // Read the first integer. 2533 // 2534 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2535 SKIP_DIGITS(next); 2536 start = __kmp_str_to_int(scan, *next); 2537 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 2538 SKIP_WS(next); 2539 2540 // 2541 // If this isn't a range, then add a mask to the list and go on. 2542 // 2543 if (*next != '-') { 2544 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2545 2546 // 2547 // Skip optional comma. 2548 // 2549 if (*next == ',') { 2550 next++; 2551 } 2552 scan = next; 2553 continue; 2554 } 2555 2556 // 2557 // This is a range. Skip over the '-' and read in the 2nd int. 2558 // 2559 next++; // skip '-' 2560 SKIP_WS(next); 2561 scan = next; 2562 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2563 SKIP_DIGITS(next); 2564 end = __kmp_str_to_int(scan, *next); 2565 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 2566 2567 // 2568 // Check for a stride parameter 2569 // 2570 stride = 1; 2571 SKIP_WS(next); 2572 if (*next == ':') { 2573 // 2574 // A stride is specified. Skip over the ':" and read the 3rd int. 2575 // 2576 int sign = +1; 2577 next++; // skip ':' 2578 SKIP_WS(next); 2579 scan = next; 2580 if (*next == '-') { 2581 sign = -1; 2582 next++; 2583 SKIP_WS(next); 2584 scan = next; 2585 } 2586 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2587 "bad explicit proc list"); 2588 SKIP_DIGITS(next); 2589 stride = __kmp_str_to_int(scan, *next); 2590 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 2591 stride *= sign; 2592 } 2593 2594 // 2595 // Do some range checks. 2596 // 2597 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 2598 if (stride > 0) { 2599 KMP_ASSERT2(start <= end, "bad explicit proc list"); 2600 } 2601 else { 2602 KMP_ASSERT2(start >= end, "bad explicit proc list"); 2603 } 2604 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 2605 2606 // 2607 // Add the mask for each OS proc # to the list. 2608 // 2609 if (stride > 0) { 2610 do { 2611 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2612 start += stride; 2613 } while (start <= end); 2614 } 2615 else { 2616 do { 2617 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2618 start += stride; 2619 } while (start >= end); 2620 } 2621 2622 // 2623 // Skip optional comma. 2624 // 2625 SKIP_WS(next); 2626 if (*next == ',') { 2627 next++; 2628 } 2629 scan = next; 2630 } 2631 2632 *out_numMasks = nextNewMask; 2633 if (nextNewMask == 0) { 2634 *out_masks = NULL; 2635 KMP_INTERNAL_FREE(newMasks); 2636 return; 2637 } 2638 *out_masks 2639 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size); 2640 KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size); 2641 __kmp_free(sumMask); 2642 KMP_INTERNAL_FREE(newMasks); 2643 } 2644 2645 2646 # if OMP_40_ENABLED 2647 2648 /*----------------------------------------------------------------------------- 2649 2650 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 2651 places. Again, Here is the grammar: 2652 2653 place_list := place 2654 place_list := place , place_list 2655 place := num 2656 place := place : num 2657 place := place : num : signed 2658 place := { subplacelist } 2659 place := ! place // (lowest priority) 2660 subplace_list := subplace 2661 subplace_list := subplace , subplace_list 2662 subplace := num 2663 subplace := num : num 2664 subplace := num : num : signed 2665 signed := num 2666 signed := + signed 2667 signed := - signed 2668 2669 -----------------------------------------------------------------------------*/ 2670 2671 static void 2672 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask, 2673 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize) 2674 { 2675 const char *next; 2676 2677 for (;;) { 2678 int start, count, stride, i; 2679 2680 // 2681 // Read in the starting proc id 2682 // 2683 SKIP_WS(*scan); 2684 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 2685 "bad explicit places list"); 2686 next = *scan; 2687 SKIP_DIGITS(next); 2688 start = __kmp_str_to_int(*scan, *next); 2689 KMP_ASSERT(start >= 0); 2690 *scan = next; 2691 2692 // 2693 // valid follow sets are ',' ':' and '}' 2694 // 2695 SKIP_WS(*scan); 2696 if (**scan == '}' || **scan == ',') { 2697 if ((start > maxOsId) || 2698 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2699 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2700 && (__kmp_affinity_type != affinity_none))) { 2701 KMP_WARNING(AffIgnoreInvalidProcID, start); 2702 } 2703 } 2704 else { 2705 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2706 (*setSize)++; 2707 } 2708 if (**scan == '}') { 2709 break; 2710 } 2711 (*scan)++; // skip ',' 2712 continue; 2713 } 2714 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 2715 (*scan)++; // skip ':' 2716 2717 // 2718 // Read count parameter 2719 // 2720 SKIP_WS(*scan); 2721 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 2722 "bad explicit places list"); 2723 next = *scan; 2724 SKIP_DIGITS(next); 2725 count = __kmp_str_to_int(*scan, *next); 2726 KMP_ASSERT(count >= 0); 2727 *scan = next; 2728 2729 // 2730 // valid follow sets are ',' ':' and '}' 2731 // 2732 SKIP_WS(*scan); 2733 if (**scan == '}' || **scan == ',') { 2734 for (i = 0; i < count; i++) { 2735 if ((start > maxOsId) || 2736 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2737 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2738 && (__kmp_affinity_type != affinity_none))) { 2739 KMP_WARNING(AffIgnoreInvalidProcID, start); 2740 } 2741 break; // don't proliferate warnings for large count 2742 } 2743 else { 2744 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2745 start++; 2746 (*setSize)++; 2747 } 2748 } 2749 if (**scan == '}') { 2750 break; 2751 } 2752 (*scan)++; // skip ',' 2753 continue; 2754 } 2755 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 2756 (*scan)++; // skip ':' 2757 2758 // 2759 // Read stride parameter 2760 // 2761 int sign = +1; 2762 for (;;) { 2763 SKIP_WS(*scan); 2764 if (**scan == '+') { 2765 (*scan)++; // skip '+' 2766 continue; 2767 } 2768 if (**scan == '-') { 2769 sign *= -1; 2770 (*scan)++; // skip '-' 2771 continue; 2772 } 2773 break; 2774 } 2775 SKIP_WS(*scan); 2776 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 2777 "bad explicit places list"); 2778 next = *scan; 2779 SKIP_DIGITS(next); 2780 stride = __kmp_str_to_int(*scan, *next); 2781 KMP_ASSERT(stride >= 0); 2782 *scan = next; 2783 stride *= sign; 2784 2785 // 2786 // valid follow sets are ',' and '}' 2787 // 2788 SKIP_WS(*scan); 2789 if (**scan == '}' || **scan == ',') { 2790 for (i = 0; i < count; i++) { 2791 if ((start > maxOsId) || 2792 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2793 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2794 && (__kmp_affinity_type != affinity_none))) { 2795 KMP_WARNING(AffIgnoreInvalidProcID, start); 2796 } 2797 break; // don't proliferate warnings for large count 2798 } 2799 else { 2800 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2801 start += stride; 2802 (*setSize)++; 2803 } 2804 } 2805 if (**scan == '}') { 2806 break; 2807 } 2808 (*scan)++; // skip ',' 2809 continue; 2810 } 2811 2812 KMP_ASSERT2(0, "bad explicit places list"); 2813 } 2814 } 2815 2816 2817 static void 2818 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 2819 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize) 2820 { 2821 const char *next; 2822 2823 // 2824 // valid follow sets are '{' '!' and num 2825 // 2826 SKIP_WS(*scan); 2827 if (**scan == '{') { 2828 (*scan)++; // skip '{' 2829 __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask, 2830 setSize); 2831 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 2832 (*scan)++; // skip '}' 2833 } 2834 else if (**scan == '!') { 2835 (*scan)++; // skip '!' 2836 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 2837 KMP_CPU_COMPLEMENT(tempMask); 2838 } 2839 else if ((**scan >= '0') && (**scan <= '9')) { 2840 next = *scan; 2841 SKIP_DIGITS(next); 2842 int num = __kmp_str_to_int(*scan, *next); 2843 KMP_ASSERT(num >= 0); 2844 if ((num > maxOsId) || 2845 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2846 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2847 && (__kmp_affinity_type != affinity_none))) { 2848 KMP_WARNING(AffIgnoreInvalidProcID, num); 2849 } 2850 } 2851 else { 2852 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 2853 (*setSize)++; 2854 } 2855 *scan = next; // skip num 2856 } 2857 else { 2858 KMP_ASSERT2(0, "bad explicit places list"); 2859 } 2860 } 2861 2862 2863 //static void 2864 void 2865 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 2866 unsigned int *out_numMasks, const char *placelist, 2867 kmp_affin_mask_t *osId2Mask, int maxOsId) 2868 { 2869 const char *scan = placelist; 2870 const char *next = placelist; 2871 2872 numNewMasks = 2; 2873 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks 2874 * __kmp_affin_mask_size); 2875 nextNewMask = 0; 2876 2877 kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate( 2878 __kmp_affin_mask_size); 2879 KMP_CPU_ZERO(tempMask); 2880 int setSize = 0; 2881 2882 for (;;) { 2883 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 2884 2885 // 2886 // valid follow sets are ',' ':' and EOL 2887 // 2888 SKIP_WS(scan); 2889 if (*scan == '\0' || *scan == ',') { 2890 if (setSize > 0) { 2891 ADD_MASK(tempMask); 2892 } 2893 KMP_CPU_ZERO(tempMask); 2894 setSize = 0; 2895 if (*scan == '\0') { 2896 break; 2897 } 2898 scan++; // skip ',' 2899 continue; 2900 } 2901 2902 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 2903 scan++; // skip ':' 2904 2905 // 2906 // Read count parameter 2907 // 2908 SKIP_WS(scan); 2909 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), 2910 "bad explicit places list"); 2911 next = scan; 2912 SKIP_DIGITS(next); 2913 int count = __kmp_str_to_int(scan, *next); 2914 KMP_ASSERT(count >= 0); 2915 scan = next; 2916 2917 // 2918 // valid follow sets are ',' ':' and EOL 2919 // 2920 SKIP_WS(scan); 2921 int stride; 2922 if (*scan == '\0' || *scan == ',') { 2923 stride = +1; 2924 } 2925 else { 2926 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 2927 scan++; // skip ':' 2928 2929 // 2930 // Read stride parameter 2931 // 2932 int sign = +1; 2933 for (;;) { 2934 SKIP_WS(scan); 2935 if (*scan == '+') { 2936 scan++; // skip '+' 2937 continue; 2938 } 2939 if (*scan == '-') { 2940 sign *= -1; 2941 scan++; // skip '-' 2942 continue; 2943 } 2944 break; 2945 } 2946 SKIP_WS(scan); 2947 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), 2948 "bad explicit places list"); 2949 next = scan; 2950 SKIP_DIGITS(next); 2951 stride = __kmp_str_to_int(scan, *next); 2952 KMP_DEBUG_ASSERT(stride >= 0); 2953 scan = next; 2954 stride *= sign; 2955 } 2956 2957 if (stride > 0) { 2958 int i; 2959 for (i = 0; i < count; i++) { 2960 int j; 2961 if (setSize == 0) { 2962 break; 2963 } 2964 ADD_MASK(tempMask); 2965 setSize = 0; 2966 for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) { 2967 if (! KMP_CPU_ISSET(j - stride, tempMask)) { 2968 KMP_CPU_CLR(j, tempMask); 2969 } 2970 else if ((j > maxOsId) || 2971 (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) { 2972 if ((__kmp_affinity_verbose || (__kmp_affinity_warnings 2973 && (__kmp_affinity_type != affinity_none))) && i < count - 1) { 2974 KMP_WARNING(AffIgnoreInvalidProcID, j); 2975 } 2976 KMP_CPU_CLR(j, tempMask); 2977 } 2978 else { 2979 KMP_CPU_SET(j, tempMask); 2980 setSize++; 2981 } 2982 } 2983 for (; j >= 0; j--) { 2984 KMP_CPU_CLR(j, tempMask); 2985 } 2986 } 2987 } 2988 else { 2989 int i; 2990 for (i = 0; i < count; i++) { 2991 int j; 2992 if (setSize == 0) { 2993 break; 2994 } 2995 ADD_MASK(tempMask); 2996 setSize = 0; 2997 for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride; 2998 j++) { 2999 if (! KMP_CPU_ISSET(j - stride, tempMask)) { 3000 KMP_CPU_CLR(j, tempMask); 3001 } 3002 else if ((j > maxOsId) || 3003 (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) { 3004 if ((__kmp_affinity_verbose || (__kmp_affinity_warnings 3005 && (__kmp_affinity_type != affinity_none))) && i < count - 1) { 3006 KMP_WARNING(AffIgnoreInvalidProcID, j); 3007 } 3008 KMP_CPU_CLR(j, tempMask); 3009 } 3010 else { 3011 KMP_CPU_SET(j, tempMask); 3012 setSize++; 3013 } 3014 } 3015 for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) { 3016 KMP_CPU_CLR(j, tempMask); 3017 } 3018 } 3019 } 3020 KMP_CPU_ZERO(tempMask); 3021 setSize = 0; 3022 3023 // 3024 // valid follow sets are ',' and EOL 3025 // 3026 SKIP_WS(scan); 3027 if (*scan == '\0') { 3028 break; 3029 } 3030 if (*scan == ',') { 3031 scan++; // skip ',' 3032 continue; 3033 } 3034 3035 KMP_ASSERT2(0, "bad explicit places list"); 3036 } 3037 3038 *out_numMasks = nextNewMask; 3039 if (nextNewMask == 0) { 3040 *out_masks = NULL; 3041 KMP_INTERNAL_FREE(newMasks); 3042 return; 3043 } 3044 *out_masks 3045 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size); 3046 KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size); 3047 __kmp_free(tempMask); 3048 KMP_INTERNAL_FREE(newMasks); 3049 } 3050 3051 # endif /* OMP_40_ENABLED */ 3052 3053 #undef ADD_MASK 3054 #undef ADD_MASK_OSID 3055 3056 static void 3057 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) 3058 { 3059 if (__kmp_place_num_sockets == 0 && 3060 __kmp_place_num_cores == 0 && 3061 __kmp_place_num_threads_per_core == 0 ) 3062 return; // no topology limiting actions requested, exit 3063 if (__kmp_place_num_sockets == 0) 3064 __kmp_place_num_sockets = nPackages; // use all available sockets 3065 if (__kmp_place_num_cores == 0) 3066 __kmp_place_num_cores = nCoresPerPkg; // use all available cores 3067 if (__kmp_place_num_threads_per_core == 0 || 3068 __kmp_place_num_threads_per_core > __kmp_nThreadsPerCore) 3069 __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts 3070 3071 if ( !__kmp_affinity_uniform_topology() ) { 3072 KMP_WARNING( AffThrPlaceNonUniform ); 3073 return; // don't support non-uniform topology 3074 } 3075 if ( depth != 3 ) { 3076 KMP_WARNING( AffThrPlaceNonThreeLevel ); 3077 return; // don't support not-3-level topology 3078 } 3079 if (__kmp_place_socket_offset + __kmp_place_num_sockets > nPackages) { 3080 KMP_WARNING(AffThrPlaceManySockets); 3081 return; 3082 } 3083 if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) { 3084 KMP_WARNING( AffThrPlaceManyCores ); 3085 return; 3086 } 3087 3088 AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) * 3089 __kmp_place_num_sockets * __kmp_place_num_cores * __kmp_place_num_threads_per_core); 3090 3091 int i, j, k, n_old = 0, n_new = 0; 3092 for (i = 0; i < nPackages; ++i) 3093 if (i < __kmp_place_socket_offset || 3094 i >= __kmp_place_socket_offset + __kmp_place_num_sockets) 3095 n_old += nCoresPerPkg * __kmp_nThreadsPerCore; // skip not-requested socket 3096 else 3097 for (j = 0; j < nCoresPerPkg; ++j) // walk through requested socket 3098 if (j < __kmp_place_core_offset || 3099 j >= __kmp_place_core_offset + __kmp_place_num_cores) 3100 n_old += __kmp_nThreadsPerCore; // skip not-requested core 3101 else 3102 for (k = 0; k < __kmp_nThreadsPerCore; ++k) { // walk through requested core 3103 if (k < __kmp_place_num_threads_per_core) { 3104 newAddr[n_new] = (*pAddr)[n_old]; // collect requested thread's data 3105 n_new++; 3106 } 3107 n_old++; 3108 } 3109 KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore); 3110 KMP_DEBUG_ASSERT(n_new == __kmp_place_num_sockets * __kmp_place_num_cores * 3111 __kmp_place_num_threads_per_core); 3112 3113 nPackages = __kmp_place_num_sockets; // correct nPackages 3114 nCoresPerPkg = __kmp_place_num_cores; // correct nCoresPerPkg 3115 __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore 3116 __kmp_avail_proc = n_new; // correct avail_proc 3117 __kmp_ncores = nPackages * __kmp_place_num_cores; // correct ncores 3118 3119 __kmp_free( *pAddr ); 3120 *pAddr = newAddr; // replace old topology with new one 3121 } 3122 3123 3124 static AddrUnsPair *address2os = NULL; 3125 static int * procarr = NULL; 3126 static int __kmp_aff_depth = 0; 3127 3128 static void 3129 __kmp_aux_affinity_initialize(void) 3130 { 3131 if (__kmp_affinity_masks != NULL) { 3132 KMP_ASSERT(fullMask != NULL); 3133 return; 3134 } 3135 3136 // 3137 // Create the "full" mask - this defines all of the processors that we 3138 // consider to be in the machine model. If respect is set, then it is 3139 // the initialization thread's affinity mask. Otherwise, it is all 3140 // processors that we know about on the machine. 3141 // 3142 if (fullMask == NULL) { 3143 fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size); 3144 } 3145 if (KMP_AFFINITY_CAPABLE()) { 3146 if (__kmp_affinity_respect_mask) { 3147 __kmp_get_system_affinity(fullMask, TRUE); 3148 3149 // 3150 // Count the number of available processors. 3151 // 3152 unsigned i; 3153 __kmp_avail_proc = 0; 3154 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 3155 if (! KMP_CPU_ISSET(i, fullMask)) { 3156 continue; 3157 } 3158 __kmp_avail_proc++; 3159 } 3160 if (__kmp_avail_proc > __kmp_xproc) { 3161 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3162 && (__kmp_affinity_type != affinity_none))) { 3163 KMP_WARNING(ErrorInitializeAffinity); 3164 } 3165 __kmp_affinity_type = affinity_none; 3166 KMP_AFFINITY_DISABLE(); 3167 return; 3168 } 3169 } 3170 else { 3171 __kmp_affinity_entire_machine_mask(fullMask); 3172 __kmp_avail_proc = __kmp_xproc; 3173 } 3174 } 3175 3176 int depth = -1; 3177 kmp_i18n_id_t msg_id = kmp_i18n_null; 3178 3179 // 3180 // For backward compatibility, setting KMP_CPUINFO_FILE => 3181 // KMP_TOPOLOGY_METHOD=cpuinfo 3182 // 3183 if ((__kmp_cpuinfo_file != NULL) && 3184 (__kmp_affinity_top_method == affinity_top_method_all)) { 3185 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 3186 } 3187 3188 if (__kmp_affinity_top_method == affinity_top_method_all) { 3189 // 3190 // In the default code path, errors are not fatal - we just try using 3191 // another method. We only emit a warning message if affinity is on, 3192 // or the verbose flag is set, an the nowarnings flag was not set. 3193 // 3194 const char *file_name = NULL; 3195 int line = 0; 3196 3197 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 3198 3199 if (__kmp_affinity_verbose) { 3200 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 3201 } 3202 3203 file_name = NULL; 3204 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3205 if (depth == 0) { 3206 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3207 KMP_ASSERT(address2os == NULL); 3208 return; 3209 } 3210 3211 if (depth < 0) { 3212 if (__kmp_affinity_verbose) { 3213 if (msg_id != kmp_i18n_null) { 3214 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), 3215 KMP_I18N_STR(DecodingLegacyAPIC)); 3216 } 3217 else { 3218 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 3219 } 3220 } 3221 3222 file_name = NULL; 3223 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3224 if (depth == 0) { 3225 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3226 KMP_ASSERT(address2os == NULL); 3227 return; 3228 } 3229 } 3230 3231 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3232 3233 # if KMP_OS_LINUX 3234 3235 if (depth < 0) { 3236 if (__kmp_affinity_verbose) { 3237 if (msg_id != kmp_i18n_null) { 3238 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo"); 3239 } 3240 else { 3241 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo"); 3242 } 3243 } 3244 3245 FILE *f = fopen("/proc/cpuinfo", "r"); 3246 if (f == NULL) { 3247 msg_id = kmp_i18n_str_CantOpenCpuinfo; 3248 } 3249 else { 3250 file_name = "/proc/cpuinfo"; 3251 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3252 fclose(f); 3253 if (depth == 0) { 3254 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3255 KMP_ASSERT(address2os == NULL); 3256 return; 3257 } 3258 } 3259 } 3260 3261 # endif /* KMP_OS_LINUX */ 3262 3263 # if KMP_GROUP_AFFINITY 3264 3265 if ((depth < 0) && (__kmp_num_proc_groups > 1)) { 3266 if (__kmp_affinity_verbose) { 3267 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 3268 } 3269 3270 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 3271 KMP_ASSERT(depth != 0); 3272 } 3273 3274 # endif /* KMP_GROUP_AFFINITY */ 3275 3276 if (depth < 0) { 3277 if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) { 3278 if (file_name == NULL) { 3279 KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id)); 3280 } 3281 else if (line == 0) { 3282 KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id)); 3283 } 3284 else { 3285 KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id)); 3286 } 3287 } 3288 // FIXME - print msg if msg_id = kmp_i18n_null ??? 3289 3290 file_name = ""; 3291 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3292 if (depth == 0) { 3293 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3294 KMP_ASSERT(address2os == NULL); 3295 return; 3296 } 3297 KMP_ASSERT(depth > 0); 3298 KMP_ASSERT(address2os != NULL); 3299 } 3300 } 3301 3302 // 3303 // If the user has specified that a paricular topology discovery method 3304 // is to be used, then we abort if that method fails. The exception is 3305 // group affinity, which might have been implicitly set. 3306 // 3307 3308 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 3309 3310 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 3311 if (__kmp_affinity_verbose) { 3312 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3313 KMP_I18N_STR(Decodingx2APIC)); 3314 } 3315 3316 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3317 if (depth == 0) { 3318 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3319 KMP_ASSERT(address2os == NULL); 3320 return; 3321 } 3322 if (depth < 0) { 3323 KMP_ASSERT(msg_id != kmp_i18n_null); 3324 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3325 } 3326 } 3327 else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 3328 if (__kmp_affinity_verbose) { 3329 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3330 KMP_I18N_STR(DecodingLegacyAPIC)); 3331 } 3332 3333 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3334 if (depth == 0) { 3335 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3336 KMP_ASSERT(address2os == NULL); 3337 return; 3338 } 3339 if (depth < 0) { 3340 KMP_ASSERT(msg_id != kmp_i18n_null); 3341 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3342 } 3343 } 3344 3345 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3346 3347 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 3348 const char *filename; 3349 if (__kmp_cpuinfo_file != NULL) { 3350 filename = __kmp_cpuinfo_file; 3351 } 3352 else { 3353 filename = "/proc/cpuinfo"; 3354 } 3355 3356 if (__kmp_affinity_verbose) { 3357 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 3358 } 3359 3360 FILE *f = fopen(filename, "r"); 3361 if (f == NULL) { 3362 int code = errno; 3363 if (__kmp_cpuinfo_file != NULL) { 3364 __kmp_msg( 3365 kmp_ms_fatal, 3366 KMP_MSG(CantOpenFileForReading, filename), 3367 KMP_ERR(code), 3368 KMP_HNT(NameComesFrom_CPUINFO_FILE), 3369 __kmp_msg_null 3370 ); 3371 } 3372 else { 3373 __kmp_msg( 3374 kmp_ms_fatal, 3375 KMP_MSG(CantOpenFileForReading, filename), 3376 KMP_ERR(code), 3377 __kmp_msg_null 3378 ); 3379 } 3380 } 3381 int line = 0; 3382 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3383 fclose(f); 3384 if (depth < 0) { 3385 KMP_ASSERT(msg_id != kmp_i18n_null); 3386 if (line > 0) { 3387 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id)); 3388 } 3389 else { 3390 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 3391 } 3392 } 3393 if (__kmp_affinity_type == affinity_none) { 3394 KMP_ASSERT(depth == 0); 3395 KMP_ASSERT(address2os == NULL); 3396 return; 3397 } 3398 } 3399 3400 # if KMP_GROUP_AFFINITY 3401 3402 else if (__kmp_affinity_top_method == affinity_top_method_group) { 3403 if (__kmp_affinity_verbose) { 3404 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 3405 } 3406 3407 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 3408 KMP_ASSERT(depth != 0); 3409 if (depth < 0) { 3410 KMP_ASSERT(msg_id != kmp_i18n_null); 3411 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3412 } 3413 } 3414 3415 # endif /* KMP_GROUP_AFFINITY */ 3416 3417 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 3418 if (__kmp_affinity_verbose) { 3419 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY"); 3420 } 3421 3422 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3423 if (depth == 0) { 3424 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3425 KMP_ASSERT(address2os == NULL); 3426 return; 3427 } 3428 // should not fail 3429 KMP_ASSERT(depth > 0); 3430 KMP_ASSERT(address2os != NULL); 3431 } 3432 3433 if (address2os == NULL) { 3434 if (KMP_AFFINITY_CAPABLE() 3435 && (__kmp_affinity_verbose || (__kmp_affinity_warnings 3436 && (__kmp_affinity_type != affinity_none)))) { 3437 KMP_WARNING(ErrorInitializeAffinity); 3438 } 3439 __kmp_affinity_type = affinity_none; 3440 KMP_AFFINITY_DISABLE(); 3441 return; 3442 } 3443 3444 __kmp_apply_thread_places(&address2os, depth); 3445 3446 // 3447 // Create the table of masks, indexed by thread Id. 3448 // 3449 unsigned maxIndex; 3450 unsigned numUnique; 3451 kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique, 3452 address2os, __kmp_avail_proc); 3453 if (__kmp_affinity_gran_levels == 0) { 3454 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 3455 } 3456 3457 // 3458 // Set the childNums vector in all Address objects. This must be done 3459 // before we can sort using __kmp_affinity_cmp_Address_child_num(), 3460 // which takes into account the setting of __kmp_affinity_compact. 3461 // 3462 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc); 3463 3464 switch (__kmp_affinity_type) { 3465 3466 case affinity_explicit: 3467 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 3468 # if OMP_40_ENABLED 3469 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 3470 # endif 3471 { 3472 __kmp_affinity_process_proclist(&__kmp_affinity_masks, 3473 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask, 3474 maxIndex); 3475 } 3476 # if OMP_40_ENABLED 3477 else { 3478 __kmp_affinity_process_placelist(&__kmp_affinity_masks, 3479 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask, 3480 maxIndex); 3481 } 3482 # endif 3483 if (__kmp_affinity_num_masks == 0) { 3484 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3485 && (__kmp_affinity_type != affinity_none))) { 3486 KMP_WARNING(AffNoValidProcID); 3487 } 3488 __kmp_affinity_type = affinity_none; 3489 return; 3490 } 3491 break; 3492 3493 // 3494 // The other affinity types rely on sorting the Addresses according 3495 // to some permutation of the machine topology tree. Set 3496 // __kmp_affinity_compact and __kmp_affinity_offset appropriately, 3497 // then jump to a common code fragment to do the sort and create 3498 // the array of affinity masks. 3499 // 3500 3501 case affinity_logical: 3502 __kmp_affinity_compact = 0; 3503 if (__kmp_affinity_offset) { 3504 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset 3505 % __kmp_avail_proc; 3506 } 3507 goto sortAddresses; 3508 3509 case affinity_physical: 3510 if (__kmp_nThreadsPerCore > 1) { 3511 __kmp_affinity_compact = 1; 3512 if (__kmp_affinity_compact >= depth) { 3513 __kmp_affinity_compact = 0; 3514 } 3515 } else { 3516 __kmp_affinity_compact = 0; 3517 } 3518 if (__kmp_affinity_offset) { 3519 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset 3520 % __kmp_avail_proc; 3521 } 3522 goto sortAddresses; 3523 3524 case affinity_scatter: 3525 if (__kmp_affinity_compact >= depth) { 3526 __kmp_affinity_compact = 0; 3527 } 3528 else { 3529 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 3530 } 3531 goto sortAddresses; 3532 3533 case affinity_compact: 3534 if (__kmp_affinity_compact >= depth) { 3535 __kmp_affinity_compact = depth - 1; 3536 } 3537 goto sortAddresses; 3538 3539 case affinity_balanced: 3540 // Balanced works only for the case of a single package 3541 if( nPackages > 1 ) { 3542 if( __kmp_affinity_verbose || __kmp_affinity_warnings ) { 3543 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" ); 3544 } 3545 __kmp_affinity_type = affinity_none; 3546 return; 3547 } else if( __kmp_affinity_uniform_topology() ) { 3548 break; 3549 } else { // Non-uniform topology 3550 3551 // Save the depth for further usage 3552 __kmp_aff_depth = depth; 3553 3554 // Number of hyper threads per core in HT machine 3555 int nth_per_core = __kmp_nThreadsPerCore; 3556 3557 int core_level; 3558 if( nth_per_core > 1 ) { 3559 core_level = depth - 2; 3560 } else { 3561 core_level = depth - 1; 3562 } 3563 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1; 3564 int nproc = nth_per_core * ncores; 3565 3566 procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc ); 3567 for( int i = 0; i < nproc; i++ ) { 3568 procarr[ i ] = -1; 3569 } 3570 3571 for( int i = 0; i < __kmp_avail_proc; i++ ) { 3572 int proc = address2os[ i ].second; 3573 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread. 3574 // If there is only one thread per core then depth == 2: level 0 - package, 3575 // level 1 - core. 3576 int level = depth - 1; 3577 3578 // __kmp_nth_per_core == 1 3579 int thread = 0; 3580 int core = address2os[ i ].first.labels[ level ]; 3581 // If the thread level exists, that is we have more than one thread context per core 3582 if( nth_per_core > 1 ) { 3583 thread = address2os[ i ].first.labels[ level ] % nth_per_core; 3584 core = address2os[ i ].first.labels[ level - 1 ]; 3585 } 3586 procarr[ core * nth_per_core + thread ] = proc; 3587 } 3588 3589 break; 3590 } 3591 3592 sortAddresses: 3593 // 3594 // Allocate the gtid->affinity mask table. 3595 // 3596 if (__kmp_affinity_dups) { 3597 __kmp_affinity_num_masks = __kmp_avail_proc; 3598 } 3599 else { 3600 __kmp_affinity_num_masks = numUnique; 3601 } 3602 3603 # if OMP_40_ENABLED 3604 if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel ) 3605 && ( __kmp_affinity_num_places > 0 ) 3606 && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) { 3607 __kmp_affinity_num_masks = __kmp_affinity_num_places; 3608 } 3609 # endif 3610 3611 __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate( 3612 __kmp_affinity_num_masks * __kmp_affin_mask_size); 3613 3614 // 3615 // Sort the address2os table according to the current setting of 3616 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 3617 // 3618 qsort(address2os, __kmp_avail_proc, sizeof(*address2os), 3619 __kmp_affinity_cmp_Address_child_num); 3620 { 3621 int i; 3622 unsigned j; 3623 for (i = 0, j = 0; i < __kmp_avail_proc; i++) { 3624 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) { 3625 continue; 3626 } 3627 unsigned osId = address2os[i].second; 3628 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 3629 kmp_affin_mask_t *dest 3630 = KMP_CPU_INDEX(__kmp_affinity_masks, j); 3631 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 3632 KMP_CPU_COPY(dest, src); 3633 if (++j >= __kmp_affinity_num_masks) { 3634 break; 3635 } 3636 } 3637 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 3638 } 3639 break; 3640 3641 default: 3642 KMP_ASSERT2(0, "Unexpected affinity setting"); 3643 } 3644 3645 __kmp_free(osId2Mask); 3646 machine_hierarchy.init(address2os, __kmp_avail_proc); 3647 } 3648 3649 3650 void 3651 __kmp_affinity_initialize(void) 3652 { 3653 // 3654 // Much of the code above was written assumming that if a machine was not 3655 // affinity capable, then __kmp_affinity_type == affinity_none. We now 3656 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 3657 // 3658 // There are too many checks for __kmp_affinity_type == affinity_none 3659 // in this code. Instead of trying to change them all, check if 3660 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 3661 // affinity_none, call the real initialization routine, then restore 3662 // __kmp_affinity_type to affinity_disabled. 3663 // 3664 int disabled = (__kmp_affinity_type == affinity_disabled); 3665 if (! KMP_AFFINITY_CAPABLE()) { 3666 KMP_ASSERT(disabled); 3667 } 3668 if (disabled) { 3669 __kmp_affinity_type = affinity_none; 3670 } 3671 __kmp_aux_affinity_initialize(); 3672 if (disabled) { 3673 __kmp_affinity_type = affinity_disabled; 3674 } 3675 } 3676 3677 3678 void 3679 __kmp_affinity_uninitialize(void) 3680 { 3681 if (__kmp_affinity_masks != NULL) { 3682 __kmp_free(__kmp_affinity_masks); 3683 __kmp_affinity_masks = NULL; 3684 } 3685 if (fullMask != NULL) { 3686 KMP_CPU_FREE(fullMask); 3687 fullMask = NULL; 3688 } 3689 __kmp_affinity_num_masks = 0; 3690 # if OMP_40_ENABLED 3691 __kmp_affinity_num_places = 0; 3692 # endif 3693 if (__kmp_affinity_proclist != NULL) { 3694 __kmp_free(__kmp_affinity_proclist); 3695 __kmp_affinity_proclist = NULL; 3696 } 3697 if( address2os != NULL ) { 3698 __kmp_free( address2os ); 3699 address2os = NULL; 3700 } 3701 if( procarr != NULL ) { 3702 __kmp_free( procarr ); 3703 procarr = NULL; 3704 } 3705 } 3706 3707 3708 void 3709 __kmp_affinity_set_init_mask(int gtid, int isa_root) 3710 { 3711 if (! KMP_AFFINITY_CAPABLE()) { 3712 return; 3713 } 3714 3715 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 3716 if (th->th.th_affin_mask == NULL) { 3717 KMP_CPU_ALLOC(th->th.th_affin_mask); 3718 } 3719 else { 3720 KMP_CPU_ZERO(th->th.th_affin_mask); 3721 } 3722 3723 // 3724 // Copy the thread mask to the kmp_info_t strucuture. 3725 // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one 3726 // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask 3727 // is set, then the full mask is the same as the mask of the initialization 3728 // thread. 3729 // 3730 kmp_affin_mask_t *mask; 3731 int i; 3732 3733 # if OMP_40_ENABLED 3734 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 3735 # endif 3736 { 3737 if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced) 3738 ) { 3739 # if KMP_GROUP_AFFINITY 3740 if (__kmp_num_proc_groups > 1) { 3741 return; 3742 } 3743 # endif 3744 KMP_ASSERT(fullMask != NULL); 3745 i = KMP_PLACE_ALL; 3746 mask = fullMask; 3747 } 3748 else { 3749 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 ); 3750 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 3751 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 3752 } 3753 } 3754 # if OMP_40_ENABLED 3755 else { 3756 if ((! isa_root) 3757 || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 3758 # if KMP_GROUP_AFFINITY 3759 if (__kmp_num_proc_groups > 1) { 3760 return; 3761 } 3762 # endif 3763 KMP_ASSERT(fullMask != NULL); 3764 i = KMP_PLACE_ALL; 3765 mask = fullMask; 3766 } 3767 else { 3768 // 3769 // int i = some hash function or just a counter that doesn't 3770 // always start at 0. Use gtid for now. 3771 // 3772 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 ); 3773 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 3774 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 3775 } 3776 } 3777 # endif 3778 3779 # if OMP_40_ENABLED 3780 th->th.th_current_place = i; 3781 if (isa_root) { 3782 th->th.th_new_place = i; 3783 th->th.th_first_place = 0; 3784 th->th.th_last_place = __kmp_affinity_num_masks - 1; 3785 } 3786 3787 if (i == KMP_PLACE_ALL) { 3788 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 3789 gtid)); 3790 } 3791 else { 3792 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 3793 gtid, i)); 3794 } 3795 # else 3796 if (i == -1) { 3797 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n", 3798 gtid)); 3799 } 3800 else { 3801 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n", 3802 gtid, i)); 3803 } 3804 # endif /* OMP_40_ENABLED */ 3805 3806 KMP_CPU_COPY(th->th.th_affin_mask, mask); 3807 3808 if (__kmp_affinity_verbose) { 3809 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 3810 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 3811 th->th.th_affin_mask); 3812 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid, 3813 buf); 3814 } 3815 3816 # if KMP_OS_WINDOWS 3817 // 3818 // On Windows* OS, the process affinity mask might have changed. 3819 // If the user didn't request affinity and this call fails, 3820 // just continue silently. See CQ171393. 3821 // 3822 if ( __kmp_affinity_type == affinity_none ) { 3823 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 3824 } 3825 else 3826 # endif 3827 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 3828 } 3829 3830 3831 # if OMP_40_ENABLED 3832 3833 void 3834 __kmp_affinity_set_place(int gtid) 3835 { 3836 int retval; 3837 3838 if (! KMP_AFFINITY_CAPABLE()) { 3839 return; 3840 } 3841 3842 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 3843 3844 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n", 3845 gtid, th->th.th_new_place, th->th.th_current_place)); 3846 3847 // 3848 // Check that the new place is within this thread's partition. 3849 // 3850 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 3851 KMP_ASSERT(th->th.th_new_place >= 0); 3852 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 3853 if (th->th.th_first_place <= th->th.th_last_place) { 3854 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) 3855 && (th->th.th_new_place <= th->th.th_last_place)); 3856 } 3857 else { 3858 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) 3859 || (th->th.th_new_place >= th->th.th_last_place)); 3860 } 3861 3862 // 3863 // Copy the thread mask to the kmp_info_t strucuture, 3864 // and set this thread's affinity. 3865 // 3866 kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, 3867 th->th.th_new_place); 3868 KMP_CPU_COPY(th->th.th_affin_mask, mask); 3869 th->th.th_current_place = th->th.th_new_place; 3870 3871 if (__kmp_affinity_verbose) { 3872 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 3873 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 3874 th->th.th_affin_mask); 3875 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), 3876 gtid, buf); 3877 } 3878 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 3879 } 3880 3881 # endif /* OMP_40_ENABLED */ 3882 3883 3884 int 3885 __kmp_aux_set_affinity(void **mask) 3886 { 3887 int gtid; 3888 kmp_info_t *th; 3889 int retval; 3890 3891 if (! KMP_AFFINITY_CAPABLE()) { 3892 return -1; 3893 } 3894 3895 gtid = __kmp_entry_gtid(); 3896 KA_TRACE(1000, ;{ 3897 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 3898 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 3899 (kmp_affin_mask_t *)(*mask)); 3900 __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n", 3901 gtid, buf); 3902 }); 3903 3904 if (__kmp_env_consistency_check) { 3905 if ((mask == NULL) || (*mask == NULL)) { 3906 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 3907 } 3908 else { 3909 unsigned proc; 3910 int num_procs = 0; 3911 3912 for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) { 3913 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 3914 continue; 3915 } 3916 num_procs++; 3917 if (! KMP_CPU_ISSET(proc, fullMask)) { 3918 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 3919 break; 3920 } 3921 } 3922 if (num_procs == 0) { 3923 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 3924 } 3925 3926 # if KMP_GROUP_AFFINITY 3927 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 3928 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 3929 } 3930 # endif /* KMP_GROUP_AFFINITY */ 3931 3932 } 3933 } 3934 3935 th = __kmp_threads[gtid]; 3936 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 3937 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 3938 if (retval == 0) { 3939 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 3940 } 3941 3942 # if OMP_40_ENABLED 3943 th->th.th_current_place = KMP_PLACE_UNDEFINED; 3944 th->th.th_new_place = KMP_PLACE_UNDEFINED; 3945 th->th.th_first_place = 0; 3946 th->th.th_last_place = __kmp_affinity_num_masks - 1; 3947 3948 // 3949 // Turn off 4.0 affinity for the current tread at this parallel level. 3950 // 3951 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 3952 # endif 3953 3954 return retval; 3955 } 3956 3957 3958 int 3959 __kmp_aux_get_affinity(void **mask) 3960 { 3961 int gtid; 3962 int retval; 3963 kmp_info_t *th; 3964 3965 if (! KMP_AFFINITY_CAPABLE()) { 3966 return -1; 3967 } 3968 3969 gtid = __kmp_entry_gtid(); 3970 th = __kmp_threads[gtid]; 3971 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 3972 3973 KA_TRACE(1000, ;{ 3974 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 3975 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 3976 th->th.th_affin_mask); 3977 __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf); 3978 }); 3979 3980 if (__kmp_env_consistency_check) { 3981 if ((mask == NULL) || (*mask == NULL)) { 3982 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 3983 } 3984 } 3985 3986 # if !KMP_OS_WINDOWS 3987 3988 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 3989 KA_TRACE(1000, ;{ 3990 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 3991 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 3992 (kmp_affin_mask_t *)(*mask)); 3993 __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf); 3994 }); 3995 return retval; 3996 3997 # else 3998 3999 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 4000 return 0; 4001 4002 # endif /* KMP_OS_WINDOWS */ 4003 4004 } 4005 4006 int 4007 __kmp_aux_set_affinity_mask_proc(int proc, void **mask) 4008 { 4009 int retval; 4010 4011 if (! KMP_AFFINITY_CAPABLE()) { 4012 return -1; 4013 } 4014 4015 KA_TRACE(1000, ;{ 4016 int gtid = __kmp_entry_gtid(); 4017 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4018 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4019 (kmp_affin_mask_t *)(*mask)); 4020 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n", 4021 proc, gtid, buf); 4022 }); 4023 4024 if (__kmp_env_consistency_check) { 4025 if ((mask == NULL) || (*mask == NULL)) { 4026 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4027 } 4028 } 4029 4030 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4031 return -1; 4032 } 4033 if (! KMP_CPU_ISSET(proc, fullMask)) { 4034 return -2; 4035 } 4036 4037 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 4038 return 0; 4039 } 4040 4041 4042 int 4043 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) 4044 { 4045 int retval; 4046 4047 if (! KMP_AFFINITY_CAPABLE()) { 4048 return -1; 4049 } 4050 4051 KA_TRACE(1000, ;{ 4052 int gtid = __kmp_entry_gtid(); 4053 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4054 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4055 (kmp_affin_mask_t *)(*mask)); 4056 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n", 4057 proc, gtid, buf); 4058 }); 4059 4060 if (__kmp_env_consistency_check) { 4061 if ((mask == NULL) || (*mask == NULL)) { 4062 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 4063 } 4064 } 4065 4066 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4067 return -1; 4068 } 4069 if (! KMP_CPU_ISSET(proc, fullMask)) { 4070 return -2; 4071 } 4072 4073 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 4074 return 0; 4075 } 4076 4077 4078 int 4079 __kmp_aux_get_affinity_mask_proc(int proc, void **mask) 4080 { 4081 int retval; 4082 4083 if (! KMP_AFFINITY_CAPABLE()) { 4084 return -1; 4085 } 4086 4087 KA_TRACE(1000, ;{ 4088 int gtid = __kmp_entry_gtid(); 4089 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4090 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4091 (kmp_affin_mask_t *)(*mask)); 4092 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n", 4093 proc, gtid, buf); 4094 }); 4095 4096 if (__kmp_env_consistency_check) { 4097 if ((mask == NULL) || (*mask == NULL)) { 4098 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc"); 4099 } 4100 } 4101 4102 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4103 return 0; 4104 } 4105 if (! KMP_CPU_ISSET(proc, fullMask)) { 4106 return 0; 4107 } 4108 4109 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 4110 } 4111 4112 4113 // Dynamic affinity settings - Affinity balanced 4114 void __kmp_balanced_affinity( int tid, int nthreads ) 4115 { 4116 if( __kmp_affinity_uniform_topology() ) { 4117 int coreID; 4118 int threadID; 4119 // Number of hyper threads per core in HT machine 4120 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 4121 // Number of cores 4122 int ncores = __kmp_ncores; 4123 // How many threads will be bound to each core 4124 int chunk = nthreads / ncores; 4125 // How many cores will have an additional thread bound to it - "big cores" 4126 int big_cores = nthreads % ncores; 4127 // Number of threads on the big cores 4128 int big_nth = ( chunk + 1 ) * big_cores; 4129 if( tid < big_nth ) { 4130 coreID = tid / (chunk + 1 ); 4131 threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ; 4132 } else { //tid >= big_nth 4133 coreID = ( tid - big_cores ) / chunk; 4134 threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ; 4135 } 4136 4137 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 4138 "Illegal set affinity operation when not capable"); 4139 4140 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size); 4141 KMP_CPU_ZERO(mask); 4142 4143 // Granularity == thread 4144 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4145 int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second; 4146 KMP_CPU_SET( osID, mask); 4147 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4148 for( int i = 0; i < __kmp_nth_per_core; i++ ) { 4149 int osID; 4150 osID = address2os[ coreID * __kmp_nth_per_core + i ].second; 4151 KMP_CPU_SET( osID, mask); 4152 } 4153 } 4154 if (__kmp_affinity_verbose) { 4155 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4156 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4157 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4158 tid, buf); 4159 } 4160 __kmp_set_system_affinity( mask, TRUE ); 4161 } else { // Non-uniform topology 4162 4163 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size); 4164 KMP_CPU_ZERO(mask); 4165 4166 // Number of hyper threads per core in HT machine 4167 int nth_per_core = __kmp_nThreadsPerCore; 4168 int core_level; 4169 if( nth_per_core > 1 ) { 4170 core_level = __kmp_aff_depth - 2; 4171 } else { 4172 core_level = __kmp_aff_depth - 1; 4173 } 4174 4175 // Number of cores - maximum value; it does not count trail cores with 0 processors 4176 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1; 4177 4178 // For performance gain consider the special case nthreads == __kmp_avail_proc 4179 if( nthreads == __kmp_avail_proc ) { 4180 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4181 int osID = address2os[ tid ].second; 4182 KMP_CPU_SET( osID, mask); 4183 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4184 int coreID = address2os[ tid ].first.labels[ core_level ]; 4185 // We'll count found osIDs for the current core; they can be not more than nth_per_core; 4186 // since the address2os is sortied we can break when cnt==nth_per_core 4187 int cnt = 0; 4188 for( int i = 0; i < __kmp_avail_proc; i++ ) { 4189 int osID = address2os[ i ].second; 4190 int core = address2os[ i ].first.labels[ core_level ]; 4191 if( core == coreID ) { 4192 KMP_CPU_SET( osID, mask); 4193 cnt++; 4194 if( cnt == nth_per_core ) { 4195 break; 4196 } 4197 } 4198 } 4199 } 4200 } else if( nthreads <= __kmp_ncores ) { 4201 4202 int core = 0; 4203 for( int i = 0; i < ncores; i++ ) { 4204 // Check if this core from procarr[] is in the mask 4205 int in_mask = 0; 4206 for( int j = 0; j < nth_per_core; j++ ) { 4207 if( procarr[ i * nth_per_core + j ] != - 1 ) { 4208 in_mask = 1; 4209 break; 4210 } 4211 } 4212 if( in_mask ) { 4213 if( tid == core ) { 4214 for( int j = 0; j < nth_per_core; j++ ) { 4215 int osID = procarr[ i * nth_per_core + j ]; 4216 if( osID != -1 ) { 4217 KMP_CPU_SET( osID, mask ); 4218 // For granularity=thread it is enough to set the first available osID for this core 4219 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4220 break; 4221 } 4222 } 4223 } 4224 break; 4225 } else { 4226 core++; 4227 } 4228 } 4229 } 4230 4231 } else { // nthreads > __kmp_ncores 4232 4233 // Array to save the number of processors at each core 4234 int* nproc_at_core = (int*)KMP_ALLOCA(sizeof(int)*ncores); 4235 // Array to save the number of cores with "x" available processors; 4236 int* ncores_with_x_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1)); 4237 // Array to save the number of cores with # procs from x to nth_per_core 4238 int* ncores_with_x_to_max_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1)); 4239 4240 for( int i = 0; i <= nth_per_core; i++ ) { 4241 ncores_with_x_procs[ i ] = 0; 4242 ncores_with_x_to_max_procs[ i ] = 0; 4243 } 4244 4245 for( int i = 0; i < ncores; i++ ) { 4246 int cnt = 0; 4247 for( int j = 0; j < nth_per_core; j++ ) { 4248 if( procarr[ i * nth_per_core + j ] != -1 ) { 4249 cnt++; 4250 } 4251 } 4252 nproc_at_core[ i ] = cnt; 4253 ncores_with_x_procs[ cnt ]++; 4254 } 4255 4256 for( int i = 0; i <= nth_per_core; i++ ) { 4257 for( int j = i; j <= nth_per_core; j++ ) { 4258 ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ]; 4259 } 4260 } 4261 4262 // Max number of processors 4263 int nproc = nth_per_core * ncores; 4264 // An array to keep number of threads per each context 4265 int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc ); 4266 for( int i = 0; i < nproc; i++ ) { 4267 newarr[ i ] = 0; 4268 } 4269 4270 int nth = nthreads; 4271 int flag = 0; 4272 while( nth > 0 ) { 4273 for( int j = 1; j <= nth_per_core; j++ ) { 4274 int cnt = ncores_with_x_to_max_procs[ j ]; 4275 for( int i = 0; i < ncores; i++ ) { 4276 // Skip the core with 0 processors 4277 if( nproc_at_core[ i ] == 0 ) { 4278 continue; 4279 } 4280 for( int k = 0; k < nth_per_core; k++ ) { 4281 if( procarr[ i * nth_per_core + k ] != -1 ) { 4282 if( newarr[ i * nth_per_core + k ] == 0 ) { 4283 newarr[ i * nth_per_core + k ] = 1; 4284 cnt--; 4285 nth--; 4286 break; 4287 } else { 4288 if( flag != 0 ) { 4289 newarr[ i * nth_per_core + k ] ++; 4290 cnt--; 4291 nth--; 4292 break; 4293 } 4294 } 4295 } 4296 } 4297 if( cnt == 0 || nth == 0 ) { 4298 break; 4299 } 4300 } 4301 if( nth == 0 ) { 4302 break; 4303 } 4304 } 4305 flag = 1; 4306 } 4307 int sum = 0; 4308 for( int i = 0; i < nproc; i++ ) { 4309 sum += newarr[ i ]; 4310 if( sum > tid ) { 4311 // Granularity == thread 4312 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4313 int osID = procarr[ i ]; 4314 KMP_CPU_SET( osID, mask); 4315 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4316 int coreID = i / nth_per_core; 4317 for( int ii = 0; ii < nth_per_core; ii++ ) { 4318 int osID = procarr[ coreID * nth_per_core + ii ]; 4319 if( osID != -1 ) { 4320 KMP_CPU_SET( osID, mask); 4321 } 4322 } 4323 } 4324 break; 4325 } 4326 } 4327 __kmp_free( newarr ); 4328 } 4329 4330 if (__kmp_affinity_verbose) { 4331 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4332 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4333 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4334 tid, buf); 4335 } 4336 __kmp_set_system_affinity( mask, TRUE ); 4337 } 4338 } 4339 4340 #endif // KMP_AFFINITY_SUPPORTED 4341