1 /* 2 * kmp_affinity.cpp -- affinity management 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "kmp.h" 17 #include "kmp_i18n.h" 18 #include "kmp_io.h" 19 #include "kmp_str.h" 20 #include "kmp_wrapper_getpid.h" 21 #include "kmp_affinity.h" 22 23 // Store the real or imagined machine hierarchy here 24 static hierarchy_info machine_hierarchy; 25 26 void __kmp_cleanup_hierarchy() { 27 machine_hierarchy.fini(); 28 } 29 30 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 31 kmp_uint32 depth; 32 // The test below is true if affinity is available, but set to "none". Need to init on first use of hierarchical barrier. 33 if (TCR_1(machine_hierarchy.uninitialized)) 34 machine_hierarchy.init(NULL, nproc); 35 36 depth = machine_hierarchy.depth; 37 KMP_DEBUG_ASSERT(depth > 0); 38 // Adjust the hierarchy in case num threads exceeds original 39 if (nproc > machine_hierarchy.skipPerLevel[depth-1]) 40 machine_hierarchy.resize(nproc); 41 42 thr_bar->depth = depth; 43 thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1; 44 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 45 } 46 47 #if KMP_AFFINITY_SUPPORTED 48 49 // 50 // Print the affinity mask to the character array in a pretty format. 51 // 52 char * 53 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask) 54 { 55 KMP_ASSERT(buf_len >= 40); 56 char *scan = buf; 57 char *end = buf + buf_len - 1; 58 59 // 60 // Find first element / check for empty set. 61 // 62 size_t i; 63 for (i = 0; i < KMP_CPU_SETSIZE; i++) { 64 if (KMP_CPU_ISSET(i, mask)) { 65 break; 66 } 67 } 68 if (i == KMP_CPU_SETSIZE) { 69 KMP_SNPRINTF(scan, end-scan+1, "{<empty>}"); 70 while (*scan != '\0') scan++; 71 KMP_ASSERT(scan <= end); 72 return buf; 73 } 74 75 KMP_SNPRINTF(scan, end-scan+1, "{%ld", (long)i); 76 while (*scan != '\0') scan++; 77 i++; 78 for (; i < KMP_CPU_SETSIZE; i++) { 79 if (! KMP_CPU_ISSET(i, mask)) { 80 continue; 81 } 82 83 // 84 // Check for buffer overflow. A string of the form ",<n>" will have 85 // at most 10 characters, plus we want to leave room to print ",...}" 86 // if the set is too large to print for a total of 15 characters. 87 // We already left room for '\0' in setting end. 88 // 89 if (end - scan < 15) { 90 break; 91 } 92 KMP_SNPRINTF(scan, end-scan+1, ",%-ld", (long)i); 93 while (*scan != '\0') scan++; 94 } 95 if (i < KMP_CPU_SETSIZE) { 96 KMP_SNPRINTF(scan, end-scan+1, ",..."); 97 while (*scan != '\0') scan++; 98 } 99 KMP_SNPRINTF(scan, end-scan+1, "}"); 100 while (*scan != '\0') scan++; 101 KMP_ASSERT(scan <= end); 102 return buf; 103 } 104 105 106 void 107 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) 108 { 109 KMP_CPU_ZERO(mask); 110 111 # if KMP_GROUP_AFFINITY 112 113 if (__kmp_num_proc_groups > 1) { 114 int group; 115 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 116 for (group = 0; group < __kmp_num_proc_groups; group++) { 117 int i; 118 int num = __kmp_GetActiveProcessorCount(group); 119 for (i = 0; i < num; i++) { 120 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 121 } 122 } 123 } 124 else 125 126 # endif /* KMP_GROUP_AFFINITY */ 127 128 { 129 int proc; 130 for (proc = 0; proc < __kmp_xproc; proc++) { 131 KMP_CPU_SET(proc, mask); 132 } 133 } 134 } 135 136 // 137 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be 138 // called to renumber the labels from [0..n] and place them into the child_num 139 // vector of the address object. This is done in case the labels used for 140 // the children at one node of the hierarchy differ from those used for 141 // another node at the same level. Example: suppose the machine has 2 nodes 142 // with 2 packages each. The first node contains packages 601 and 602, and 143 // second node contains packages 603 and 604. If we try to sort the table 144 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604 145 // because we are paying attention to the labels themselves, not the ordinal 146 // child numbers. By using the child numbers in the sort, the result is 147 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604. 148 // 149 static void 150 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os, 151 int numAddrs) 152 { 153 KMP_DEBUG_ASSERT(numAddrs > 0); 154 int depth = address2os->first.depth; 155 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 156 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth 157 * sizeof(unsigned)); 158 int labCt; 159 for (labCt = 0; labCt < depth; labCt++) { 160 address2os[0].first.childNums[labCt] = counts[labCt] = 0; 161 lastLabel[labCt] = address2os[0].first.labels[labCt]; 162 } 163 int i; 164 for (i = 1; i < numAddrs; i++) { 165 for (labCt = 0; labCt < depth; labCt++) { 166 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) { 167 int labCt2; 168 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) { 169 counts[labCt2] = 0; 170 lastLabel[labCt2] = address2os[i].first.labels[labCt2]; 171 } 172 counts[labCt]++; 173 lastLabel[labCt] = address2os[i].first.labels[labCt]; 174 break; 175 } 176 } 177 for (labCt = 0; labCt < depth; labCt++) { 178 address2os[i].first.childNums[labCt] = counts[labCt]; 179 } 180 for (; labCt < (int)Address::maxDepth; labCt++) { 181 address2os[i].first.childNums[labCt] = 0; 182 } 183 } 184 } 185 186 187 // 188 // All of the __kmp_affinity_create_*_map() routines should set 189 // __kmp_affinity_masks to a vector of affinity mask objects of length 190 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and 191 // return the number of levels in the machine topology tree (zero if 192 // __kmp_affinity_type == affinity_none). 193 // 194 // All of the __kmp_affinity_create_*_map() routines should set *fullMask 195 // to the affinity mask for the initialization thread. They need to save and 196 // restore the mask, and it could be needed later, so saving it is just an 197 // optimization to avoid calling kmp_get_system_affinity() again. 198 // 199 static kmp_affin_mask_t *fullMask = NULL; 200 201 kmp_affin_mask_t * 202 __kmp_affinity_get_fullMask() { return fullMask; } 203 204 205 static int nCoresPerPkg, nPackages; 206 static int __kmp_nThreadsPerCore; 207 #ifndef KMP_DFLT_NTH_CORES 208 static int __kmp_ncores; 209 #endif 210 211 // 212 // __kmp_affinity_uniform_topology() doesn't work when called from 213 // places which support arbitrarily many levels in the machine topology 214 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map() 215 // __kmp_affinity_create_x2apicid_map(). 216 // 217 inline static bool 218 __kmp_affinity_uniform_topology() 219 { 220 return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages); 221 } 222 223 224 // 225 // Print out the detailed machine topology map, i.e. the physical locations 226 // of each OS proc. 227 // 228 static void 229 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth, 230 int pkgLevel, int coreLevel, int threadLevel) 231 { 232 int proc; 233 234 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 235 for (proc = 0; proc < len; proc++) { 236 int level; 237 kmp_str_buf_t buf; 238 __kmp_str_buf_init(&buf); 239 for (level = 0; level < depth; level++) { 240 if (level == threadLevel) { 241 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread)); 242 } 243 else if (level == coreLevel) { 244 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core)); 245 } 246 else if (level == pkgLevel) { 247 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package)); 248 } 249 else if (level > pkgLevel) { 250 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node), 251 level - pkgLevel - 1); 252 } 253 else { 254 __kmp_str_buf_print(&buf, "L%d ", level); 255 } 256 __kmp_str_buf_print(&buf, "%d ", 257 address2os[proc].first.labels[level]); 258 } 259 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second, 260 buf.str); 261 __kmp_str_buf_free(&buf); 262 } 263 } 264 265 266 // 267 // If we don't know how to retrieve the machine's processor topology, or 268 // encounter an error in doing so, this routine is called to form a "flat" 269 // mapping of os thread id's <-> processor id's. 270 // 271 static int 272 __kmp_affinity_create_flat_map(AddrUnsPair **address2os, 273 kmp_i18n_id_t *const msg_id) 274 { 275 *address2os = NULL; 276 *msg_id = kmp_i18n_null; 277 278 // 279 // Even if __kmp_affinity_type == affinity_none, this routine might still 280 // called to set __kmp_ncores, as well as 281 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 282 // 283 if (! KMP_AFFINITY_CAPABLE()) { 284 KMP_ASSERT(__kmp_affinity_type == affinity_none); 285 __kmp_ncores = nPackages = __kmp_xproc; 286 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 287 if (__kmp_affinity_verbose) { 288 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY"); 289 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 290 KMP_INFORM(Uniform, "KMP_AFFINITY"); 291 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 292 __kmp_nThreadsPerCore, __kmp_ncores); 293 } 294 return 0; 295 } 296 297 // 298 // When affinity is off, this routine will still be called to set 299 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 300 // nCoresPerPkg, & nPackages. Make sure all these vars are set 301 // correctly, and return now if affinity is not enabled. 302 // 303 __kmp_ncores = nPackages = __kmp_avail_proc; 304 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 305 if (__kmp_affinity_verbose) { 306 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 307 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask); 308 309 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY"); 310 if (__kmp_affinity_respect_mask) { 311 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 312 } else { 313 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 314 } 315 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 316 KMP_INFORM(Uniform, "KMP_AFFINITY"); 317 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 318 __kmp_nThreadsPerCore, __kmp_ncores); 319 } 320 if (__kmp_affinity_type == affinity_none) { 321 return 0; 322 } 323 324 // 325 // Contruct the data structure to be returned. 326 // 327 *address2os = (AddrUnsPair*) 328 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 329 int avail_ct = 0; 330 unsigned int i; 331 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 332 // 333 // Skip this proc if it is not included in the machine model. 334 // 335 if (! KMP_CPU_ISSET(i, fullMask)) { 336 continue; 337 } 338 339 Address addr(1); 340 addr.labels[0] = i; 341 (*address2os)[avail_ct++] = AddrUnsPair(addr,i); 342 } 343 if (__kmp_affinity_verbose) { 344 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 345 } 346 347 if (__kmp_affinity_gran_levels < 0) { 348 // 349 // Only the package level is modeled in the machine topology map, 350 // so the #levels of granularity is either 0 or 1. 351 // 352 if (__kmp_affinity_gran > affinity_gran_package) { 353 __kmp_affinity_gran_levels = 1; 354 } 355 else { 356 __kmp_affinity_gran_levels = 0; 357 } 358 } 359 return 1; 360 } 361 362 363 # if KMP_GROUP_AFFINITY 364 365 // 366 // If multiple Windows* OS processor groups exist, we can create a 2-level 367 // topology map with the groups at level 0 and the individual procs at 368 // level 1. 369 // 370 // This facilitates letting the threads float among all procs in a group, 371 // if granularity=group (the default when there are multiple groups). 372 // 373 static int 374 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os, 375 kmp_i18n_id_t *const msg_id) 376 { 377 *address2os = NULL; 378 *msg_id = kmp_i18n_null; 379 380 // 381 // If we don't have multiple processor groups, return now. 382 // The flat mapping will be used. 383 // 384 if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) { 385 // FIXME set *msg_id 386 return -1; 387 } 388 389 // 390 // Contruct the data structure to be returned. 391 // 392 *address2os = (AddrUnsPair*) 393 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 394 int avail_ct = 0; 395 int i; 396 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 397 // 398 // Skip this proc if it is not included in the machine model. 399 // 400 if (! KMP_CPU_ISSET(i, fullMask)) { 401 continue; 402 } 403 404 Address addr(2); 405 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR)); 406 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR)); 407 (*address2os)[avail_ct++] = AddrUnsPair(addr,i); 408 409 if (__kmp_affinity_verbose) { 410 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0], 411 addr.labels[1]); 412 } 413 } 414 415 if (__kmp_affinity_gran_levels < 0) { 416 if (__kmp_affinity_gran == affinity_gran_group) { 417 __kmp_affinity_gran_levels = 1; 418 } 419 else if ((__kmp_affinity_gran == affinity_gran_fine) 420 || (__kmp_affinity_gran == affinity_gran_thread)) { 421 __kmp_affinity_gran_levels = 0; 422 } 423 else { 424 const char *gran_str = NULL; 425 if (__kmp_affinity_gran == affinity_gran_core) { 426 gran_str = "core"; 427 } 428 else if (__kmp_affinity_gran == affinity_gran_package) { 429 gran_str = "package"; 430 } 431 else if (__kmp_affinity_gran == affinity_gran_node) { 432 gran_str = "node"; 433 } 434 else { 435 KMP_ASSERT(0); 436 } 437 438 // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread" 439 __kmp_affinity_gran_levels = 0; 440 } 441 } 442 return 2; 443 } 444 445 # endif /* KMP_GROUP_AFFINITY */ 446 447 448 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 449 450 static int 451 __kmp_cpuid_mask_width(int count) { 452 int r = 0; 453 454 while((1<<r) < count) 455 ++r; 456 return r; 457 } 458 459 460 class apicThreadInfo { 461 public: 462 unsigned osId; // param to __kmp_affinity_bind_thread 463 unsigned apicId; // from cpuid after binding 464 unsigned maxCoresPerPkg; // "" 465 unsigned maxThreadsPerPkg; // "" 466 unsigned pkgId; // inferred from above values 467 unsigned coreId; // "" 468 unsigned threadId; // "" 469 }; 470 471 472 static int 473 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b) 474 { 475 const apicThreadInfo *aa = (const apicThreadInfo *)a; 476 const apicThreadInfo *bb = (const apicThreadInfo *)b; 477 if (aa->osId < bb->osId) return -1; 478 if (aa->osId > bb->osId) return 1; 479 return 0; 480 } 481 482 483 static int 484 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b) 485 { 486 const apicThreadInfo *aa = (const apicThreadInfo *)a; 487 const apicThreadInfo *bb = (const apicThreadInfo *)b; 488 if (aa->pkgId < bb->pkgId) return -1; 489 if (aa->pkgId > bb->pkgId) return 1; 490 if (aa->coreId < bb->coreId) return -1; 491 if (aa->coreId > bb->coreId) return 1; 492 if (aa->threadId < bb->threadId) return -1; 493 if (aa->threadId > bb->threadId) return 1; 494 return 0; 495 } 496 497 498 // 499 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 500 // an algorithm which cycles through the available os threads, setting 501 // the current thread's affinity mask to that thread, and then retrieves 502 // the Apic Id for each thread context using the cpuid instruction. 503 // 504 static int 505 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os, 506 kmp_i18n_id_t *const msg_id) 507 { 508 kmp_cpuid buf; 509 int rc; 510 *address2os = NULL; 511 *msg_id = kmp_i18n_null; 512 513 // 514 // Check if cpuid leaf 4 is supported. 515 // 516 __kmp_x86_cpuid(0, 0, &buf); 517 if (buf.eax < 4) { 518 *msg_id = kmp_i18n_str_NoLeaf4Support; 519 return -1; 520 } 521 522 // 523 // The algorithm used starts by setting the affinity to each available 524 // thread and retrieving info from the cpuid instruction, so if we are 525 // not capable of calling __kmp_get_system_affinity() and 526 // _kmp_get_system_affinity(), then we need to do something else - use 527 // the defaults that we calculated from issuing cpuid without binding 528 // to each proc. 529 // 530 if (! KMP_AFFINITY_CAPABLE()) { 531 // 532 // Hack to try and infer the machine topology using only the data 533 // available from cpuid on the current thread, and __kmp_xproc. 534 // 535 KMP_ASSERT(__kmp_affinity_type == affinity_none); 536 537 // 538 // Get an upper bound on the number of threads per package using 539 // cpuid(1). 540 // 541 // On some OS/chps combinations where HT is supported by the chip 542 // but is disabled, this value will be 2 on a single core chip. 543 // Usually, it will be 2 if HT is enabled and 1 if HT is disabled. 544 // 545 __kmp_x86_cpuid(1, 0, &buf); 546 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 547 if (maxThreadsPerPkg == 0) { 548 maxThreadsPerPkg = 1; 549 } 550 551 // 552 // The num cores per pkg comes from cpuid(4). 553 // 1 must be added to the encoded value. 554 // 555 // The author of cpu_count.cpp treated this only an upper bound 556 // on the number of cores, but I haven't seen any cases where it 557 // was greater than the actual number of cores, so we will treat 558 // it as exact in this block of code. 559 // 560 // First, we need to check if cpuid(4) is supported on this chip. 561 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax 562 // has the value n or greater. 563 // 564 __kmp_x86_cpuid(0, 0, &buf); 565 if (buf.eax >= 4) { 566 __kmp_x86_cpuid(4, 0, &buf); 567 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 568 } 569 else { 570 nCoresPerPkg = 1; 571 } 572 573 // 574 // There is no way to reliably tell if HT is enabled without issuing 575 // the cpuid instruction from every thread, can correlating the cpuid 576 // info, so if the machine is not affinity capable, we assume that HT 577 // is off. We have seen quite a few machines where maxThreadsPerPkg 578 // is 2, yet the machine does not support HT. 579 // 580 // - Older OSes are usually found on machines with older chips, which 581 // do not support HT. 582 // 583 // - The performance penalty for mistakenly identifying a machine as 584 // HT when it isn't (which results in blocktime being incorrecly set 585 // to 0) is greater than the penalty when for mistakenly identifying 586 // a machine as being 1 thread/core when it is really HT enabled 587 // (which results in blocktime being incorrectly set to a positive 588 // value). 589 // 590 __kmp_ncores = __kmp_xproc; 591 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 592 __kmp_nThreadsPerCore = 1; 593 if (__kmp_affinity_verbose) { 594 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY"); 595 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 596 if (__kmp_affinity_uniform_topology()) { 597 KMP_INFORM(Uniform, "KMP_AFFINITY"); 598 } else { 599 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 600 } 601 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 602 __kmp_nThreadsPerCore, __kmp_ncores); 603 } 604 return 0; 605 } 606 607 // 608 // 609 // From here on, we can assume that it is safe to call 610 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), 611 // even if __kmp_affinity_type = affinity_none. 612 // 613 614 // 615 // Save the affinity mask for the current thread. 616 // 617 kmp_affin_mask_t *oldMask; 618 KMP_CPU_ALLOC(oldMask); 619 KMP_ASSERT(oldMask != NULL); 620 __kmp_get_system_affinity(oldMask, TRUE); 621 622 // 623 // Run through each of the available contexts, binding the current thread 624 // to it, and obtaining the pertinent information using the cpuid instr. 625 // 626 // The relevant information is: 627 // 628 // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 629 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 630 // 631 // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The 632 // value of this field determines the width of the core# + thread# 633 // fields in the Apic Id. It is also an upper bound on the number 634 // of threads per package, but it has been verified that situations 635 // happen were it is not exact. In particular, on certain OS/chip 636 // combinations where Intel(R) Hyper-Threading Technology is supported 637 // by the chip but has 638 // been disabled, the value of this field will be 2 (for a single core 639 // chip). On other OS/chip combinations supporting 640 // Intel(R) Hyper-Threading Technology, the value of 641 // this field will be 1 when Intel(R) Hyper-Threading Technology is 642 // disabled and 2 when it is enabled. 643 // 644 // Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The 645 // value of this field (+1) determines the width of the core# field in 646 // the Apic Id. The comments in "cpucount.cpp" say that this value is 647 // an upper bound, but the IA-32 architecture manual says that it is 648 // exactly the number of cores per package, and I haven't seen any 649 // case where it wasn't. 650 // 651 // From this information, deduce the package Id, core Id, and thread Id, 652 // and set the corresponding fields in the apicThreadInfo struct. 653 // 654 unsigned i; 655 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 656 __kmp_avail_proc * sizeof(apicThreadInfo)); 657 unsigned nApics = 0; 658 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 659 // 660 // Skip this proc if it is not included in the machine model. 661 // 662 if (! KMP_CPU_ISSET(i, fullMask)) { 663 continue; 664 } 665 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 666 667 __kmp_affinity_bind_thread(i); 668 threadInfo[nApics].osId = i; 669 670 // 671 // The apic id and max threads per pkg come from cpuid(1). 672 // 673 __kmp_x86_cpuid(1, 0, &buf); 674 if (! (buf.edx >> 9) & 1) { 675 __kmp_set_system_affinity(oldMask, TRUE); 676 __kmp_free(threadInfo); 677 KMP_CPU_FREE(oldMask); 678 *msg_id = kmp_i18n_str_ApicNotPresent; 679 return -1; 680 } 681 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 682 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 683 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 684 threadInfo[nApics].maxThreadsPerPkg = 1; 685 } 686 687 // 688 // Max cores per pkg comes from cpuid(4). 689 // 1 must be added to the encoded value. 690 // 691 // First, we need to check if cpuid(4) is supported on this chip. 692 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax 693 // has the value n or greater. 694 // 695 __kmp_x86_cpuid(0, 0, &buf); 696 if (buf.eax >= 4) { 697 __kmp_x86_cpuid(4, 0, &buf); 698 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 699 } 700 else { 701 threadInfo[nApics].maxCoresPerPkg = 1; 702 } 703 704 // 705 // Infer the pkgId / coreId / threadId using only the info 706 // obtained locally. 707 // 708 int widthCT = __kmp_cpuid_mask_width( 709 threadInfo[nApics].maxThreadsPerPkg); 710 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 711 712 int widthC = __kmp_cpuid_mask_width( 713 threadInfo[nApics].maxCoresPerPkg); 714 int widthT = widthCT - widthC; 715 if (widthT < 0) { 716 // 717 // I've never seen this one happen, but I suppose it could, if 718 // the cpuid instruction on a chip was really screwed up. 719 // Make sure to restore the affinity mask before the tail call. 720 // 721 __kmp_set_system_affinity(oldMask, TRUE); 722 __kmp_free(threadInfo); 723 KMP_CPU_FREE(oldMask); 724 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 725 return -1; 726 } 727 728 int maskC = (1 << widthC) - 1; 729 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) 730 &maskC; 731 732 int maskT = (1 << widthT) - 1; 733 threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT; 734 735 nApics++; 736 } 737 738 // 739 // We've collected all the info we need. 740 // Restore the old affinity mask for this thread. 741 // 742 __kmp_set_system_affinity(oldMask, TRUE); 743 744 // 745 // If there's only one thread context to bind to, form an Address object 746 // with depth 1 and return immediately (or, if affinity is off, set 747 // address2os to NULL and return). 748 // 749 // If it is configured to omit the package level when there is only a 750 // single package, the logic at the end of this routine won't work if 751 // there is only a single thread - it would try to form an Address 752 // object with depth 0. 753 // 754 KMP_ASSERT(nApics > 0); 755 if (nApics == 1) { 756 __kmp_ncores = nPackages = 1; 757 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 758 if (__kmp_affinity_verbose) { 759 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 760 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 761 762 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 763 if (__kmp_affinity_respect_mask) { 764 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 765 } else { 766 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 767 } 768 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 769 KMP_INFORM(Uniform, "KMP_AFFINITY"); 770 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 771 __kmp_nThreadsPerCore, __kmp_ncores); 772 } 773 774 if (__kmp_affinity_type == affinity_none) { 775 __kmp_free(threadInfo); 776 KMP_CPU_FREE(oldMask); 777 return 0; 778 } 779 780 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair)); 781 Address addr(1); 782 addr.labels[0] = threadInfo[0].pkgId; 783 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId); 784 785 if (__kmp_affinity_gran_levels < 0) { 786 __kmp_affinity_gran_levels = 0; 787 } 788 789 if (__kmp_affinity_verbose) { 790 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 791 } 792 793 __kmp_free(threadInfo); 794 KMP_CPU_FREE(oldMask); 795 return 1; 796 } 797 798 // 799 // Sort the threadInfo table by physical Id. 800 // 801 qsort(threadInfo, nApics, sizeof(*threadInfo), 802 __kmp_affinity_cmp_apicThreadInfo_phys_id); 803 804 // 805 // The table is now sorted by pkgId / coreId / threadId, but we really 806 // don't know the radix of any of the fields. pkgId's may be sparsely 807 // assigned among the chips on a system. Although coreId's are usually 808 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned 809 // [0..threadsPerCore-1], we don't want to make any such assumptions. 810 // 811 // For that matter, we don't know what coresPerPkg and threadsPerCore 812 // (or the total # packages) are at this point - we want to determine 813 // that now. We only have an upper bound on the first two figures. 814 // 815 // We also perform a consistency check at this point: the values returned 816 // by the cpuid instruction for any thread bound to a given package had 817 // better return the same info for maxThreadsPerPkg and maxCoresPerPkg. 818 // 819 nPackages = 1; 820 nCoresPerPkg = 1; 821 __kmp_nThreadsPerCore = 1; 822 unsigned nCores = 1; 823 824 unsigned pkgCt = 1; // to determine radii 825 unsigned lastPkgId = threadInfo[0].pkgId; 826 unsigned coreCt = 1; 827 unsigned lastCoreId = threadInfo[0].coreId; 828 unsigned threadCt = 1; 829 unsigned lastThreadId = threadInfo[0].threadId; 830 831 // intra-pkg consist checks 832 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 833 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 834 835 for (i = 1; i < nApics; i++) { 836 if (threadInfo[i].pkgId != lastPkgId) { 837 nCores++; 838 pkgCt++; 839 lastPkgId = threadInfo[i].pkgId; 840 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; 841 coreCt = 1; 842 lastCoreId = threadInfo[i].coreId; 843 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 844 threadCt = 1; 845 lastThreadId = threadInfo[i].threadId; 846 847 // 848 // This is a different package, so go on to the next iteration 849 // without doing any consistency checks. Reset the consistency 850 // check vars, though. 851 // 852 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 853 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 854 continue; 855 } 856 857 if (threadInfo[i].coreId != lastCoreId) { 858 nCores++; 859 coreCt++; 860 lastCoreId = threadInfo[i].coreId; 861 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 862 threadCt = 1; 863 lastThreadId = threadInfo[i].threadId; 864 } 865 else if (threadInfo[i].threadId != lastThreadId) { 866 threadCt++; 867 lastThreadId = threadInfo[i].threadId; 868 } 869 else { 870 __kmp_free(threadInfo); 871 KMP_CPU_FREE(oldMask); 872 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 873 return -1; 874 } 875 876 // 877 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 878 // fields agree between all the threads bounds to a given package. 879 // 880 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) 881 || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 882 __kmp_free(threadInfo); 883 KMP_CPU_FREE(oldMask); 884 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 885 return -1; 886 } 887 } 888 nPackages = pkgCt; 889 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; 890 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 891 892 // 893 // When affinity is off, this routine will still be called to set 894 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 895 // nCoresPerPkg, & nPackages. Make sure all these vars are set 896 // correctly, and return now if affinity is not enabled. 897 // 898 __kmp_ncores = nCores; 899 if (__kmp_affinity_verbose) { 900 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 901 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 902 903 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 904 if (__kmp_affinity_respect_mask) { 905 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 906 } else { 907 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 908 } 909 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 910 if (__kmp_affinity_uniform_topology()) { 911 KMP_INFORM(Uniform, "KMP_AFFINITY"); 912 } else { 913 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 914 } 915 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 916 __kmp_nThreadsPerCore, __kmp_ncores); 917 918 } 919 920 if (__kmp_affinity_type == affinity_none) { 921 __kmp_free(threadInfo); 922 KMP_CPU_FREE(oldMask); 923 return 0; 924 } 925 926 // 927 // Now that we've determined the number of packages, the number of cores 928 // per package, and the number of threads per core, we can construct the 929 // data structure that is to be returned. 930 // 931 int pkgLevel = 0; 932 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1; 933 int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 934 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 935 936 KMP_ASSERT(depth > 0); 937 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 938 939 for (i = 0; i < nApics; ++i) { 940 Address addr(depth); 941 unsigned os = threadInfo[i].osId; 942 int d = 0; 943 944 if (pkgLevel >= 0) { 945 addr.labels[d++] = threadInfo[i].pkgId; 946 } 947 if (coreLevel >= 0) { 948 addr.labels[d++] = threadInfo[i].coreId; 949 } 950 if (threadLevel >= 0) { 951 addr.labels[d++] = threadInfo[i].threadId; 952 } 953 (*address2os)[i] = AddrUnsPair(addr, os); 954 } 955 956 if (__kmp_affinity_gran_levels < 0) { 957 // 958 // Set the granularity level based on what levels are modeled 959 // in the machine topology map. 960 // 961 __kmp_affinity_gran_levels = 0; 962 if ((threadLevel >= 0) 963 && (__kmp_affinity_gran > affinity_gran_thread)) { 964 __kmp_affinity_gran_levels++; 965 } 966 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 967 __kmp_affinity_gran_levels++; 968 } 969 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) { 970 __kmp_affinity_gran_levels++; 971 } 972 } 973 974 if (__kmp_affinity_verbose) { 975 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel, 976 coreLevel, threadLevel); 977 } 978 979 __kmp_free(threadInfo); 980 KMP_CPU_FREE(oldMask); 981 return depth; 982 } 983 984 985 // 986 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 987 // architectures support a newer interface for specifying the x2APIC Ids, 988 // based on cpuid leaf 11. 989 // 990 static int 991 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os, 992 kmp_i18n_id_t *const msg_id) 993 { 994 kmp_cpuid buf; 995 996 *address2os = NULL; 997 *msg_id = kmp_i18n_null; 998 999 // 1000 // Check to see if cpuid leaf 11 is supported. 1001 // 1002 __kmp_x86_cpuid(0, 0, &buf); 1003 if (buf.eax < 11) { 1004 *msg_id = kmp_i18n_str_NoLeaf11Support; 1005 return -1; 1006 } 1007 __kmp_x86_cpuid(11, 0, &buf); 1008 if (buf.ebx == 0) { 1009 *msg_id = kmp_i18n_str_NoLeaf11Support; 1010 return -1; 1011 } 1012 1013 // 1014 // Find the number of levels in the machine topology. While we're at it, 1015 // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will 1016 // try to get more accurate values later by explicitly counting them, 1017 // but get reasonable defaults now, in case we return early. 1018 // 1019 int level; 1020 int threadLevel = -1; 1021 int coreLevel = -1; 1022 int pkgLevel = -1; 1023 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1024 1025 for (level = 0;; level++) { 1026 if (level > 31) { 1027 // 1028 // FIXME: Hack for DPD200163180 1029 // 1030 // If level is big then something went wrong -> exiting 1031 // 1032 // There could actually be 32 valid levels in the machine topology, 1033 // but so far, the only machine we have seen which does not exit 1034 // this loop before iteration 32 has fubar x2APIC settings. 1035 // 1036 // For now, just reject this case based upon loop trip count. 1037 // 1038 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1039 return -1; 1040 } 1041 __kmp_x86_cpuid(11, level, &buf); 1042 if (buf.ebx == 0) { 1043 if (pkgLevel < 0) { 1044 // 1045 // Will infer nPackages from __kmp_xproc 1046 // 1047 pkgLevel = level; 1048 level++; 1049 } 1050 break; 1051 } 1052 int kind = (buf.ecx >> 8) & 0xff; 1053 if (kind == 1) { 1054 // 1055 // SMT level 1056 // 1057 threadLevel = level; 1058 coreLevel = -1; 1059 pkgLevel = -1; 1060 __kmp_nThreadsPerCore = buf.ebx & 0xff; 1061 if (__kmp_nThreadsPerCore == 0) { 1062 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1063 return -1; 1064 } 1065 } 1066 else if (kind == 2) { 1067 // 1068 // core level 1069 // 1070 coreLevel = level; 1071 pkgLevel = -1; 1072 nCoresPerPkg = buf.ebx & 0xff; 1073 if (nCoresPerPkg == 0) { 1074 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1075 return -1; 1076 } 1077 } 1078 else { 1079 if (level <= 0) { 1080 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1081 return -1; 1082 } 1083 if (pkgLevel >= 0) { 1084 continue; 1085 } 1086 pkgLevel = level; 1087 nPackages = buf.ebx & 0xff; 1088 if (nPackages == 0) { 1089 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1090 return -1; 1091 } 1092 } 1093 } 1094 int depth = level; 1095 1096 // 1097 // In the above loop, "level" was counted from the finest level (usually 1098 // thread) to the coarsest. The caller expects that we will place the 1099 // labels in (*address2os)[].first.labels[] in the inverse order, so 1100 // we need to invert the vars saying which level means what. 1101 // 1102 if (threadLevel >= 0) { 1103 threadLevel = depth - threadLevel - 1; 1104 } 1105 if (coreLevel >= 0) { 1106 coreLevel = depth - coreLevel - 1; 1107 } 1108 KMP_DEBUG_ASSERT(pkgLevel >= 0); 1109 pkgLevel = depth - pkgLevel - 1; 1110 1111 // 1112 // The algorithm used starts by setting the affinity to each available 1113 // thread and retrieving info from the cpuid instruction, so if we are 1114 // not capable of calling __kmp_get_system_affinity() and 1115 // _kmp_get_system_affinity(), then we need to do something else - use 1116 // the defaults that we calculated from issuing cpuid without binding 1117 // to each proc. 1118 // 1119 if (! KMP_AFFINITY_CAPABLE()) 1120 { 1121 // 1122 // Hack to try and infer the machine topology using only the data 1123 // available from cpuid on the current thread, and __kmp_xproc. 1124 // 1125 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1126 1127 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1128 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1129 if (__kmp_affinity_verbose) { 1130 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 1131 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1132 if (__kmp_affinity_uniform_topology()) { 1133 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1134 } else { 1135 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1136 } 1137 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1138 __kmp_nThreadsPerCore, __kmp_ncores); 1139 } 1140 return 0; 1141 } 1142 1143 // 1144 // 1145 // From here on, we can assume that it is safe to call 1146 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), 1147 // even if __kmp_affinity_type = affinity_none. 1148 // 1149 1150 // 1151 // Save the affinity mask for the current thread. 1152 // 1153 kmp_affin_mask_t *oldMask; 1154 KMP_CPU_ALLOC(oldMask); 1155 __kmp_get_system_affinity(oldMask, TRUE); 1156 1157 // 1158 // Allocate the data structure to be returned. 1159 // 1160 AddrUnsPair *retval = (AddrUnsPair *) 1161 __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 1162 1163 // 1164 // Run through each of the available contexts, binding the current thread 1165 // to it, and obtaining the pertinent information using the cpuid instr. 1166 // 1167 unsigned int proc; 1168 int nApics = 0; 1169 for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) { 1170 // 1171 // Skip this proc if it is not included in the machine model. 1172 // 1173 if (! KMP_CPU_ISSET(proc, fullMask)) { 1174 continue; 1175 } 1176 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc); 1177 1178 __kmp_affinity_bind_thread(proc); 1179 1180 // 1181 // Extrach the labels for each level in the machine topology map 1182 // from the Apic ID. 1183 // 1184 Address addr(depth); 1185 int prev_shift = 0; 1186 1187 for (level = 0; level < depth; level++) { 1188 __kmp_x86_cpuid(11, level, &buf); 1189 unsigned apicId = buf.edx; 1190 if (buf.ebx == 0) { 1191 if (level != depth - 1) { 1192 KMP_CPU_FREE(oldMask); 1193 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1194 return -1; 1195 } 1196 addr.labels[depth - level - 1] = apicId >> prev_shift; 1197 level++; 1198 break; 1199 } 1200 int shift = buf.eax & 0x1f; 1201 int mask = (1 << shift) - 1; 1202 addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift; 1203 prev_shift = shift; 1204 } 1205 if (level != depth) { 1206 KMP_CPU_FREE(oldMask); 1207 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1208 return -1; 1209 } 1210 1211 retval[nApics] = AddrUnsPair(addr, proc); 1212 nApics++; 1213 } 1214 1215 // 1216 // We've collected all the info we need. 1217 // Restore the old affinity mask for this thread. 1218 // 1219 __kmp_set_system_affinity(oldMask, TRUE); 1220 1221 // 1222 // If there's only one thread context to bind to, return now. 1223 // 1224 KMP_ASSERT(nApics > 0); 1225 if (nApics == 1) { 1226 __kmp_ncores = nPackages = 1; 1227 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1228 if (__kmp_affinity_verbose) { 1229 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1230 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1231 1232 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1233 if (__kmp_affinity_respect_mask) { 1234 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1235 } else { 1236 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1237 } 1238 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1239 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1240 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1241 __kmp_nThreadsPerCore, __kmp_ncores); 1242 } 1243 1244 if (__kmp_affinity_type == affinity_none) { 1245 __kmp_free(retval); 1246 KMP_CPU_FREE(oldMask); 1247 return 0; 1248 } 1249 1250 // 1251 // Form an Address object which only includes the package level. 1252 // 1253 Address addr(1); 1254 addr.labels[0] = retval[0].first.labels[pkgLevel]; 1255 retval[0].first = addr; 1256 1257 if (__kmp_affinity_gran_levels < 0) { 1258 __kmp_affinity_gran_levels = 0; 1259 } 1260 1261 if (__kmp_affinity_verbose) { 1262 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 1263 } 1264 1265 *address2os = retval; 1266 KMP_CPU_FREE(oldMask); 1267 return 1; 1268 } 1269 1270 // 1271 // Sort the table by physical Id. 1272 // 1273 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels); 1274 1275 // 1276 // Find the radix at each of the levels. 1277 // 1278 unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1279 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1280 unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1281 unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1282 for (level = 0; level < depth; level++) { 1283 totals[level] = 1; 1284 maxCt[level] = 1; 1285 counts[level] = 1; 1286 last[level] = retval[0].first.labels[level]; 1287 } 1288 1289 // 1290 // From here on, the iteration variable "level" runs from the finest 1291 // level to the coarsest, i.e. we iterate forward through 1292 // (*address2os)[].first.labels[] - in the previous loops, we iterated 1293 // backwards. 1294 // 1295 for (proc = 1; (int)proc < nApics; proc++) { 1296 int level; 1297 for (level = 0; level < depth; level++) { 1298 if (retval[proc].first.labels[level] != last[level]) { 1299 int j; 1300 for (j = level + 1; j < depth; j++) { 1301 totals[j]++; 1302 counts[j] = 1; 1303 // The line below causes printing incorrect topology information 1304 // in case the max value for some level (maxCt[level]) is encountered earlier than 1305 // some less value while going through the array. 1306 // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2 1307 // whereas it must be 4. 1308 // TODO!!! Check if it can be commented safely 1309 //maxCt[j] = 1; 1310 last[j] = retval[proc].first.labels[j]; 1311 } 1312 totals[level]++; 1313 counts[level]++; 1314 if (counts[level] > maxCt[level]) { 1315 maxCt[level] = counts[level]; 1316 } 1317 last[level] = retval[proc].first.labels[level]; 1318 break; 1319 } 1320 else if (level == depth - 1) { 1321 __kmp_free(last); 1322 __kmp_free(maxCt); 1323 __kmp_free(counts); 1324 __kmp_free(totals); 1325 __kmp_free(retval); 1326 KMP_CPU_FREE(oldMask); 1327 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 1328 return -1; 1329 } 1330 } 1331 } 1332 1333 // 1334 // When affinity is off, this routine will still be called to set 1335 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 1336 // nCoresPerPkg, & nPackages. Make sure all these vars are set 1337 // correctly, and return if affinity is not enabled. 1338 // 1339 if (threadLevel >= 0) { 1340 __kmp_nThreadsPerCore = maxCt[threadLevel]; 1341 } 1342 else { 1343 __kmp_nThreadsPerCore = 1; 1344 } 1345 nPackages = totals[pkgLevel]; 1346 1347 if (coreLevel >= 0) { 1348 __kmp_ncores = totals[coreLevel]; 1349 nCoresPerPkg = maxCt[coreLevel]; 1350 } 1351 else { 1352 __kmp_ncores = nPackages; 1353 nCoresPerPkg = 1; 1354 } 1355 1356 // 1357 // Check to see if the machine topology is uniform 1358 // 1359 unsigned prod = maxCt[0]; 1360 for (level = 1; level < depth; level++) { 1361 prod *= maxCt[level]; 1362 } 1363 bool uniform = (prod == totals[level - 1]); 1364 1365 // 1366 // Print the machine topology summary. 1367 // 1368 if (__kmp_affinity_verbose) { 1369 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 1370 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1371 1372 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1373 if (__kmp_affinity_respect_mask) { 1374 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 1375 } else { 1376 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 1377 } 1378 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1379 if (uniform) { 1380 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1381 } else { 1382 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1383 } 1384 1385 kmp_str_buf_t buf; 1386 __kmp_str_buf_init(&buf); 1387 1388 __kmp_str_buf_print(&buf, "%d", totals[0]); 1389 for (level = 1; level <= pkgLevel; level++) { 1390 __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 1391 } 1392 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 1393 __kmp_nThreadsPerCore, __kmp_ncores); 1394 1395 __kmp_str_buf_free(&buf); 1396 } 1397 1398 if (__kmp_affinity_type == affinity_none) { 1399 __kmp_free(last); 1400 __kmp_free(maxCt); 1401 __kmp_free(counts); 1402 __kmp_free(totals); 1403 __kmp_free(retval); 1404 KMP_CPU_FREE(oldMask); 1405 return 0; 1406 } 1407 1408 // 1409 // Find any levels with radiix 1, and remove them from the map 1410 // (except for the package level). 1411 // 1412 int new_depth = 0; 1413 for (level = 0; level < depth; level++) { 1414 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1415 continue; 1416 } 1417 new_depth++; 1418 } 1419 1420 // 1421 // If we are removing any levels, allocate a new vector to return, 1422 // and copy the relevant information to it. 1423 // 1424 if (new_depth != depth) { 1425 AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate( 1426 sizeof(AddrUnsPair) * nApics); 1427 for (proc = 0; (int)proc < nApics; proc++) { 1428 Address addr(new_depth); 1429 new_retval[proc] = AddrUnsPair(addr, retval[proc].second); 1430 } 1431 int new_level = 0; 1432 int newPkgLevel = -1; 1433 int newCoreLevel = -1; 1434 int newThreadLevel = -1; 1435 int i; 1436 for (level = 0; level < depth; level++) { 1437 if ((maxCt[level] == 1) 1438 && (level != pkgLevel)) { 1439 // 1440 // Remove this level. Never remove the package level 1441 // 1442 continue; 1443 } 1444 if (level == pkgLevel) { 1445 newPkgLevel = level; 1446 } 1447 if (level == coreLevel) { 1448 newCoreLevel = level; 1449 } 1450 if (level == threadLevel) { 1451 newThreadLevel = level; 1452 } 1453 for (proc = 0; (int)proc < nApics; proc++) { 1454 new_retval[proc].first.labels[new_level] 1455 = retval[proc].first.labels[level]; 1456 } 1457 new_level++; 1458 } 1459 1460 __kmp_free(retval); 1461 retval = new_retval; 1462 depth = new_depth; 1463 pkgLevel = newPkgLevel; 1464 coreLevel = newCoreLevel; 1465 threadLevel = newThreadLevel; 1466 } 1467 1468 if (__kmp_affinity_gran_levels < 0) { 1469 // 1470 // Set the granularity level based on what levels are modeled 1471 // in the machine topology map. 1472 // 1473 __kmp_affinity_gran_levels = 0; 1474 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1475 __kmp_affinity_gran_levels++; 1476 } 1477 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1478 __kmp_affinity_gran_levels++; 1479 } 1480 if (__kmp_affinity_gran > affinity_gran_package) { 1481 __kmp_affinity_gran_levels++; 1482 } 1483 } 1484 1485 if (__kmp_affinity_verbose) { 1486 __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, 1487 coreLevel, threadLevel); 1488 } 1489 1490 __kmp_free(last); 1491 __kmp_free(maxCt); 1492 __kmp_free(counts); 1493 __kmp_free(totals); 1494 KMP_CPU_FREE(oldMask); 1495 *address2os = retval; 1496 return depth; 1497 } 1498 1499 1500 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1501 1502 1503 #define osIdIndex 0 1504 #define threadIdIndex 1 1505 #define coreIdIndex 2 1506 #define pkgIdIndex 3 1507 #define nodeIdIndex 4 1508 1509 typedef unsigned *ProcCpuInfo; 1510 static unsigned maxIndex = pkgIdIndex; 1511 1512 1513 static int 1514 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b) 1515 { 1516 const unsigned *aa = (const unsigned *)a; 1517 const unsigned *bb = (const unsigned *)b; 1518 if (aa[osIdIndex] < bb[osIdIndex]) return -1; 1519 if (aa[osIdIndex] > bb[osIdIndex]) return 1; 1520 return 0; 1521 }; 1522 1523 1524 static int 1525 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b) 1526 { 1527 unsigned i; 1528 const unsigned *aa = *((const unsigned **)a); 1529 const unsigned *bb = *((const unsigned **)b); 1530 for (i = maxIndex; ; i--) { 1531 if (aa[i] < bb[i]) return -1; 1532 if (aa[i] > bb[i]) return 1; 1533 if (i == osIdIndex) break; 1534 } 1535 return 0; 1536 } 1537 1538 1539 // 1540 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 1541 // affinity map. 1542 // 1543 static int 1544 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line, 1545 kmp_i18n_id_t *const msg_id, FILE *f) 1546 { 1547 *address2os = NULL; 1548 *msg_id = kmp_i18n_null; 1549 1550 // 1551 // Scan of the file, and count the number of "processor" (osId) fields, 1552 // and find the highest value of <n> for a node_<n> field. 1553 // 1554 char buf[256]; 1555 unsigned num_records = 0; 1556 while (! feof(f)) { 1557 buf[sizeof(buf) - 1] = 1; 1558 if (! fgets(buf, sizeof(buf), f)) { 1559 // 1560 // Read errors presumably because of EOF 1561 // 1562 break; 1563 } 1564 1565 char s1[] = "processor"; 1566 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1567 num_records++; 1568 continue; 1569 } 1570 1571 // 1572 // FIXME - this will match "node_<n> <garbage>" 1573 // 1574 unsigned level; 1575 if (KMP_SSCANF(buf, "node_%d id", &level) == 1) { 1576 if (nodeIdIndex + level >= maxIndex) { 1577 maxIndex = nodeIdIndex + level; 1578 } 1579 continue; 1580 } 1581 } 1582 1583 // 1584 // Check for empty file / no valid processor records, or too many. 1585 // The number of records can't exceed the number of valid bits in the 1586 // affinity mask. 1587 // 1588 if (num_records == 0) { 1589 *line = 0; 1590 *msg_id = kmp_i18n_str_NoProcRecords; 1591 return -1; 1592 } 1593 if (num_records > (unsigned)__kmp_xproc) { 1594 *line = 0; 1595 *msg_id = kmp_i18n_str_TooManyProcRecords; 1596 return -1; 1597 } 1598 1599 // 1600 // Set the file pointer back to the begginning, so that we can scan the 1601 // file again, this time performing a full parse of the data. 1602 // Allocate a vector of ProcCpuInfo object, where we will place the data. 1603 // Adding an extra element at the end allows us to remove a lot of extra 1604 // checks for termination conditions. 1605 // 1606 if (fseek(f, 0, SEEK_SET) != 0) { 1607 *line = 0; 1608 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 1609 return -1; 1610 } 1611 1612 // 1613 // Allocate the array of records to store the proc info in. The dummy 1614 // element at the end makes the logic in filling them out easier to code. 1615 // 1616 unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1) 1617 * sizeof(unsigned *)); 1618 unsigned i; 1619 for (i = 0; i <= num_records; i++) { 1620 threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1) 1621 * sizeof(unsigned)); 1622 } 1623 1624 #define CLEANUP_THREAD_INFO \ 1625 for (i = 0; i <= num_records; i++) { \ 1626 __kmp_free(threadInfo[i]); \ 1627 } \ 1628 __kmp_free(threadInfo); 1629 1630 // 1631 // A value of UINT_MAX means that we didn't find the field 1632 // 1633 unsigned __index; 1634 1635 #define INIT_PROC_INFO(p) \ 1636 for (__index = 0; __index <= maxIndex; __index++) { \ 1637 (p)[__index] = UINT_MAX; \ 1638 } 1639 1640 for (i = 0; i <= num_records; i++) { 1641 INIT_PROC_INFO(threadInfo[i]); 1642 } 1643 1644 unsigned num_avail = 0; 1645 *line = 0; 1646 while (! feof(f)) { 1647 // 1648 // Create an inner scoping level, so that all the goto targets at the 1649 // end of the loop appear in an outer scoping level. This avoids 1650 // warnings about jumping past an initialization to a target in the 1651 // same block. 1652 // 1653 { 1654 buf[sizeof(buf) - 1] = 1; 1655 bool long_line = false; 1656 if (! fgets(buf, sizeof(buf), f)) { 1657 // 1658 // Read errors presumably because of EOF 1659 // 1660 // If there is valid data in threadInfo[num_avail], then fake 1661 // a blank line in ensure that the last address gets parsed. 1662 // 1663 bool valid = false; 1664 for (i = 0; i <= maxIndex; i++) { 1665 if (threadInfo[num_avail][i] != UINT_MAX) { 1666 valid = true; 1667 } 1668 } 1669 if (! valid) { 1670 break; 1671 } 1672 buf[0] = 0; 1673 } else if (!buf[sizeof(buf) - 1]) { 1674 // 1675 // The line is longer than the buffer. Set a flag and don't 1676 // emit an error if we were going to ignore the line, anyway. 1677 // 1678 long_line = true; 1679 1680 #define CHECK_LINE \ 1681 if (long_line) { \ 1682 CLEANUP_THREAD_INFO; \ 1683 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 1684 return -1; \ 1685 } 1686 } 1687 (*line)++; 1688 1689 char s1[] = "processor"; 1690 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1691 CHECK_LINE; 1692 char *p = strchr(buf + sizeof(s1) - 1, ':'); 1693 unsigned val; 1694 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 1695 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field; 1696 threadInfo[num_avail][osIdIndex] = val; 1697 #if KMP_OS_LINUX && USE_SYSFS_INFO 1698 char path[256]; 1699 KMP_SNPRINTF(path, sizeof(path), 1700 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 1701 threadInfo[num_avail][osIdIndex]); 1702 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 1703 1704 KMP_SNPRINTF(path, sizeof(path), 1705 "/sys/devices/system/cpu/cpu%u/topology/core_id", 1706 threadInfo[num_avail][osIdIndex]); 1707 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 1708 continue; 1709 #else 1710 } 1711 char s2[] = "physical id"; 1712 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 1713 CHECK_LINE; 1714 char *p = strchr(buf + sizeof(s2) - 1, ':'); 1715 unsigned val; 1716 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 1717 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field; 1718 threadInfo[num_avail][pkgIdIndex] = val; 1719 continue; 1720 } 1721 char s3[] = "core id"; 1722 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 1723 CHECK_LINE; 1724 char *p = strchr(buf + sizeof(s3) - 1, ':'); 1725 unsigned val; 1726 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 1727 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field; 1728 threadInfo[num_avail][coreIdIndex] = val; 1729 continue; 1730 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 1731 } 1732 char s4[] = "thread id"; 1733 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 1734 CHECK_LINE; 1735 char *p = strchr(buf + sizeof(s4) - 1, ':'); 1736 unsigned val; 1737 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 1738 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field; 1739 threadInfo[num_avail][threadIdIndex] = val; 1740 continue; 1741 } 1742 unsigned level; 1743 if (KMP_SSCANF(buf, "node_%d id", &level) == 1) { 1744 CHECK_LINE; 1745 char *p = strchr(buf + sizeof(s4) - 1, ':'); 1746 unsigned val; 1747 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 1748 KMP_ASSERT(nodeIdIndex + level <= maxIndex); 1749 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field; 1750 threadInfo[num_avail][nodeIdIndex + level] = val; 1751 continue; 1752 } 1753 1754 // 1755 // We didn't recognize the leading token on the line. 1756 // There are lots of leading tokens that we don't recognize - 1757 // if the line isn't empty, go on to the next line. 1758 // 1759 if ((*buf != 0) && (*buf != '\n')) { 1760 // 1761 // If the line is longer than the buffer, read characters 1762 // until we find a newline. 1763 // 1764 if (long_line) { 1765 int ch; 1766 while (((ch = fgetc(f)) != EOF) && (ch != '\n')); 1767 } 1768 continue; 1769 } 1770 1771 // 1772 // A newline has signalled the end of the processor record. 1773 // Check that there aren't too many procs specified. 1774 // 1775 if ((int)num_avail == __kmp_xproc) { 1776 CLEANUP_THREAD_INFO; 1777 *msg_id = kmp_i18n_str_TooManyEntries; 1778 return -1; 1779 } 1780 1781 // 1782 // Check for missing fields. The osId field must be there, and we 1783 // currently require that the physical id field is specified, also. 1784 // 1785 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 1786 CLEANUP_THREAD_INFO; 1787 *msg_id = kmp_i18n_str_MissingProcField; 1788 return -1; 1789 } 1790 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 1791 CLEANUP_THREAD_INFO; 1792 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 1793 return -1; 1794 } 1795 1796 // 1797 // Skip this proc if it is not included in the machine model. 1798 // 1799 if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) { 1800 INIT_PROC_INFO(threadInfo[num_avail]); 1801 continue; 1802 } 1803 1804 // 1805 // We have a successful parse of this proc's info. 1806 // Increment the counter, and prepare for the next proc. 1807 // 1808 num_avail++; 1809 KMP_ASSERT(num_avail <= num_records); 1810 INIT_PROC_INFO(threadInfo[num_avail]); 1811 } 1812 continue; 1813 1814 no_val: 1815 CLEANUP_THREAD_INFO; 1816 *msg_id = kmp_i18n_str_MissingValCpuinfo; 1817 return -1; 1818 1819 dup_field: 1820 CLEANUP_THREAD_INFO; 1821 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 1822 return -1; 1823 } 1824 *line = 0; 1825 1826 # if KMP_MIC && REDUCE_TEAM_SIZE 1827 unsigned teamSize = 0; 1828 # endif // KMP_MIC && REDUCE_TEAM_SIZE 1829 1830 // check for num_records == __kmp_xproc ??? 1831 1832 // 1833 // If there's only one thread context to bind to, form an Address object 1834 // with depth 1 and return immediately (or, if affinity is off, set 1835 // address2os to NULL and return). 1836 // 1837 // If it is configured to omit the package level when there is only a 1838 // single package, the logic at the end of this routine won't work if 1839 // there is only a single thread - it would try to form an Address 1840 // object with depth 0. 1841 // 1842 KMP_ASSERT(num_avail > 0); 1843 KMP_ASSERT(num_avail <= num_records); 1844 if (num_avail == 1) { 1845 __kmp_ncores = 1; 1846 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1847 if (__kmp_affinity_verbose) { 1848 if (! KMP_AFFINITY_CAPABLE()) { 1849 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 1850 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1851 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1852 } 1853 else { 1854 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1855 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 1856 fullMask); 1857 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 1858 if (__kmp_affinity_respect_mask) { 1859 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1860 } else { 1861 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1862 } 1863 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1864 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1865 } 1866 int index; 1867 kmp_str_buf_t buf; 1868 __kmp_str_buf_init(&buf); 1869 __kmp_str_buf_print(&buf, "1"); 1870 for (index = maxIndex - 1; index > pkgIdIndex; index--) { 1871 __kmp_str_buf_print(&buf, " x 1"); 1872 } 1873 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1); 1874 __kmp_str_buf_free(&buf); 1875 } 1876 1877 if (__kmp_affinity_type == affinity_none) { 1878 CLEANUP_THREAD_INFO; 1879 return 0; 1880 } 1881 1882 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair)); 1883 Address addr(1); 1884 addr.labels[0] = threadInfo[0][pkgIdIndex]; 1885 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]); 1886 1887 if (__kmp_affinity_gran_levels < 0) { 1888 __kmp_affinity_gran_levels = 0; 1889 } 1890 1891 if (__kmp_affinity_verbose) { 1892 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 1893 } 1894 1895 CLEANUP_THREAD_INFO; 1896 return 1; 1897 } 1898 1899 // 1900 // Sort the threadInfo table by physical Id. 1901 // 1902 qsort(threadInfo, num_avail, sizeof(*threadInfo), 1903 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 1904 1905 // 1906 // The table is now sorted by pkgId / coreId / threadId, but we really 1907 // don't know the radix of any of the fields. pkgId's may be sparsely 1908 // assigned among the chips on a system. Although coreId's are usually 1909 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned 1910 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1911 // 1912 // For that matter, we don't know what coresPerPkg and threadsPerCore 1913 // (or the total # packages) are at this point - we want to determine 1914 // that now. We only have an upper bound on the first two figures. 1915 // 1916 unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1) 1917 * sizeof(unsigned)); 1918 unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1) 1919 * sizeof(unsigned)); 1920 unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1) 1921 * sizeof(unsigned)); 1922 unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1) 1923 * sizeof(unsigned)); 1924 1925 bool assign_thread_ids = false; 1926 unsigned threadIdCt; 1927 unsigned index; 1928 1929 restart_radix_check: 1930 threadIdCt = 0; 1931 1932 // 1933 // Initialize the counter arrays with data from threadInfo[0]. 1934 // 1935 if (assign_thread_ids) { 1936 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 1937 threadInfo[0][threadIdIndex] = threadIdCt++; 1938 } 1939 else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 1940 threadIdCt = threadInfo[0][threadIdIndex] + 1; 1941 } 1942 } 1943 for (index = 0; index <= maxIndex; index++) { 1944 counts[index] = 1; 1945 maxCt[index] = 1; 1946 totals[index] = 1; 1947 lastId[index] = threadInfo[0][index];; 1948 } 1949 1950 // 1951 // Run through the rest of the OS procs. 1952 // 1953 for (i = 1; i < num_avail; i++) { 1954 // 1955 // Find the most significant index whose id differs 1956 // from the id for the previous OS proc. 1957 // 1958 for (index = maxIndex; index >= threadIdIndex; index--) { 1959 if (assign_thread_ids && (index == threadIdIndex)) { 1960 // 1961 // Auto-assign the thread id field if it wasn't specified. 1962 // 1963 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 1964 threadInfo[i][threadIdIndex] = threadIdCt++; 1965 } 1966 1967 // 1968 // Aparrently the thread id field was specified for some 1969 // entries and not others. Start the thread id counter 1970 // off at the next higher thread id. 1971 // 1972 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 1973 threadIdCt = threadInfo[i][threadIdIndex] + 1; 1974 } 1975 } 1976 if (threadInfo[i][index] != lastId[index]) { 1977 // 1978 // Run through all indices which are less significant, 1979 // and reset the counts to 1. 1980 // 1981 // At all levels up to and including index, we need to 1982 // increment the totals and record the last id. 1983 // 1984 unsigned index2; 1985 for (index2 = threadIdIndex; index2 < index; index2++) { 1986 totals[index2]++; 1987 if (counts[index2] > maxCt[index2]) { 1988 maxCt[index2] = counts[index2]; 1989 } 1990 counts[index2] = 1; 1991 lastId[index2] = threadInfo[i][index2]; 1992 } 1993 counts[index]++; 1994 totals[index]++; 1995 lastId[index] = threadInfo[i][index]; 1996 1997 if (assign_thread_ids && (index > threadIdIndex)) { 1998 1999 # if KMP_MIC && REDUCE_TEAM_SIZE 2000 // 2001 // The default team size is the total #threads in the machine 2002 // minus 1 thread for every core that has 3 or more threads. 2003 // 2004 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 ); 2005 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2006 2007 // 2008 // Restart the thread counter, as we are on a new core. 2009 // 2010 threadIdCt = 0; 2011 2012 // 2013 // Auto-assign the thread id field if it wasn't specified. 2014 // 2015 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2016 threadInfo[i][threadIdIndex] = threadIdCt++; 2017 } 2018 2019 // 2020 // Aparrently the thread id field was specified for some 2021 // entries and not others. Start the thread id counter 2022 // off at the next higher thread id. 2023 // 2024 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2025 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2026 } 2027 } 2028 break; 2029 } 2030 } 2031 if (index < threadIdIndex) { 2032 // 2033 // If thread ids were specified, it is an error if they are not 2034 // unique. Also, check that we waven't already restarted the 2035 // loop (to be safe - shouldn't need to). 2036 // 2037 if ((threadInfo[i][threadIdIndex] != UINT_MAX) 2038 || assign_thread_ids) { 2039 __kmp_free(lastId); 2040 __kmp_free(totals); 2041 __kmp_free(maxCt); 2042 __kmp_free(counts); 2043 CLEANUP_THREAD_INFO; 2044 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2045 return -1; 2046 } 2047 2048 // 2049 // If the thread ids were not specified and we see entries 2050 // entries that are duplicates, start the loop over and 2051 // assign the thread ids manually. 2052 // 2053 assign_thread_ids = true; 2054 goto restart_radix_check; 2055 } 2056 } 2057 2058 # if KMP_MIC && REDUCE_TEAM_SIZE 2059 // 2060 // The default team size is the total #threads in the machine 2061 // minus 1 thread for every core that has 3 or more threads. 2062 // 2063 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 ); 2064 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2065 2066 for (index = threadIdIndex; index <= maxIndex; index++) { 2067 if (counts[index] > maxCt[index]) { 2068 maxCt[index] = counts[index]; 2069 } 2070 } 2071 2072 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2073 nCoresPerPkg = maxCt[coreIdIndex]; 2074 nPackages = totals[pkgIdIndex]; 2075 2076 // 2077 // Check to see if the machine topology is uniform 2078 // 2079 unsigned prod = totals[maxIndex]; 2080 for (index = threadIdIndex; index < maxIndex; index++) { 2081 prod *= maxCt[index]; 2082 } 2083 bool uniform = (prod == totals[threadIdIndex]); 2084 2085 // 2086 // When affinity is off, this routine will still be called to set 2087 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 2088 // nCoresPerPkg, & nPackages. Make sure all these vars are set 2089 // correctly, and return now if affinity is not enabled. 2090 // 2091 __kmp_ncores = totals[coreIdIndex]; 2092 2093 if (__kmp_affinity_verbose) { 2094 if (! KMP_AFFINITY_CAPABLE()) { 2095 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2096 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2097 if (uniform) { 2098 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2099 } else { 2100 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2101 } 2102 } 2103 else { 2104 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2105 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask); 2106 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2107 if (__kmp_affinity_respect_mask) { 2108 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2109 } else { 2110 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2111 } 2112 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2113 if (uniform) { 2114 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2115 } else { 2116 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2117 } 2118 } 2119 kmp_str_buf_t buf; 2120 __kmp_str_buf_init(&buf); 2121 2122 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]); 2123 for (index = maxIndex - 1; index >= pkgIdIndex; index--) { 2124 __kmp_str_buf_print(&buf, " x %d", maxCt[index]); 2125 } 2126 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex], 2127 maxCt[threadIdIndex], __kmp_ncores); 2128 2129 __kmp_str_buf_free(&buf); 2130 } 2131 2132 # if KMP_MIC && REDUCE_TEAM_SIZE 2133 // 2134 // Set the default team size. 2135 // 2136 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2137 __kmp_dflt_team_nth = teamSize; 2138 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n", 2139 __kmp_dflt_team_nth)); 2140 } 2141 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2142 2143 if (__kmp_affinity_type == affinity_none) { 2144 __kmp_free(lastId); 2145 __kmp_free(totals); 2146 __kmp_free(maxCt); 2147 __kmp_free(counts); 2148 CLEANUP_THREAD_INFO; 2149 return 0; 2150 } 2151 2152 // 2153 // Count the number of levels which have more nodes at that level than 2154 // at the parent's level (with there being an implicit root node of 2155 // the top level). This is equivalent to saying that there is at least 2156 // one node at this level which has a sibling. These levels are in the 2157 // map, and the package level is always in the map. 2158 // 2159 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2160 int level = 0; 2161 for (index = threadIdIndex; index < maxIndex; index++) { 2162 KMP_ASSERT(totals[index] >= totals[index + 1]); 2163 inMap[index] = (totals[index] > totals[index + 1]); 2164 } 2165 inMap[maxIndex] = (totals[maxIndex] > 1); 2166 inMap[pkgIdIndex] = true; 2167 2168 int depth = 0; 2169 for (index = threadIdIndex; index <= maxIndex; index++) { 2170 if (inMap[index]) { 2171 depth++; 2172 } 2173 } 2174 KMP_ASSERT(depth > 0); 2175 2176 // 2177 // Construct the data structure that is to be returned. 2178 // 2179 *address2os = (AddrUnsPair*) 2180 __kmp_allocate(sizeof(AddrUnsPair) * num_avail); 2181 int pkgLevel = -1; 2182 int coreLevel = -1; 2183 int threadLevel = -1; 2184 2185 for (i = 0; i < num_avail; ++i) { 2186 Address addr(depth); 2187 unsigned os = threadInfo[i][osIdIndex]; 2188 int src_index; 2189 int dst_index = 0; 2190 2191 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2192 if (! inMap[src_index]) { 2193 continue; 2194 } 2195 addr.labels[dst_index] = threadInfo[i][src_index]; 2196 if (src_index == pkgIdIndex) { 2197 pkgLevel = dst_index; 2198 } 2199 else if (src_index == coreIdIndex) { 2200 coreLevel = dst_index; 2201 } 2202 else if (src_index == threadIdIndex) { 2203 threadLevel = dst_index; 2204 } 2205 dst_index++; 2206 } 2207 (*address2os)[i] = AddrUnsPair(addr, os); 2208 } 2209 2210 if (__kmp_affinity_gran_levels < 0) { 2211 // 2212 // Set the granularity level based on what levels are modeled 2213 // in the machine topology map. 2214 // 2215 unsigned src_index; 2216 __kmp_affinity_gran_levels = 0; 2217 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) { 2218 if (! inMap[src_index]) { 2219 continue; 2220 } 2221 switch (src_index) { 2222 case threadIdIndex: 2223 if (__kmp_affinity_gran > affinity_gran_thread) { 2224 __kmp_affinity_gran_levels++; 2225 } 2226 2227 break; 2228 case coreIdIndex: 2229 if (__kmp_affinity_gran > affinity_gran_core) { 2230 __kmp_affinity_gran_levels++; 2231 } 2232 break; 2233 2234 case pkgIdIndex: 2235 if (__kmp_affinity_gran > affinity_gran_package) { 2236 __kmp_affinity_gran_levels++; 2237 } 2238 break; 2239 } 2240 } 2241 } 2242 2243 if (__kmp_affinity_verbose) { 2244 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel, 2245 coreLevel, threadLevel); 2246 } 2247 2248 __kmp_free(inMap); 2249 __kmp_free(lastId); 2250 __kmp_free(totals); 2251 __kmp_free(maxCt); 2252 __kmp_free(counts); 2253 CLEANUP_THREAD_INFO; 2254 return depth; 2255 } 2256 2257 2258 // 2259 // Create and return a table of affinity masks, indexed by OS thread ID. 2260 // This routine handles OR'ing together all the affinity masks of threads 2261 // that are sufficiently close, if granularity > fine. 2262 // 2263 static kmp_affin_mask_t * 2264 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique, 2265 AddrUnsPair *address2os, unsigned numAddrs) 2266 { 2267 // 2268 // First form a table of affinity masks in order of OS thread id. 2269 // 2270 unsigned depth; 2271 unsigned maxOsId; 2272 unsigned i; 2273 2274 KMP_ASSERT(numAddrs > 0); 2275 depth = address2os[0].first.depth; 2276 2277 maxOsId = 0; 2278 for (i = 0; i < numAddrs; i++) { 2279 unsigned osId = address2os[i].second; 2280 if (osId > maxOsId) { 2281 maxOsId = osId; 2282 } 2283 } 2284 kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate( 2285 (maxOsId + 1) * __kmp_affin_mask_size); 2286 2287 // 2288 // Sort the address2os table according to physical order. Doing so 2289 // will put all threads on the same core/package/node in consecutive 2290 // locations. 2291 // 2292 qsort(address2os, numAddrs, sizeof(*address2os), 2293 __kmp_affinity_cmp_Address_labels); 2294 2295 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2296 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2297 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2298 } 2299 if (__kmp_affinity_gran_levels >= (int)depth) { 2300 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2301 && (__kmp_affinity_type != affinity_none))) { 2302 KMP_WARNING(AffThreadsMayMigrate); 2303 } 2304 } 2305 2306 // 2307 // Run through the table, forming the masks for all threads on each 2308 // core. Threads on the same core will have identical "Address" 2309 // objects, not considering the last level, which must be the thread 2310 // id. All threads on a core will appear consecutively. 2311 // 2312 unsigned unique = 0; 2313 unsigned j = 0; // index of 1st thread on core 2314 unsigned leader = 0; 2315 Address *leaderAddr = &(address2os[0].first); 2316 kmp_affin_mask_t *sum 2317 = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size); 2318 KMP_CPU_ZERO(sum); 2319 KMP_CPU_SET(address2os[0].second, sum); 2320 for (i = 1; i < numAddrs; i++) { 2321 // 2322 // If this thread is sufficiently close to the leader (within the 2323 // granularity setting), then set the bit for this os thread in the 2324 // affinity mask for this group, and go on to the next thread. 2325 // 2326 if (leaderAddr->isClose(address2os[i].first, 2327 __kmp_affinity_gran_levels)) { 2328 KMP_CPU_SET(address2os[i].second, sum); 2329 continue; 2330 } 2331 2332 // 2333 // For every thread in this group, copy the mask to the thread's 2334 // entry in the osId2Mask table. Mark the first address as a 2335 // leader. 2336 // 2337 for (; j < i; j++) { 2338 unsigned osId = address2os[j].second; 2339 KMP_DEBUG_ASSERT(osId <= maxOsId); 2340 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2341 KMP_CPU_COPY(mask, sum); 2342 address2os[j].first.leader = (j == leader); 2343 } 2344 unique++; 2345 2346 // 2347 // Start a new mask. 2348 // 2349 leader = i; 2350 leaderAddr = &(address2os[i].first); 2351 KMP_CPU_ZERO(sum); 2352 KMP_CPU_SET(address2os[i].second, sum); 2353 } 2354 2355 // 2356 // For every thread in last group, copy the mask to the thread's 2357 // entry in the osId2Mask table. 2358 // 2359 for (; j < i; j++) { 2360 unsigned osId = address2os[j].second; 2361 KMP_DEBUG_ASSERT(osId <= maxOsId); 2362 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2363 KMP_CPU_COPY(mask, sum); 2364 address2os[j].first.leader = (j == leader); 2365 } 2366 unique++; 2367 2368 *maxIndex = maxOsId; 2369 *numUnique = unique; 2370 return osId2Mask; 2371 } 2372 2373 2374 // 2375 // Stuff for the affinity proclist parsers. It's easier to declare these vars 2376 // as file-static than to try and pass them through the calling sequence of 2377 // the recursive-descent OMP_PLACES parser. 2378 // 2379 static kmp_affin_mask_t *newMasks; 2380 static int numNewMasks; 2381 static int nextNewMask; 2382 2383 #define ADD_MASK(_mask) \ 2384 { \ 2385 if (nextNewMask >= numNewMasks) { \ 2386 numNewMasks *= 2; \ 2387 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \ 2388 numNewMasks * __kmp_affin_mask_size); \ 2389 } \ 2390 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 2391 nextNewMask++; \ 2392 } 2393 2394 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \ 2395 { \ 2396 if (((_osId) > _maxOsId) || \ 2397 (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 2398 if (__kmp_affinity_verbose || (__kmp_affinity_warnings \ 2399 && (__kmp_affinity_type != affinity_none))) { \ 2400 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 2401 } \ 2402 } \ 2403 else { \ 2404 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 2405 } \ 2406 } 2407 2408 2409 // 2410 // Re-parse the proclist (for the explicit affinity type), and form the list 2411 // of affinity newMasks indexed by gtid. 2412 // 2413 static void 2414 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 2415 unsigned int *out_numMasks, const char *proclist, 2416 kmp_affin_mask_t *osId2Mask, int maxOsId) 2417 { 2418 const char *scan = proclist; 2419 const char *next = proclist; 2420 2421 // 2422 // We use malloc() for the temporary mask vector, 2423 // so that we can use realloc() to extend it. 2424 // 2425 numNewMasks = 2; 2426 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks 2427 * __kmp_affin_mask_size); 2428 nextNewMask = 0; 2429 kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate( 2430 __kmp_affin_mask_size); 2431 int setSize = 0; 2432 2433 for (;;) { 2434 int start, end, stride; 2435 2436 SKIP_WS(scan); 2437 next = scan; 2438 if (*next == '\0') { 2439 break; 2440 } 2441 2442 if (*next == '{') { 2443 int num; 2444 setSize = 0; 2445 next++; // skip '{' 2446 SKIP_WS(next); 2447 scan = next; 2448 2449 // 2450 // Read the first integer in the set. 2451 // 2452 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2453 "bad proclist"); 2454 SKIP_DIGITS(next); 2455 num = __kmp_str_to_int(scan, *next); 2456 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2457 2458 // 2459 // Copy the mask for that osId to the sum (union) mask. 2460 // 2461 if ((num > maxOsId) || 2462 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2463 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2464 && (__kmp_affinity_type != affinity_none))) { 2465 KMP_WARNING(AffIgnoreInvalidProcID, num); 2466 } 2467 KMP_CPU_ZERO(sumMask); 2468 } 2469 else { 2470 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2471 setSize = 1; 2472 } 2473 2474 for (;;) { 2475 // 2476 // Check for end of set. 2477 // 2478 SKIP_WS(next); 2479 if (*next == '}') { 2480 next++; // skip '}' 2481 break; 2482 } 2483 2484 // 2485 // Skip optional comma. 2486 // 2487 if (*next == ',') { 2488 next++; 2489 } 2490 SKIP_WS(next); 2491 2492 // 2493 // Read the next integer in the set. 2494 // 2495 scan = next; 2496 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2497 "bad explicit proc list"); 2498 2499 SKIP_DIGITS(next); 2500 num = __kmp_str_to_int(scan, *next); 2501 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2502 2503 // 2504 // Add the mask for that osId to the sum mask. 2505 // 2506 if ((num > maxOsId) || 2507 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2508 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2509 && (__kmp_affinity_type != affinity_none))) { 2510 KMP_WARNING(AffIgnoreInvalidProcID, num); 2511 } 2512 } 2513 else { 2514 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2515 setSize++; 2516 } 2517 } 2518 if (setSize > 0) { 2519 ADD_MASK(sumMask); 2520 } 2521 2522 SKIP_WS(next); 2523 if (*next == ',') { 2524 next++; 2525 } 2526 scan = next; 2527 continue; 2528 } 2529 2530 // 2531 // Read the first integer. 2532 // 2533 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2534 SKIP_DIGITS(next); 2535 start = __kmp_str_to_int(scan, *next); 2536 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 2537 SKIP_WS(next); 2538 2539 // 2540 // If this isn't a range, then add a mask to the list and go on. 2541 // 2542 if (*next != '-') { 2543 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2544 2545 // 2546 // Skip optional comma. 2547 // 2548 if (*next == ',') { 2549 next++; 2550 } 2551 scan = next; 2552 continue; 2553 } 2554 2555 // 2556 // This is a range. Skip over the '-' and read in the 2nd int. 2557 // 2558 next++; // skip '-' 2559 SKIP_WS(next); 2560 scan = next; 2561 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2562 SKIP_DIGITS(next); 2563 end = __kmp_str_to_int(scan, *next); 2564 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 2565 2566 // 2567 // Check for a stride parameter 2568 // 2569 stride = 1; 2570 SKIP_WS(next); 2571 if (*next == ':') { 2572 // 2573 // A stride is specified. Skip over the ':" and read the 3rd int. 2574 // 2575 int sign = +1; 2576 next++; // skip ':' 2577 SKIP_WS(next); 2578 scan = next; 2579 if (*next == '-') { 2580 sign = -1; 2581 next++; 2582 SKIP_WS(next); 2583 scan = next; 2584 } 2585 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2586 "bad explicit proc list"); 2587 SKIP_DIGITS(next); 2588 stride = __kmp_str_to_int(scan, *next); 2589 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 2590 stride *= sign; 2591 } 2592 2593 // 2594 // Do some range checks. 2595 // 2596 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 2597 if (stride > 0) { 2598 KMP_ASSERT2(start <= end, "bad explicit proc list"); 2599 } 2600 else { 2601 KMP_ASSERT2(start >= end, "bad explicit proc list"); 2602 } 2603 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 2604 2605 // 2606 // Add the mask for each OS proc # to the list. 2607 // 2608 if (stride > 0) { 2609 do { 2610 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2611 start += stride; 2612 } while (start <= end); 2613 } 2614 else { 2615 do { 2616 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2617 start += stride; 2618 } while (start >= end); 2619 } 2620 2621 // 2622 // Skip optional comma. 2623 // 2624 SKIP_WS(next); 2625 if (*next == ',') { 2626 next++; 2627 } 2628 scan = next; 2629 } 2630 2631 *out_numMasks = nextNewMask; 2632 if (nextNewMask == 0) { 2633 *out_masks = NULL; 2634 KMP_INTERNAL_FREE(newMasks); 2635 return; 2636 } 2637 *out_masks 2638 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size); 2639 KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size); 2640 __kmp_free(sumMask); 2641 KMP_INTERNAL_FREE(newMasks); 2642 } 2643 2644 2645 # if OMP_40_ENABLED 2646 2647 /*----------------------------------------------------------------------------- 2648 2649 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 2650 places. Again, Here is the grammar: 2651 2652 place_list := place 2653 place_list := place , place_list 2654 place := num 2655 place := place : num 2656 place := place : num : signed 2657 place := { subplacelist } 2658 place := ! place // (lowest priority) 2659 subplace_list := subplace 2660 subplace_list := subplace , subplace_list 2661 subplace := num 2662 subplace := num : num 2663 subplace := num : num : signed 2664 signed := num 2665 signed := + signed 2666 signed := - signed 2667 2668 -----------------------------------------------------------------------------*/ 2669 2670 static void 2671 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask, 2672 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize) 2673 { 2674 const char *next; 2675 2676 for (;;) { 2677 int start, count, stride, i; 2678 2679 // 2680 // Read in the starting proc id 2681 // 2682 SKIP_WS(*scan); 2683 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 2684 "bad explicit places list"); 2685 next = *scan; 2686 SKIP_DIGITS(next); 2687 start = __kmp_str_to_int(*scan, *next); 2688 KMP_ASSERT(start >= 0); 2689 *scan = next; 2690 2691 // 2692 // valid follow sets are ',' ':' and '}' 2693 // 2694 SKIP_WS(*scan); 2695 if (**scan == '}' || **scan == ',') { 2696 if ((start > maxOsId) || 2697 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2698 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2699 && (__kmp_affinity_type != affinity_none))) { 2700 KMP_WARNING(AffIgnoreInvalidProcID, start); 2701 } 2702 } 2703 else { 2704 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2705 (*setSize)++; 2706 } 2707 if (**scan == '}') { 2708 break; 2709 } 2710 (*scan)++; // skip ',' 2711 continue; 2712 } 2713 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 2714 (*scan)++; // skip ':' 2715 2716 // 2717 // Read count parameter 2718 // 2719 SKIP_WS(*scan); 2720 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 2721 "bad explicit places list"); 2722 next = *scan; 2723 SKIP_DIGITS(next); 2724 count = __kmp_str_to_int(*scan, *next); 2725 KMP_ASSERT(count >= 0); 2726 *scan = next; 2727 2728 // 2729 // valid follow sets are ',' ':' and '}' 2730 // 2731 SKIP_WS(*scan); 2732 if (**scan == '}' || **scan == ',') { 2733 for (i = 0; i < count; i++) { 2734 if ((start > maxOsId) || 2735 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2736 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2737 && (__kmp_affinity_type != affinity_none))) { 2738 KMP_WARNING(AffIgnoreInvalidProcID, start); 2739 } 2740 break; // don't proliferate warnings for large count 2741 } 2742 else { 2743 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2744 start++; 2745 (*setSize)++; 2746 } 2747 } 2748 if (**scan == '}') { 2749 break; 2750 } 2751 (*scan)++; // skip ',' 2752 continue; 2753 } 2754 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 2755 (*scan)++; // skip ':' 2756 2757 // 2758 // Read stride parameter 2759 // 2760 int sign = +1; 2761 for (;;) { 2762 SKIP_WS(*scan); 2763 if (**scan == '+') { 2764 (*scan)++; // skip '+' 2765 continue; 2766 } 2767 if (**scan == '-') { 2768 sign *= -1; 2769 (*scan)++; // skip '-' 2770 continue; 2771 } 2772 break; 2773 } 2774 SKIP_WS(*scan); 2775 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 2776 "bad explicit places list"); 2777 next = *scan; 2778 SKIP_DIGITS(next); 2779 stride = __kmp_str_to_int(*scan, *next); 2780 KMP_ASSERT(stride >= 0); 2781 *scan = next; 2782 stride *= sign; 2783 2784 // 2785 // valid follow sets are ',' and '}' 2786 // 2787 SKIP_WS(*scan); 2788 if (**scan == '}' || **scan == ',') { 2789 for (i = 0; i < count; i++) { 2790 if ((start > maxOsId) || 2791 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2792 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2793 && (__kmp_affinity_type != affinity_none))) { 2794 KMP_WARNING(AffIgnoreInvalidProcID, start); 2795 } 2796 break; // don't proliferate warnings for large count 2797 } 2798 else { 2799 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2800 start += stride; 2801 (*setSize)++; 2802 } 2803 } 2804 if (**scan == '}') { 2805 break; 2806 } 2807 (*scan)++; // skip ',' 2808 continue; 2809 } 2810 2811 KMP_ASSERT2(0, "bad explicit places list"); 2812 } 2813 } 2814 2815 2816 static void 2817 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 2818 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize) 2819 { 2820 const char *next; 2821 2822 // 2823 // valid follow sets are '{' '!' and num 2824 // 2825 SKIP_WS(*scan); 2826 if (**scan == '{') { 2827 (*scan)++; // skip '{' 2828 __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask, 2829 setSize); 2830 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 2831 (*scan)++; // skip '}' 2832 } 2833 else if (**scan == '!') { 2834 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 2835 KMP_CPU_COMPLEMENT(tempMask); 2836 (*scan)++; // skip '!' 2837 } 2838 else if ((**scan >= '0') && (**scan <= '9')) { 2839 next = *scan; 2840 SKIP_DIGITS(next); 2841 int num = __kmp_str_to_int(*scan, *next); 2842 KMP_ASSERT(num >= 0); 2843 if ((num > maxOsId) || 2844 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2845 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2846 && (__kmp_affinity_type != affinity_none))) { 2847 KMP_WARNING(AffIgnoreInvalidProcID, num); 2848 } 2849 } 2850 else { 2851 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 2852 (*setSize)++; 2853 } 2854 *scan = next; // skip num 2855 } 2856 else { 2857 KMP_ASSERT2(0, "bad explicit places list"); 2858 } 2859 } 2860 2861 2862 //static void 2863 void 2864 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 2865 unsigned int *out_numMasks, const char *placelist, 2866 kmp_affin_mask_t *osId2Mask, int maxOsId) 2867 { 2868 const char *scan = placelist; 2869 const char *next = placelist; 2870 2871 numNewMasks = 2; 2872 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks 2873 * __kmp_affin_mask_size); 2874 nextNewMask = 0; 2875 2876 kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate( 2877 __kmp_affin_mask_size); 2878 KMP_CPU_ZERO(tempMask); 2879 int setSize = 0; 2880 2881 for (;;) { 2882 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 2883 2884 // 2885 // valid follow sets are ',' ':' and EOL 2886 // 2887 SKIP_WS(scan); 2888 if (*scan == '\0' || *scan == ',') { 2889 if (setSize > 0) { 2890 ADD_MASK(tempMask); 2891 } 2892 KMP_CPU_ZERO(tempMask); 2893 setSize = 0; 2894 if (*scan == '\0') { 2895 break; 2896 } 2897 scan++; // skip ',' 2898 continue; 2899 } 2900 2901 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 2902 scan++; // skip ':' 2903 2904 // 2905 // Read count parameter 2906 // 2907 SKIP_WS(scan); 2908 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), 2909 "bad explicit places list"); 2910 next = scan; 2911 SKIP_DIGITS(next); 2912 int count = __kmp_str_to_int(scan, *next); 2913 KMP_ASSERT(count >= 0); 2914 scan = next; 2915 2916 // 2917 // valid follow sets are ',' ':' and EOL 2918 // 2919 SKIP_WS(scan); 2920 int stride; 2921 if (*scan == '\0' || *scan == ',') { 2922 stride = +1; 2923 } 2924 else { 2925 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 2926 scan++; // skip ':' 2927 2928 // 2929 // Read stride parameter 2930 // 2931 int sign = +1; 2932 for (;;) { 2933 SKIP_WS(scan); 2934 if (*scan == '+') { 2935 scan++; // skip '+' 2936 continue; 2937 } 2938 if (*scan == '-') { 2939 sign *= -1; 2940 scan++; // skip '-' 2941 continue; 2942 } 2943 break; 2944 } 2945 SKIP_WS(scan); 2946 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), 2947 "bad explicit places list"); 2948 next = scan; 2949 SKIP_DIGITS(next); 2950 stride = __kmp_str_to_int(scan, *next); 2951 KMP_DEBUG_ASSERT(stride >= 0); 2952 scan = next; 2953 stride *= sign; 2954 } 2955 2956 if (stride > 0) { 2957 int i; 2958 for (i = 0; i < count; i++) { 2959 int j; 2960 if (setSize == 0) { 2961 break; 2962 } 2963 ADD_MASK(tempMask); 2964 setSize = 0; 2965 for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) { 2966 if (! KMP_CPU_ISSET(j - stride, tempMask)) { 2967 KMP_CPU_CLR(j, tempMask); 2968 } 2969 else if ((j > maxOsId) || 2970 (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) { 2971 if ((__kmp_affinity_verbose || (__kmp_affinity_warnings 2972 && (__kmp_affinity_type != affinity_none))) && i < count - 1) { 2973 KMP_WARNING(AffIgnoreInvalidProcID, j); 2974 } 2975 KMP_CPU_CLR(j, tempMask); 2976 } 2977 else { 2978 KMP_CPU_SET(j, tempMask); 2979 setSize++; 2980 } 2981 } 2982 for (; j >= 0; j--) { 2983 KMP_CPU_CLR(j, tempMask); 2984 } 2985 } 2986 } 2987 else { 2988 int i; 2989 for (i = 0; i < count; i++) { 2990 int j; 2991 if (setSize == 0) { 2992 break; 2993 } 2994 ADD_MASK(tempMask); 2995 setSize = 0; 2996 for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride; 2997 j++) { 2998 if (! KMP_CPU_ISSET(j - stride, tempMask)) { 2999 KMP_CPU_CLR(j, tempMask); 3000 } 3001 else if ((j > maxOsId) || 3002 (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) { 3003 if ((__kmp_affinity_verbose || (__kmp_affinity_warnings 3004 && (__kmp_affinity_type != affinity_none))) && i < count - 1) { 3005 KMP_WARNING(AffIgnoreInvalidProcID, j); 3006 } 3007 KMP_CPU_CLR(j, tempMask); 3008 } 3009 else { 3010 KMP_CPU_SET(j, tempMask); 3011 setSize++; 3012 } 3013 } 3014 for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) { 3015 KMP_CPU_CLR(j, tempMask); 3016 } 3017 } 3018 } 3019 KMP_CPU_ZERO(tempMask); 3020 setSize = 0; 3021 3022 // 3023 // valid follow sets are ',' and EOL 3024 // 3025 SKIP_WS(scan); 3026 if (*scan == '\0') { 3027 break; 3028 } 3029 if (*scan == ',') { 3030 scan++; // skip ',' 3031 continue; 3032 } 3033 3034 KMP_ASSERT2(0, "bad explicit places list"); 3035 } 3036 3037 *out_numMasks = nextNewMask; 3038 if (nextNewMask == 0) { 3039 *out_masks = NULL; 3040 KMP_INTERNAL_FREE(newMasks); 3041 return; 3042 } 3043 *out_masks 3044 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size); 3045 KMP_MEMCPY(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size); 3046 __kmp_free(tempMask); 3047 KMP_INTERNAL_FREE(newMasks); 3048 } 3049 3050 # endif /* OMP_40_ENABLED */ 3051 3052 #undef ADD_MASK 3053 #undef ADD_MASK_OSID 3054 3055 static void 3056 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) 3057 { 3058 if ( __kmp_place_num_cores == 0 ) { 3059 if ( __kmp_place_num_threads_per_core == 0 ) { 3060 return; // no cores limiting actions requested, exit 3061 } 3062 __kmp_place_num_cores = nCoresPerPkg; // use all available cores 3063 } 3064 if ( !__kmp_affinity_uniform_topology() ) { 3065 KMP_WARNING( AffThrPlaceNonUniform ); 3066 return; // don't support non-uniform topology 3067 } 3068 if ( depth != 3 ) { 3069 KMP_WARNING( AffThrPlaceNonThreeLevel ); 3070 return; // don't support not-3-level topology 3071 } 3072 if ( __kmp_place_num_threads_per_core == 0 ) { 3073 __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts 3074 } 3075 if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) { 3076 KMP_WARNING( AffThrPlaceManyCores ); 3077 return; 3078 } 3079 3080 AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) * 3081 nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core); 3082 int i, j, k, n_old = 0, n_new = 0; 3083 for ( i = 0; i < nPackages; ++i ) { 3084 for ( j = 0; j < nCoresPerPkg; ++j ) { 3085 if ( j < __kmp_place_core_offset || j >= __kmp_place_core_offset + __kmp_place_num_cores ) { 3086 n_old += __kmp_nThreadsPerCore; // skip not-requested core 3087 } else { 3088 for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) { 3089 if ( k < __kmp_place_num_threads_per_core ) { 3090 newAddr[n_new] = (*pAddr)[n_old]; // copy requested core' data to new location 3091 n_new++; 3092 } 3093 n_old++; 3094 } 3095 } 3096 } 3097 } 3098 nCoresPerPkg = __kmp_place_num_cores; // correct nCoresPerPkg 3099 __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore 3100 __kmp_avail_proc = n_new; // correct avail_proc 3101 __kmp_ncores = nPackages * __kmp_place_num_cores; // correct ncores 3102 3103 __kmp_free( *pAddr ); 3104 *pAddr = newAddr; // replace old topology with new one 3105 } 3106 3107 3108 static AddrUnsPair *address2os = NULL; 3109 static int * procarr = NULL; 3110 static int __kmp_aff_depth = 0; 3111 3112 static void 3113 __kmp_aux_affinity_initialize(void) 3114 { 3115 if (__kmp_affinity_masks != NULL) { 3116 KMP_ASSERT(fullMask != NULL); 3117 return; 3118 } 3119 3120 // 3121 // Create the "full" mask - this defines all of the processors that we 3122 // consider to be in the machine model. If respect is set, then it is 3123 // the initialization thread's affinity mask. Otherwise, it is all 3124 // processors that we know about on the machine. 3125 // 3126 if (fullMask == NULL) { 3127 fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size); 3128 } 3129 if (KMP_AFFINITY_CAPABLE()) { 3130 if (__kmp_affinity_respect_mask) { 3131 __kmp_get_system_affinity(fullMask, TRUE); 3132 3133 // 3134 // Count the number of available processors. 3135 // 3136 unsigned i; 3137 __kmp_avail_proc = 0; 3138 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 3139 if (! KMP_CPU_ISSET(i, fullMask)) { 3140 continue; 3141 } 3142 __kmp_avail_proc++; 3143 } 3144 if (__kmp_avail_proc > __kmp_xproc) { 3145 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3146 && (__kmp_affinity_type != affinity_none))) { 3147 KMP_WARNING(ErrorInitializeAffinity); 3148 } 3149 __kmp_affinity_type = affinity_none; 3150 KMP_AFFINITY_DISABLE(); 3151 return; 3152 } 3153 } 3154 else { 3155 __kmp_affinity_entire_machine_mask(fullMask); 3156 __kmp_avail_proc = __kmp_xproc; 3157 } 3158 } 3159 3160 int depth = -1; 3161 kmp_i18n_id_t msg_id = kmp_i18n_null; 3162 3163 // 3164 // For backward compatibility, setting KMP_CPUINFO_FILE => 3165 // KMP_TOPOLOGY_METHOD=cpuinfo 3166 // 3167 if ((__kmp_cpuinfo_file != NULL) && 3168 (__kmp_affinity_top_method == affinity_top_method_all)) { 3169 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 3170 } 3171 3172 if (__kmp_affinity_top_method == affinity_top_method_all) { 3173 // 3174 // In the default code path, errors are not fatal - we just try using 3175 // another method. We only emit a warning message if affinity is on, 3176 // or the verbose flag is set, an the nowarnings flag was not set. 3177 // 3178 const char *file_name = NULL; 3179 int line = 0; 3180 3181 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 3182 3183 if (__kmp_affinity_verbose) { 3184 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 3185 } 3186 3187 file_name = NULL; 3188 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3189 if (depth == 0) { 3190 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3191 KMP_ASSERT(address2os == NULL); 3192 return; 3193 } 3194 3195 if (depth < 0) { 3196 if (__kmp_affinity_verbose) { 3197 if (msg_id != kmp_i18n_null) { 3198 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), 3199 KMP_I18N_STR(DecodingLegacyAPIC)); 3200 } 3201 else { 3202 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 3203 } 3204 } 3205 3206 file_name = NULL; 3207 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3208 if (depth == 0) { 3209 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3210 KMP_ASSERT(address2os == NULL); 3211 return; 3212 } 3213 } 3214 3215 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3216 3217 # if KMP_OS_LINUX 3218 3219 if (depth < 0) { 3220 if (__kmp_affinity_verbose) { 3221 if (msg_id != kmp_i18n_null) { 3222 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo"); 3223 } 3224 else { 3225 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo"); 3226 } 3227 } 3228 3229 FILE *f = fopen("/proc/cpuinfo", "r"); 3230 if (f == NULL) { 3231 msg_id = kmp_i18n_str_CantOpenCpuinfo; 3232 } 3233 else { 3234 file_name = "/proc/cpuinfo"; 3235 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3236 fclose(f); 3237 if (depth == 0) { 3238 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3239 KMP_ASSERT(address2os == NULL); 3240 return; 3241 } 3242 } 3243 } 3244 3245 # endif /* KMP_OS_LINUX */ 3246 3247 # if KMP_GROUP_AFFINITY 3248 3249 if ((depth < 0) && (__kmp_num_proc_groups > 1)) { 3250 if (__kmp_affinity_verbose) { 3251 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 3252 } 3253 3254 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 3255 KMP_ASSERT(depth != 0); 3256 } 3257 3258 # endif /* KMP_GROUP_AFFINITY */ 3259 3260 if (depth < 0) { 3261 if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) { 3262 if (file_name == NULL) { 3263 KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id)); 3264 } 3265 else if (line == 0) { 3266 KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id)); 3267 } 3268 else { 3269 KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id)); 3270 } 3271 } 3272 // FIXME - print msg if msg_id = kmp_i18n_null ??? 3273 3274 file_name = ""; 3275 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3276 if (depth == 0) { 3277 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3278 KMP_ASSERT(address2os == NULL); 3279 return; 3280 } 3281 KMP_ASSERT(depth > 0); 3282 KMP_ASSERT(address2os != NULL); 3283 } 3284 } 3285 3286 // 3287 // If the user has specified that a paricular topology discovery method 3288 // is to be used, then we abort if that method fails. The exception is 3289 // group affinity, which might have been implicitly set. 3290 // 3291 3292 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 3293 3294 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 3295 if (__kmp_affinity_verbose) { 3296 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3297 KMP_I18N_STR(Decodingx2APIC)); 3298 } 3299 3300 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3301 if (depth == 0) { 3302 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3303 KMP_ASSERT(address2os == NULL); 3304 return; 3305 } 3306 if (depth < 0) { 3307 KMP_ASSERT(msg_id != kmp_i18n_null); 3308 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3309 } 3310 } 3311 else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 3312 if (__kmp_affinity_verbose) { 3313 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3314 KMP_I18N_STR(DecodingLegacyAPIC)); 3315 } 3316 3317 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3318 if (depth == 0) { 3319 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3320 KMP_ASSERT(address2os == NULL); 3321 return; 3322 } 3323 if (depth < 0) { 3324 KMP_ASSERT(msg_id != kmp_i18n_null); 3325 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3326 } 3327 } 3328 3329 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3330 3331 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 3332 const char *filename; 3333 if (__kmp_cpuinfo_file != NULL) { 3334 filename = __kmp_cpuinfo_file; 3335 } 3336 else { 3337 filename = "/proc/cpuinfo"; 3338 } 3339 3340 if (__kmp_affinity_verbose) { 3341 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 3342 } 3343 3344 FILE *f = fopen(filename, "r"); 3345 if (f == NULL) { 3346 int code = errno; 3347 if (__kmp_cpuinfo_file != NULL) { 3348 __kmp_msg( 3349 kmp_ms_fatal, 3350 KMP_MSG(CantOpenFileForReading, filename), 3351 KMP_ERR(code), 3352 KMP_HNT(NameComesFrom_CPUINFO_FILE), 3353 __kmp_msg_null 3354 ); 3355 } 3356 else { 3357 __kmp_msg( 3358 kmp_ms_fatal, 3359 KMP_MSG(CantOpenFileForReading, filename), 3360 KMP_ERR(code), 3361 __kmp_msg_null 3362 ); 3363 } 3364 } 3365 int line = 0; 3366 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3367 fclose(f); 3368 if (depth < 0) { 3369 KMP_ASSERT(msg_id != kmp_i18n_null); 3370 if (line > 0) { 3371 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id)); 3372 } 3373 else { 3374 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 3375 } 3376 } 3377 if (__kmp_affinity_type == affinity_none) { 3378 KMP_ASSERT(depth == 0); 3379 KMP_ASSERT(address2os == NULL); 3380 return; 3381 } 3382 } 3383 3384 # if KMP_GROUP_AFFINITY 3385 3386 else if (__kmp_affinity_top_method == affinity_top_method_group) { 3387 if (__kmp_affinity_verbose) { 3388 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 3389 } 3390 3391 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 3392 KMP_ASSERT(depth != 0); 3393 if (depth < 0) { 3394 KMP_ASSERT(msg_id != kmp_i18n_null); 3395 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3396 } 3397 } 3398 3399 # endif /* KMP_GROUP_AFFINITY */ 3400 3401 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 3402 if (__kmp_affinity_verbose) { 3403 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY"); 3404 } 3405 3406 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3407 if (depth == 0) { 3408 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3409 KMP_ASSERT(address2os == NULL); 3410 return; 3411 } 3412 // should not fail 3413 KMP_ASSERT(depth > 0); 3414 KMP_ASSERT(address2os != NULL); 3415 } 3416 3417 if (address2os == NULL) { 3418 if (KMP_AFFINITY_CAPABLE() 3419 && (__kmp_affinity_verbose || (__kmp_affinity_warnings 3420 && (__kmp_affinity_type != affinity_none)))) { 3421 KMP_WARNING(ErrorInitializeAffinity); 3422 } 3423 __kmp_affinity_type = affinity_none; 3424 KMP_AFFINITY_DISABLE(); 3425 return; 3426 } 3427 3428 __kmp_apply_thread_places(&address2os, depth); 3429 3430 // 3431 // Create the table of masks, indexed by thread Id. 3432 // 3433 unsigned maxIndex; 3434 unsigned numUnique; 3435 kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique, 3436 address2os, __kmp_avail_proc); 3437 if (__kmp_affinity_gran_levels == 0) { 3438 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 3439 } 3440 3441 // 3442 // Set the childNums vector in all Address objects. This must be done 3443 // before we can sort using __kmp_affinity_cmp_Address_child_num(), 3444 // which takes into account the setting of __kmp_affinity_compact. 3445 // 3446 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc); 3447 3448 switch (__kmp_affinity_type) { 3449 3450 case affinity_explicit: 3451 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 3452 # if OMP_40_ENABLED 3453 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 3454 # endif 3455 { 3456 __kmp_affinity_process_proclist(&__kmp_affinity_masks, 3457 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask, 3458 maxIndex); 3459 } 3460 # if OMP_40_ENABLED 3461 else { 3462 __kmp_affinity_process_placelist(&__kmp_affinity_masks, 3463 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask, 3464 maxIndex); 3465 } 3466 # endif 3467 if (__kmp_affinity_num_masks == 0) { 3468 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3469 && (__kmp_affinity_type != affinity_none))) { 3470 KMP_WARNING(AffNoValidProcID); 3471 } 3472 __kmp_affinity_type = affinity_none; 3473 return; 3474 } 3475 break; 3476 3477 // 3478 // The other affinity types rely on sorting the Addresses according 3479 // to some permutation of the machine topology tree. Set 3480 // __kmp_affinity_compact and __kmp_affinity_offset appropriately, 3481 // then jump to a common code fragment to do the sort and create 3482 // the array of affinity masks. 3483 // 3484 3485 case affinity_logical: 3486 __kmp_affinity_compact = 0; 3487 if (__kmp_affinity_offset) { 3488 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset 3489 % __kmp_avail_proc; 3490 } 3491 goto sortAddresses; 3492 3493 case affinity_physical: 3494 if (__kmp_nThreadsPerCore > 1) { 3495 __kmp_affinity_compact = 1; 3496 if (__kmp_affinity_compact >= depth) { 3497 __kmp_affinity_compact = 0; 3498 } 3499 } else { 3500 __kmp_affinity_compact = 0; 3501 } 3502 if (__kmp_affinity_offset) { 3503 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset 3504 % __kmp_avail_proc; 3505 } 3506 goto sortAddresses; 3507 3508 case affinity_scatter: 3509 if (__kmp_affinity_compact >= depth) { 3510 __kmp_affinity_compact = 0; 3511 } 3512 else { 3513 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 3514 } 3515 goto sortAddresses; 3516 3517 case affinity_compact: 3518 if (__kmp_affinity_compact >= depth) { 3519 __kmp_affinity_compact = depth - 1; 3520 } 3521 goto sortAddresses; 3522 3523 case affinity_balanced: 3524 // Balanced works only for the case of a single package 3525 if( nPackages > 1 ) { 3526 if( __kmp_affinity_verbose || __kmp_affinity_warnings ) { 3527 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" ); 3528 } 3529 __kmp_affinity_type = affinity_none; 3530 return; 3531 } else if( __kmp_affinity_uniform_topology() ) { 3532 break; 3533 } else { // Non-uniform topology 3534 3535 // Save the depth for further usage 3536 __kmp_aff_depth = depth; 3537 3538 // Number of hyper threads per core in HT machine 3539 int nth_per_core = __kmp_nThreadsPerCore; 3540 3541 int core_level; 3542 if( nth_per_core > 1 ) { 3543 core_level = depth - 2; 3544 } else { 3545 core_level = depth - 1; 3546 } 3547 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1; 3548 int nproc = nth_per_core * ncores; 3549 3550 procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc ); 3551 for( int i = 0; i < nproc; i++ ) { 3552 procarr[ i ] = -1; 3553 } 3554 3555 for( int i = 0; i < __kmp_avail_proc; i++ ) { 3556 int proc = address2os[ i ].second; 3557 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread. 3558 // If there is only one thread per core then depth == 2: level 0 - package, 3559 // level 1 - core. 3560 int level = depth - 1; 3561 3562 // __kmp_nth_per_core == 1 3563 int thread = 0; 3564 int core = address2os[ i ].first.labels[ level ]; 3565 // If the thread level exists, that is we have more than one thread context per core 3566 if( nth_per_core > 1 ) { 3567 thread = address2os[ i ].first.labels[ level ] % nth_per_core; 3568 core = address2os[ i ].first.labels[ level - 1 ]; 3569 } 3570 procarr[ core * nth_per_core + thread ] = proc; 3571 } 3572 3573 break; 3574 } 3575 3576 sortAddresses: 3577 // 3578 // Allocate the gtid->affinity mask table. 3579 // 3580 if (__kmp_affinity_dups) { 3581 __kmp_affinity_num_masks = __kmp_avail_proc; 3582 } 3583 else { 3584 __kmp_affinity_num_masks = numUnique; 3585 } 3586 3587 # if OMP_40_ENABLED 3588 if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel ) 3589 && ( __kmp_affinity_num_places > 0 ) 3590 && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) { 3591 __kmp_affinity_num_masks = __kmp_affinity_num_places; 3592 } 3593 # endif 3594 3595 __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate( 3596 __kmp_affinity_num_masks * __kmp_affin_mask_size); 3597 3598 // 3599 // Sort the address2os table according to the current setting of 3600 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 3601 // 3602 qsort(address2os, __kmp_avail_proc, sizeof(*address2os), 3603 __kmp_affinity_cmp_Address_child_num); 3604 { 3605 int i; 3606 unsigned j; 3607 for (i = 0, j = 0; i < __kmp_avail_proc; i++) { 3608 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) { 3609 continue; 3610 } 3611 unsigned osId = address2os[i].second; 3612 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 3613 kmp_affin_mask_t *dest 3614 = KMP_CPU_INDEX(__kmp_affinity_masks, j); 3615 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 3616 KMP_CPU_COPY(dest, src); 3617 if (++j >= __kmp_affinity_num_masks) { 3618 break; 3619 } 3620 } 3621 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 3622 } 3623 break; 3624 3625 default: 3626 KMP_ASSERT2(0, "Unexpected affinity setting"); 3627 } 3628 3629 __kmp_free(osId2Mask); 3630 machine_hierarchy.init(address2os, __kmp_avail_proc); 3631 } 3632 3633 3634 void 3635 __kmp_affinity_initialize(void) 3636 { 3637 // 3638 // Much of the code above was written assumming that if a machine was not 3639 // affinity capable, then __kmp_affinity_type == affinity_none. We now 3640 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 3641 // 3642 // There are too many checks for __kmp_affinity_type == affinity_none 3643 // in this code. Instead of trying to change them all, check if 3644 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 3645 // affinity_none, call the real initialization routine, then restore 3646 // __kmp_affinity_type to affinity_disabled. 3647 // 3648 int disabled = (__kmp_affinity_type == affinity_disabled); 3649 if (! KMP_AFFINITY_CAPABLE()) { 3650 KMP_ASSERT(disabled); 3651 } 3652 if (disabled) { 3653 __kmp_affinity_type = affinity_none; 3654 } 3655 __kmp_aux_affinity_initialize(); 3656 if (disabled) { 3657 __kmp_affinity_type = affinity_disabled; 3658 } 3659 } 3660 3661 3662 void 3663 __kmp_affinity_uninitialize(void) 3664 { 3665 if (__kmp_affinity_masks != NULL) { 3666 __kmp_free(__kmp_affinity_masks); 3667 __kmp_affinity_masks = NULL; 3668 } 3669 if (fullMask != NULL) { 3670 KMP_CPU_FREE(fullMask); 3671 fullMask = NULL; 3672 } 3673 __kmp_affinity_num_masks = 0; 3674 # if OMP_40_ENABLED 3675 __kmp_affinity_num_places = 0; 3676 # endif 3677 if (__kmp_affinity_proclist != NULL) { 3678 __kmp_free(__kmp_affinity_proclist); 3679 __kmp_affinity_proclist = NULL; 3680 } 3681 if( address2os != NULL ) { 3682 __kmp_free( address2os ); 3683 address2os = NULL; 3684 } 3685 if( procarr != NULL ) { 3686 __kmp_free( procarr ); 3687 procarr = NULL; 3688 } 3689 } 3690 3691 3692 void 3693 __kmp_affinity_set_init_mask(int gtid, int isa_root) 3694 { 3695 if (! KMP_AFFINITY_CAPABLE()) { 3696 return; 3697 } 3698 3699 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 3700 if (th->th.th_affin_mask == NULL) { 3701 KMP_CPU_ALLOC(th->th.th_affin_mask); 3702 } 3703 else { 3704 KMP_CPU_ZERO(th->th.th_affin_mask); 3705 } 3706 3707 // 3708 // Copy the thread mask to the kmp_info_t strucuture. 3709 // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one 3710 // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask 3711 // is set, then the full mask is the same as the mask of the initialization 3712 // thread. 3713 // 3714 kmp_affin_mask_t *mask; 3715 int i; 3716 3717 # if OMP_40_ENABLED 3718 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 3719 # endif 3720 { 3721 if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced) 3722 ) { 3723 # if KMP_GROUP_AFFINITY 3724 if (__kmp_num_proc_groups > 1) { 3725 return; 3726 } 3727 # endif 3728 KMP_ASSERT(fullMask != NULL); 3729 i = KMP_PLACE_ALL; 3730 mask = fullMask; 3731 } 3732 else { 3733 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 ); 3734 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 3735 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 3736 } 3737 } 3738 # if OMP_40_ENABLED 3739 else { 3740 if ((! isa_root) 3741 || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 3742 # if KMP_GROUP_AFFINITY 3743 if (__kmp_num_proc_groups > 1) { 3744 return; 3745 } 3746 # endif 3747 KMP_ASSERT(fullMask != NULL); 3748 i = KMP_PLACE_ALL; 3749 mask = fullMask; 3750 } 3751 else { 3752 // 3753 // int i = some hash function or just a counter that doesn't 3754 // always start at 0. Use gtid for now. 3755 // 3756 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 ); 3757 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 3758 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 3759 } 3760 } 3761 # endif 3762 3763 # if OMP_40_ENABLED 3764 th->th.th_current_place = i; 3765 if (isa_root) { 3766 th->th.th_new_place = i; 3767 th->th.th_first_place = 0; 3768 th->th.th_last_place = __kmp_affinity_num_masks - 1; 3769 } 3770 3771 if (i == KMP_PLACE_ALL) { 3772 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 3773 gtid)); 3774 } 3775 else { 3776 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 3777 gtid, i)); 3778 } 3779 # else 3780 if (i == -1) { 3781 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n", 3782 gtid)); 3783 } 3784 else { 3785 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n", 3786 gtid, i)); 3787 } 3788 # endif /* OMP_40_ENABLED */ 3789 3790 KMP_CPU_COPY(th->th.th_affin_mask, mask); 3791 3792 if (__kmp_affinity_verbose) { 3793 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 3794 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 3795 th->th.th_affin_mask); 3796 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid, 3797 buf); 3798 } 3799 3800 # if KMP_OS_WINDOWS 3801 // 3802 // On Windows* OS, the process affinity mask might have changed. 3803 // If the user didn't request affinity and this call fails, 3804 // just continue silently. See CQ171393. 3805 // 3806 if ( __kmp_affinity_type == affinity_none ) { 3807 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 3808 } 3809 else 3810 # endif 3811 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 3812 } 3813 3814 3815 # if OMP_40_ENABLED 3816 3817 void 3818 __kmp_affinity_set_place(int gtid) 3819 { 3820 int retval; 3821 3822 if (! KMP_AFFINITY_CAPABLE()) { 3823 return; 3824 } 3825 3826 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 3827 3828 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n", 3829 gtid, th->th.th_new_place, th->th.th_current_place)); 3830 3831 // 3832 // Check that the new place is within this thread's partition. 3833 // 3834 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 3835 KMP_ASSERT(th->th.th_new_place >= 0); 3836 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 3837 if (th->th.th_first_place <= th->th.th_last_place) { 3838 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) 3839 && (th->th.th_new_place <= th->th.th_last_place)); 3840 } 3841 else { 3842 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) 3843 || (th->th.th_new_place >= th->th.th_last_place)); 3844 } 3845 3846 // 3847 // Copy the thread mask to the kmp_info_t strucuture, 3848 // and set this thread's affinity. 3849 // 3850 kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, 3851 th->th.th_new_place); 3852 KMP_CPU_COPY(th->th.th_affin_mask, mask); 3853 th->th.th_current_place = th->th.th_new_place; 3854 3855 if (__kmp_affinity_verbose) { 3856 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 3857 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 3858 th->th.th_affin_mask); 3859 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), 3860 gtid, buf); 3861 } 3862 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 3863 } 3864 3865 # endif /* OMP_40_ENABLED */ 3866 3867 3868 int 3869 __kmp_aux_set_affinity(void **mask) 3870 { 3871 int gtid; 3872 kmp_info_t *th; 3873 int retval; 3874 3875 if (! KMP_AFFINITY_CAPABLE()) { 3876 return -1; 3877 } 3878 3879 gtid = __kmp_entry_gtid(); 3880 KA_TRACE(1000, ;{ 3881 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 3882 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 3883 (kmp_affin_mask_t *)(*mask)); 3884 __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n", 3885 gtid, buf); 3886 }); 3887 3888 if (__kmp_env_consistency_check) { 3889 if ((mask == NULL) || (*mask == NULL)) { 3890 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 3891 } 3892 else { 3893 unsigned proc; 3894 int num_procs = 0; 3895 3896 for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) { 3897 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 3898 continue; 3899 } 3900 num_procs++; 3901 if (! KMP_CPU_ISSET(proc, fullMask)) { 3902 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 3903 break; 3904 } 3905 } 3906 if (num_procs == 0) { 3907 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 3908 } 3909 3910 # if KMP_GROUP_AFFINITY 3911 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 3912 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 3913 } 3914 # endif /* KMP_GROUP_AFFINITY */ 3915 3916 } 3917 } 3918 3919 th = __kmp_threads[gtid]; 3920 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 3921 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 3922 if (retval == 0) { 3923 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 3924 } 3925 3926 # if OMP_40_ENABLED 3927 th->th.th_current_place = KMP_PLACE_UNDEFINED; 3928 th->th.th_new_place = KMP_PLACE_UNDEFINED; 3929 th->th.th_first_place = 0; 3930 th->th.th_last_place = __kmp_affinity_num_masks - 1; 3931 3932 // 3933 // Turn off 4.0 affinity for the current tread at this parallel level. 3934 // 3935 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 3936 # endif 3937 3938 return retval; 3939 } 3940 3941 3942 int 3943 __kmp_aux_get_affinity(void **mask) 3944 { 3945 int gtid; 3946 int retval; 3947 kmp_info_t *th; 3948 3949 if (! KMP_AFFINITY_CAPABLE()) { 3950 return -1; 3951 } 3952 3953 gtid = __kmp_entry_gtid(); 3954 th = __kmp_threads[gtid]; 3955 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 3956 3957 KA_TRACE(1000, ;{ 3958 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 3959 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 3960 th->th.th_affin_mask); 3961 __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf); 3962 }); 3963 3964 if (__kmp_env_consistency_check) { 3965 if ((mask == NULL) || (*mask == NULL)) { 3966 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 3967 } 3968 } 3969 3970 # if !KMP_OS_WINDOWS 3971 3972 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 3973 KA_TRACE(1000, ;{ 3974 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 3975 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 3976 (kmp_affin_mask_t *)(*mask)); 3977 __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf); 3978 }); 3979 return retval; 3980 3981 # else 3982 3983 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 3984 return 0; 3985 3986 # endif /* KMP_OS_WINDOWS */ 3987 3988 } 3989 3990 int 3991 __kmp_aux_set_affinity_mask_proc(int proc, void **mask) 3992 { 3993 int retval; 3994 3995 if (! KMP_AFFINITY_CAPABLE()) { 3996 return -1; 3997 } 3998 3999 KA_TRACE(1000, ;{ 4000 int gtid = __kmp_entry_gtid(); 4001 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4002 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4003 (kmp_affin_mask_t *)(*mask)); 4004 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n", 4005 proc, gtid, buf); 4006 }); 4007 4008 if (__kmp_env_consistency_check) { 4009 if ((mask == NULL) || (*mask == NULL)) { 4010 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4011 } 4012 } 4013 4014 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4015 return -1; 4016 } 4017 if (! KMP_CPU_ISSET(proc, fullMask)) { 4018 return -2; 4019 } 4020 4021 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 4022 return 0; 4023 } 4024 4025 4026 int 4027 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) 4028 { 4029 int retval; 4030 4031 if (! KMP_AFFINITY_CAPABLE()) { 4032 return -1; 4033 } 4034 4035 KA_TRACE(1000, ;{ 4036 int gtid = __kmp_entry_gtid(); 4037 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4038 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4039 (kmp_affin_mask_t *)(*mask)); 4040 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n", 4041 proc, gtid, buf); 4042 }); 4043 4044 if (__kmp_env_consistency_check) { 4045 if ((mask == NULL) || (*mask == NULL)) { 4046 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 4047 } 4048 } 4049 4050 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4051 return -1; 4052 } 4053 if (! KMP_CPU_ISSET(proc, fullMask)) { 4054 return -2; 4055 } 4056 4057 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 4058 return 0; 4059 } 4060 4061 4062 int 4063 __kmp_aux_get_affinity_mask_proc(int proc, void **mask) 4064 { 4065 int retval; 4066 4067 if (! KMP_AFFINITY_CAPABLE()) { 4068 return -1; 4069 } 4070 4071 KA_TRACE(1000, ;{ 4072 int gtid = __kmp_entry_gtid(); 4073 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4074 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4075 (kmp_affin_mask_t *)(*mask)); 4076 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n", 4077 proc, gtid, buf); 4078 }); 4079 4080 if (__kmp_env_consistency_check) { 4081 if ((mask == NULL) || (*mask == NULL)) { 4082 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc"); 4083 } 4084 } 4085 4086 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4087 return 0; 4088 } 4089 if (! KMP_CPU_ISSET(proc, fullMask)) { 4090 return 0; 4091 } 4092 4093 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 4094 } 4095 4096 4097 // Dynamic affinity settings - Affinity balanced 4098 void __kmp_balanced_affinity( int tid, int nthreads ) 4099 { 4100 if( __kmp_affinity_uniform_topology() ) { 4101 int coreID; 4102 int threadID; 4103 // Number of hyper threads per core in HT machine 4104 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 4105 // Number of cores 4106 int ncores = __kmp_ncores; 4107 // How many threads will be bound to each core 4108 int chunk = nthreads / ncores; 4109 // How many cores will have an additional thread bound to it - "big cores" 4110 int big_cores = nthreads % ncores; 4111 // Number of threads on the big cores 4112 int big_nth = ( chunk + 1 ) * big_cores; 4113 if( tid < big_nth ) { 4114 coreID = tid / (chunk + 1 ); 4115 threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ; 4116 } else { //tid >= big_nth 4117 coreID = ( tid - big_cores ) / chunk; 4118 threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ; 4119 } 4120 4121 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 4122 "Illegal set affinity operation when not capable"); 4123 4124 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size); 4125 KMP_CPU_ZERO(mask); 4126 4127 // Granularity == thread 4128 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4129 int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second; 4130 KMP_CPU_SET( osID, mask); 4131 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4132 for( int i = 0; i < __kmp_nth_per_core; i++ ) { 4133 int osID; 4134 osID = address2os[ coreID * __kmp_nth_per_core + i ].second; 4135 KMP_CPU_SET( osID, mask); 4136 } 4137 } 4138 if (__kmp_affinity_verbose) { 4139 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4140 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4141 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4142 tid, buf); 4143 } 4144 __kmp_set_system_affinity( mask, TRUE ); 4145 } else { // Non-uniform topology 4146 4147 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)KMP_ALLOCA(__kmp_affin_mask_size); 4148 KMP_CPU_ZERO(mask); 4149 4150 // Number of hyper threads per core in HT machine 4151 int nth_per_core = __kmp_nThreadsPerCore; 4152 int core_level; 4153 if( nth_per_core > 1 ) { 4154 core_level = __kmp_aff_depth - 2; 4155 } else { 4156 core_level = __kmp_aff_depth - 1; 4157 } 4158 4159 // Number of cores - maximum value; it does not count trail cores with 0 processors 4160 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1; 4161 4162 // For performance gain consider the special case nthreads == __kmp_avail_proc 4163 if( nthreads == __kmp_avail_proc ) { 4164 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4165 int osID = address2os[ tid ].second; 4166 KMP_CPU_SET( osID, mask); 4167 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4168 int coreID = address2os[ tid ].first.labels[ core_level ]; 4169 // We'll count found osIDs for the current core; they can be not more than nth_per_core; 4170 // since the address2os is sortied we can break when cnt==nth_per_core 4171 int cnt = 0; 4172 for( int i = 0; i < __kmp_avail_proc; i++ ) { 4173 int osID = address2os[ i ].second; 4174 int core = address2os[ i ].first.labels[ core_level ]; 4175 if( core == coreID ) { 4176 KMP_CPU_SET( osID, mask); 4177 cnt++; 4178 if( cnt == nth_per_core ) { 4179 break; 4180 } 4181 } 4182 } 4183 } 4184 } else if( nthreads <= __kmp_ncores ) { 4185 4186 int core = 0; 4187 for( int i = 0; i < ncores; i++ ) { 4188 // Check if this core from procarr[] is in the mask 4189 int in_mask = 0; 4190 for( int j = 0; j < nth_per_core; j++ ) { 4191 if( procarr[ i * nth_per_core + j ] != - 1 ) { 4192 in_mask = 1; 4193 break; 4194 } 4195 } 4196 if( in_mask ) { 4197 if( tid == core ) { 4198 for( int j = 0; j < nth_per_core; j++ ) { 4199 int osID = procarr[ i * nth_per_core + j ]; 4200 if( osID != -1 ) { 4201 KMP_CPU_SET( osID, mask ); 4202 // For granularity=thread it is enough to set the first available osID for this core 4203 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4204 break; 4205 } 4206 } 4207 } 4208 break; 4209 } else { 4210 core++; 4211 } 4212 } 4213 } 4214 4215 } else { // nthreads > __kmp_ncores 4216 4217 // Array to save the number of processors at each core 4218 int* nproc_at_core = (int*)KMP_ALLOCA(sizeof(int)*ncores); 4219 // Array to save the number of cores with "x" available processors; 4220 int* ncores_with_x_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1)); 4221 // Array to save the number of cores with # procs from x to nth_per_core 4222 int* ncores_with_x_to_max_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1)); 4223 4224 for( int i = 0; i <= nth_per_core; i++ ) { 4225 ncores_with_x_procs[ i ] = 0; 4226 ncores_with_x_to_max_procs[ i ] = 0; 4227 } 4228 4229 for( int i = 0; i < ncores; i++ ) { 4230 int cnt = 0; 4231 for( int j = 0; j < nth_per_core; j++ ) { 4232 if( procarr[ i * nth_per_core + j ] != -1 ) { 4233 cnt++; 4234 } 4235 } 4236 nproc_at_core[ i ] = cnt; 4237 ncores_with_x_procs[ cnt ]++; 4238 } 4239 4240 for( int i = 0; i <= nth_per_core; i++ ) { 4241 for( int j = i; j <= nth_per_core; j++ ) { 4242 ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ]; 4243 } 4244 } 4245 4246 // Max number of processors 4247 int nproc = nth_per_core * ncores; 4248 // An array to keep number of threads per each context 4249 int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc ); 4250 for( int i = 0; i < nproc; i++ ) { 4251 newarr[ i ] = 0; 4252 } 4253 4254 int nth = nthreads; 4255 int flag = 0; 4256 while( nth > 0 ) { 4257 for( int j = 1; j <= nth_per_core; j++ ) { 4258 int cnt = ncores_with_x_to_max_procs[ j ]; 4259 for( int i = 0; i < ncores; i++ ) { 4260 // Skip the core with 0 processors 4261 if( nproc_at_core[ i ] == 0 ) { 4262 continue; 4263 } 4264 for( int k = 0; k < nth_per_core; k++ ) { 4265 if( procarr[ i * nth_per_core + k ] != -1 ) { 4266 if( newarr[ i * nth_per_core + k ] == 0 ) { 4267 newarr[ i * nth_per_core + k ] = 1; 4268 cnt--; 4269 nth--; 4270 break; 4271 } else { 4272 if( flag != 0 ) { 4273 newarr[ i * nth_per_core + k ] ++; 4274 cnt--; 4275 nth--; 4276 break; 4277 } 4278 } 4279 } 4280 } 4281 if( cnt == 0 || nth == 0 ) { 4282 break; 4283 } 4284 } 4285 if( nth == 0 ) { 4286 break; 4287 } 4288 } 4289 flag = 1; 4290 } 4291 int sum = 0; 4292 for( int i = 0; i < nproc; i++ ) { 4293 sum += newarr[ i ]; 4294 if( sum > tid ) { 4295 // Granularity == thread 4296 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4297 int osID = procarr[ i ]; 4298 KMP_CPU_SET( osID, mask); 4299 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4300 int coreID = i / nth_per_core; 4301 for( int ii = 0; ii < nth_per_core; ii++ ) { 4302 int osID = procarr[ coreID * nth_per_core + ii ]; 4303 if( osID != -1 ) { 4304 KMP_CPU_SET( osID, mask); 4305 } 4306 } 4307 } 4308 break; 4309 } 4310 } 4311 __kmp_free( newarr ); 4312 } 4313 4314 if (__kmp_affinity_verbose) { 4315 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4316 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4317 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4318 tid, buf); 4319 } 4320 __kmp_set_system_affinity( mask, TRUE ); 4321 } 4322 } 4323 4324 #endif // KMP_AFFINITY_SUPPORTED 4325