1 /* 2 * kmp_affinity.cpp -- affinity management 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "kmp.h" 17 #include "kmp_affinity.h" 18 #include "kmp_i18n.h" 19 #include "kmp_io.h" 20 #include "kmp_str.h" 21 #include "kmp_wrapper_getpid.h" 22 23 // Store the real or imagined machine hierarchy here 24 static hierarchy_info machine_hierarchy; 25 26 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); } 27 28 29 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 30 kmp_uint32 depth; 31 // The test below is true if affinity is available, but set to "none". Need to 32 // init on first use of hierarchical barrier. 33 if (TCR_1(machine_hierarchy.uninitialized)) 34 machine_hierarchy.init(NULL, nproc); 35 36 // Adjust the hierarchy in case num threads exceeds original 37 if (nproc > machine_hierarchy.base_num_threads) 38 machine_hierarchy.resize(nproc); 39 40 depth = machine_hierarchy.depth; 41 KMP_DEBUG_ASSERT(depth > 0); 42 43 thr_bar->depth = depth; 44 thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0] - 1; 45 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 46 } 47 48 #if KMP_AFFINITY_SUPPORTED 49 50 bool KMPAffinity::picked_api = false; 51 52 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); } 53 void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); } 54 void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); } 55 void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); } 56 void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); } 57 void KMPAffinity::operator delete(void *p) { __kmp_free(p); } 58 59 void KMPAffinity::pick_api() { 60 KMPAffinity *affinity_dispatch; 61 if (picked_api) 62 return; 63 #if KMP_USE_HWLOC 64 // Only use Hwloc if affinity isn't explicitly disabled and 65 // user requests Hwloc topology method 66 if (__kmp_affinity_top_method == affinity_top_method_hwloc && 67 __kmp_affinity_type != affinity_disabled) { 68 affinity_dispatch = new KMPHwlocAffinity(); 69 } else 70 #endif 71 { 72 affinity_dispatch = new KMPNativeAffinity(); 73 } 74 __kmp_affinity_dispatch = affinity_dispatch; 75 picked_api = true; 76 } 77 78 void KMPAffinity::destroy_api() { 79 if (__kmp_affinity_dispatch != NULL) { 80 delete __kmp_affinity_dispatch; 81 __kmp_affinity_dispatch = NULL; 82 picked_api = false; 83 } 84 } 85 86 // Print the affinity mask to the character array in a pretty format. 87 char *__kmp_affinity_print_mask(char *buf, int buf_len, 88 kmp_affin_mask_t *mask) { 89 KMP_ASSERT(buf_len >= 40); 90 char *scan = buf; 91 char *end = buf + buf_len - 1; 92 93 // Find first element / check for empty set. 94 size_t i; 95 i = mask->begin(); 96 if (i == mask->end()) { 97 KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}"); 98 while (*scan != '\0') 99 scan++; 100 KMP_ASSERT(scan <= end); 101 return buf; 102 } 103 104 KMP_SNPRINTF(scan, end - scan + 1, "{%ld", (long)i); 105 while (*scan != '\0') 106 scan++; 107 i++; 108 for (; i != mask->end(); i = mask->next(i)) { 109 if (!KMP_CPU_ISSET(i, mask)) { 110 continue; 111 } 112 113 // Check for buffer overflow. A string of the form ",<n>" will have at most 114 // 10 characters, plus we want to leave room to print ",...}" if the set is 115 // too large to print for a total of 15 characters. We already left room for 116 // '\0' in setting end. 117 if (end - scan < 15) { 118 break; 119 } 120 KMP_SNPRINTF(scan, end - scan + 1, ",%-ld", (long)i); 121 while (*scan != '\0') 122 scan++; 123 } 124 if (i != mask->end()) { 125 KMP_SNPRINTF(scan, end - scan + 1, ",..."); 126 while (*scan != '\0') 127 scan++; 128 } 129 KMP_SNPRINTF(scan, end - scan + 1, "}"); 130 while (*scan != '\0') 131 scan++; 132 KMP_ASSERT(scan <= end); 133 return buf; 134 } 135 136 void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) { 137 KMP_CPU_ZERO(mask); 138 139 #if KMP_GROUP_AFFINITY 140 141 if (__kmp_num_proc_groups > 1) { 142 int group; 143 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 144 for (group = 0; group < __kmp_num_proc_groups; group++) { 145 int i; 146 int num = __kmp_GetActiveProcessorCount(group); 147 for (i = 0; i < num; i++) { 148 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 149 } 150 } 151 } else 152 153 #endif /* KMP_GROUP_AFFINITY */ 154 155 { 156 int proc; 157 for (proc = 0; proc < __kmp_xproc; proc++) { 158 KMP_CPU_SET(proc, mask); 159 } 160 } 161 } 162 163 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be 164 // called to renumber the labels from [0..n] and place them into the child_num 165 // vector of the address object. This is done in case the labels used for 166 // the children at one node of the hierarchy differ from those used for 167 // another node at the same level. Example: suppose the machine has 2 nodes 168 // with 2 packages each. The first node contains packages 601 and 602, and 169 // second node contains packages 603 and 604. If we try to sort the table 170 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604 171 // because we are paying attention to the labels themselves, not the ordinal 172 // child numbers. By using the child numbers in the sort, the result is 173 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604. 174 static void __kmp_affinity_assign_child_nums(AddrUnsPair *address2os, 175 int numAddrs) { 176 KMP_DEBUG_ASSERT(numAddrs > 0); 177 int depth = address2os->first.depth; 178 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 179 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 180 int labCt; 181 for (labCt = 0; labCt < depth; labCt++) { 182 address2os[0].first.childNums[labCt] = counts[labCt] = 0; 183 lastLabel[labCt] = address2os[0].first.labels[labCt]; 184 } 185 int i; 186 for (i = 1; i < numAddrs; i++) { 187 for (labCt = 0; labCt < depth; labCt++) { 188 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) { 189 int labCt2; 190 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) { 191 counts[labCt2] = 0; 192 lastLabel[labCt2] = address2os[i].first.labels[labCt2]; 193 } 194 counts[labCt]++; 195 lastLabel[labCt] = address2os[i].first.labels[labCt]; 196 break; 197 } 198 } 199 for (labCt = 0; labCt < depth; labCt++) { 200 address2os[i].first.childNums[labCt] = counts[labCt]; 201 } 202 for (; labCt < (int)Address::maxDepth; labCt++) { 203 address2os[i].first.childNums[labCt] = 0; 204 } 205 } 206 __kmp_free(lastLabel); 207 __kmp_free(counts); 208 } 209 210 // All of the __kmp_affinity_create_*_map() routines should set 211 // __kmp_affinity_masks to a vector of affinity mask objects of length 212 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and return 213 // the number of levels in the machine topology tree (zero if 214 // __kmp_affinity_type == affinity_none). 215 // 216 // All of the __kmp_affinity_create_*_map() routines should set 217 // *__kmp_affin_fullMask to the affinity mask for the initialization thread. 218 // They need to save and restore the mask, and it could be needed later, so 219 // saving it is just an optimization to avoid calling kmp_get_system_affinity() 220 // again. 221 kmp_affin_mask_t *__kmp_affin_fullMask = NULL; 222 223 static int nCoresPerPkg, nPackages; 224 static int __kmp_nThreadsPerCore; 225 #ifndef KMP_DFLT_NTH_CORES 226 static int __kmp_ncores; 227 #endif 228 static int *__kmp_pu_os_idx = NULL; 229 230 // __kmp_affinity_uniform_topology() doesn't work when called from 231 // places which support arbitrarily many levels in the machine topology 232 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map() 233 // __kmp_affinity_create_x2apicid_map(). 234 inline static bool __kmp_affinity_uniform_topology() { 235 return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages); 236 } 237 238 // Print out the detailed machine topology map, i.e. the physical locations 239 // of each OS proc. 240 static void __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, 241 int depth, int pkgLevel, 242 int coreLevel, int threadLevel) { 243 int proc; 244 245 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 246 for (proc = 0; proc < len; proc++) { 247 int level; 248 kmp_str_buf_t buf; 249 __kmp_str_buf_init(&buf); 250 for (level = 0; level < depth; level++) { 251 if (level == threadLevel) { 252 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread)); 253 } else if (level == coreLevel) { 254 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core)); 255 } else if (level == pkgLevel) { 256 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package)); 257 } else if (level > pkgLevel) { 258 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node), 259 level - pkgLevel - 1); 260 } else { 261 __kmp_str_buf_print(&buf, "L%d ", level); 262 } 263 __kmp_str_buf_print(&buf, "%d ", address2os[proc].first.labels[level]); 264 } 265 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second, 266 buf.str); 267 __kmp_str_buf_free(&buf); 268 } 269 } 270 271 #if KMP_USE_HWLOC 272 273 // This function removes the topology levels that are radix 1 and don't offer 274 // further information about the topology. The most common example is when you 275 // have one thread context per core, we don't want the extra thread context 276 // level if it offers no unique labels. So they are removed. 277 // return value: the new depth of address2os 278 static int __kmp_affinity_remove_radix_one_levels(AddrUnsPair *address2os, 279 int nActiveThreads, int depth, 280 int *pkgLevel, int *coreLevel, 281 int *threadLevel) { 282 int level; 283 int i; 284 int radix1_detected; 285 286 for (level = depth - 1; level >= 0; --level) { 287 // Always keep the package level 288 if (level == *pkgLevel) 289 continue; 290 // Detect if this level is radix 1 291 radix1_detected = 1; 292 for (i = 1; i < nActiveThreads; ++i) { 293 if (address2os[0].first.labels[level] != 294 address2os[i].first.labels[level]) { 295 // There are differing label values for this level so it stays 296 radix1_detected = 0; 297 break; 298 } 299 } 300 if (!radix1_detected) 301 continue; 302 // Radix 1 was detected 303 if (level == *threadLevel) { 304 // If only one thread per core, then just decrement 305 // the depth which removes the threadlevel from address2os 306 for (i = 0; i < nActiveThreads; ++i) { 307 address2os[i].first.depth--; 308 } 309 *threadLevel = -1; 310 } else if (level == *coreLevel) { 311 // For core level, we move the thread labels over if they are still 312 // valid (*threadLevel != -1), and also reduce the depth another level 313 for (i = 0; i < nActiveThreads; ++i) { 314 if (*threadLevel != -1) { 315 address2os[i].first.labels[*coreLevel] = 316 address2os[i].first.labels[*threadLevel]; 317 } 318 address2os[i].first.depth--; 319 } 320 *coreLevel = -1; 321 } 322 } 323 return address2os[0].first.depth; 324 } 325 326 // Returns the number of objects of type 'type' below 'obj' within the topology 327 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is 328 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET 329 // object. 330 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj, 331 hwloc_obj_type_t type) { 332 int retval = 0; 333 hwloc_obj_t first; 334 for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type, 335 obj->logical_index, type, 0); 336 first != NULL && 337 hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, obj->type, first) == 338 obj; 339 first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type, 340 first)) { 341 ++retval; 342 } 343 return retval; 344 } 345 346 static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os, 347 kmp_i18n_id_t *const msg_id) { 348 *address2os = NULL; 349 *msg_id = kmp_i18n_null; 350 351 // Save the affinity mask for the current thread. 352 kmp_affin_mask_t *oldMask; 353 KMP_CPU_ALLOC(oldMask); 354 __kmp_get_system_affinity(oldMask, TRUE); 355 356 int depth = 3; 357 int pkgLevel = 0; 358 int coreLevel = 1; 359 int threadLevel = 2; 360 361 if (!KMP_AFFINITY_CAPABLE()) { 362 // Hack to try and infer the machine topology using only the data 363 // available from cpuid on the current thread, and __kmp_xproc. 364 KMP_ASSERT(__kmp_affinity_type == affinity_none); 365 366 nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj( 367 hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PACKAGE, 0), 368 HWLOC_OBJ_CORE); 369 __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj( 370 hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, 0), 371 HWLOC_OBJ_PU); 372 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 373 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 374 if (__kmp_affinity_verbose) { 375 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 376 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 377 if (__kmp_affinity_uniform_topology()) { 378 KMP_INFORM(Uniform, "KMP_AFFINITY"); 379 } else { 380 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 381 } 382 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 383 __kmp_nThreadsPerCore, __kmp_ncores); 384 } 385 KMP_CPU_FREE(oldMask); 386 return 0; 387 } 388 389 // Allocate the data structure to be returned. 390 AddrUnsPair *retval = 391 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 392 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 393 394 // When affinity is off, this routine will still be called to set 395 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 396 // nCoresPerPkg, & nPackages. Make sure all these vars are set 397 // correctly, and return if affinity is not enabled. 398 399 hwloc_obj_t pu; 400 hwloc_obj_t core; 401 hwloc_obj_t socket; 402 int nActiveThreads = 0; 403 int socket_identifier = 0; 404 // re-calculate globals to count only accessible resources 405 __kmp_ncores = nPackages = nCoresPerPkg = __kmp_nThreadsPerCore = 0; 406 for (socket = 407 hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PACKAGE, 0); 408 socket != NULL; socket = hwloc_get_next_obj_by_type( 409 __kmp_hwloc_topology, HWLOC_OBJ_PACKAGE, socket), 410 socket_identifier++) { 411 int core_identifier = 0; 412 int num_active_cores = 0; 413 for (core = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, socket->type, 414 socket->logical_index, 415 HWLOC_OBJ_CORE, 0); 416 core != NULL && 417 hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, socket->type, 418 core) == socket; 419 core = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, 420 core), 421 core_identifier++) { 422 int pu_identifier = 0; 423 int num_active_threads = 0; 424 for (pu = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, core->type, 425 core->logical_index, HWLOC_OBJ_PU, 426 0); 427 pu != NULL && 428 hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, core->type, 429 pu) == core; 430 pu = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PU, 431 pu), 432 pu_identifier++) { 433 Address addr(3); 434 if (!KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask)) 435 continue; // skip inactive (inaccessible) unit 436 KA_TRACE(20, 437 ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n", 438 socket->os_index, socket->logical_index, core->os_index, 439 core->logical_index, pu->os_index, pu->logical_index)); 440 addr.labels[0] = socket_identifier; // package 441 addr.labels[1] = core_identifier; // core 442 addr.labels[2] = pu_identifier; // pu 443 retval[nActiveThreads] = AddrUnsPair(addr, pu->os_index); 444 __kmp_pu_os_idx[nActiveThreads] = 445 pu->os_index; // keep os index for each active pu 446 nActiveThreads++; 447 ++num_active_threads; // count active threads per core 448 } 449 if (num_active_threads) { // were there any active threads on the core? 450 ++__kmp_ncores; // count total active cores 451 ++num_active_cores; // count active cores per socket 452 if (num_active_threads > __kmp_nThreadsPerCore) 453 __kmp_nThreadsPerCore = num_active_threads; // calc maximum 454 } 455 } 456 if (num_active_cores) { // were there any active cores on the socket? 457 ++nPackages; // count total active packages 458 if (num_active_cores > nCoresPerPkg) 459 nCoresPerPkg = num_active_cores; // calc maximum 460 } 461 } 462 463 // If there's only one thread context to bind to, return now. 464 KMP_DEBUG_ASSERT(nActiveThreads == __kmp_avail_proc); 465 KMP_ASSERT(nActiveThreads > 0); 466 if (nActiveThreads == 1) { 467 __kmp_ncores = nPackages = 1; 468 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 469 if (__kmp_affinity_verbose) { 470 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 471 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 472 473 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 474 if (__kmp_affinity_respect_mask) { 475 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 476 } else { 477 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 478 } 479 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 480 KMP_INFORM(Uniform, "KMP_AFFINITY"); 481 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 482 __kmp_nThreadsPerCore, __kmp_ncores); 483 } 484 485 if (__kmp_affinity_type == affinity_none) { 486 __kmp_free(retval); 487 KMP_CPU_FREE(oldMask); 488 return 0; 489 } 490 491 // Form an Address object which only includes the package level. 492 Address addr(1); 493 addr.labels[0] = retval[0].first.labels[pkgLevel]; 494 retval[0].first = addr; 495 496 if (__kmp_affinity_gran_levels < 0) { 497 __kmp_affinity_gran_levels = 0; 498 } 499 500 if (__kmp_affinity_verbose) { 501 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 502 } 503 504 *address2os = retval; 505 KMP_CPU_FREE(oldMask); 506 return 1; 507 } 508 509 // Sort the table by physical Id. 510 qsort(retval, nActiveThreads, sizeof(*retval), 511 __kmp_affinity_cmp_Address_labels); 512 513 // Check to see if the machine topology is uniform 514 unsigned uniform = 515 (nPackages * nCoresPerPkg * __kmp_nThreadsPerCore == nActiveThreads); 516 517 // Print the machine topology summary. 518 if (__kmp_affinity_verbose) { 519 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 520 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 521 522 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 523 if (__kmp_affinity_respect_mask) { 524 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 525 } else { 526 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 527 } 528 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 529 if (uniform) { 530 KMP_INFORM(Uniform, "KMP_AFFINITY"); 531 } else { 532 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 533 } 534 535 kmp_str_buf_t buf; 536 __kmp_str_buf_init(&buf); 537 538 __kmp_str_buf_print(&buf, "%d", nPackages); 539 // for (level = 1; level <= pkgLevel; level++) { 540 // __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 541 // } 542 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 543 __kmp_nThreadsPerCore, __kmp_ncores); 544 545 __kmp_str_buf_free(&buf); 546 } 547 548 if (__kmp_affinity_type == affinity_none) { 549 __kmp_free(retval); 550 KMP_CPU_FREE(oldMask); 551 return 0; 552 } 553 554 // Find any levels with radiix 1, and remove them from the map 555 // (except for the package level). 556 depth = __kmp_affinity_remove_radix_one_levels( 557 retval, nActiveThreads, depth, &pkgLevel, &coreLevel, &threadLevel); 558 559 if (__kmp_affinity_gran_levels < 0) { 560 // Set the granularity level based on what levels are modeled 561 // in the machine topology map. 562 __kmp_affinity_gran_levels = 0; 563 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 564 __kmp_affinity_gran_levels++; 565 } 566 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 567 __kmp_affinity_gran_levels++; 568 } 569 if (__kmp_affinity_gran > affinity_gran_package) { 570 __kmp_affinity_gran_levels++; 571 } 572 } 573 574 if (__kmp_affinity_verbose) { 575 __kmp_affinity_print_topology(retval, nActiveThreads, depth, pkgLevel, 576 coreLevel, threadLevel); 577 } 578 579 KMP_CPU_FREE(oldMask); 580 *address2os = retval; 581 return depth; 582 } 583 #endif // KMP_USE_HWLOC 584 585 // If we don't know how to retrieve the machine's processor topology, or 586 // encounter an error in doing so, this routine is called to form a "flat" 587 // mapping of os thread id's <-> processor id's. 588 static int __kmp_affinity_create_flat_map(AddrUnsPair **address2os, 589 kmp_i18n_id_t *const msg_id) { 590 *address2os = NULL; 591 *msg_id = kmp_i18n_null; 592 593 // Even if __kmp_affinity_type == affinity_none, this routine might still 594 // called to set __kmp_ncores, as well as 595 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 596 if (!KMP_AFFINITY_CAPABLE()) { 597 KMP_ASSERT(__kmp_affinity_type == affinity_none); 598 __kmp_ncores = nPackages = __kmp_xproc; 599 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 600 if (__kmp_affinity_verbose) { 601 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY"); 602 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 603 KMP_INFORM(Uniform, "KMP_AFFINITY"); 604 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 605 __kmp_nThreadsPerCore, __kmp_ncores); 606 } 607 return 0; 608 } 609 610 // When affinity is off, this routine will still be called to set 611 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 612 // Make sure all these vars are set correctly, and return now if affinity is 613 // not enabled. 614 __kmp_ncores = nPackages = __kmp_avail_proc; 615 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 616 if (__kmp_affinity_verbose) { 617 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 618 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 619 __kmp_affin_fullMask); 620 621 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY"); 622 if (__kmp_affinity_respect_mask) { 623 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 624 } else { 625 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 626 } 627 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 628 KMP_INFORM(Uniform, "KMP_AFFINITY"); 629 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 630 __kmp_nThreadsPerCore, __kmp_ncores); 631 } 632 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 633 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 634 if (__kmp_affinity_type == affinity_none) { 635 int avail_ct = 0; 636 int i; 637 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 638 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) 639 continue; 640 __kmp_pu_os_idx[avail_ct++] = i; // suppose indices are flat 641 } 642 return 0; 643 } 644 645 // Contruct the data structure to be returned. 646 *address2os = 647 (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 648 int avail_ct = 0; 649 unsigned int i; 650 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 651 // Skip this proc if it is not included in the machine model. 652 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 653 continue; 654 } 655 __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat 656 Address addr(1); 657 addr.labels[0] = i; 658 (*address2os)[avail_ct++] = AddrUnsPair(addr, i); 659 } 660 if (__kmp_affinity_verbose) { 661 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 662 } 663 664 if (__kmp_affinity_gran_levels < 0) { 665 // Only the package level is modeled in the machine topology map, 666 // so the #levels of granularity is either 0 or 1. 667 if (__kmp_affinity_gran > affinity_gran_package) { 668 __kmp_affinity_gran_levels = 1; 669 } else { 670 __kmp_affinity_gran_levels = 0; 671 } 672 } 673 return 1; 674 } 675 676 #if KMP_GROUP_AFFINITY 677 678 // If multiple Windows* OS processor groups exist, we can create a 2-level 679 // topology map with the groups at level 0 and the individual procs at level 1. 680 // This facilitates letting the threads float among all procs in a group, 681 // if granularity=group (the default when there are multiple groups). 682 static int __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os, 683 kmp_i18n_id_t *const msg_id) { 684 *address2os = NULL; 685 *msg_id = kmp_i18n_null; 686 687 // If we aren't affinity capable, then return now. 688 // The flat mapping will be used. 689 if (!KMP_AFFINITY_CAPABLE()) { 690 // FIXME set *msg_id 691 return -1; 692 } 693 694 // Contruct the data structure to be returned. 695 *address2os = 696 (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 697 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 698 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 699 int avail_ct = 0; 700 int i; 701 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 702 // Skip this proc if it is not included in the machine model. 703 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 704 continue; 705 } 706 __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat 707 Address addr(2); 708 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR)); 709 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR)); 710 (*address2os)[avail_ct++] = AddrUnsPair(addr, i); 711 712 if (__kmp_affinity_verbose) { 713 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0], 714 addr.labels[1]); 715 } 716 } 717 718 if (__kmp_affinity_gran_levels < 0) { 719 if (__kmp_affinity_gran == affinity_gran_group) { 720 __kmp_affinity_gran_levels = 1; 721 } else if ((__kmp_affinity_gran == affinity_gran_fine) || 722 (__kmp_affinity_gran == affinity_gran_thread)) { 723 __kmp_affinity_gran_levels = 0; 724 } else { 725 const char *gran_str = NULL; 726 if (__kmp_affinity_gran == affinity_gran_core) { 727 gran_str = "core"; 728 } else if (__kmp_affinity_gran == affinity_gran_package) { 729 gran_str = "package"; 730 } else if (__kmp_affinity_gran == affinity_gran_node) { 731 gran_str = "node"; 732 } else { 733 KMP_ASSERT(0); 734 } 735 736 // Warning: can't use affinity granularity \"gran\" with group topology 737 // method, using "thread" 738 __kmp_affinity_gran_levels = 0; 739 } 740 } 741 return 2; 742 } 743 744 #endif /* KMP_GROUP_AFFINITY */ 745 746 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 747 748 static int __kmp_cpuid_mask_width(int count) { 749 int r = 0; 750 751 while ((1 << r) < count) 752 ++r; 753 return r; 754 } 755 756 class apicThreadInfo { 757 public: 758 unsigned osId; // param to __kmp_affinity_bind_thread 759 unsigned apicId; // from cpuid after binding 760 unsigned maxCoresPerPkg; // "" 761 unsigned maxThreadsPerPkg; // "" 762 unsigned pkgId; // inferred from above values 763 unsigned coreId; // "" 764 unsigned threadId; // "" 765 }; 766 767 static int __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, 768 const void *b) { 769 const apicThreadInfo *aa = (const apicThreadInfo *)a; 770 const apicThreadInfo *bb = (const apicThreadInfo *)b; 771 if (aa->osId < bb->osId) 772 return -1; 773 if (aa->osId > bb->osId) 774 return 1; 775 return 0; 776 } 777 778 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, 779 const void *b) { 780 const apicThreadInfo *aa = (const apicThreadInfo *)a; 781 const apicThreadInfo *bb = (const apicThreadInfo *)b; 782 if (aa->pkgId < bb->pkgId) 783 return -1; 784 if (aa->pkgId > bb->pkgId) 785 return 1; 786 if (aa->coreId < bb->coreId) 787 return -1; 788 if (aa->coreId > bb->coreId) 789 return 1; 790 if (aa->threadId < bb->threadId) 791 return -1; 792 if (aa->threadId > bb->threadId) 793 return 1; 794 return 0; 795 } 796 797 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 798 // an algorithm which cycles through the available os threads, setting 799 // the current thread's affinity mask to that thread, and then retrieves 800 // the Apic Id for each thread context using the cpuid instruction. 801 static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os, 802 kmp_i18n_id_t *const msg_id) { 803 kmp_cpuid buf; 804 int rc; 805 *address2os = NULL; 806 *msg_id = kmp_i18n_null; 807 808 // Check if cpuid leaf 4 is supported. 809 __kmp_x86_cpuid(0, 0, &buf); 810 if (buf.eax < 4) { 811 *msg_id = kmp_i18n_str_NoLeaf4Support; 812 return -1; 813 } 814 815 // The algorithm used starts by setting the affinity to each available thread 816 // and retrieving info from the cpuid instruction, so if we are not capable of 817 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we 818 // need to do something else - use the defaults that we calculated from 819 // issuing cpuid without binding to each proc. 820 if (!KMP_AFFINITY_CAPABLE()) { 821 // Hack to try and infer the machine topology using only the data 822 // available from cpuid on the current thread, and __kmp_xproc. 823 KMP_ASSERT(__kmp_affinity_type == affinity_none); 824 825 // Get an upper bound on the number of threads per package using cpuid(1). 826 // On some OS/chps combinations where HT is supported by the chip but is 827 // disabled, this value will be 2 on a single core chip. Usually, it will be 828 // 2 if HT is enabled and 1 if HT is disabled. 829 __kmp_x86_cpuid(1, 0, &buf); 830 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 831 if (maxThreadsPerPkg == 0) { 832 maxThreadsPerPkg = 1; 833 } 834 835 // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded 836 // value. 837 // 838 // The author of cpu_count.cpp treated this only an upper bound on the 839 // number of cores, but I haven't seen any cases where it was greater than 840 // the actual number of cores, so we will treat it as exact in this block of 841 // code. 842 // 843 // First, we need to check if cpuid(4) is supported on this chip. To see if 844 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or 845 // greater. 846 __kmp_x86_cpuid(0, 0, &buf); 847 if (buf.eax >= 4) { 848 __kmp_x86_cpuid(4, 0, &buf); 849 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 850 } else { 851 nCoresPerPkg = 1; 852 } 853 854 // There is no way to reliably tell if HT is enabled without issuing the 855 // cpuid instruction from every thread, can correlating the cpuid info, so 856 // if the machine is not affinity capable, we assume that HT is off. We have 857 // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine 858 // does not support HT. 859 // 860 // - Older OSes are usually found on machines with older chips, which do not 861 // support HT. 862 // - The performance penalty for mistakenly identifying a machine as HT when 863 // it isn't (which results in blocktime being incorrecly set to 0) is 864 // greater than the penalty when for mistakenly identifying a machine as 865 // being 1 thread/core when it is really HT enabled (which results in 866 // blocktime being incorrectly set to a positive value). 867 __kmp_ncores = __kmp_xproc; 868 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 869 __kmp_nThreadsPerCore = 1; 870 if (__kmp_affinity_verbose) { 871 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY"); 872 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 873 if (__kmp_affinity_uniform_topology()) { 874 KMP_INFORM(Uniform, "KMP_AFFINITY"); 875 } else { 876 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 877 } 878 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 879 __kmp_nThreadsPerCore, __kmp_ncores); 880 } 881 return 0; 882 } 883 884 // From here on, we can assume that it is safe to call 885 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 886 // __kmp_affinity_type = affinity_none. 887 888 // Save the affinity mask for the current thread. 889 kmp_affin_mask_t *oldMask; 890 KMP_CPU_ALLOC(oldMask); 891 KMP_ASSERT(oldMask != NULL); 892 __kmp_get_system_affinity(oldMask, TRUE); 893 894 // Run through each of the available contexts, binding the current thread 895 // to it, and obtaining the pertinent information using the cpuid instr. 896 // 897 // The relevant information is: 898 // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 899 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 900 // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value 901 // of this field determines the width of the core# + thread# fields in the 902 // Apic Id. It is also an upper bound on the number of threads per 903 // package, but it has been verified that situations happen were it is not 904 // exact. In particular, on certain OS/chip combinations where Intel(R) 905 // Hyper-Threading Technology is supported by the chip but has been 906 // disabled, the value of this field will be 2 (for a single core chip). 907 // On other OS/chip combinations supporting Intel(R) Hyper-Threading 908 // Technology, the value of this field will be 1 when Intel(R) 909 // Hyper-Threading Technology is disabled and 2 when it is enabled. 910 // - Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The value 911 // of this field (+1) determines the width of the core# field in the Apic 912 // Id. The comments in "cpucount.cpp" say that this value is an upper 913 // bound, but the IA-32 architecture manual says that it is exactly the 914 // number of cores per package, and I haven't seen any case where it 915 // wasn't. 916 // 917 // From this information, deduce the package Id, core Id, and thread Id, 918 // and set the corresponding fields in the apicThreadInfo struct. 919 unsigned i; 920 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 921 __kmp_avail_proc * sizeof(apicThreadInfo)); 922 unsigned nApics = 0; 923 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 924 // Skip this proc if it is not included in the machine model. 925 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 926 continue; 927 } 928 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 929 930 __kmp_affinity_dispatch->bind_thread(i); 931 threadInfo[nApics].osId = i; 932 933 // The apic id and max threads per pkg come from cpuid(1). 934 __kmp_x86_cpuid(1, 0, &buf); 935 if (((buf.edx >> 9) & 1) == 0) { 936 __kmp_set_system_affinity(oldMask, TRUE); 937 __kmp_free(threadInfo); 938 KMP_CPU_FREE(oldMask); 939 *msg_id = kmp_i18n_str_ApicNotPresent; 940 return -1; 941 } 942 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 943 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 944 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 945 threadInfo[nApics].maxThreadsPerPkg = 1; 946 } 947 948 // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded 949 // value. 950 // 951 // First, we need to check if cpuid(4) is supported on this chip. To see if 952 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n 953 // or greater. 954 __kmp_x86_cpuid(0, 0, &buf); 955 if (buf.eax >= 4) { 956 __kmp_x86_cpuid(4, 0, &buf); 957 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 958 } else { 959 threadInfo[nApics].maxCoresPerPkg = 1; 960 } 961 962 // Infer the pkgId / coreId / threadId using only the info obtained locally. 963 int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg); 964 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 965 966 int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg); 967 int widthT = widthCT - widthC; 968 if (widthT < 0) { 969 // I've never seen this one happen, but I suppose it could, if the cpuid 970 // instruction on a chip was really screwed up. Make sure to restore the 971 // affinity mask before the tail call. 972 __kmp_set_system_affinity(oldMask, TRUE); 973 __kmp_free(threadInfo); 974 KMP_CPU_FREE(oldMask); 975 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 976 return -1; 977 } 978 979 int maskC = (1 << widthC) - 1; 980 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC; 981 982 int maskT = (1 << widthT) - 1; 983 threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT; 984 985 nApics++; 986 } 987 988 // We've collected all the info we need. 989 // Restore the old affinity mask for this thread. 990 __kmp_set_system_affinity(oldMask, TRUE); 991 992 // If there's only one thread context to bind to, form an Address object 993 // with depth 1 and return immediately (or, if affinity is off, set 994 // address2os to NULL and return). 995 // 996 // If it is configured to omit the package level when there is only a single 997 // package, the logic at the end of this routine won't work if there is only 998 // a single thread - it would try to form an Address object with depth 0. 999 KMP_ASSERT(nApics > 0); 1000 if (nApics == 1) { 1001 __kmp_ncores = nPackages = 1; 1002 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1003 if (__kmp_affinity_verbose) { 1004 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1005 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1006 1007 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1008 if (__kmp_affinity_respect_mask) { 1009 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1010 } else { 1011 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1012 } 1013 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1014 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1015 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1016 __kmp_nThreadsPerCore, __kmp_ncores); 1017 } 1018 1019 if (__kmp_affinity_type == affinity_none) { 1020 __kmp_free(threadInfo); 1021 KMP_CPU_FREE(oldMask); 1022 return 0; 1023 } 1024 1025 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair)); 1026 Address addr(1); 1027 addr.labels[0] = threadInfo[0].pkgId; 1028 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId); 1029 1030 if (__kmp_affinity_gran_levels < 0) { 1031 __kmp_affinity_gran_levels = 0; 1032 } 1033 1034 if (__kmp_affinity_verbose) { 1035 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 1036 } 1037 1038 __kmp_free(threadInfo); 1039 KMP_CPU_FREE(oldMask); 1040 return 1; 1041 } 1042 1043 // Sort the threadInfo table by physical Id. 1044 qsort(threadInfo, nApics, sizeof(*threadInfo), 1045 __kmp_affinity_cmp_apicThreadInfo_phys_id); 1046 1047 // The table is now sorted by pkgId / coreId / threadId, but we really don't 1048 // know the radix of any of the fields. pkgId's may be sparsely assigned among 1049 // the chips on a system. Although coreId's are usually assigned 1050 // [0 .. coresPerPkg-1] and threadId's are usually assigned 1051 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1052 // 1053 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 1054 // total # packages) are at this point - we want to determine that now. We 1055 // only have an upper bound on the first two figures. 1056 // 1057 // We also perform a consistency check at this point: the values returned by 1058 // the cpuid instruction for any thread bound to a given package had better 1059 // return the same info for maxThreadsPerPkg and maxCoresPerPkg. 1060 nPackages = 1; 1061 nCoresPerPkg = 1; 1062 __kmp_nThreadsPerCore = 1; 1063 unsigned nCores = 1; 1064 1065 unsigned pkgCt = 1; // to determine radii 1066 unsigned lastPkgId = threadInfo[0].pkgId; 1067 unsigned coreCt = 1; 1068 unsigned lastCoreId = threadInfo[0].coreId; 1069 unsigned threadCt = 1; 1070 unsigned lastThreadId = threadInfo[0].threadId; 1071 1072 // intra-pkg consist checks 1073 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 1074 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 1075 1076 for (i = 1; i < nApics; i++) { 1077 if (threadInfo[i].pkgId != lastPkgId) { 1078 nCores++; 1079 pkgCt++; 1080 lastPkgId = threadInfo[i].pkgId; 1081 if ((int)coreCt > nCoresPerPkg) 1082 nCoresPerPkg = coreCt; 1083 coreCt = 1; 1084 lastCoreId = threadInfo[i].coreId; 1085 if ((int)threadCt > __kmp_nThreadsPerCore) 1086 __kmp_nThreadsPerCore = threadCt; 1087 threadCt = 1; 1088 lastThreadId = threadInfo[i].threadId; 1089 1090 // This is a different package, so go on to the next iteration without 1091 // doing any consistency checks. Reset the consistency check vars, though. 1092 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 1093 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 1094 continue; 1095 } 1096 1097 if (threadInfo[i].coreId != lastCoreId) { 1098 nCores++; 1099 coreCt++; 1100 lastCoreId = threadInfo[i].coreId; 1101 if ((int)threadCt > __kmp_nThreadsPerCore) 1102 __kmp_nThreadsPerCore = threadCt; 1103 threadCt = 1; 1104 lastThreadId = threadInfo[i].threadId; 1105 } else if (threadInfo[i].threadId != lastThreadId) { 1106 threadCt++; 1107 lastThreadId = threadInfo[i].threadId; 1108 } else { 1109 __kmp_free(threadInfo); 1110 KMP_CPU_FREE(oldMask); 1111 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1112 return -1; 1113 } 1114 1115 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 1116 // fields agree between all the threads bounds to a given package. 1117 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) || 1118 (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 1119 __kmp_free(threadInfo); 1120 KMP_CPU_FREE(oldMask); 1121 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1122 return -1; 1123 } 1124 } 1125 nPackages = pkgCt; 1126 if ((int)coreCt > nCoresPerPkg) 1127 nCoresPerPkg = coreCt; 1128 if ((int)threadCt > __kmp_nThreadsPerCore) 1129 __kmp_nThreadsPerCore = threadCt; 1130 1131 // When affinity is off, this routine will still be called to set 1132 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1133 // Make sure all these vars are set correctly, and return now if affinity is 1134 // not enabled. 1135 __kmp_ncores = nCores; 1136 if (__kmp_affinity_verbose) { 1137 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1138 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1139 1140 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1141 if (__kmp_affinity_respect_mask) { 1142 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1143 } else { 1144 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1145 } 1146 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1147 if (__kmp_affinity_uniform_topology()) { 1148 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1149 } else { 1150 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1151 } 1152 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1153 __kmp_nThreadsPerCore, __kmp_ncores); 1154 } 1155 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 1156 KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc); 1157 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 1158 for (i = 0; i < nApics; ++i) { 1159 __kmp_pu_os_idx[i] = threadInfo[i].osId; 1160 } 1161 if (__kmp_affinity_type == affinity_none) { 1162 __kmp_free(threadInfo); 1163 KMP_CPU_FREE(oldMask); 1164 return 0; 1165 } 1166 1167 // Now that we've determined the number of packages, the number of cores per 1168 // package, and the number of threads per core, we can construct the data 1169 // structure that is to be returned. 1170 int pkgLevel = 0; 1171 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1; 1172 int threadLevel = 1173 (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 1174 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 1175 1176 KMP_ASSERT(depth > 0); 1177 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1178 1179 for (i = 0; i < nApics; ++i) { 1180 Address addr(depth); 1181 unsigned os = threadInfo[i].osId; 1182 int d = 0; 1183 1184 if (pkgLevel >= 0) { 1185 addr.labels[d++] = threadInfo[i].pkgId; 1186 } 1187 if (coreLevel >= 0) { 1188 addr.labels[d++] = threadInfo[i].coreId; 1189 } 1190 if (threadLevel >= 0) { 1191 addr.labels[d++] = threadInfo[i].threadId; 1192 } 1193 (*address2os)[i] = AddrUnsPair(addr, os); 1194 } 1195 1196 if (__kmp_affinity_gran_levels < 0) { 1197 // Set the granularity level based on what levels are modeled in the machine 1198 // topology map. 1199 __kmp_affinity_gran_levels = 0; 1200 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1201 __kmp_affinity_gran_levels++; 1202 } 1203 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1204 __kmp_affinity_gran_levels++; 1205 } 1206 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) { 1207 __kmp_affinity_gran_levels++; 1208 } 1209 } 1210 1211 if (__kmp_affinity_verbose) { 1212 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel, 1213 coreLevel, threadLevel); 1214 } 1215 1216 __kmp_free(threadInfo); 1217 KMP_CPU_FREE(oldMask); 1218 return depth; 1219 } 1220 1221 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 1222 // architectures support a newer interface for specifying the x2APIC Ids, 1223 // based on cpuid leaf 11. 1224 static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os, 1225 kmp_i18n_id_t *const msg_id) { 1226 kmp_cpuid buf; 1227 *address2os = NULL; 1228 *msg_id = kmp_i18n_null; 1229 1230 // Check to see if cpuid leaf 11 is supported. 1231 __kmp_x86_cpuid(0, 0, &buf); 1232 if (buf.eax < 11) { 1233 *msg_id = kmp_i18n_str_NoLeaf11Support; 1234 return -1; 1235 } 1236 __kmp_x86_cpuid(11, 0, &buf); 1237 if (buf.ebx == 0) { 1238 *msg_id = kmp_i18n_str_NoLeaf11Support; 1239 return -1; 1240 } 1241 1242 // Find the number of levels in the machine topology. While we're at it, get 1243 // the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will try to 1244 // get more accurate values later by explicitly counting them, but get 1245 // reasonable defaults now, in case we return early. 1246 int level; 1247 int threadLevel = -1; 1248 int coreLevel = -1; 1249 int pkgLevel = -1; 1250 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1251 1252 for (level = 0;; level++) { 1253 if (level > 31) { 1254 // FIXME: Hack for DPD200163180 1255 // 1256 // If level is big then something went wrong -> exiting 1257 // 1258 // There could actually be 32 valid levels in the machine topology, but so 1259 // far, the only machine we have seen which does not exit this loop before 1260 // iteration 32 has fubar x2APIC settings. 1261 // 1262 // For now, just reject this case based upon loop trip count. 1263 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1264 return -1; 1265 } 1266 __kmp_x86_cpuid(11, level, &buf); 1267 if (buf.ebx == 0) { 1268 if (pkgLevel < 0) { 1269 // Will infer nPackages from __kmp_xproc 1270 pkgLevel = level; 1271 level++; 1272 } 1273 break; 1274 } 1275 int kind = (buf.ecx >> 8) & 0xff; 1276 if (kind == 1) { 1277 // SMT level 1278 threadLevel = level; 1279 coreLevel = -1; 1280 pkgLevel = -1; 1281 __kmp_nThreadsPerCore = buf.ebx & 0xffff; 1282 if (__kmp_nThreadsPerCore == 0) { 1283 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1284 return -1; 1285 } 1286 } else if (kind == 2) { 1287 // core level 1288 coreLevel = level; 1289 pkgLevel = -1; 1290 nCoresPerPkg = buf.ebx & 0xffff; 1291 if (nCoresPerPkg == 0) { 1292 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1293 return -1; 1294 } 1295 } else { 1296 if (level <= 0) { 1297 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1298 return -1; 1299 } 1300 if (pkgLevel >= 0) { 1301 continue; 1302 } 1303 pkgLevel = level; 1304 nPackages = buf.ebx & 0xffff; 1305 if (nPackages == 0) { 1306 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1307 return -1; 1308 } 1309 } 1310 } 1311 int depth = level; 1312 1313 // In the above loop, "level" was counted from the finest level (usually 1314 // thread) to the coarsest. The caller expects that we will place the labels 1315 // in (*address2os)[].first.labels[] in the inverse order, so we need to 1316 // invert the vars saying which level means what. 1317 if (threadLevel >= 0) { 1318 threadLevel = depth - threadLevel - 1; 1319 } 1320 if (coreLevel >= 0) { 1321 coreLevel = depth - coreLevel - 1; 1322 } 1323 KMP_DEBUG_ASSERT(pkgLevel >= 0); 1324 pkgLevel = depth - pkgLevel - 1; 1325 1326 // The algorithm used starts by setting the affinity to each available thread 1327 // and retrieving info from the cpuid instruction, so if we are not capable of 1328 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we 1329 // need to do something else - use the defaults that we calculated from 1330 // issuing cpuid without binding to each proc. 1331 if (!KMP_AFFINITY_CAPABLE()) { 1332 // Hack to try and infer the machine topology using only the data 1333 // available from cpuid on the current thread, and __kmp_xproc. 1334 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1335 1336 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1337 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1338 if (__kmp_affinity_verbose) { 1339 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 1340 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1341 if (__kmp_affinity_uniform_topology()) { 1342 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1343 } else { 1344 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1345 } 1346 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1347 __kmp_nThreadsPerCore, __kmp_ncores); 1348 } 1349 return 0; 1350 } 1351 1352 // From here on, we can assume that it is safe to call 1353 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 1354 // __kmp_affinity_type = affinity_none. 1355 1356 // Save the affinity mask for the current thread. 1357 kmp_affin_mask_t *oldMask; 1358 KMP_CPU_ALLOC(oldMask); 1359 __kmp_get_system_affinity(oldMask, TRUE); 1360 1361 // Allocate the data structure to be returned. 1362 AddrUnsPair *retval = 1363 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 1364 1365 // Run through each of the available contexts, binding the current thread 1366 // to it, and obtaining the pertinent information using the cpuid instr. 1367 unsigned int proc; 1368 int nApics = 0; 1369 KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) { 1370 // Skip this proc if it is not included in the machine model. 1371 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 1372 continue; 1373 } 1374 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc); 1375 1376 __kmp_affinity_dispatch->bind_thread(proc); 1377 1378 // Extract labels for each level in the machine topology map from Apic ID. 1379 Address addr(depth); 1380 int prev_shift = 0; 1381 1382 for (level = 0; level < depth; level++) { 1383 __kmp_x86_cpuid(11, level, &buf); 1384 unsigned apicId = buf.edx; 1385 if (buf.ebx == 0) { 1386 if (level != depth - 1) { 1387 KMP_CPU_FREE(oldMask); 1388 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1389 return -1; 1390 } 1391 addr.labels[depth - level - 1] = apicId >> prev_shift; 1392 level++; 1393 break; 1394 } 1395 int shift = buf.eax & 0x1f; 1396 int mask = (1 << shift) - 1; 1397 addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift; 1398 prev_shift = shift; 1399 } 1400 if (level != depth) { 1401 KMP_CPU_FREE(oldMask); 1402 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1403 return -1; 1404 } 1405 1406 retval[nApics] = AddrUnsPair(addr, proc); 1407 nApics++; 1408 } 1409 1410 // We've collected all the info we need. 1411 // Restore the old affinity mask for this thread. 1412 __kmp_set_system_affinity(oldMask, TRUE); 1413 1414 // If there's only one thread context to bind to, return now. 1415 KMP_ASSERT(nApics > 0); 1416 if (nApics == 1) { 1417 __kmp_ncores = nPackages = 1; 1418 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1419 if (__kmp_affinity_verbose) { 1420 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1421 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1422 1423 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1424 if (__kmp_affinity_respect_mask) { 1425 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1426 } else { 1427 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1428 } 1429 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1430 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1431 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1432 __kmp_nThreadsPerCore, __kmp_ncores); 1433 } 1434 1435 if (__kmp_affinity_type == affinity_none) { 1436 __kmp_free(retval); 1437 KMP_CPU_FREE(oldMask); 1438 return 0; 1439 } 1440 1441 // Form an Address object which only includes the package level. 1442 Address addr(1); 1443 addr.labels[0] = retval[0].first.labels[pkgLevel]; 1444 retval[0].first = addr; 1445 1446 if (__kmp_affinity_gran_levels < 0) { 1447 __kmp_affinity_gran_levels = 0; 1448 } 1449 1450 if (__kmp_affinity_verbose) { 1451 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 1452 } 1453 1454 *address2os = retval; 1455 KMP_CPU_FREE(oldMask); 1456 return 1; 1457 } 1458 1459 // Sort the table by physical Id. 1460 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels); 1461 1462 // Find the radix at each of the levels. 1463 unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1464 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1465 unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1466 unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1467 for (level = 0; level < depth; level++) { 1468 totals[level] = 1; 1469 maxCt[level] = 1; 1470 counts[level] = 1; 1471 last[level] = retval[0].first.labels[level]; 1472 } 1473 1474 // From here on, the iteration variable "level" runs from the finest level to 1475 // the coarsest, i.e. we iterate forward through 1476 // (*address2os)[].first.labels[] - in the previous loops, we iterated 1477 // backwards. 1478 for (proc = 1; (int)proc < nApics; proc++) { 1479 int level; 1480 for (level = 0; level < depth; level++) { 1481 if (retval[proc].first.labels[level] != last[level]) { 1482 int j; 1483 for (j = level + 1; j < depth; j++) { 1484 totals[j]++; 1485 counts[j] = 1; 1486 // The line below causes printing incorrect topology information in 1487 // case the max value for some level (maxCt[level]) is encountered 1488 // earlier than some less value while going through the array. For 1489 // example, let pkg0 has 4 cores and pkg1 has 2 cores. Then 1490 // maxCt[1] == 2 1491 // whereas it must be 4. 1492 // TODO!!! Check if it can be commented safely 1493 // maxCt[j] = 1; 1494 last[j] = retval[proc].first.labels[j]; 1495 } 1496 totals[level]++; 1497 counts[level]++; 1498 if (counts[level] > maxCt[level]) { 1499 maxCt[level] = counts[level]; 1500 } 1501 last[level] = retval[proc].first.labels[level]; 1502 break; 1503 } else if (level == depth - 1) { 1504 __kmp_free(last); 1505 __kmp_free(maxCt); 1506 __kmp_free(counts); 1507 __kmp_free(totals); 1508 __kmp_free(retval); 1509 KMP_CPU_FREE(oldMask); 1510 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 1511 return -1; 1512 } 1513 } 1514 } 1515 1516 // When affinity is off, this routine will still be called to set 1517 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1518 // Make sure all these vars are set correctly, and return if affinity is not 1519 // enabled. 1520 if (threadLevel >= 0) { 1521 __kmp_nThreadsPerCore = maxCt[threadLevel]; 1522 } else { 1523 __kmp_nThreadsPerCore = 1; 1524 } 1525 nPackages = totals[pkgLevel]; 1526 1527 if (coreLevel >= 0) { 1528 __kmp_ncores = totals[coreLevel]; 1529 nCoresPerPkg = maxCt[coreLevel]; 1530 } else { 1531 __kmp_ncores = nPackages; 1532 nCoresPerPkg = 1; 1533 } 1534 1535 // Check to see if the machine topology is uniform 1536 unsigned prod = maxCt[0]; 1537 for (level = 1; level < depth; level++) { 1538 prod *= maxCt[level]; 1539 } 1540 bool uniform = (prod == totals[level - 1]); 1541 1542 // Print the machine topology summary. 1543 if (__kmp_affinity_verbose) { 1544 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 1545 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1546 1547 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1548 if (__kmp_affinity_respect_mask) { 1549 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 1550 } else { 1551 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 1552 } 1553 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1554 if (uniform) { 1555 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1556 } else { 1557 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1558 } 1559 1560 kmp_str_buf_t buf; 1561 __kmp_str_buf_init(&buf); 1562 1563 __kmp_str_buf_print(&buf, "%d", totals[0]); 1564 for (level = 1; level <= pkgLevel; level++) { 1565 __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 1566 } 1567 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 1568 __kmp_nThreadsPerCore, __kmp_ncores); 1569 1570 __kmp_str_buf_free(&buf); 1571 } 1572 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 1573 KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc); 1574 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 1575 for (proc = 0; (int)proc < nApics; ++proc) { 1576 __kmp_pu_os_idx[proc] = retval[proc].second; 1577 } 1578 if (__kmp_affinity_type == affinity_none) { 1579 __kmp_free(last); 1580 __kmp_free(maxCt); 1581 __kmp_free(counts); 1582 __kmp_free(totals); 1583 __kmp_free(retval); 1584 KMP_CPU_FREE(oldMask); 1585 return 0; 1586 } 1587 1588 // Find any levels with radiix 1, and remove them from the map 1589 // (except for the package level). 1590 int new_depth = 0; 1591 for (level = 0; level < depth; level++) { 1592 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1593 continue; 1594 } 1595 new_depth++; 1596 } 1597 1598 // If we are removing any levels, allocate a new vector to return, 1599 // and copy the relevant information to it. 1600 if (new_depth != depth) { 1601 AddrUnsPair *new_retval = 1602 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1603 for (proc = 0; (int)proc < nApics; proc++) { 1604 Address addr(new_depth); 1605 new_retval[proc] = AddrUnsPair(addr, retval[proc].second); 1606 } 1607 int new_level = 0; 1608 int newPkgLevel = -1; 1609 int newCoreLevel = -1; 1610 int newThreadLevel = -1; 1611 int i; 1612 for (level = 0; level < depth; level++) { 1613 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1614 // Remove this level. Never remove the package level 1615 continue; 1616 } 1617 if (level == pkgLevel) { 1618 newPkgLevel = level; 1619 } 1620 if (level == coreLevel) { 1621 newCoreLevel = level; 1622 } 1623 if (level == threadLevel) { 1624 newThreadLevel = level; 1625 } 1626 for (proc = 0; (int)proc < nApics; proc++) { 1627 new_retval[proc].first.labels[new_level] = 1628 retval[proc].first.labels[level]; 1629 } 1630 new_level++; 1631 } 1632 1633 __kmp_free(retval); 1634 retval = new_retval; 1635 depth = new_depth; 1636 pkgLevel = newPkgLevel; 1637 coreLevel = newCoreLevel; 1638 threadLevel = newThreadLevel; 1639 } 1640 1641 if (__kmp_affinity_gran_levels < 0) { 1642 // Set the granularity level based on what levels are modeled 1643 // in the machine topology map. 1644 __kmp_affinity_gran_levels = 0; 1645 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1646 __kmp_affinity_gran_levels++; 1647 } 1648 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1649 __kmp_affinity_gran_levels++; 1650 } 1651 if (__kmp_affinity_gran > affinity_gran_package) { 1652 __kmp_affinity_gran_levels++; 1653 } 1654 } 1655 1656 if (__kmp_affinity_verbose) { 1657 __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, coreLevel, 1658 threadLevel); 1659 } 1660 1661 __kmp_free(last); 1662 __kmp_free(maxCt); 1663 __kmp_free(counts); 1664 __kmp_free(totals); 1665 KMP_CPU_FREE(oldMask); 1666 *address2os = retval; 1667 return depth; 1668 } 1669 1670 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1671 1672 #define osIdIndex 0 1673 #define threadIdIndex 1 1674 #define coreIdIndex 2 1675 #define pkgIdIndex 3 1676 #define nodeIdIndex 4 1677 1678 typedef unsigned *ProcCpuInfo; 1679 static unsigned maxIndex = pkgIdIndex; 1680 1681 static int __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b) { 1682 const unsigned *aa = (const unsigned *)a; 1683 const unsigned *bb = (const unsigned *)b; 1684 if (aa[osIdIndex] < bb[osIdIndex]) 1685 return -1; 1686 if (aa[osIdIndex] > bb[osIdIndex]) 1687 return 1; 1688 return 0; 1689 }; 1690 1691 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, 1692 const void *b) { 1693 unsigned i; 1694 const unsigned *aa = *(unsigned *const *)a; 1695 const unsigned *bb = *(unsigned *const *)b; 1696 for (i = maxIndex;; i--) { 1697 if (aa[i] < bb[i]) 1698 return -1; 1699 if (aa[i] > bb[i]) 1700 return 1; 1701 if (i == osIdIndex) 1702 break; 1703 } 1704 return 0; 1705 } 1706 1707 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 1708 // affinity map. 1709 static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, 1710 int *line, 1711 kmp_i18n_id_t *const msg_id, 1712 FILE *f) { 1713 *address2os = NULL; 1714 *msg_id = kmp_i18n_null; 1715 1716 // Scan of the file, and count the number of "processor" (osId) fields, 1717 // and find the highest value of <n> for a node_<n> field. 1718 char buf[256]; 1719 unsigned num_records = 0; 1720 while (!feof(f)) { 1721 buf[sizeof(buf) - 1] = 1; 1722 if (!fgets(buf, sizeof(buf), f)) { 1723 // Read errors presumably because of EOF 1724 break; 1725 } 1726 1727 char s1[] = "processor"; 1728 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1729 num_records++; 1730 continue; 1731 } 1732 1733 // FIXME - this will match "node_<n> <garbage>" 1734 unsigned level; 1735 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 1736 if (nodeIdIndex + level >= maxIndex) { 1737 maxIndex = nodeIdIndex + level; 1738 } 1739 continue; 1740 } 1741 } 1742 1743 // Check for empty file / no valid processor records, or too many. The number 1744 // of records can't exceed the number of valid bits in the affinity mask. 1745 if (num_records == 0) { 1746 *line = 0; 1747 *msg_id = kmp_i18n_str_NoProcRecords; 1748 return -1; 1749 } 1750 if (num_records > (unsigned)__kmp_xproc) { 1751 *line = 0; 1752 *msg_id = kmp_i18n_str_TooManyProcRecords; 1753 return -1; 1754 } 1755 1756 // Set the file pointer back to the begginning, so that we can scan the file 1757 // again, this time performing a full parse of the data. Allocate a vector of 1758 // ProcCpuInfo object, where we will place the data. Adding an extra element 1759 // at the end allows us to remove a lot of extra checks for termination 1760 // conditions. 1761 if (fseek(f, 0, SEEK_SET) != 0) { 1762 *line = 0; 1763 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 1764 return -1; 1765 } 1766 1767 // Allocate the array of records to store the proc info in. The dummy 1768 // element at the end makes the logic in filling them out easier to code. 1769 unsigned **threadInfo = 1770 (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *)); 1771 unsigned i; 1772 for (i = 0; i <= num_records; i++) { 1773 threadInfo[i] = 1774 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 1775 } 1776 1777 #define CLEANUP_THREAD_INFO \ 1778 for (i = 0; i <= num_records; i++) { \ 1779 __kmp_free(threadInfo[i]); \ 1780 } \ 1781 __kmp_free(threadInfo); 1782 1783 // A value of UINT_MAX means that we didn't find the field 1784 unsigned __index; 1785 1786 #define INIT_PROC_INFO(p) \ 1787 for (__index = 0; __index <= maxIndex; __index++) { \ 1788 (p)[__index] = UINT_MAX; \ 1789 } 1790 1791 for (i = 0; i <= num_records; i++) { 1792 INIT_PROC_INFO(threadInfo[i]); 1793 } 1794 1795 unsigned num_avail = 0; 1796 *line = 0; 1797 while (!feof(f)) { 1798 // Create an inner scoping level, so that all the goto targets at the end of 1799 // the loop appear in an outer scoping level. This avoids warnings about 1800 // jumping past an initialization to a target in the same block. 1801 { 1802 buf[sizeof(buf) - 1] = 1; 1803 bool long_line = false; 1804 if (!fgets(buf, sizeof(buf), f)) { 1805 // Read errors presumably because of EOF 1806 // If there is valid data in threadInfo[num_avail], then fake 1807 // a blank line in ensure that the last address gets parsed. 1808 bool valid = false; 1809 for (i = 0; i <= maxIndex; i++) { 1810 if (threadInfo[num_avail][i] != UINT_MAX) { 1811 valid = true; 1812 } 1813 } 1814 if (!valid) { 1815 break; 1816 } 1817 buf[0] = 0; 1818 } else if (!buf[sizeof(buf) - 1]) { 1819 // The line is longer than the buffer. Set a flag and don't 1820 // emit an error if we were going to ignore the line, anyway. 1821 long_line = true; 1822 1823 #define CHECK_LINE \ 1824 if (long_line) { \ 1825 CLEANUP_THREAD_INFO; \ 1826 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 1827 return -1; \ 1828 } 1829 } 1830 (*line)++; 1831 1832 char s1[] = "processor"; 1833 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1834 CHECK_LINE; 1835 char *p = strchr(buf + sizeof(s1) - 1, ':'); 1836 unsigned val; 1837 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 1838 goto no_val; 1839 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) 1840 goto dup_field; 1841 threadInfo[num_avail][osIdIndex] = val; 1842 #if KMP_OS_LINUX && USE_SYSFS_INFO 1843 char path[256]; 1844 KMP_SNPRINTF( 1845 path, sizeof(path), 1846 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 1847 threadInfo[num_avail][osIdIndex]); 1848 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 1849 1850 KMP_SNPRINTF(path, sizeof(path), 1851 "/sys/devices/system/cpu/cpu%u/topology/core_id", 1852 threadInfo[num_avail][osIdIndex]); 1853 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 1854 continue; 1855 #else 1856 } 1857 char s2[] = "physical id"; 1858 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 1859 CHECK_LINE; 1860 char *p = strchr(buf + sizeof(s2) - 1, ':'); 1861 unsigned val; 1862 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 1863 goto no_val; 1864 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) 1865 goto dup_field; 1866 threadInfo[num_avail][pkgIdIndex] = val; 1867 continue; 1868 } 1869 char s3[] = "core id"; 1870 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 1871 CHECK_LINE; 1872 char *p = strchr(buf + sizeof(s3) - 1, ':'); 1873 unsigned val; 1874 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 1875 goto no_val; 1876 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) 1877 goto dup_field; 1878 threadInfo[num_avail][coreIdIndex] = val; 1879 continue; 1880 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 1881 } 1882 char s4[] = "thread id"; 1883 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 1884 CHECK_LINE; 1885 char *p = strchr(buf + sizeof(s4) - 1, ':'); 1886 unsigned val; 1887 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 1888 goto no_val; 1889 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) 1890 goto dup_field; 1891 threadInfo[num_avail][threadIdIndex] = val; 1892 continue; 1893 } 1894 unsigned level; 1895 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 1896 CHECK_LINE; 1897 char *p = strchr(buf + sizeof(s4) - 1, ':'); 1898 unsigned val; 1899 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 1900 goto no_val; 1901 KMP_ASSERT(nodeIdIndex + level <= maxIndex); 1902 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) 1903 goto dup_field; 1904 threadInfo[num_avail][nodeIdIndex + level] = val; 1905 continue; 1906 } 1907 1908 // We didn't recognize the leading token on the line. There are lots of 1909 // leading tokens that we don't recognize - if the line isn't empty, go on 1910 // to the next line. 1911 if ((*buf != 0) && (*buf != '\n')) { 1912 // If the line is longer than the buffer, read characters 1913 // until we find a newline. 1914 if (long_line) { 1915 int ch; 1916 while (((ch = fgetc(f)) != EOF) && (ch != '\n')) 1917 ; 1918 } 1919 continue; 1920 } 1921 1922 // A newline has signalled the end of the processor record. 1923 // Check that there aren't too many procs specified. 1924 if ((int)num_avail == __kmp_xproc) { 1925 CLEANUP_THREAD_INFO; 1926 *msg_id = kmp_i18n_str_TooManyEntries; 1927 return -1; 1928 } 1929 1930 // Check for missing fields. The osId field must be there, and we 1931 // currently require that the physical id field is specified, also. 1932 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 1933 CLEANUP_THREAD_INFO; 1934 *msg_id = kmp_i18n_str_MissingProcField; 1935 return -1; 1936 } 1937 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 1938 CLEANUP_THREAD_INFO; 1939 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 1940 return -1; 1941 } 1942 1943 // Skip this proc if it is not included in the machine model. 1944 if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], 1945 __kmp_affin_fullMask)) { 1946 INIT_PROC_INFO(threadInfo[num_avail]); 1947 continue; 1948 } 1949 1950 // We have a successful parse of this proc's info. 1951 // Increment the counter, and prepare for the next proc. 1952 num_avail++; 1953 KMP_ASSERT(num_avail <= num_records); 1954 INIT_PROC_INFO(threadInfo[num_avail]); 1955 } 1956 continue; 1957 1958 no_val: 1959 CLEANUP_THREAD_INFO; 1960 *msg_id = kmp_i18n_str_MissingValCpuinfo; 1961 return -1; 1962 1963 dup_field: 1964 CLEANUP_THREAD_INFO; 1965 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 1966 return -1; 1967 } 1968 *line = 0; 1969 1970 #if KMP_MIC && REDUCE_TEAM_SIZE 1971 unsigned teamSize = 0; 1972 #endif // KMP_MIC && REDUCE_TEAM_SIZE 1973 1974 // check for num_records == __kmp_xproc ??? 1975 1976 // If there's only one thread context to bind to, form an Address object with 1977 // depth 1 and return immediately (or, if affinity is off, set address2os to 1978 // NULL and return). 1979 // 1980 // If it is configured to omit the package level when there is only a single 1981 // package, the logic at the end of this routine won't work if there is only a 1982 // single thread - it would try to form an Address object with depth 0. 1983 KMP_ASSERT(num_avail > 0); 1984 KMP_ASSERT(num_avail <= num_records); 1985 if (num_avail == 1) { 1986 __kmp_ncores = 1; 1987 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1988 if (__kmp_affinity_verbose) { 1989 if (!KMP_AFFINITY_CAPABLE()) { 1990 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 1991 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1992 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1993 } else { 1994 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1995 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 1996 __kmp_affin_fullMask); 1997 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 1998 if (__kmp_affinity_respect_mask) { 1999 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2000 } else { 2001 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2002 } 2003 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2004 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2005 } 2006 int index; 2007 kmp_str_buf_t buf; 2008 __kmp_str_buf_init(&buf); 2009 __kmp_str_buf_print(&buf, "1"); 2010 for (index = maxIndex - 1; index > pkgIdIndex; index--) { 2011 __kmp_str_buf_print(&buf, " x 1"); 2012 } 2013 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1); 2014 __kmp_str_buf_free(&buf); 2015 } 2016 2017 if (__kmp_affinity_type == affinity_none) { 2018 CLEANUP_THREAD_INFO; 2019 return 0; 2020 } 2021 2022 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair)); 2023 Address addr(1); 2024 addr.labels[0] = threadInfo[0][pkgIdIndex]; 2025 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]); 2026 2027 if (__kmp_affinity_gran_levels < 0) { 2028 __kmp_affinity_gran_levels = 0; 2029 } 2030 2031 if (__kmp_affinity_verbose) { 2032 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 2033 } 2034 2035 CLEANUP_THREAD_INFO; 2036 return 1; 2037 } 2038 2039 // Sort the threadInfo table by physical Id. 2040 qsort(threadInfo, num_avail, sizeof(*threadInfo), 2041 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 2042 2043 // The table is now sorted by pkgId / coreId / threadId, but we really don't 2044 // know the radix of any of the fields. pkgId's may be sparsely assigned among 2045 // the chips on a system. Although coreId's are usually assigned 2046 // [0 .. coresPerPkg-1] and threadId's are usually assigned 2047 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2048 // 2049 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 2050 // total # packages) are at this point - we want to determine that now. We 2051 // only have an upper bound on the first two figures. 2052 unsigned *counts = 2053 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2054 unsigned *maxCt = 2055 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2056 unsigned *totals = 2057 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2058 unsigned *lastId = 2059 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2060 2061 bool assign_thread_ids = false; 2062 unsigned threadIdCt; 2063 unsigned index; 2064 2065 restart_radix_check: 2066 threadIdCt = 0; 2067 2068 // Initialize the counter arrays with data from threadInfo[0]. 2069 if (assign_thread_ids) { 2070 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 2071 threadInfo[0][threadIdIndex] = threadIdCt++; 2072 } else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 2073 threadIdCt = threadInfo[0][threadIdIndex] + 1; 2074 } 2075 } 2076 for (index = 0; index <= maxIndex; index++) { 2077 counts[index] = 1; 2078 maxCt[index] = 1; 2079 totals[index] = 1; 2080 lastId[index] = threadInfo[0][index]; 2081 ; 2082 } 2083 2084 // Run through the rest of the OS procs. 2085 for (i = 1; i < num_avail; i++) { 2086 // Find the most significant index whose id differs from the id for the 2087 // previous OS proc. 2088 for (index = maxIndex; index >= threadIdIndex; index--) { 2089 if (assign_thread_ids && (index == threadIdIndex)) { 2090 // Auto-assign the thread id field if it wasn't specified. 2091 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2092 threadInfo[i][threadIdIndex] = threadIdCt++; 2093 } 2094 // Apparently the thread id field was specified for some entries and not 2095 // others. Start the thread id counter off at the next higher thread id. 2096 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2097 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2098 } 2099 } 2100 if (threadInfo[i][index] != lastId[index]) { 2101 // Run through all indices which are less significant, and reset the 2102 // counts to 1. At all levels up to and including index, we need to 2103 // increment the totals and record the last id. 2104 unsigned index2; 2105 for (index2 = threadIdIndex; index2 < index; index2++) { 2106 totals[index2]++; 2107 if (counts[index2] > maxCt[index2]) { 2108 maxCt[index2] = counts[index2]; 2109 } 2110 counts[index2] = 1; 2111 lastId[index2] = threadInfo[i][index2]; 2112 } 2113 counts[index]++; 2114 totals[index]++; 2115 lastId[index] = threadInfo[i][index]; 2116 2117 if (assign_thread_ids && (index > threadIdIndex)) { 2118 2119 #if KMP_MIC && REDUCE_TEAM_SIZE 2120 // The default team size is the total #threads in the machine 2121 // minus 1 thread for every core that has 3 or more threads. 2122 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2123 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2124 2125 // Restart the thread counter, as we are on a new core. 2126 threadIdCt = 0; 2127 2128 // Auto-assign the thread id field if it wasn't specified. 2129 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2130 threadInfo[i][threadIdIndex] = threadIdCt++; 2131 } 2132 2133 // Aparrently the thread id field was specified for some entries and 2134 // not others. Start the thread id counter off at the next higher 2135 // thread id. 2136 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2137 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2138 } 2139 } 2140 break; 2141 } 2142 } 2143 if (index < threadIdIndex) { 2144 // If thread ids were specified, it is an error if they are not unique. 2145 // Also, check that we waven't already restarted the loop (to be safe - 2146 // shouldn't need to). 2147 if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) { 2148 __kmp_free(lastId); 2149 __kmp_free(totals); 2150 __kmp_free(maxCt); 2151 __kmp_free(counts); 2152 CLEANUP_THREAD_INFO; 2153 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2154 return -1; 2155 } 2156 2157 // If the thread ids were not specified and we see entries entries that 2158 // are duplicates, start the loop over and assign the thread ids manually. 2159 assign_thread_ids = true; 2160 goto restart_radix_check; 2161 } 2162 } 2163 2164 #if KMP_MIC && REDUCE_TEAM_SIZE 2165 // The default team size is the total #threads in the machine 2166 // minus 1 thread for every core that has 3 or more threads. 2167 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2168 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2169 2170 for (index = threadIdIndex; index <= maxIndex; index++) { 2171 if (counts[index] > maxCt[index]) { 2172 maxCt[index] = counts[index]; 2173 } 2174 } 2175 2176 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2177 nCoresPerPkg = maxCt[coreIdIndex]; 2178 nPackages = totals[pkgIdIndex]; 2179 2180 // Check to see if the machine topology is uniform 2181 unsigned prod = totals[maxIndex]; 2182 for (index = threadIdIndex; index < maxIndex; index++) { 2183 prod *= maxCt[index]; 2184 } 2185 bool uniform = (prod == totals[threadIdIndex]); 2186 2187 // When affinity is off, this routine will still be called to set 2188 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 2189 // Make sure all these vars are set correctly, and return now if affinity is 2190 // not enabled. 2191 __kmp_ncores = totals[coreIdIndex]; 2192 2193 if (__kmp_affinity_verbose) { 2194 if (!KMP_AFFINITY_CAPABLE()) { 2195 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2196 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2197 if (uniform) { 2198 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2199 } else { 2200 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2201 } 2202 } else { 2203 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2204 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 2205 __kmp_affin_fullMask); 2206 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2207 if (__kmp_affinity_respect_mask) { 2208 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2209 } else { 2210 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2211 } 2212 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2213 if (uniform) { 2214 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2215 } else { 2216 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2217 } 2218 } 2219 kmp_str_buf_t buf; 2220 __kmp_str_buf_init(&buf); 2221 2222 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]); 2223 for (index = maxIndex - 1; index >= pkgIdIndex; index--) { 2224 __kmp_str_buf_print(&buf, " x %d", maxCt[index]); 2225 } 2226 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex], 2227 maxCt[threadIdIndex], __kmp_ncores); 2228 2229 __kmp_str_buf_free(&buf); 2230 } 2231 2232 #if KMP_MIC && REDUCE_TEAM_SIZE 2233 // Set the default team size. 2234 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2235 __kmp_dflt_team_nth = teamSize; 2236 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting " 2237 "__kmp_dflt_team_nth = %d\n", 2238 __kmp_dflt_team_nth)); 2239 } 2240 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2241 2242 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 2243 KMP_DEBUG_ASSERT(num_avail == __kmp_avail_proc); 2244 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 2245 for (i = 0; i < num_avail; ++i) { // fill the os indices 2246 __kmp_pu_os_idx[i] = threadInfo[i][osIdIndex]; 2247 } 2248 2249 if (__kmp_affinity_type == affinity_none) { 2250 __kmp_free(lastId); 2251 __kmp_free(totals); 2252 __kmp_free(maxCt); 2253 __kmp_free(counts); 2254 CLEANUP_THREAD_INFO; 2255 return 0; 2256 } 2257 2258 // Count the number of levels which have more nodes at that level than at the 2259 // parent's level (with there being an implicit root node of the top level). 2260 // This is equivalent to saying that there is at least one node at this level 2261 // which has a sibling. These levels are in the map, and the package level is 2262 // always in the map. 2263 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2264 int level = 0; 2265 for (index = threadIdIndex; index < maxIndex; index++) { 2266 KMP_ASSERT(totals[index] >= totals[index + 1]); 2267 inMap[index] = (totals[index] > totals[index + 1]); 2268 } 2269 inMap[maxIndex] = (totals[maxIndex] > 1); 2270 inMap[pkgIdIndex] = true; 2271 2272 int depth = 0; 2273 for (index = threadIdIndex; index <= maxIndex; index++) { 2274 if (inMap[index]) { 2275 depth++; 2276 } 2277 } 2278 KMP_ASSERT(depth > 0); 2279 2280 // Construct the data structure that is to be returned. 2281 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * num_avail); 2282 int pkgLevel = -1; 2283 int coreLevel = -1; 2284 int threadLevel = -1; 2285 2286 for (i = 0; i < num_avail; ++i) { 2287 Address addr(depth); 2288 unsigned os = threadInfo[i][osIdIndex]; 2289 int src_index; 2290 int dst_index = 0; 2291 2292 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2293 if (!inMap[src_index]) { 2294 continue; 2295 } 2296 addr.labels[dst_index] = threadInfo[i][src_index]; 2297 if (src_index == pkgIdIndex) { 2298 pkgLevel = dst_index; 2299 } else if (src_index == coreIdIndex) { 2300 coreLevel = dst_index; 2301 } else if (src_index == threadIdIndex) { 2302 threadLevel = dst_index; 2303 } 2304 dst_index++; 2305 } 2306 (*address2os)[i] = AddrUnsPair(addr, os); 2307 } 2308 2309 if (__kmp_affinity_gran_levels < 0) { 2310 // Set the granularity level based on what levels are modeled 2311 // in the machine topology map. 2312 unsigned src_index; 2313 __kmp_affinity_gran_levels = 0; 2314 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) { 2315 if (!inMap[src_index]) { 2316 continue; 2317 } 2318 switch (src_index) { 2319 case threadIdIndex: 2320 if (__kmp_affinity_gran > affinity_gran_thread) { 2321 __kmp_affinity_gran_levels++; 2322 } 2323 2324 break; 2325 case coreIdIndex: 2326 if (__kmp_affinity_gran > affinity_gran_core) { 2327 __kmp_affinity_gran_levels++; 2328 } 2329 break; 2330 2331 case pkgIdIndex: 2332 if (__kmp_affinity_gran > affinity_gran_package) { 2333 __kmp_affinity_gran_levels++; 2334 } 2335 break; 2336 } 2337 } 2338 } 2339 2340 if (__kmp_affinity_verbose) { 2341 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel, 2342 coreLevel, threadLevel); 2343 } 2344 2345 __kmp_free(inMap); 2346 __kmp_free(lastId); 2347 __kmp_free(totals); 2348 __kmp_free(maxCt); 2349 __kmp_free(counts); 2350 CLEANUP_THREAD_INFO; 2351 return depth; 2352 } 2353 2354 // Create and return a table of affinity masks, indexed by OS thread ID. 2355 // This routine handles OR'ing together all the affinity masks of threads 2356 // that are sufficiently close, if granularity > fine. 2357 static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex, 2358 unsigned *numUnique, 2359 AddrUnsPair *address2os, 2360 unsigned numAddrs) { 2361 // First form a table of affinity masks in order of OS thread id. 2362 unsigned depth; 2363 unsigned maxOsId; 2364 unsigned i; 2365 2366 KMP_ASSERT(numAddrs > 0); 2367 depth = address2os[0].first.depth; 2368 2369 maxOsId = 0; 2370 for (i = 0; i < numAddrs; i++) { 2371 unsigned osId = address2os[i].second; 2372 if (osId > maxOsId) { 2373 maxOsId = osId; 2374 } 2375 } 2376 kmp_affin_mask_t *osId2Mask; 2377 KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1)); 2378 2379 // Sort the address2os table according to physical order. Doing so will put 2380 // all threads on the same core/package/node in consecutive locations. 2381 qsort(address2os, numAddrs, sizeof(*address2os), 2382 __kmp_affinity_cmp_Address_labels); 2383 2384 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2385 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2386 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2387 } 2388 if (__kmp_affinity_gran_levels >= (int)depth) { 2389 if (__kmp_affinity_verbose || 2390 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 2391 KMP_WARNING(AffThreadsMayMigrate); 2392 } 2393 } 2394 2395 // Run through the table, forming the masks for all threads on each core. 2396 // Threads on the same core will have identical "Address" objects, not 2397 // considering the last level, which must be the thread id. All threads on a 2398 // core will appear consecutively. 2399 unsigned unique = 0; 2400 unsigned j = 0; // index of 1st thread on core 2401 unsigned leader = 0; 2402 Address *leaderAddr = &(address2os[0].first); 2403 kmp_affin_mask_t *sum; 2404 KMP_CPU_ALLOC_ON_STACK(sum); 2405 KMP_CPU_ZERO(sum); 2406 KMP_CPU_SET(address2os[0].second, sum); 2407 for (i = 1; i < numAddrs; i++) { 2408 // If this thread is sufficiently close to the leader (within the 2409 // granularity setting), then set the bit for this os thread in the 2410 // affinity mask for this group, and go on to the next thread. 2411 if (leaderAddr->isClose(address2os[i].first, __kmp_affinity_gran_levels)) { 2412 KMP_CPU_SET(address2os[i].second, sum); 2413 continue; 2414 } 2415 2416 // For every thread in this group, copy the mask to the thread's entry in 2417 // the osId2Mask table. Mark the first address as a leader. 2418 for (; j < i; j++) { 2419 unsigned osId = address2os[j].second; 2420 KMP_DEBUG_ASSERT(osId <= maxOsId); 2421 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2422 KMP_CPU_COPY(mask, sum); 2423 address2os[j].first.leader = (j == leader); 2424 } 2425 unique++; 2426 2427 // Start a new mask. 2428 leader = i; 2429 leaderAddr = &(address2os[i].first); 2430 KMP_CPU_ZERO(sum); 2431 KMP_CPU_SET(address2os[i].second, sum); 2432 } 2433 2434 // For every thread in last group, copy the mask to the thread's 2435 // entry in the osId2Mask table. 2436 for (; j < i; j++) { 2437 unsigned osId = address2os[j].second; 2438 KMP_DEBUG_ASSERT(osId <= maxOsId); 2439 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2440 KMP_CPU_COPY(mask, sum); 2441 address2os[j].first.leader = (j == leader); 2442 } 2443 unique++; 2444 KMP_CPU_FREE_FROM_STACK(sum); 2445 2446 *maxIndex = maxOsId; 2447 *numUnique = unique; 2448 return osId2Mask; 2449 } 2450 2451 // Stuff for the affinity proclist parsers. It's easier to declare these vars 2452 // as file-static than to try and pass them through the calling sequence of 2453 // the recursive-descent OMP_PLACES parser. 2454 static kmp_affin_mask_t *newMasks; 2455 static int numNewMasks; 2456 static int nextNewMask; 2457 2458 #define ADD_MASK(_mask) \ 2459 { \ 2460 if (nextNewMask >= numNewMasks) { \ 2461 int i; \ 2462 numNewMasks *= 2; \ 2463 kmp_affin_mask_t *temp; \ 2464 KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \ 2465 for (i = 0; i < numNewMasks / 2; i++) { \ 2466 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); \ 2467 kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i); \ 2468 KMP_CPU_COPY(dest, src); \ 2469 } \ 2470 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2); \ 2471 newMasks = temp; \ 2472 } \ 2473 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 2474 nextNewMask++; \ 2475 } 2476 2477 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId) \ 2478 { \ 2479 if (((_osId) > _maxOsId) || \ 2480 (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 2481 if (__kmp_affinity_verbose || \ 2482 (__kmp_affinity_warnings && \ 2483 (__kmp_affinity_type != affinity_none))) { \ 2484 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 2485 } \ 2486 } else { \ 2487 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 2488 } \ 2489 } 2490 2491 // Re-parse the proclist (for the explicit affinity type), and form the list 2492 // of affinity newMasks indexed by gtid. 2493 static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 2494 unsigned int *out_numMasks, 2495 const char *proclist, 2496 kmp_affin_mask_t *osId2Mask, 2497 int maxOsId) { 2498 int i; 2499 const char *scan = proclist; 2500 const char *next = proclist; 2501 2502 // We use malloc() for the temporary mask vector, so that we can use 2503 // realloc() to extend it. 2504 numNewMasks = 2; 2505 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 2506 nextNewMask = 0; 2507 kmp_affin_mask_t *sumMask; 2508 KMP_CPU_ALLOC(sumMask); 2509 int setSize = 0; 2510 2511 for (;;) { 2512 int start, end, stride; 2513 2514 SKIP_WS(scan); 2515 next = scan; 2516 if (*next == '\0') { 2517 break; 2518 } 2519 2520 if (*next == '{') { 2521 int num; 2522 setSize = 0; 2523 next++; // skip '{' 2524 SKIP_WS(next); 2525 scan = next; 2526 2527 // Read the first integer in the set. 2528 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist"); 2529 SKIP_DIGITS(next); 2530 num = __kmp_str_to_int(scan, *next); 2531 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2532 2533 // Copy the mask for that osId to the sum (union) mask. 2534 if ((num > maxOsId) || 2535 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2536 if (__kmp_affinity_verbose || 2537 (__kmp_affinity_warnings && 2538 (__kmp_affinity_type != affinity_none))) { 2539 KMP_WARNING(AffIgnoreInvalidProcID, num); 2540 } 2541 KMP_CPU_ZERO(sumMask); 2542 } else { 2543 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2544 setSize = 1; 2545 } 2546 2547 for (;;) { 2548 // Check for end of set. 2549 SKIP_WS(next); 2550 if (*next == '}') { 2551 next++; // skip '}' 2552 break; 2553 } 2554 2555 // Skip optional comma. 2556 if (*next == ',') { 2557 next++; 2558 } 2559 SKIP_WS(next); 2560 2561 // Read the next integer in the set. 2562 scan = next; 2563 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2564 2565 SKIP_DIGITS(next); 2566 num = __kmp_str_to_int(scan, *next); 2567 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2568 2569 // Add the mask for that osId to the sum mask. 2570 if ((num > maxOsId) || 2571 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2572 if (__kmp_affinity_verbose || 2573 (__kmp_affinity_warnings && 2574 (__kmp_affinity_type != affinity_none))) { 2575 KMP_WARNING(AffIgnoreInvalidProcID, num); 2576 } 2577 } else { 2578 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2579 setSize++; 2580 } 2581 } 2582 if (setSize > 0) { 2583 ADD_MASK(sumMask); 2584 } 2585 2586 SKIP_WS(next); 2587 if (*next == ',') { 2588 next++; 2589 } 2590 scan = next; 2591 continue; 2592 } 2593 2594 // Read the first integer. 2595 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2596 SKIP_DIGITS(next); 2597 start = __kmp_str_to_int(scan, *next); 2598 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 2599 SKIP_WS(next); 2600 2601 // If this isn't a range, then add a mask to the list and go on. 2602 if (*next != '-') { 2603 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2604 2605 // Skip optional comma. 2606 if (*next == ',') { 2607 next++; 2608 } 2609 scan = next; 2610 continue; 2611 } 2612 2613 // This is a range. Skip over the '-' and read in the 2nd int. 2614 next++; // skip '-' 2615 SKIP_WS(next); 2616 scan = next; 2617 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2618 SKIP_DIGITS(next); 2619 end = __kmp_str_to_int(scan, *next); 2620 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 2621 2622 // Check for a stride parameter 2623 stride = 1; 2624 SKIP_WS(next); 2625 if (*next == ':') { 2626 // A stride is specified. Skip over the ':" and read the 3rd int. 2627 int sign = +1; 2628 next++; // skip ':' 2629 SKIP_WS(next); 2630 scan = next; 2631 if (*next == '-') { 2632 sign = -1; 2633 next++; 2634 SKIP_WS(next); 2635 scan = next; 2636 } 2637 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2638 SKIP_DIGITS(next); 2639 stride = __kmp_str_to_int(scan, *next); 2640 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 2641 stride *= sign; 2642 } 2643 2644 // Do some range checks. 2645 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 2646 if (stride > 0) { 2647 KMP_ASSERT2(start <= end, "bad explicit proc list"); 2648 } else { 2649 KMP_ASSERT2(start >= end, "bad explicit proc list"); 2650 } 2651 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 2652 2653 // Add the mask for each OS proc # to the list. 2654 if (stride > 0) { 2655 do { 2656 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2657 start += stride; 2658 } while (start <= end); 2659 } else { 2660 do { 2661 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2662 start += stride; 2663 } while (start >= end); 2664 } 2665 2666 // Skip optional comma. 2667 SKIP_WS(next); 2668 if (*next == ',') { 2669 next++; 2670 } 2671 scan = next; 2672 } 2673 2674 *out_numMasks = nextNewMask; 2675 if (nextNewMask == 0) { 2676 *out_masks = NULL; 2677 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 2678 return; 2679 } 2680 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 2681 for (i = 0; i < nextNewMask; i++) { 2682 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 2683 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 2684 KMP_CPU_COPY(dest, src); 2685 } 2686 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 2687 KMP_CPU_FREE(sumMask); 2688 } 2689 2690 #if OMP_40_ENABLED 2691 2692 /*----------------------------------------------------------------------------- 2693 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 2694 places. Again, Here is the grammar: 2695 2696 place_list := place 2697 place_list := place , place_list 2698 place := num 2699 place := place : num 2700 place := place : num : signed 2701 place := { subplacelist } 2702 place := ! place // (lowest priority) 2703 subplace_list := subplace 2704 subplace_list := subplace , subplace_list 2705 subplace := num 2706 subplace := num : num 2707 subplace := num : num : signed 2708 signed := num 2709 signed := + signed 2710 signed := - signed 2711 -----------------------------------------------------------------------------*/ 2712 2713 static void __kmp_process_subplace_list(const char **scan, 2714 kmp_affin_mask_t *osId2Mask, 2715 int maxOsId, kmp_affin_mask_t *tempMask, 2716 int *setSize) { 2717 const char *next; 2718 2719 for (;;) { 2720 int start, count, stride, i; 2721 2722 // Read in the starting proc id 2723 SKIP_WS(*scan); 2724 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 2725 next = *scan; 2726 SKIP_DIGITS(next); 2727 start = __kmp_str_to_int(*scan, *next); 2728 KMP_ASSERT(start >= 0); 2729 *scan = next; 2730 2731 // valid follow sets are ',' ':' and '}' 2732 SKIP_WS(*scan); 2733 if (**scan == '}' || **scan == ',') { 2734 if ((start > maxOsId) || 2735 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2736 if (__kmp_affinity_verbose || 2737 (__kmp_affinity_warnings && 2738 (__kmp_affinity_type != affinity_none))) { 2739 KMP_WARNING(AffIgnoreInvalidProcID, start); 2740 } 2741 } else { 2742 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2743 (*setSize)++; 2744 } 2745 if (**scan == '}') { 2746 break; 2747 } 2748 (*scan)++; // skip ',' 2749 continue; 2750 } 2751 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 2752 (*scan)++; // skip ':' 2753 2754 // Read count parameter 2755 SKIP_WS(*scan); 2756 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 2757 next = *scan; 2758 SKIP_DIGITS(next); 2759 count = __kmp_str_to_int(*scan, *next); 2760 KMP_ASSERT(count >= 0); 2761 *scan = next; 2762 2763 // valid follow sets are ',' ':' and '}' 2764 SKIP_WS(*scan); 2765 if (**scan == '}' || **scan == ',') { 2766 for (i = 0; i < count; i++) { 2767 if ((start > maxOsId) || 2768 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2769 if (__kmp_affinity_verbose || 2770 (__kmp_affinity_warnings && 2771 (__kmp_affinity_type != affinity_none))) { 2772 KMP_WARNING(AffIgnoreInvalidProcID, start); 2773 } 2774 break; // don't proliferate warnings for large count 2775 } else { 2776 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2777 start++; 2778 (*setSize)++; 2779 } 2780 } 2781 if (**scan == '}') { 2782 break; 2783 } 2784 (*scan)++; // skip ',' 2785 continue; 2786 } 2787 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 2788 (*scan)++; // skip ':' 2789 2790 // Read stride parameter 2791 int sign = +1; 2792 for (;;) { 2793 SKIP_WS(*scan); 2794 if (**scan == '+') { 2795 (*scan)++; // skip '+' 2796 continue; 2797 } 2798 if (**scan == '-') { 2799 sign *= -1; 2800 (*scan)++; // skip '-' 2801 continue; 2802 } 2803 break; 2804 } 2805 SKIP_WS(*scan); 2806 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 2807 next = *scan; 2808 SKIP_DIGITS(next); 2809 stride = __kmp_str_to_int(*scan, *next); 2810 KMP_ASSERT(stride >= 0); 2811 *scan = next; 2812 stride *= sign; 2813 2814 // valid follow sets are ',' and '}' 2815 SKIP_WS(*scan); 2816 if (**scan == '}' || **scan == ',') { 2817 for (i = 0; i < count; i++) { 2818 if ((start > maxOsId) || 2819 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2820 if (__kmp_affinity_verbose || 2821 (__kmp_affinity_warnings && 2822 (__kmp_affinity_type != affinity_none))) { 2823 KMP_WARNING(AffIgnoreInvalidProcID, start); 2824 } 2825 break; // don't proliferate warnings for large count 2826 } else { 2827 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2828 start += stride; 2829 (*setSize)++; 2830 } 2831 } 2832 if (**scan == '}') { 2833 break; 2834 } 2835 (*scan)++; // skip ',' 2836 continue; 2837 } 2838 2839 KMP_ASSERT2(0, "bad explicit places list"); 2840 } 2841 } 2842 2843 static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 2844 int maxOsId, kmp_affin_mask_t *tempMask, 2845 int *setSize) { 2846 const char *next; 2847 2848 // valid follow sets are '{' '!' and num 2849 SKIP_WS(*scan); 2850 if (**scan == '{') { 2851 (*scan)++; // skip '{' 2852 __kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize); 2853 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 2854 (*scan)++; // skip '}' 2855 } else if (**scan == '!') { 2856 (*scan)++; // skip '!' 2857 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 2858 KMP_CPU_COMPLEMENT(maxOsId, tempMask); 2859 } else if ((**scan >= '0') && (**scan <= '9')) { 2860 next = *scan; 2861 SKIP_DIGITS(next); 2862 int num = __kmp_str_to_int(*scan, *next); 2863 KMP_ASSERT(num >= 0); 2864 if ((num > maxOsId) || 2865 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2866 if (__kmp_affinity_verbose || 2867 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 2868 KMP_WARNING(AffIgnoreInvalidProcID, num); 2869 } 2870 } else { 2871 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 2872 (*setSize)++; 2873 } 2874 *scan = next; // skip num 2875 } else { 2876 KMP_ASSERT2(0, "bad explicit places list"); 2877 } 2878 } 2879 2880 // static void 2881 void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 2882 unsigned int *out_numMasks, 2883 const char *placelist, 2884 kmp_affin_mask_t *osId2Mask, 2885 int maxOsId) { 2886 int i, j, count, stride, sign; 2887 const char *scan = placelist; 2888 const char *next = placelist; 2889 2890 numNewMasks = 2; 2891 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 2892 nextNewMask = 0; 2893 2894 // tempMask is modified based on the previous or initial 2895 // place to form the current place 2896 // previousMask contains the previous place 2897 kmp_affin_mask_t *tempMask; 2898 kmp_affin_mask_t *previousMask; 2899 KMP_CPU_ALLOC(tempMask); 2900 KMP_CPU_ZERO(tempMask); 2901 KMP_CPU_ALLOC(previousMask); 2902 KMP_CPU_ZERO(previousMask); 2903 int setSize = 0; 2904 2905 for (;;) { 2906 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 2907 2908 // valid follow sets are ',' ':' and EOL 2909 SKIP_WS(scan); 2910 if (*scan == '\0' || *scan == ',') { 2911 if (setSize > 0) { 2912 ADD_MASK(tempMask); 2913 } 2914 KMP_CPU_ZERO(tempMask); 2915 setSize = 0; 2916 if (*scan == '\0') { 2917 break; 2918 } 2919 scan++; // skip ',' 2920 continue; 2921 } 2922 2923 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 2924 scan++; // skip ':' 2925 2926 // Read count parameter 2927 SKIP_WS(scan); 2928 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 2929 next = scan; 2930 SKIP_DIGITS(next); 2931 count = __kmp_str_to_int(scan, *next); 2932 KMP_ASSERT(count >= 0); 2933 scan = next; 2934 2935 // valid follow sets are ',' ':' and EOL 2936 SKIP_WS(scan); 2937 if (*scan == '\0' || *scan == ',') { 2938 stride = +1; 2939 } else { 2940 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 2941 scan++; // skip ':' 2942 2943 // Read stride parameter 2944 sign = +1; 2945 for (;;) { 2946 SKIP_WS(scan); 2947 if (*scan == '+') { 2948 scan++; // skip '+' 2949 continue; 2950 } 2951 if (*scan == '-') { 2952 sign *= -1; 2953 scan++; // skip '-' 2954 continue; 2955 } 2956 break; 2957 } 2958 SKIP_WS(scan); 2959 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 2960 next = scan; 2961 SKIP_DIGITS(next); 2962 stride = __kmp_str_to_int(scan, *next); 2963 KMP_DEBUG_ASSERT(stride >= 0); 2964 scan = next; 2965 stride *= sign; 2966 } 2967 2968 // Add places determined by initial_place : count : stride 2969 for (i = 0; i < count; i++) { 2970 if (setSize == 0) { 2971 break; 2972 } 2973 // Add the current place, then build the next place (tempMask) from that 2974 KMP_CPU_COPY(previousMask, tempMask); 2975 ADD_MASK(previousMask); 2976 KMP_CPU_ZERO(tempMask); 2977 setSize = 0; 2978 KMP_CPU_SET_ITERATE(j, previousMask) { 2979 if (!KMP_CPU_ISSET(j, previousMask)) { 2980 continue; 2981 } 2982 if ((j + stride > maxOsId) || (j + stride < 0) || 2983 (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) || 2984 (!KMP_CPU_ISSET(j + stride, 2985 KMP_CPU_INDEX(osId2Mask, j + stride)))) { 2986 if ((__kmp_affinity_verbose || 2987 (__kmp_affinity_warnings && 2988 (__kmp_affinity_type != affinity_none))) && 2989 i < count - 1) { 2990 KMP_WARNING(AffIgnoreInvalidProcID, j + stride); 2991 } 2992 continue; 2993 } 2994 KMP_CPU_SET(j + stride, tempMask); 2995 setSize++; 2996 } 2997 } 2998 KMP_CPU_ZERO(tempMask); 2999 setSize = 0; 3000 3001 // valid follow sets are ',' and EOL 3002 SKIP_WS(scan); 3003 if (*scan == '\0') { 3004 break; 3005 } 3006 if (*scan == ',') { 3007 scan++; // skip ',' 3008 continue; 3009 } 3010 3011 KMP_ASSERT2(0, "bad explicit places list"); 3012 } 3013 3014 *out_numMasks = nextNewMask; 3015 if (nextNewMask == 0) { 3016 *out_masks = NULL; 3017 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3018 return; 3019 } 3020 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3021 KMP_CPU_FREE(tempMask); 3022 KMP_CPU_FREE(previousMask); 3023 for (i = 0; i < nextNewMask; i++) { 3024 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 3025 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 3026 KMP_CPU_COPY(dest, src); 3027 } 3028 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3029 } 3030 3031 #endif /* OMP_40_ENABLED */ 3032 3033 #undef ADD_MASK 3034 #undef ADD_MASK_OSID 3035 3036 #if KMP_USE_HWLOC 3037 static int __kmp_hwloc_count_children_by_type(hwloc_topology_t t, hwloc_obj_t o, 3038 hwloc_obj_type_t type, 3039 hwloc_obj_t *f) { 3040 if (!hwloc_compare_types(o->type, type)) { 3041 if (*f == NULL) 3042 *f = o; // output first descendant found 3043 return 1; 3044 } 3045 int sum = 0; 3046 for (unsigned i = 0; i < o->arity; i++) 3047 sum += __kmp_hwloc_count_children_by_type(t, o->children[i], type, f); 3048 return sum; // will be 0 if no one found (as PU arity is 0) 3049 } 3050 3051 static int __kmp_hwloc_count_children_by_depth(hwloc_topology_t t, 3052 hwloc_obj_t o, unsigned depth, 3053 hwloc_obj_t *f) { 3054 if (o->depth == depth) { 3055 if (*f == NULL) 3056 *f = o; // output first descendant found 3057 return 1; 3058 } 3059 int sum = 0; 3060 for (unsigned i = 0; i < o->arity; i++) 3061 sum += __kmp_hwloc_count_children_by_depth(t, o->children[i], depth, f); 3062 return sum; // will be 0 if no one found (as PU arity is 0) 3063 } 3064 3065 static int __kmp_hwloc_skip_PUs_obj(hwloc_topology_t t, hwloc_obj_t o) { 3066 // skip PUs descendants of the object o 3067 int skipped = 0; 3068 hwloc_obj_t hT = NULL; 3069 int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT); 3070 for (int i = 0; i < N; ++i) { 3071 KMP_DEBUG_ASSERT(hT); 3072 unsigned idx = hT->os_index; 3073 if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3074 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3075 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3076 ++skipped; 3077 } 3078 hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT); 3079 } 3080 return skipped; // count number of skipped units 3081 } 3082 3083 static int __kmp_hwloc_obj_has_PUs(hwloc_topology_t t, hwloc_obj_t o) { 3084 // check if obj has PUs present in fullMask 3085 hwloc_obj_t hT = NULL; 3086 int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT); 3087 for (int i = 0; i < N; ++i) { 3088 KMP_DEBUG_ASSERT(hT); 3089 unsigned idx = hT->os_index; 3090 if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) 3091 return 1; // found PU 3092 hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT); 3093 } 3094 return 0; // no PUs found 3095 } 3096 #endif // KMP_USE_HWLOC 3097 3098 static void __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) { 3099 AddrUnsPair *newAddr; 3100 if (__kmp_hws_requested == 0) 3101 goto _exit; // no topology limiting actions requested, exit 3102 #if KMP_USE_HWLOC 3103 if (__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 3104 // Number of subobjects calculated dynamically, this works fine for 3105 // any non-uniform topology. 3106 // L2 cache objects are determined by depth, other objects - by type. 3107 hwloc_topology_t tp = __kmp_hwloc_topology; 3108 int nS = 0, nN = 0, nL = 0, nC = 0, 3109 nT = 0; // logical index including skipped 3110 int nCr = 0, nTr = 0; // number of requested units 3111 int nPkg = 0, nCo = 0, n_new = 0, n_old = 0, nCpP = 0, nTpC = 0; // counters 3112 hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to) 3113 int L2depth, idx; 3114 3115 // check support of extensions ---------------------------------- 3116 int numa_support = 0, tile_support = 0; 3117 if (__kmp_pu_os_idx) 3118 hT = hwloc_get_pu_obj_by_os_index(tp, 3119 __kmp_pu_os_idx[__kmp_avail_proc - 1]); 3120 else 3121 hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, __kmp_avail_proc - 1); 3122 if (hT == NULL) { // something's gone wrong 3123 KMP_WARNING(AffHWSubsetUnsupported); 3124 goto _exit; 3125 } 3126 // check NUMA node 3127 hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT); 3128 hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT); 3129 if (hN != NULL && hN->depth > hS->depth) { 3130 numa_support = 1; // 1 in case socket includes node(s) 3131 } else if (__kmp_hws_node.num > 0) { 3132 // don't support sockets inside NUMA node (no such HW found for testing) 3133 KMP_WARNING(AffHWSubsetUnsupported); 3134 goto _exit; 3135 } 3136 // check L2 cahce, get object by depth because of multiple caches 3137 L2depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED); 3138 hL = hwloc_get_ancestor_obj_by_depth(tp, L2depth, hT); 3139 if (hL != NULL && 3140 __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1) { 3141 tile_support = 1; // no sense to count L2 if it includes single core 3142 } else if (__kmp_hws_tile.num > 0) { 3143 if (__kmp_hws_core.num == 0) { 3144 __kmp_hws_core = __kmp_hws_tile; // replace L2 with core 3145 __kmp_hws_tile.num = 0; 3146 } else { 3147 // L2 and core are both requested, but represent same object 3148 KMP_WARNING(AffHWSubsetInvalid); 3149 goto _exit; 3150 } 3151 } 3152 // end of check of extensions ----------------------------------- 3153 3154 // fill in unset items, validate settings ----------------------- 3155 if (__kmp_hws_socket.num == 0) 3156 __kmp_hws_socket.num = nPackages; // use all available sockets 3157 if (__kmp_hws_socket.offset >= nPackages) { 3158 KMP_WARNING(AffHWSubsetManySockets); 3159 goto _exit; 3160 } 3161 if (numa_support) { 3162 int NN = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, 3163 &hN); // num nodes in socket 3164 if (__kmp_hws_node.num == 0) 3165 __kmp_hws_node.num = NN; // use all available nodes 3166 if (__kmp_hws_node.offset >= NN) { 3167 KMP_WARNING(AffHWSubsetManyNodes); 3168 goto _exit; 3169 } 3170 if (tile_support) { 3171 // get num tiles in node 3172 int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL); 3173 if (__kmp_hws_tile.num == 0) { 3174 __kmp_hws_tile.num = NL + 1; 3175 } // use all available tiles, some node may have more tiles, thus +1 3176 if (__kmp_hws_tile.offset >= NL) { 3177 KMP_WARNING(AffHWSubsetManyTiles); 3178 goto _exit; 3179 } 3180 int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, 3181 &hC); // num cores in tile 3182 if (__kmp_hws_core.num == 0) 3183 __kmp_hws_core.num = NC; // use all available cores 3184 if (__kmp_hws_core.offset >= NC) { 3185 KMP_WARNING(AffHWSubsetManyCores); 3186 goto _exit; 3187 } 3188 } else { // tile_support 3189 int NC = __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, 3190 &hC); // num cores in node 3191 if (__kmp_hws_core.num == 0) 3192 __kmp_hws_core.num = NC; // use all available cores 3193 if (__kmp_hws_core.offset >= NC) { 3194 KMP_WARNING(AffHWSubsetManyCores); 3195 goto _exit; 3196 } 3197 } // tile_support 3198 } else { // numa_support 3199 if (tile_support) { 3200 // get num tiles in socket 3201 int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL); 3202 if (__kmp_hws_tile.num == 0) 3203 __kmp_hws_tile.num = NL; // use all available tiles 3204 if (__kmp_hws_tile.offset >= NL) { 3205 KMP_WARNING(AffHWSubsetManyTiles); 3206 goto _exit; 3207 } 3208 int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, 3209 &hC); // num cores in tile 3210 if (__kmp_hws_core.num == 0) 3211 __kmp_hws_core.num = NC; // use all available cores 3212 if (__kmp_hws_core.offset >= NC) { 3213 KMP_WARNING(AffHWSubsetManyCores); 3214 goto _exit; 3215 } 3216 } else { // tile_support 3217 int NC = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, 3218 &hC); // num cores in socket 3219 if (__kmp_hws_core.num == 0) 3220 __kmp_hws_core.num = NC; // use all available cores 3221 if (__kmp_hws_core.offset >= NC) { 3222 KMP_WARNING(AffHWSubsetManyCores); 3223 goto _exit; 3224 } 3225 } // tile_support 3226 } 3227 if (__kmp_hws_proc.num == 0) 3228 __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all available procs 3229 if (__kmp_hws_proc.offset >= __kmp_nThreadsPerCore) { 3230 KMP_WARNING(AffHWSubsetManyProcs); 3231 goto _exit; 3232 } 3233 // end of validation -------------------------------------------- 3234 3235 if (pAddr) // pAddr is NULL in case of affinity_none 3236 newAddr = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * 3237 __kmp_avail_proc); // max size 3238 // main loop to form HW subset ---------------------------------- 3239 hS = NULL; 3240 int NP = hwloc_get_nbobjs_by_type(tp, HWLOC_OBJ_PACKAGE); 3241 for (int s = 0; s < NP; ++s) { 3242 // Check Socket ----------------------------------------------- 3243 hS = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hS); 3244 if (!__kmp_hwloc_obj_has_PUs(tp, hS)) 3245 continue; // skip socket if all PUs are out of fullMask 3246 ++nS; // only count objects those have PUs in affinity mask 3247 if (nS <= __kmp_hws_socket.offset || 3248 nS > __kmp_hws_socket.num + __kmp_hws_socket.offset) { 3249 n_old += __kmp_hwloc_skip_PUs_obj(tp, hS); // skip socket 3250 continue; // move to next socket 3251 } 3252 nCr = 0; // count number of cores per socket 3253 // socket requested, go down the topology tree 3254 // check 4 cases: (+NUMA+Tile), (+NUMA-Tile), (-NUMA+Tile), (-NUMA-Tile) 3255 if (numa_support) { 3256 nN = 0; 3257 hN = NULL; 3258 // num nodes in current socket 3259 int NN = 3260 __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, &hN); 3261 for (int n = 0; n < NN; ++n) { 3262 // Check NUMA Node ---------------------------------------- 3263 if (!__kmp_hwloc_obj_has_PUs(tp, hN)) { 3264 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3265 continue; // skip node if all PUs are out of fullMask 3266 } 3267 ++nN; 3268 if (nN <= __kmp_hws_node.offset || 3269 nN > __kmp_hws_node.num + __kmp_hws_node.offset) { 3270 // skip node as not requested 3271 n_old += __kmp_hwloc_skip_PUs_obj(tp, hN); // skip node 3272 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3273 continue; // move to next node 3274 } 3275 // node requested, go down the topology tree 3276 if (tile_support) { 3277 nL = 0; 3278 hL = NULL; 3279 int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL); 3280 for (int l = 0; l < NL; ++l) { 3281 // Check L2 (tile) ------------------------------------ 3282 if (!__kmp_hwloc_obj_has_PUs(tp, hL)) { 3283 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3284 continue; // skip tile if all PUs are out of fullMask 3285 } 3286 ++nL; 3287 if (nL <= __kmp_hws_tile.offset || 3288 nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) { 3289 // skip tile as not requested 3290 n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile 3291 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3292 continue; // move to next tile 3293 } 3294 // tile requested, go down the topology tree 3295 nC = 0; 3296 hC = NULL; 3297 // num cores in current tile 3298 int NC = __kmp_hwloc_count_children_by_type(tp, hL, 3299 HWLOC_OBJ_CORE, &hC); 3300 for (int c = 0; c < NC; ++c) { 3301 // Check Core --------------------------------------- 3302 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3303 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3304 continue; // skip core if all PUs are out of fullMask 3305 } 3306 ++nC; 3307 if (nC <= __kmp_hws_core.offset || 3308 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3309 // skip node as not requested 3310 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3311 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3312 continue; // move to next node 3313 } 3314 // core requested, go down to PUs 3315 nT = 0; 3316 nTr = 0; 3317 hT = NULL; 3318 // num procs in current core 3319 int NT = __kmp_hwloc_count_children_by_type(tp, hC, 3320 HWLOC_OBJ_PU, &hT); 3321 for (int t = 0; t < NT; ++t) { 3322 // Check PU --------------------------------------- 3323 idx = hT->os_index; 3324 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3325 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3326 continue; // skip PU if not in fullMask 3327 } 3328 ++nT; 3329 if (nT <= __kmp_hws_proc.offset || 3330 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3331 // skip PU 3332 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3333 ++n_old; 3334 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3335 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3336 continue; // move to next node 3337 } 3338 ++nTr; 3339 if (pAddr) // collect requested thread's data 3340 newAddr[n_new] = (*pAddr)[n_old]; 3341 ++n_new; 3342 ++n_old; 3343 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3344 } // threads loop 3345 if (nTr > 0) { 3346 ++nCr; // num cores per socket 3347 ++nCo; // total num cores 3348 if (nTr > nTpC) 3349 nTpC = nTr; // calc max threads per core 3350 } 3351 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3352 } // cores loop 3353 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3354 } // tiles loop 3355 } else { // tile_support 3356 // no tiles, check cores 3357 nC = 0; 3358 hC = NULL; 3359 // num cores in current node 3360 int NC = 3361 __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, &hC); 3362 for (int c = 0; c < NC; ++c) { 3363 // Check Core --------------------------------------- 3364 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3365 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3366 continue; // skip core if all PUs are out of fullMask 3367 } 3368 ++nC; 3369 if (nC <= __kmp_hws_core.offset || 3370 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3371 // skip node as not requested 3372 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3373 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3374 continue; // move to next node 3375 } 3376 // core requested, go down to PUs 3377 nT = 0; 3378 nTr = 0; 3379 hT = NULL; 3380 int NT = 3381 __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT); 3382 for (int t = 0; t < NT; ++t) { 3383 // Check PU --------------------------------------- 3384 idx = hT->os_index; 3385 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3386 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3387 continue; // skip PU if not in fullMask 3388 } 3389 ++nT; 3390 if (nT <= __kmp_hws_proc.offset || 3391 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3392 // skip PU 3393 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3394 ++n_old; 3395 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3396 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3397 continue; // move to next node 3398 } 3399 ++nTr; 3400 if (pAddr) // collect requested thread's data 3401 newAddr[n_new] = (*pAddr)[n_old]; 3402 ++n_new; 3403 ++n_old; 3404 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3405 } // threads loop 3406 if (nTr > 0) { 3407 ++nCr; // num cores per socket 3408 ++nCo; // total num cores 3409 if (nTr > nTpC) 3410 nTpC = nTr; // calc max threads per core 3411 } 3412 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3413 } // cores loop 3414 } // tiles support 3415 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3416 } // nodes loop 3417 } else { // numa_support 3418 // no NUMA support 3419 if (tile_support) { 3420 nL = 0; 3421 hL = NULL; 3422 // num tiles in current socket 3423 int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL); 3424 for (int l = 0; l < NL; ++l) { 3425 // Check L2 (tile) ------------------------------------ 3426 if (!__kmp_hwloc_obj_has_PUs(tp, hL)) { 3427 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3428 continue; // skip tile if all PUs are out of fullMask 3429 } 3430 ++nL; 3431 if (nL <= __kmp_hws_tile.offset || 3432 nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) { 3433 // skip tile as not requested 3434 n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile 3435 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3436 continue; // move to next tile 3437 } 3438 // tile requested, go down the topology tree 3439 nC = 0; 3440 hC = NULL; 3441 // num cores per tile 3442 int NC = 3443 __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC); 3444 for (int c = 0; c < NC; ++c) { 3445 // Check Core --------------------------------------- 3446 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3447 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3448 continue; // skip core if all PUs are out of fullMask 3449 } 3450 ++nC; 3451 if (nC <= __kmp_hws_core.offset || 3452 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3453 // skip node as not requested 3454 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3455 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3456 continue; // move to next node 3457 } 3458 // core requested, go down to PUs 3459 nT = 0; 3460 nTr = 0; 3461 hT = NULL; 3462 // num procs per core 3463 int NT = 3464 __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT); 3465 for (int t = 0; t < NT; ++t) { 3466 // Check PU --------------------------------------- 3467 idx = hT->os_index; 3468 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3469 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3470 continue; // skip PU if not in fullMask 3471 } 3472 ++nT; 3473 if (nT <= __kmp_hws_proc.offset || 3474 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3475 // skip PU 3476 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3477 ++n_old; 3478 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3479 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3480 continue; // move to next node 3481 } 3482 ++nTr; 3483 if (pAddr) // collect requested thread's data 3484 newAddr[n_new] = (*pAddr)[n_old]; 3485 ++n_new; 3486 ++n_old; 3487 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3488 } // threads loop 3489 if (nTr > 0) { 3490 ++nCr; // num cores per socket 3491 ++nCo; // total num cores 3492 if (nTr > nTpC) 3493 nTpC = nTr; // calc max threads per core 3494 } 3495 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3496 } // cores loop 3497 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3498 } // tiles loop 3499 } else { // tile_support 3500 // no tiles, check cores 3501 nC = 0; 3502 hC = NULL; 3503 // num cores in socket 3504 int NC = 3505 __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, &hC); 3506 for (int c = 0; c < NC; ++c) { 3507 // Check Core ------------------------------------------- 3508 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3509 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3510 continue; // skip core if all PUs are out of fullMask 3511 } 3512 ++nC; 3513 if (nC <= __kmp_hws_core.offset || 3514 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3515 // skip node as not requested 3516 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3517 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3518 continue; // move to next node 3519 } 3520 // core requested, go down to PUs 3521 nT = 0; 3522 nTr = 0; 3523 hT = NULL; 3524 // num procs per core 3525 int NT = 3526 __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT); 3527 for (int t = 0; t < NT; ++t) { 3528 // Check PU --------------------------------------- 3529 idx = hT->os_index; 3530 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3531 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3532 continue; // skip PU if not in fullMask 3533 } 3534 ++nT; 3535 if (nT <= __kmp_hws_proc.offset || 3536 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3537 // skip PU 3538 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3539 ++n_old; 3540 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3541 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3542 continue; // move to next node 3543 } 3544 ++nTr; 3545 if (pAddr) // collect requested thread's data 3546 newAddr[n_new] = (*pAddr)[n_old]; 3547 ++n_new; 3548 ++n_old; 3549 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3550 } // threads loop 3551 if (nTr > 0) { 3552 ++nCr; // num cores per socket 3553 ++nCo; // total num cores 3554 if (nTr > nTpC) 3555 nTpC = nTr; // calc max threads per core 3556 } 3557 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3558 } // cores loop 3559 } // tiles support 3560 } // numa_support 3561 if (nCr > 0) { // found cores? 3562 ++nPkg; // num sockets 3563 if (nCr > nCpP) 3564 nCpP = nCr; // calc max cores per socket 3565 } 3566 } // sockets loop 3567 3568 // check the subset is valid 3569 KMP_DEBUG_ASSERT(n_old == __kmp_avail_proc); 3570 KMP_DEBUG_ASSERT(nPkg > 0); 3571 KMP_DEBUG_ASSERT(nCpP > 0); 3572 KMP_DEBUG_ASSERT(nTpC > 0); 3573 KMP_DEBUG_ASSERT(nCo > 0); 3574 KMP_DEBUG_ASSERT(nPkg <= nPackages); 3575 KMP_DEBUG_ASSERT(nCpP <= nCoresPerPkg); 3576 KMP_DEBUG_ASSERT(nTpC <= __kmp_nThreadsPerCore); 3577 KMP_DEBUG_ASSERT(nCo <= __kmp_ncores); 3578 3579 nPackages = nPkg; // correct num sockets 3580 nCoresPerPkg = nCpP; // correct num cores per socket 3581 __kmp_nThreadsPerCore = nTpC; // correct num threads per core 3582 __kmp_avail_proc = n_new; // correct num procs 3583 __kmp_ncores = nCo; // correct num cores 3584 // hwloc topology method end 3585 } else 3586 #endif // KMP_USE_HWLOC 3587 { 3588 int n_old = 0, n_new = 0, proc_num = 0; 3589 if (__kmp_hws_node.num > 0 || __kmp_hws_tile.num > 0) { 3590 KMP_WARNING(AffHWSubsetNoHWLOC); 3591 goto _exit; 3592 } 3593 if (__kmp_hws_socket.num == 0) 3594 __kmp_hws_socket.num = nPackages; // use all available sockets 3595 if (__kmp_hws_core.num == 0) 3596 __kmp_hws_core.num = nCoresPerPkg; // use all available cores 3597 if (__kmp_hws_proc.num == 0 || __kmp_hws_proc.num > __kmp_nThreadsPerCore) 3598 __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all HW contexts 3599 if (!__kmp_affinity_uniform_topology()) { 3600 KMP_WARNING(AffHWSubsetNonUniform); 3601 goto _exit; // don't support non-uniform topology 3602 } 3603 if (depth > 3) { 3604 KMP_WARNING(AffHWSubsetNonThreeLevel); 3605 goto _exit; // don't support not-3-level topology 3606 } 3607 if (__kmp_hws_socket.offset + __kmp_hws_socket.num > nPackages) { 3608 KMP_WARNING(AffHWSubsetManySockets); 3609 goto _exit; 3610 } 3611 if (__kmp_hws_core.offset + __kmp_hws_core.num > nCoresPerPkg) { 3612 KMP_WARNING(AffHWSubsetManyCores); 3613 goto _exit; 3614 } 3615 // Form the requested subset 3616 if (pAddr) // pAddr is NULL in case of affinity_none 3617 newAddr = (AddrUnsPair *)__kmp_allocate( 3618 sizeof(AddrUnsPair) * __kmp_hws_socket.num * __kmp_hws_core.num * 3619 __kmp_hws_proc.num); 3620 for (int i = 0; i < nPackages; ++i) { 3621 if (i < __kmp_hws_socket.offset || 3622 i >= __kmp_hws_socket.offset + __kmp_hws_socket.num) { 3623 // skip not-requested socket 3624 n_old += nCoresPerPkg * __kmp_nThreadsPerCore; 3625 if (__kmp_pu_os_idx != NULL) { 3626 // walk through skipped socket 3627 for (int j = 0; j < nCoresPerPkg; ++j) { 3628 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 3629 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3630 ++proc_num; 3631 } 3632 } 3633 } 3634 } else { 3635 // walk through requested socket 3636 for (int j = 0; j < nCoresPerPkg; ++j) { 3637 if (j < __kmp_hws_core.offset || 3638 j >= __kmp_hws_core.offset + 3639 __kmp_hws_core.num) { // skip not-requested core 3640 n_old += __kmp_nThreadsPerCore; 3641 if (__kmp_pu_os_idx != NULL) { 3642 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 3643 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3644 ++proc_num; 3645 } 3646 } 3647 } else { 3648 // walk through requested core 3649 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 3650 if (k < __kmp_hws_proc.num) { 3651 if (pAddr) // collect requested thread's data 3652 newAddr[n_new] = (*pAddr)[n_old]; 3653 n_new++; 3654 } else { 3655 if (__kmp_pu_os_idx != NULL) 3656 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3657 } 3658 n_old++; 3659 ++proc_num; 3660 } 3661 } 3662 } 3663 } 3664 } 3665 KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore); 3666 KMP_DEBUG_ASSERT(n_new == 3667 __kmp_hws_socket.num * __kmp_hws_core.num * 3668 __kmp_hws_proc.num); 3669 nPackages = __kmp_hws_socket.num; // correct nPackages 3670 nCoresPerPkg = __kmp_hws_core.num; // correct nCoresPerPkg 3671 __kmp_nThreadsPerCore = __kmp_hws_proc.num; // correct __kmp_nThreadsPerCore 3672 __kmp_avail_proc = n_new; // correct avail_proc 3673 __kmp_ncores = nPackages * __kmp_hws_core.num; // correct ncores 3674 } // non-hwloc topology method 3675 if (pAddr) { 3676 __kmp_free(*pAddr); 3677 *pAddr = newAddr; // replace old topology with new one 3678 } 3679 if (__kmp_affinity_verbose) { 3680 char m[KMP_AFFIN_MASK_PRINT_LEN]; 3681 __kmp_affinity_print_mask(m, KMP_AFFIN_MASK_PRINT_LEN, 3682 __kmp_affin_fullMask); 3683 if (__kmp_affinity_respect_mask) { 3684 KMP_INFORM(InitOSProcSetRespect, "KMP_HW_SUBSET", m); 3685 } else { 3686 KMP_INFORM(InitOSProcSetNotRespect, "KMP_HW_SUBSET", m); 3687 } 3688 KMP_INFORM(AvailableOSProc, "KMP_HW_SUBSET", __kmp_avail_proc); 3689 kmp_str_buf_t buf; 3690 __kmp_str_buf_init(&buf); 3691 __kmp_str_buf_print(&buf, "%d", nPackages); 3692 KMP_INFORM(TopologyExtra, "KMP_HW_SUBSET", buf.str, nCoresPerPkg, 3693 __kmp_nThreadsPerCore, __kmp_ncores); 3694 __kmp_str_buf_free(&buf); 3695 } 3696 _exit: 3697 if (__kmp_pu_os_idx != NULL) { 3698 __kmp_free(__kmp_pu_os_idx); 3699 __kmp_pu_os_idx = NULL; 3700 } 3701 } 3702 3703 // This function figures out the deepest level at which there is at least one 3704 // cluster/core with more than one processing unit bound to it. 3705 static int __kmp_affinity_find_core_level(const AddrUnsPair *address2os, 3706 int nprocs, int bottom_level) { 3707 int core_level = 0; 3708 3709 for (int i = 0; i < nprocs; i++) { 3710 for (int j = bottom_level; j > 0; j--) { 3711 if (address2os[i].first.labels[j] > 0) { 3712 if (core_level < (j - 1)) { 3713 core_level = j - 1; 3714 } 3715 } 3716 } 3717 } 3718 return core_level; 3719 } 3720 3721 // This function counts number of clusters/cores at given level. 3722 static int __kmp_affinity_compute_ncores(const AddrUnsPair *address2os, 3723 int nprocs, int bottom_level, 3724 int core_level) { 3725 int ncores = 0; 3726 int i, j; 3727 3728 j = bottom_level; 3729 for (i = 0; i < nprocs; i++) { 3730 for (j = bottom_level; j > core_level; j--) { 3731 if ((i + 1) < nprocs) { 3732 if (address2os[i + 1].first.labels[j] > 0) { 3733 break; 3734 } 3735 } 3736 } 3737 if (j == core_level) { 3738 ncores++; 3739 } 3740 } 3741 if (j > core_level) { 3742 // In case of ( nprocs < __kmp_avail_proc ) we may end too deep and miss one 3743 // core. May occur when called from __kmp_affinity_find_core(). 3744 ncores++; 3745 } 3746 return ncores; 3747 } 3748 3749 // This function finds to which cluster/core given processing unit is bound. 3750 static int __kmp_affinity_find_core(const AddrUnsPair *address2os, int proc, 3751 int bottom_level, int core_level) { 3752 return __kmp_affinity_compute_ncores(address2os, proc + 1, bottom_level, 3753 core_level) - 3754 1; 3755 } 3756 3757 // This function finds maximal number of processing units bound to a 3758 // cluster/core at given level. 3759 static int __kmp_affinity_max_proc_per_core(const AddrUnsPair *address2os, 3760 int nprocs, int bottom_level, 3761 int core_level) { 3762 int maxprocpercore = 0; 3763 3764 if (core_level < bottom_level) { 3765 for (int i = 0; i < nprocs; i++) { 3766 int percore = address2os[i].first.labels[core_level + 1] + 1; 3767 3768 if (percore > maxprocpercore) { 3769 maxprocpercore = percore; 3770 } 3771 } 3772 } else { 3773 maxprocpercore = 1; 3774 } 3775 return maxprocpercore; 3776 } 3777 3778 static AddrUnsPair *address2os = NULL; 3779 static int *procarr = NULL; 3780 static int __kmp_aff_depth = 0; 3781 3782 #define KMP_EXIT_AFF_NONE \ 3783 KMP_ASSERT(__kmp_affinity_type == affinity_none); \ 3784 KMP_ASSERT(address2os == NULL); \ 3785 __kmp_apply_thread_places(NULL, 0); \ 3786 return; 3787 3788 static int __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) { 3789 const Address *aa = &(((const AddrUnsPair *)a)->first); 3790 const Address *bb = &(((const AddrUnsPair *)b)->first); 3791 unsigned depth = aa->depth; 3792 unsigned i; 3793 KMP_DEBUG_ASSERT(depth == bb->depth); 3794 KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth); 3795 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0); 3796 for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) { 3797 int j = depth - i - 1; 3798 if (aa->childNums[j] < bb->childNums[j]) 3799 return -1; 3800 if (aa->childNums[j] > bb->childNums[j]) 3801 return 1; 3802 } 3803 for (; i < depth; i++) { 3804 int j = i - __kmp_affinity_compact; 3805 if (aa->childNums[j] < bb->childNums[j]) 3806 return -1; 3807 if (aa->childNums[j] > bb->childNums[j]) 3808 return 1; 3809 } 3810 return 0; 3811 } 3812 3813 static void __kmp_aux_affinity_initialize(void) { 3814 if (__kmp_affinity_masks != NULL) { 3815 KMP_ASSERT(__kmp_affin_fullMask != NULL); 3816 return; 3817 } 3818 3819 // Create the "full" mask - this defines all of the processors that we 3820 // consider to be in the machine model. If respect is set, then it is the 3821 // initialization thread's affinity mask. Otherwise, it is all processors that 3822 // we know about on the machine. 3823 if (__kmp_affin_fullMask == NULL) { 3824 KMP_CPU_ALLOC(__kmp_affin_fullMask); 3825 } 3826 if (KMP_AFFINITY_CAPABLE()) { 3827 if (__kmp_affinity_respect_mask) { 3828 __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE); 3829 3830 // Count the number of available processors. 3831 unsigned i; 3832 __kmp_avail_proc = 0; 3833 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 3834 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 3835 continue; 3836 } 3837 __kmp_avail_proc++; 3838 } 3839 if (__kmp_avail_proc > __kmp_xproc) { 3840 if (__kmp_affinity_verbose || 3841 (__kmp_affinity_warnings && 3842 (__kmp_affinity_type != affinity_none))) { 3843 KMP_WARNING(ErrorInitializeAffinity); 3844 } 3845 __kmp_affinity_type = affinity_none; 3846 KMP_AFFINITY_DISABLE(); 3847 return; 3848 } 3849 } else { 3850 __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask); 3851 __kmp_avail_proc = __kmp_xproc; 3852 } 3853 } 3854 3855 int depth = -1; 3856 kmp_i18n_id_t msg_id = kmp_i18n_null; 3857 3858 // For backward compatibility, setting KMP_CPUINFO_FILE => 3859 // KMP_TOPOLOGY_METHOD=cpuinfo 3860 if ((__kmp_cpuinfo_file != NULL) && 3861 (__kmp_affinity_top_method == affinity_top_method_all)) { 3862 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 3863 } 3864 3865 if (__kmp_affinity_top_method == affinity_top_method_all) { 3866 // In the default code path, errors are not fatal - we just try using 3867 // another method. We only emit a warning message if affinity is on, or the 3868 // verbose flag is set, an the nowarnings flag was not set. 3869 const char *file_name = NULL; 3870 int line = 0; 3871 #if KMP_USE_HWLOC 3872 if (depth < 0 && 3873 __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 3874 if (__kmp_affinity_verbose) { 3875 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 3876 } 3877 if (!__kmp_hwloc_error) { 3878 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); 3879 if (depth == 0) { 3880 KMP_EXIT_AFF_NONE; 3881 } else if (depth < 0 && __kmp_affinity_verbose) { 3882 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 3883 } 3884 } else if (__kmp_affinity_verbose) { 3885 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 3886 } 3887 } 3888 #endif 3889 3890 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 3891 3892 if (depth < 0) { 3893 if (__kmp_affinity_verbose) { 3894 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 3895 } 3896 3897 file_name = NULL; 3898 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3899 if (depth == 0) { 3900 KMP_EXIT_AFF_NONE; 3901 } 3902 3903 if (depth < 0) { 3904 if (__kmp_affinity_verbose) { 3905 if (msg_id != kmp_i18n_null) { 3906 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", 3907 __kmp_i18n_catgets(msg_id), 3908 KMP_I18N_STR(DecodingLegacyAPIC)); 3909 } else { 3910 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3911 KMP_I18N_STR(DecodingLegacyAPIC)); 3912 } 3913 } 3914 3915 file_name = NULL; 3916 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3917 if (depth == 0) { 3918 KMP_EXIT_AFF_NONE; 3919 } 3920 } 3921 } 3922 3923 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3924 3925 #if KMP_OS_LINUX 3926 3927 if (depth < 0) { 3928 if (__kmp_affinity_verbose) { 3929 if (msg_id != kmp_i18n_null) { 3930 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", 3931 __kmp_i18n_catgets(msg_id), "/proc/cpuinfo"); 3932 } else { 3933 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo"); 3934 } 3935 } 3936 3937 FILE *f = fopen("/proc/cpuinfo", "r"); 3938 if (f == NULL) { 3939 msg_id = kmp_i18n_str_CantOpenCpuinfo; 3940 } else { 3941 file_name = "/proc/cpuinfo"; 3942 depth = 3943 __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3944 fclose(f); 3945 if (depth == 0) { 3946 KMP_EXIT_AFF_NONE; 3947 } 3948 } 3949 } 3950 3951 #endif /* KMP_OS_LINUX */ 3952 3953 #if KMP_GROUP_AFFINITY 3954 3955 if ((depth < 0) && (__kmp_num_proc_groups > 1)) { 3956 if (__kmp_affinity_verbose) { 3957 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 3958 } 3959 3960 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 3961 KMP_ASSERT(depth != 0); 3962 } 3963 3964 #endif /* KMP_GROUP_AFFINITY */ 3965 3966 if (depth < 0) { 3967 if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) { 3968 if (file_name == NULL) { 3969 KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id)); 3970 } else if (line == 0) { 3971 KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id)); 3972 } else { 3973 KMP_INFORM(UsingFlatOSFileLine, file_name, line, 3974 __kmp_i18n_catgets(msg_id)); 3975 } 3976 } 3977 // FIXME - print msg if msg_id = kmp_i18n_null ??? 3978 3979 file_name = ""; 3980 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3981 if (depth == 0) { 3982 KMP_EXIT_AFF_NONE; 3983 } 3984 KMP_ASSERT(depth > 0); 3985 KMP_ASSERT(address2os != NULL); 3986 } 3987 } 3988 3989 // If the user has specified that a paricular topology discovery method is to be 3990 // used, then we abort if that method fails. The exception is group affinity, 3991 // which might have been implicitly set. 3992 3993 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 3994 3995 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 3996 if (__kmp_affinity_verbose) { 3997 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 3998 } 3999 4000 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 4001 if (depth == 0) { 4002 KMP_EXIT_AFF_NONE; 4003 } 4004 if (depth < 0) { 4005 KMP_ASSERT(msg_id != kmp_i18n_null); 4006 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4007 } 4008 } else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 4009 if (__kmp_affinity_verbose) { 4010 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 4011 } 4012 4013 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 4014 if (depth == 0) { 4015 KMP_EXIT_AFF_NONE; 4016 } 4017 if (depth < 0) { 4018 KMP_ASSERT(msg_id != kmp_i18n_null); 4019 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4020 } 4021 } 4022 4023 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4024 4025 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 4026 const char *filename; 4027 if (__kmp_cpuinfo_file != NULL) { 4028 filename = __kmp_cpuinfo_file; 4029 } else { 4030 filename = "/proc/cpuinfo"; 4031 } 4032 4033 if (__kmp_affinity_verbose) { 4034 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 4035 } 4036 4037 FILE *f = fopen(filename, "r"); 4038 if (f == NULL) { 4039 int code = errno; 4040 if (__kmp_cpuinfo_file != NULL) { 4041 __kmp_fatal(KMP_MSG(CantOpenFileForReading, filename), KMP_ERR(code), 4042 KMP_HNT(NameComesFrom_CPUINFO_FILE), __kmp_msg_null); 4043 } else { 4044 __kmp_fatal(KMP_MSG(CantOpenFileForReading, filename), KMP_ERR(code), 4045 __kmp_msg_null); 4046 } 4047 } 4048 int line = 0; 4049 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 4050 fclose(f); 4051 if (depth < 0) { 4052 KMP_ASSERT(msg_id != kmp_i18n_null); 4053 if (line > 0) { 4054 KMP_FATAL(FileLineMsgExiting, filename, line, 4055 __kmp_i18n_catgets(msg_id)); 4056 } else { 4057 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 4058 } 4059 } 4060 if (__kmp_affinity_type == affinity_none) { 4061 KMP_ASSERT(depth == 0); 4062 KMP_EXIT_AFF_NONE; 4063 } 4064 } 4065 4066 #if KMP_GROUP_AFFINITY 4067 4068 else if (__kmp_affinity_top_method == affinity_top_method_group) { 4069 if (__kmp_affinity_verbose) { 4070 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 4071 } 4072 4073 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 4074 KMP_ASSERT(depth != 0); 4075 if (depth < 0) { 4076 KMP_ASSERT(msg_id != kmp_i18n_null); 4077 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4078 } 4079 } 4080 4081 #endif /* KMP_GROUP_AFFINITY */ 4082 4083 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 4084 if (__kmp_affinity_verbose) { 4085 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY"); 4086 } 4087 4088 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 4089 if (depth == 0) { 4090 KMP_EXIT_AFF_NONE; 4091 } 4092 // should not fail 4093 KMP_ASSERT(depth > 0); 4094 KMP_ASSERT(address2os != NULL); 4095 } 4096 4097 #if KMP_USE_HWLOC 4098 else if (__kmp_affinity_top_method == affinity_top_method_hwloc) { 4099 KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC); 4100 if (__kmp_affinity_verbose) { 4101 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 4102 } 4103 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); 4104 if (depth == 0) { 4105 KMP_EXIT_AFF_NONE; 4106 } 4107 } 4108 #endif // KMP_USE_HWLOC 4109 4110 if (address2os == NULL) { 4111 if (KMP_AFFINITY_CAPABLE() && 4112 (__kmp_affinity_verbose || 4113 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) { 4114 KMP_WARNING(ErrorInitializeAffinity); 4115 } 4116 __kmp_affinity_type = affinity_none; 4117 KMP_AFFINITY_DISABLE(); 4118 return; 4119 } 4120 4121 __kmp_apply_thread_places(&address2os, depth); 4122 4123 // Create the table of masks, indexed by thread Id. 4124 unsigned maxIndex; 4125 unsigned numUnique; 4126 kmp_affin_mask_t *osId2Mask = 4127 __kmp_create_masks(&maxIndex, &numUnique, address2os, __kmp_avail_proc); 4128 if (__kmp_affinity_gran_levels == 0) { 4129 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 4130 } 4131 4132 // Set the childNums vector in all Address objects. This must be done before 4133 // we can sort using __kmp_affinity_cmp_Address_child_num(), which takes into 4134 // account the setting of __kmp_affinity_compact. 4135 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc); 4136 4137 switch (__kmp_affinity_type) { 4138 4139 case affinity_explicit: 4140 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 4141 #if OMP_40_ENABLED 4142 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 4143 #endif 4144 { 4145 __kmp_affinity_process_proclist( 4146 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 4147 __kmp_affinity_proclist, osId2Mask, maxIndex); 4148 } 4149 #if OMP_40_ENABLED 4150 else { 4151 __kmp_affinity_process_placelist( 4152 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 4153 __kmp_affinity_proclist, osId2Mask, maxIndex); 4154 } 4155 #endif 4156 if (__kmp_affinity_num_masks == 0) { 4157 if (__kmp_affinity_verbose || 4158 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 4159 KMP_WARNING(AffNoValidProcID); 4160 } 4161 __kmp_affinity_type = affinity_none; 4162 return; 4163 } 4164 break; 4165 4166 // The other affinity types rely on sorting the Addresses according to some 4167 // permutation of the machine topology tree. Set __kmp_affinity_compact and 4168 // __kmp_affinity_offset appropriately, then jump to a common code fragment 4169 // to do the sort and create the array of affinity masks. 4170 4171 case affinity_logical: 4172 __kmp_affinity_compact = 0; 4173 if (__kmp_affinity_offset) { 4174 __kmp_affinity_offset = 4175 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 4176 } 4177 goto sortAddresses; 4178 4179 case affinity_physical: 4180 if (__kmp_nThreadsPerCore > 1) { 4181 __kmp_affinity_compact = 1; 4182 if (__kmp_affinity_compact >= depth) { 4183 __kmp_affinity_compact = 0; 4184 } 4185 } else { 4186 __kmp_affinity_compact = 0; 4187 } 4188 if (__kmp_affinity_offset) { 4189 __kmp_affinity_offset = 4190 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 4191 } 4192 goto sortAddresses; 4193 4194 case affinity_scatter: 4195 if (__kmp_affinity_compact >= depth) { 4196 __kmp_affinity_compact = 0; 4197 } else { 4198 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 4199 } 4200 goto sortAddresses; 4201 4202 case affinity_compact: 4203 if (__kmp_affinity_compact >= depth) { 4204 __kmp_affinity_compact = depth - 1; 4205 } 4206 goto sortAddresses; 4207 4208 case affinity_balanced: 4209 if (depth <= 1) { 4210 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 4211 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 4212 } 4213 __kmp_affinity_type = affinity_none; 4214 return; 4215 } else if (__kmp_affinity_uniform_topology()) { 4216 break; 4217 } else { // Non-uniform topology 4218 4219 // Save the depth for further usage 4220 __kmp_aff_depth = depth; 4221 4222 int core_level = __kmp_affinity_find_core_level( 4223 address2os, __kmp_avail_proc, depth - 1); 4224 int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, 4225 depth - 1, core_level); 4226 int maxprocpercore = __kmp_affinity_max_proc_per_core( 4227 address2os, __kmp_avail_proc, depth - 1, core_level); 4228 4229 int nproc = ncores * maxprocpercore; 4230 if ((nproc < 2) || (nproc < __kmp_avail_proc)) { 4231 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 4232 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 4233 } 4234 __kmp_affinity_type = affinity_none; 4235 return; 4236 } 4237 4238 procarr = (int *)__kmp_allocate(sizeof(int) * nproc); 4239 for (int i = 0; i < nproc; i++) { 4240 procarr[i] = -1; 4241 } 4242 4243 int lastcore = -1; 4244 int inlastcore = 0; 4245 for (int i = 0; i < __kmp_avail_proc; i++) { 4246 int proc = address2os[i].second; 4247 int core = 4248 __kmp_affinity_find_core(address2os, i, depth - 1, core_level); 4249 4250 if (core == lastcore) { 4251 inlastcore++; 4252 } else { 4253 inlastcore = 0; 4254 } 4255 lastcore = core; 4256 4257 procarr[core * maxprocpercore + inlastcore] = proc; 4258 } 4259 4260 break; 4261 } 4262 4263 sortAddresses: 4264 // Allocate the gtid->affinity mask table. 4265 if (__kmp_affinity_dups) { 4266 __kmp_affinity_num_masks = __kmp_avail_proc; 4267 } else { 4268 __kmp_affinity_num_masks = numUnique; 4269 } 4270 4271 #if OMP_40_ENABLED 4272 if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) && 4273 (__kmp_affinity_num_places > 0) && 4274 ((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) { 4275 __kmp_affinity_num_masks = __kmp_affinity_num_places; 4276 } 4277 #endif 4278 4279 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4280 4281 // Sort the address2os table according to the current setting of 4282 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 4283 qsort(address2os, __kmp_avail_proc, sizeof(*address2os), 4284 __kmp_affinity_cmp_Address_child_num); 4285 { 4286 int i; 4287 unsigned j; 4288 for (i = 0, j = 0; i < __kmp_avail_proc; i++) { 4289 if ((!__kmp_affinity_dups) && (!address2os[i].first.leader)) { 4290 continue; 4291 } 4292 unsigned osId = address2os[i].second; 4293 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 4294 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j); 4295 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 4296 KMP_CPU_COPY(dest, src); 4297 if (++j >= __kmp_affinity_num_masks) { 4298 break; 4299 } 4300 } 4301 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 4302 } 4303 break; 4304 4305 default: 4306 KMP_ASSERT2(0, "Unexpected affinity setting"); 4307 } 4308 4309 KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1); 4310 machine_hierarchy.init(address2os, __kmp_avail_proc); 4311 } 4312 #undef KMP_EXIT_AFF_NONE 4313 4314 void __kmp_affinity_initialize(void) { 4315 // Much of the code above was written assumming that if a machine was not 4316 // affinity capable, then __kmp_affinity_type == affinity_none. We now 4317 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 4318 // There are too many checks for __kmp_affinity_type == affinity_none 4319 // in this code. Instead of trying to change them all, check if 4320 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 4321 // affinity_none, call the real initialization routine, then restore 4322 // __kmp_affinity_type to affinity_disabled. 4323 int disabled = (__kmp_affinity_type == affinity_disabled); 4324 if (!KMP_AFFINITY_CAPABLE()) { 4325 KMP_ASSERT(disabled); 4326 } 4327 if (disabled) { 4328 __kmp_affinity_type = affinity_none; 4329 } 4330 __kmp_aux_affinity_initialize(); 4331 if (disabled) { 4332 __kmp_affinity_type = affinity_disabled; 4333 } 4334 } 4335 4336 void __kmp_affinity_uninitialize(void) { 4337 if (__kmp_affinity_masks != NULL) { 4338 KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4339 __kmp_affinity_masks = NULL; 4340 } 4341 if (__kmp_affin_fullMask != NULL) { 4342 KMP_CPU_FREE(__kmp_affin_fullMask); 4343 __kmp_affin_fullMask = NULL; 4344 } 4345 __kmp_affinity_num_masks = 0; 4346 __kmp_affinity_type = affinity_default; 4347 #if OMP_40_ENABLED 4348 __kmp_affinity_num_places = 0; 4349 #endif 4350 if (__kmp_affinity_proclist != NULL) { 4351 __kmp_free(__kmp_affinity_proclist); 4352 __kmp_affinity_proclist = NULL; 4353 } 4354 if (address2os != NULL) { 4355 __kmp_free(address2os); 4356 address2os = NULL; 4357 } 4358 if (procarr != NULL) { 4359 __kmp_free(procarr); 4360 procarr = NULL; 4361 } 4362 #if KMP_USE_HWLOC 4363 if (__kmp_hwloc_topology != NULL) { 4364 hwloc_topology_destroy(__kmp_hwloc_topology); 4365 __kmp_hwloc_topology = NULL; 4366 } 4367 #endif 4368 KMPAffinity::destroy_api(); 4369 } 4370 4371 void __kmp_affinity_set_init_mask(int gtid, int isa_root) { 4372 if (!KMP_AFFINITY_CAPABLE()) { 4373 return; 4374 } 4375 4376 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4377 if (th->th.th_affin_mask == NULL) { 4378 KMP_CPU_ALLOC(th->th.th_affin_mask); 4379 } else { 4380 KMP_CPU_ZERO(th->th.th_affin_mask); 4381 } 4382 4383 // Copy the thread mask to the kmp_info_t strucuture. If 4384 // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that 4385 // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set, 4386 // then the full mask is the same as the mask of the initialization thread. 4387 kmp_affin_mask_t *mask; 4388 int i; 4389 4390 #if OMP_40_ENABLED 4391 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 4392 #endif 4393 { 4394 if ((__kmp_affinity_type == affinity_none) || 4395 (__kmp_affinity_type == affinity_balanced)) { 4396 #if KMP_GROUP_AFFINITY 4397 if (__kmp_num_proc_groups > 1) { 4398 return; 4399 } 4400 #endif 4401 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4402 i = KMP_PLACE_ALL; 4403 mask = __kmp_affin_fullMask; 4404 } else { 4405 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 4406 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4407 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4408 } 4409 } 4410 #if OMP_40_ENABLED 4411 else { 4412 if ((!isa_root) || 4413 (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 4414 #if KMP_GROUP_AFFINITY 4415 if (__kmp_num_proc_groups > 1) { 4416 return; 4417 } 4418 #endif 4419 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4420 i = KMP_PLACE_ALL; 4421 mask = __kmp_affin_fullMask; 4422 } else { 4423 // int i = some hash function or just a counter that doesn't 4424 // always start at 0. Use gtid for now. 4425 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 4426 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4427 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4428 } 4429 } 4430 #endif 4431 4432 #if OMP_40_ENABLED 4433 th->th.th_current_place = i; 4434 if (isa_root) { 4435 th->th.th_new_place = i; 4436 th->th.th_first_place = 0; 4437 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4438 } 4439 4440 if (i == KMP_PLACE_ALL) { 4441 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 4442 gtid)); 4443 } else { 4444 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 4445 gtid, i)); 4446 } 4447 #else 4448 if (i == -1) { 4449 KA_TRACE( 4450 100, 4451 ("__kmp_affinity_set_init_mask: binding T#%d to __kmp_affin_fullMask\n", 4452 gtid)); 4453 } else { 4454 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n", 4455 gtid, i)); 4456 } 4457 #endif /* OMP_40_ENABLED */ 4458 4459 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4460 4461 if (__kmp_affinity_verbose) { 4462 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4463 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4464 th->th.th_affin_mask); 4465 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4466 __kmp_gettid(), gtid, buf); 4467 } 4468 4469 #if KMP_OS_WINDOWS 4470 // On Windows* OS, the process affinity mask might have changed. If the user 4471 // didn't request affinity and this call fails, just continue silently. 4472 // See CQ171393. 4473 if (__kmp_affinity_type == affinity_none) { 4474 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 4475 } else 4476 #endif 4477 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4478 } 4479 4480 #if OMP_40_ENABLED 4481 4482 void __kmp_affinity_set_place(int gtid) { 4483 int retval; 4484 4485 if (!KMP_AFFINITY_CAPABLE()) { 4486 return; 4487 } 4488 4489 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4490 4491 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current " 4492 "place = %d)\n", 4493 gtid, th->th.th_new_place, th->th.th_current_place)); 4494 4495 // Check that the new place is within this thread's partition. 4496 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4497 KMP_ASSERT(th->th.th_new_place >= 0); 4498 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 4499 if (th->th.th_first_place <= th->th.th_last_place) { 4500 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) && 4501 (th->th.th_new_place <= th->th.th_last_place)); 4502 } else { 4503 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) || 4504 (th->th.th_new_place >= th->th.th_last_place)); 4505 } 4506 4507 // Copy the thread mask to the kmp_info_t strucuture, 4508 // and set this thread's affinity. 4509 kmp_affin_mask_t *mask = 4510 KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place); 4511 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4512 th->th.th_current_place = th->th.th_new_place; 4513 4514 if (__kmp_affinity_verbose) { 4515 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4516 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4517 th->th.th_affin_mask); 4518 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), 4519 __kmp_gettid(), gtid, buf); 4520 } 4521 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4522 } 4523 4524 #endif /* OMP_40_ENABLED */ 4525 4526 int __kmp_aux_set_affinity(void **mask) { 4527 int gtid; 4528 kmp_info_t *th; 4529 int retval; 4530 4531 if (!KMP_AFFINITY_CAPABLE()) { 4532 return -1; 4533 } 4534 4535 gtid = __kmp_entry_gtid(); 4536 KA_TRACE(1000, ; { 4537 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4538 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4539 (kmp_affin_mask_t *)(*mask)); 4540 __kmp_debug_printf( 4541 "kmp_set_affinity: setting affinity mask for thread %d = %s\n", gtid, 4542 buf); 4543 }); 4544 4545 if (__kmp_env_consistency_check) { 4546 if ((mask == NULL) || (*mask == NULL)) { 4547 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4548 } else { 4549 unsigned proc; 4550 int num_procs = 0; 4551 4552 KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) { 4553 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4554 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4555 } 4556 if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 4557 continue; 4558 } 4559 num_procs++; 4560 } 4561 if (num_procs == 0) { 4562 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4563 } 4564 4565 #if KMP_GROUP_AFFINITY 4566 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 4567 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4568 } 4569 #endif /* KMP_GROUP_AFFINITY */ 4570 } 4571 } 4572 4573 th = __kmp_threads[gtid]; 4574 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4575 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4576 if (retval == 0) { 4577 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 4578 } 4579 4580 #if OMP_40_ENABLED 4581 th->th.th_current_place = KMP_PLACE_UNDEFINED; 4582 th->th.th_new_place = KMP_PLACE_UNDEFINED; 4583 th->th.th_first_place = 0; 4584 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4585 4586 // Turn off 4.0 affinity for the current tread at this parallel level. 4587 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 4588 #endif 4589 4590 return retval; 4591 } 4592 4593 int __kmp_aux_get_affinity(void **mask) { 4594 int gtid; 4595 int retval; 4596 kmp_info_t *th; 4597 4598 if (!KMP_AFFINITY_CAPABLE()) { 4599 return -1; 4600 } 4601 4602 gtid = __kmp_entry_gtid(); 4603 th = __kmp_threads[gtid]; 4604 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4605 4606 KA_TRACE(1000, ; { 4607 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4608 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4609 th->th.th_affin_mask); 4610 __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", 4611 gtid, buf); 4612 }); 4613 4614 if (__kmp_env_consistency_check) { 4615 if ((mask == NULL) || (*mask == NULL)) { 4616 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 4617 } 4618 } 4619 4620 #if !KMP_OS_WINDOWS 4621 4622 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4623 KA_TRACE(1000, ; { 4624 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4625 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4626 (kmp_affin_mask_t *)(*mask)); 4627 __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", 4628 gtid, buf); 4629 }); 4630 return retval; 4631 4632 #else 4633 4634 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 4635 return 0; 4636 4637 #endif /* KMP_OS_WINDOWS */ 4638 } 4639 4640 int __kmp_aux_get_affinity_max_proc() { 4641 if (!KMP_AFFINITY_CAPABLE()) { 4642 return 0; 4643 } 4644 #if KMP_GROUP_AFFINITY 4645 if (__kmp_num_proc_groups > 1) { 4646 return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT); 4647 } 4648 #endif 4649 return __kmp_xproc; 4650 } 4651 4652 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) { 4653 int retval; 4654 4655 if (!KMP_AFFINITY_CAPABLE()) { 4656 return -1; 4657 } 4658 4659 KA_TRACE(1000, ; { 4660 int gtid = __kmp_entry_gtid(); 4661 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4662 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4663 (kmp_affin_mask_t *)(*mask)); 4664 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in " 4665 "affinity mask for thread %d = %s\n", 4666 proc, gtid, buf); 4667 }); 4668 4669 if (__kmp_env_consistency_check) { 4670 if ((mask == NULL) || (*mask == NULL)) { 4671 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4672 } 4673 } 4674 4675 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4676 return -1; 4677 } 4678 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4679 return -2; 4680 } 4681 4682 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 4683 return 0; 4684 } 4685 4686 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) { 4687 int retval; 4688 4689 if (!KMP_AFFINITY_CAPABLE()) { 4690 return -1; 4691 } 4692 4693 KA_TRACE(1000, ; { 4694 int gtid = __kmp_entry_gtid(); 4695 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4696 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4697 (kmp_affin_mask_t *)(*mask)); 4698 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in " 4699 "affinity mask for thread %d = %s\n", 4700 proc, gtid, buf); 4701 }); 4702 4703 if (__kmp_env_consistency_check) { 4704 if ((mask == NULL) || (*mask == NULL)) { 4705 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 4706 } 4707 } 4708 4709 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4710 return -1; 4711 } 4712 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4713 return -2; 4714 } 4715 4716 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 4717 return 0; 4718 } 4719 4720 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) { 4721 int retval; 4722 4723 if (!KMP_AFFINITY_CAPABLE()) { 4724 return -1; 4725 } 4726 4727 KA_TRACE(1000, ; { 4728 int gtid = __kmp_entry_gtid(); 4729 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4730 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4731 (kmp_affin_mask_t *)(*mask)); 4732 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in " 4733 "affinity mask for thread %d = %s\n", 4734 proc, gtid, buf); 4735 }); 4736 4737 if (__kmp_env_consistency_check) { 4738 if ((mask == NULL) || (*mask == NULL)) { 4739 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc"); 4740 } 4741 } 4742 4743 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4744 return -1; 4745 } 4746 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4747 return 0; 4748 } 4749 4750 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 4751 } 4752 4753 // Dynamic affinity settings - Affinity balanced 4754 void __kmp_balanced_affinity(int tid, int nthreads) { 4755 bool fine_gran = true; 4756 4757 switch (__kmp_affinity_gran) { 4758 case affinity_gran_fine: 4759 case affinity_gran_thread: 4760 break; 4761 case affinity_gran_core: 4762 if (__kmp_nThreadsPerCore > 1) { 4763 fine_gran = false; 4764 } 4765 break; 4766 case affinity_gran_package: 4767 if (nCoresPerPkg > 1) { 4768 fine_gran = false; 4769 } 4770 break; 4771 default: 4772 fine_gran = false; 4773 } 4774 4775 if (__kmp_affinity_uniform_topology()) { 4776 int coreID; 4777 int threadID; 4778 // Number of hyper threads per core in HT machine 4779 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 4780 // Number of cores 4781 int ncores = __kmp_ncores; 4782 if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) { 4783 __kmp_nth_per_core = __kmp_avail_proc / nPackages; 4784 ncores = nPackages; 4785 } 4786 // How many threads will be bound to each core 4787 int chunk = nthreads / ncores; 4788 // How many cores will have an additional thread bound to it - "big cores" 4789 int big_cores = nthreads % ncores; 4790 // Number of threads on the big cores 4791 int big_nth = (chunk + 1) * big_cores; 4792 if (tid < big_nth) { 4793 coreID = tid / (chunk + 1); 4794 threadID = (tid % (chunk + 1)) % __kmp_nth_per_core; 4795 } else { // tid >= big_nth 4796 coreID = (tid - big_cores) / chunk; 4797 threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core; 4798 } 4799 4800 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 4801 "Illegal set affinity operation when not capable"); 4802 4803 kmp_affin_mask_t *mask; 4804 KMP_CPU_ALLOC_ON_STACK(mask); 4805 KMP_CPU_ZERO(mask); 4806 4807 if (fine_gran) { 4808 int osID = address2os[coreID * __kmp_nth_per_core + threadID].second; 4809 KMP_CPU_SET(osID, mask); 4810 } else { 4811 for (int i = 0; i < __kmp_nth_per_core; i++) { 4812 int osID; 4813 osID = address2os[coreID * __kmp_nth_per_core + i].second; 4814 KMP_CPU_SET(osID, mask); 4815 } 4816 } 4817 if (__kmp_affinity_verbose) { 4818 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4819 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4820 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4821 __kmp_gettid(), tid, buf); 4822 } 4823 __kmp_set_system_affinity(mask, TRUE); 4824 KMP_CPU_FREE_FROM_STACK(mask); 4825 } else { // Non-uniform topology 4826 4827 kmp_affin_mask_t *mask; 4828 KMP_CPU_ALLOC_ON_STACK(mask); 4829 KMP_CPU_ZERO(mask); 4830 4831 int core_level = __kmp_affinity_find_core_level( 4832 address2os, __kmp_avail_proc, __kmp_aff_depth - 1); 4833 int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, 4834 __kmp_aff_depth - 1, core_level); 4835 int nth_per_core = __kmp_affinity_max_proc_per_core( 4836 address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level); 4837 4838 // For performance gain consider the special case nthreads == 4839 // __kmp_avail_proc 4840 if (nthreads == __kmp_avail_proc) { 4841 if (fine_gran) { 4842 int osID = address2os[tid].second; 4843 KMP_CPU_SET(osID, mask); 4844 } else { 4845 int core = __kmp_affinity_find_core(address2os, tid, 4846 __kmp_aff_depth - 1, core_level); 4847 for (int i = 0; i < __kmp_avail_proc; i++) { 4848 int osID = address2os[i].second; 4849 if (__kmp_affinity_find_core(address2os, i, __kmp_aff_depth - 1, 4850 core_level) == core) { 4851 KMP_CPU_SET(osID, mask); 4852 } 4853 } 4854 } 4855 } else if (nthreads <= ncores) { 4856 4857 int core = 0; 4858 for (int i = 0; i < ncores; i++) { 4859 // Check if this core from procarr[] is in the mask 4860 int in_mask = 0; 4861 for (int j = 0; j < nth_per_core; j++) { 4862 if (procarr[i * nth_per_core + j] != -1) { 4863 in_mask = 1; 4864 break; 4865 } 4866 } 4867 if (in_mask) { 4868 if (tid == core) { 4869 for (int j = 0; j < nth_per_core; j++) { 4870 int osID = procarr[i * nth_per_core + j]; 4871 if (osID != -1) { 4872 KMP_CPU_SET(osID, mask); 4873 // For fine granularity it is enough to set the first available 4874 // osID for this core 4875 if (fine_gran) { 4876 break; 4877 } 4878 } 4879 } 4880 break; 4881 } else { 4882 core++; 4883 } 4884 } 4885 } 4886 } else { // nthreads > ncores 4887 // Array to save the number of processors at each core 4888 int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores); 4889 // Array to save the number of cores with "x" available processors; 4890 int *ncores_with_x_procs = 4891 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 4892 // Array to save the number of cores with # procs from x to nth_per_core 4893 int *ncores_with_x_to_max_procs = 4894 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 4895 4896 for (int i = 0; i <= nth_per_core; i++) { 4897 ncores_with_x_procs[i] = 0; 4898 ncores_with_x_to_max_procs[i] = 0; 4899 } 4900 4901 for (int i = 0; i < ncores; i++) { 4902 int cnt = 0; 4903 for (int j = 0; j < nth_per_core; j++) { 4904 if (procarr[i * nth_per_core + j] != -1) { 4905 cnt++; 4906 } 4907 } 4908 nproc_at_core[i] = cnt; 4909 ncores_with_x_procs[cnt]++; 4910 } 4911 4912 for (int i = 0; i <= nth_per_core; i++) { 4913 for (int j = i; j <= nth_per_core; j++) { 4914 ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j]; 4915 } 4916 } 4917 4918 // Max number of processors 4919 int nproc = nth_per_core * ncores; 4920 // An array to keep number of threads per each context 4921 int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc); 4922 for (int i = 0; i < nproc; i++) { 4923 newarr[i] = 0; 4924 } 4925 4926 int nth = nthreads; 4927 int flag = 0; 4928 while (nth > 0) { 4929 for (int j = 1; j <= nth_per_core; j++) { 4930 int cnt = ncores_with_x_to_max_procs[j]; 4931 for (int i = 0; i < ncores; i++) { 4932 // Skip the core with 0 processors 4933 if (nproc_at_core[i] == 0) { 4934 continue; 4935 } 4936 for (int k = 0; k < nth_per_core; k++) { 4937 if (procarr[i * nth_per_core + k] != -1) { 4938 if (newarr[i * nth_per_core + k] == 0) { 4939 newarr[i * nth_per_core + k] = 1; 4940 cnt--; 4941 nth--; 4942 break; 4943 } else { 4944 if (flag != 0) { 4945 newarr[i * nth_per_core + k]++; 4946 cnt--; 4947 nth--; 4948 break; 4949 } 4950 } 4951 } 4952 } 4953 if (cnt == 0 || nth == 0) { 4954 break; 4955 } 4956 } 4957 if (nth == 0) { 4958 break; 4959 } 4960 } 4961 flag = 1; 4962 } 4963 int sum = 0; 4964 for (int i = 0; i < nproc; i++) { 4965 sum += newarr[i]; 4966 if (sum > tid) { 4967 if (fine_gran) { 4968 int osID = procarr[i]; 4969 KMP_CPU_SET(osID, mask); 4970 } else { 4971 int coreID = i / nth_per_core; 4972 for (int ii = 0; ii < nth_per_core; ii++) { 4973 int osID = procarr[coreID * nth_per_core + ii]; 4974 if (osID != -1) { 4975 KMP_CPU_SET(osID, mask); 4976 } 4977 } 4978 } 4979 break; 4980 } 4981 } 4982 __kmp_free(newarr); 4983 } 4984 4985 if (__kmp_affinity_verbose) { 4986 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4987 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4988 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4989 __kmp_gettid(), tid, buf); 4990 } 4991 __kmp_set_system_affinity(mask, TRUE); 4992 KMP_CPU_FREE_FROM_STACK(mask); 4993 } 4994 } 4995 4996 #if KMP_OS_LINUX 4997 // We don't need this entry for Windows because 4998 // there is GetProcessAffinityMask() api 4999 // 5000 // The intended usage is indicated by these steps: 5001 // 1) The user gets the current affinity mask 5002 // 2) Then sets the affinity by calling this function 5003 // 3) Error check the return value 5004 // 4) Use non-OpenMP parallelization 5005 // 5) Reset the affinity to what was stored in step 1) 5006 #ifdef __cplusplus 5007 extern "C" 5008 #endif 5009 int 5010 kmp_set_thread_affinity_mask_initial() 5011 // the function returns 0 on success, 5012 // -1 if we cannot bind thread 5013 // >0 (errno) if an error happened during binding 5014 { 5015 int gtid = __kmp_get_gtid(); 5016 if (gtid < 0) { 5017 // Do not touch non-omp threads 5018 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5019 "non-omp thread, returning\n")); 5020 return -1; 5021 } 5022 if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) { 5023 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5024 "affinity not initialized, returning\n")); 5025 return -1; 5026 } 5027 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5028 "set full mask for thread %d\n", 5029 gtid)); 5030 KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL); 5031 return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE); 5032 } 5033 #endif 5034 5035 #endif // KMP_AFFINITY_SUPPORTED 5036