1 /* 2 * kmp_affinity.cpp -- affinity management 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_i18n.h" 16 #include "kmp_io.h" 17 #include "kmp_str.h" 18 #include "kmp_wrapper_getpid.h" 19 #if KMP_USE_HIER_SCHED 20 #include "kmp_dispatch_hier.h" 21 #endif 22 #if KMP_USE_HWLOC 23 // Copied from hwloc 24 #define HWLOC_GROUP_KIND_INTEL_DIE 104 25 #endif 26 27 // Store the real or imagined machine hierarchy here 28 static hierarchy_info machine_hierarchy; 29 30 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); } 31 32 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 33 kmp_uint32 depth; 34 // The test below is true if affinity is available, but set to "none". Need to 35 // init on first use of hierarchical barrier. 36 if (TCR_1(machine_hierarchy.uninitialized)) 37 machine_hierarchy.init(NULL, nproc); 38 39 // Adjust the hierarchy in case num threads exceeds original 40 if (nproc > machine_hierarchy.base_num_threads) 41 machine_hierarchy.resize(nproc); 42 43 depth = machine_hierarchy.depth; 44 KMP_DEBUG_ASSERT(depth > 0); 45 46 thr_bar->depth = depth; 47 __kmp_type_convert(machine_hierarchy.numPerLevel[0] - 1, 48 &(thr_bar->base_leaf_kids)); 49 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 50 } 51 52 #if KMP_AFFINITY_SUPPORTED 53 54 const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) { 55 switch (type) { 56 case KMP_HW_SOCKET: 57 return ((plural) ? KMP_I18N_STR(Sockets) : KMP_I18N_STR(Socket)); 58 case KMP_HW_DIE: 59 return ((plural) ? KMP_I18N_STR(Dice) : KMP_I18N_STR(Die)); 60 case KMP_HW_MODULE: 61 return ((plural) ? KMP_I18N_STR(Modules) : KMP_I18N_STR(Module)); 62 case KMP_HW_TILE: 63 return ((plural) ? KMP_I18N_STR(Tiles) : KMP_I18N_STR(Tile)); 64 case KMP_HW_NUMA: 65 return ((plural) ? KMP_I18N_STR(NumaDomains) : KMP_I18N_STR(NumaDomain)); 66 case KMP_HW_L3: 67 return ((plural) ? KMP_I18N_STR(L3Caches) : KMP_I18N_STR(L3Cache)); 68 case KMP_HW_L2: 69 return ((plural) ? KMP_I18N_STR(L2Caches) : KMP_I18N_STR(L2Cache)); 70 case KMP_HW_L1: 71 return ((plural) ? KMP_I18N_STR(L1Caches) : KMP_I18N_STR(L1Cache)); 72 case KMP_HW_CORE: 73 return ((plural) ? KMP_I18N_STR(Cores) : KMP_I18N_STR(Core)); 74 case KMP_HW_THREAD: 75 return ((plural) ? KMP_I18N_STR(Threads) : KMP_I18N_STR(Thread)); 76 case KMP_HW_PROC_GROUP: 77 return ((plural) ? KMP_I18N_STR(ProcGroups) : KMP_I18N_STR(ProcGroup)); 78 } 79 return KMP_I18N_STR(Unknown); 80 } 81 82 // This function removes the topology levels that are radix 1 and don't offer 83 // further information about the topology. The most common example is when you 84 // have one thread context per core, we don't want the extra thread context 85 // level if it offers no unique labels. So they are removed. 86 // return value: the new depth of address2os 87 static int __kmp_affinity_remove_radix_one_levels(AddrUnsPair *addrP, int nTh, 88 int depth, kmp_hw_t *types) { 89 int preference[KMP_HW_LAST]; 90 int top_index1, top_index2; 91 // Set up preference associative array 92 preference[KMP_HW_PROC_GROUP] = 110; 93 preference[KMP_HW_SOCKET] = 100; 94 preference[KMP_HW_CORE] = 95; 95 preference[KMP_HW_THREAD] = 90; 96 preference[KMP_HW_DIE] = 85; 97 preference[KMP_HW_NUMA] = 80; 98 preference[KMP_HW_TILE] = 75; 99 preference[KMP_HW_MODULE] = 73; 100 preference[KMP_HW_L3] = 70; 101 preference[KMP_HW_L2] = 65; 102 preference[KMP_HW_L1] = 60; 103 top_index1 = 0; 104 top_index2 = 1; 105 while (top_index1 < depth - 1 && top_index2 < depth) { 106 KMP_DEBUG_ASSERT(top_index1 >= 0 && top_index1 < depth); 107 KMP_DEBUG_ASSERT(top_index2 >= 0 && top_index2 < depth); 108 kmp_hw_t type1 = types[top_index1]; 109 kmp_hw_t type2 = types[top_index2]; 110 if (type1 == KMP_HW_SOCKET && type2 == KMP_HW_CORE) { 111 top_index1 = top_index2++; 112 continue; 113 } 114 bool radix1 = true; 115 bool all_same = true; 116 unsigned id1 = addrP[0].first.labels[top_index1]; 117 unsigned id2 = addrP[0].first.labels[top_index2]; 118 int pref1 = preference[type1]; 119 int pref2 = preference[type2]; 120 for (int hwidx = 1; hwidx < nTh; ++hwidx) { 121 if (addrP[hwidx].first.labels[top_index1] == id1 && 122 addrP[hwidx].first.labels[top_index2] != id2) { 123 radix1 = false; 124 break; 125 } 126 if (addrP[hwidx].first.labels[top_index2] != id2) 127 all_same = false; 128 id1 = addrP[hwidx].first.labels[top_index1]; 129 id2 = addrP[hwidx].first.labels[top_index2]; 130 } 131 if (radix1) { 132 // Select the layer to remove based on preference 133 kmp_hw_t remove_type, keep_type; 134 int remove_layer, remove_layer_ids; 135 if (pref1 > pref2) { 136 remove_type = type2; 137 remove_layer = remove_layer_ids = top_index2; 138 keep_type = type1; 139 } else { 140 remove_type = type1; 141 remove_layer = remove_layer_ids = top_index1; 142 keep_type = type2; 143 } 144 // If all the indexes for the second (deeper) layer are the same. 145 // e.g., all are zero, then make sure to keep the first layer's ids 146 if (all_same) 147 remove_layer_ids = top_index2; 148 // Remove radix one type by setting the equivalence, removing the id from 149 // the hw threads and removing the layer from types and depth 150 for (int idx = 0; idx < nTh; ++idx) { 151 Address &hw_thread = addrP[idx].first; 152 for (int d = remove_layer_ids; d < depth - 1; ++d) 153 hw_thread.labels[d] = hw_thread.labels[d + 1]; 154 hw_thread.depth--; 155 } 156 for (int idx = remove_layer; idx < depth - 1; ++idx) 157 types[idx] = types[idx + 1]; 158 depth--; 159 } else { 160 top_index1 = top_index2++; 161 } 162 } 163 KMP_ASSERT(depth > 0); 164 return depth; 165 } 166 // Gather the count of each topology layer and the ratio 167 // ratio contains the number of types[i] / types[i+1] and so forth 168 // count contains the absolute number of types[i] 169 static void __kmp_affinity_gather_enumeration_information(AddrUnsPair *addrP, 170 int nTh, int depth, 171 kmp_hw_t *types, 172 int *ratio, 173 int *count) { 174 int previous_id[KMP_HW_LAST]; 175 int max[KMP_HW_LAST]; 176 177 for (int i = 0; i < depth; ++i) { 178 previous_id[i] = -1; 179 max[i] = 0; 180 count[i] = 0; 181 ratio[i] = 0; 182 } 183 for (int i = 0; i < nTh; ++i) { 184 Address &hw_thread = addrP[i].first; 185 for (int layer = 0; layer < depth; ++layer) { 186 int id = hw_thread.labels[layer]; 187 if (id != previous_id[layer]) { 188 // Add an additional increment to each count 189 for (int l = layer; l < depth; ++l) 190 count[l]++; 191 // Keep track of topology layer ratio statistics 192 max[layer]++; 193 for (int l = layer + 1; l < depth; ++l) { 194 if (max[l] > ratio[l]) 195 ratio[l] = max[l]; 196 max[l] = 1; 197 } 198 break; 199 } 200 } 201 for (int layer = 0; layer < depth; ++layer) { 202 previous_id[layer] = hw_thread.labels[layer]; 203 } 204 } 205 for (int layer = 0; layer < depth; ++layer) { 206 if (max[layer] > ratio[layer]) 207 ratio[layer] = max[layer]; 208 } 209 } 210 211 // Find out if the topology is uniform 212 static bool __kmp_affinity_discover_uniformity(int depth, int *ratio, 213 int *count) { 214 int num = 1; 215 for (int level = 0; level < depth; ++level) 216 num *= ratio[level]; 217 return (num == count[depth - 1]); 218 } 219 220 // calculate the number of X's per Y 221 static inline int __kmp_affinity_calculate_ratio(int *ratio, int deep_level, 222 int shallow_level) { 223 int retval = 1; 224 if (deep_level < 0 || shallow_level < 0) 225 return retval; 226 for (int level = deep_level; level > shallow_level; --level) 227 retval *= ratio[level]; 228 return retval; 229 } 230 231 static void __kmp_affinity_print_topology(AddrUnsPair *addrP, int len, 232 int depth, kmp_hw_t *types) { 233 int proc; 234 kmp_str_buf_t buf; 235 __kmp_str_buf_init(&buf); 236 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 237 for (proc = 0; proc < len; proc++) { 238 for (int i = 0; i < depth; ++i) { 239 __kmp_str_buf_print(&buf, "%s %d ", __kmp_hw_get_catalog_string(types[i]), 240 addrP[proc].first.labels[i]); 241 } 242 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", addrP[proc].second, buf.str); 243 __kmp_str_buf_clear(&buf); 244 } 245 __kmp_str_buf_free(&buf); 246 } 247 248 // Print out the detailed machine topology map, i.e. the physical locations 249 // of each OS proc. 250 static void __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, 251 int depth, int pkgLevel, 252 int coreLevel, int threadLevel) { 253 int proc; 254 255 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 256 for (proc = 0; proc < len; proc++) { 257 int level; 258 kmp_str_buf_t buf; 259 __kmp_str_buf_init(&buf); 260 for (level = 0; level < depth; level++) { 261 if (level == threadLevel) { 262 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread)); 263 } else if (level == coreLevel) { 264 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core)); 265 } else if (level == pkgLevel) { 266 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package)); 267 } else if (level > pkgLevel) { 268 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node), 269 level - pkgLevel - 1); 270 } else { 271 __kmp_str_buf_print(&buf, "L%d ", level); 272 } 273 __kmp_str_buf_print(&buf, "%d ", address2os[proc].first.labels[level]); 274 } 275 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second, 276 buf.str); 277 __kmp_str_buf_free(&buf); 278 } 279 } 280 281 bool KMPAffinity::picked_api = false; 282 283 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); } 284 void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); } 285 void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); } 286 void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); } 287 void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); } 288 void KMPAffinity::operator delete(void *p) { __kmp_free(p); } 289 290 void KMPAffinity::pick_api() { 291 KMPAffinity *affinity_dispatch; 292 if (picked_api) 293 return; 294 #if KMP_USE_HWLOC 295 // Only use Hwloc if affinity isn't explicitly disabled and 296 // user requests Hwloc topology method 297 if (__kmp_affinity_top_method == affinity_top_method_hwloc && 298 __kmp_affinity_type != affinity_disabled) { 299 affinity_dispatch = new KMPHwlocAffinity(); 300 } else 301 #endif 302 { 303 affinity_dispatch = new KMPNativeAffinity(); 304 } 305 __kmp_affinity_dispatch = affinity_dispatch; 306 picked_api = true; 307 } 308 309 void KMPAffinity::destroy_api() { 310 if (__kmp_affinity_dispatch != NULL) { 311 delete __kmp_affinity_dispatch; 312 __kmp_affinity_dispatch = NULL; 313 picked_api = false; 314 } 315 } 316 317 #define KMP_ADVANCE_SCAN(scan) \ 318 while (*scan != '\0') { \ 319 scan++; \ 320 } 321 322 // Print the affinity mask to the character array in a pretty format. 323 // The format is a comma separated list of non-negative integers or integer 324 // ranges: e.g., 1,2,3-5,7,9-15 325 // The format can also be the string "{<empty>}" if no bits are set in mask 326 char *__kmp_affinity_print_mask(char *buf, int buf_len, 327 kmp_affin_mask_t *mask) { 328 int start = 0, finish = 0, previous = 0; 329 bool first_range; 330 KMP_ASSERT(buf); 331 KMP_ASSERT(buf_len >= 40); 332 KMP_ASSERT(mask); 333 char *scan = buf; 334 char *end = buf + buf_len - 1; 335 336 // Check for empty set. 337 if (mask->begin() == mask->end()) { 338 KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}"); 339 KMP_ADVANCE_SCAN(scan); 340 KMP_ASSERT(scan <= end); 341 return buf; 342 } 343 344 first_range = true; 345 start = mask->begin(); 346 while (1) { 347 // Find next range 348 // [start, previous] is inclusive range of contiguous bits in mask 349 for (finish = mask->next(start), previous = start; 350 finish == previous + 1 && finish != mask->end(); 351 finish = mask->next(finish)) { 352 previous = finish; 353 } 354 355 // The first range does not need a comma printed before it, but the rest 356 // of the ranges do need a comma beforehand 357 if (!first_range) { 358 KMP_SNPRINTF(scan, end - scan + 1, "%s", ","); 359 KMP_ADVANCE_SCAN(scan); 360 } else { 361 first_range = false; 362 } 363 // Range with three or more contiguous bits in the affinity mask 364 if (previous - start > 1) { 365 KMP_SNPRINTF(scan, end - scan + 1, "%u-%u", start, previous); 366 } else { 367 // Range with one or two contiguous bits in the affinity mask 368 KMP_SNPRINTF(scan, end - scan + 1, "%u", start); 369 KMP_ADVANCE_SCAN(scan); 370 if (previous - start > 0) { 371 KMP_SNPRINTF(scan, end - scan + 1, ",%u", previous); 372 } 373 } 374 KMP_ADVANCE_SCAN(scan); 375 // Start over with new start point 376 start = finish; 377 if (start == mask->end()) 378 break; 379 // Check for overflow 380 if (end - scan < 2) 381 break; 382 } 383 384 // Check for overflow 385 KMP_ASSERT(scan <= end); 386 return buf; 387 } 388 #undef KMP_ADVANCE_SCAN 389 390 // Print the affinity mask to the string buffer object in a pretty format 391 // The format is a comma separated list of non-negative integers or integer 392 // ranges: e.g., 1,2,3-5,7,9-15 393 // The format can also be the string "{<empty>}" if no bits are set in mask 394 kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf, 395 kmp_affin_mask_t *mask) { 396 int start = 0, finish = 0, previous = 0; 397 bool first_range; 398 KMP_ASSERT(buf); 399 KMP_ASSERT(mask); 400 401 __kmp_str_buf_clear(buf); 402 403 // Check for empty set. 404 if (mask->begin() == mask->end()) { 405 __kmp_str_buf_print(buf, "%s", "{<empty>}"); 406 return buf; 407 } 408 409 first_range = true; 410 start = mask->begin(); 411 while (1) { 412 // Find next range 413 // [start, previous] is inclusive range of contiguous bits in mask 414 for (finish = mask->next(start), previous = start; 415 finish == previous + 1 && finish != mask->end(); 416 finish = mask->next(finish)) { 417 previous = finish; 418 } 419 420 // The first range does not need a comma printed before it, but the rest 421 // of the ranges do need a comma beforehand 422 if (!first_range) { 423 __kmp_str_buf_print(buf, "%s", ","); 424 } else { 425 first_range = false; 426 } 427 // Range with three or more contiguous bits in the affinity mask 428 if (previous - start > 1) { 429 __kmp_str_buf_print(buf, "%u-%u", start, previous); 430 } else { 431 // Range with one or two contiguous bits in the affinity mask 432 __kmp_str_buf_print(buf, "%u", start); 433 if (previous - start > 0) { 434 __kmp_str_buf_print(buf, ",%u", previous); 435 } 436 } 437 // Start over with new start point 438 start = finish; 439 if (start == mask->end()) 440 break; 441 } 442 return buf; 443 } 444 445 void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) { 446 KMP_CPU_ZERO(mask); 447 448 #if KMP_GROUP_AFFINITY 449 450 if (__kmp_num_proc_groups > 1) { 451 int group; 452 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 453 for (group = 0; group < __kmp_num_proc_groups; group++) { 454 int i; 455 int num = __kmp_GetActiveProcessorCount(group); 456 for (i = 0; i < num; i++) { 457 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 458 } 459 } 460 } else 461 462 #endif /* KMP_GROUP_AFFINITY */ 463 464 { 465 int proc; 466 for (proc = 0; proc < __kmp_xproc; proc++) { 467 KMP_CPU_SET(proc, mask); 468 } 469 } 470 } 471 472 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be 473 // called to renumber the labels from [0..n] and place them into the child_num 474 // vector of the address object. This is done in case the labels used for 475 // the children at one node of the hierarchy differ from those used for 476 // another node at the same level. Example: suppose the machine has 2 nodes 477 // with 2 packages each. The first node contains packages 601 and 602, and 478 // second node contains packages 603 and 604. If we try to sort the table 479 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604 480 // because we are paying attention to the labels themselves, not the ordinal 481 // child numbers. By using the child numbers in the sort, the result is 482 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604. 483 static void __kmp_affinity_assign_child_nums(AddrUnsPair *address2os, 484 int numAddrs) { 485 KMP_DEBUG_ASSERT(numAddrs > 0); 486 int depth = address2os->first.depth; 487 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 488 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 489 int labCt; 490 for (labCt = 0; labCt < depth; labCt++) { 491 address2os[0].first.childNums[labCt] = counts[labCt] = 0; 492 lastLabel[labCt] = address2os[0].first.labels[labCt]; 493 } 494 int i; 495 for (i = 1; i < numAddrs; i++) { 496 for (labCt = 0; labCt < depth; labCt++) { 497 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) { 498 int labCt2; 499 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) { 500 counts[labCt2] = 0; 501 lastLabel[labCt2] = address2os[i].first.labels[labCt2]; 502 } 503 counts[labCt]++; 504 lastLabel[labCt] = address2os[i].first.labels[labCt]; 505 break; 506 } 507 } 508 for (labCt = 0; labCt < depth; labCt++) { 509 address2os[i].first.childNums[labCt] = counts[labCt]; 510 } 511 for (; labCt < (int)Address::maxDepth; labCt++) { 512 address2os[i].first.childNums[labCt] = 0; 513 } 514 } 515 __kmp_free(lastLabel); 516 __kmp_free(counts); 517 } 518 519 // All of the __kmp_affinity_create_*_map() routines should set 520 // __kmp_affinity_masks to a vector of affinity mask objects of length 521 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and return 522 // the number of levels in the machine topology tree (zero if 523 // __kmp_affinity_type == affinity_none). 524 // 525 // All of the __kmp_affinity_create_*_map() routines should set 526 // *__kmp_affin_fullMask to the affinity mask for the initialization thread. 527 // They need to save and restore the mask, and it could be needed later, so 528 // saving it is just an optimization to avoid calling kmp_get_system_affinity() 529 // again. 530 kmp_affin_mask_t *__kmp_affin_fullMask = NULL; 531 532 static int nCoresPerPkg, nPackages; 533 static int __kmp_nThreadsPerCore; 534 #ifndef KMP_DFLT_NTH_CORES 535 static int __kmp_ncores; 536 #endif 537 static int *__kmp_pu_os_idx = NULL; 538 static int nDiesPerPkg = 1; 539 540 // __kmp_affinity_uniform_topology() doesn't work when called from 541 // places which support arbitrarily many levels in the machine topology 542 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map() 543 // __kmp_affinity_create_x2apicid_map(). 544 inline static bool __kmp_affinity_uniform_topology() { 545 return __kmp_avail_proc == 546 (__kmp_nThreadsPerCore * nCoresPerPkg * nDiesPerPkg * nPackages); 547 } 548 549 #if KMP_USE_HWLOC 550 551 static inline bool __kmp_hwloc_is_cache_type(hwloc_obj_t obj) { 552 #if HWLOC_API_VERSION >= 0x00020000 553 return hwloc_obj_type_is_cache(obj->type); 554 #else 555 return obj->type == HWLOC_OBJ_CACHE; 556 #endif 557 } 558 559 // Returns KMP_HW_* type derived from HWLOC_* type 560 static inline kmp_hw_t __kmp_hwloc_type_2_topology_type(hwloc_obj_t obj) { 561 562 if (__kmp_hwloc_is_cache_type(obj)) { 563 if (obj->attr->cache.type == HWLOC_OBJ_CACHE_INSTRUCTION) 564 return KMP_HW_UNKNOWN; 565 switch (obj->attr->cache.depth) { 566 case 1: 567 return KMP_HW_L1; 568 case 2: 569 #if KMP_MIC_SUPPORTED 570 if (__kmp_mic_type == mic3) { 571 return KMP_HW_TILE; 572 } 573 #endif 574 return KMP_HW_L2; 575 case 3: 576 return KMP_HW_L3; 577 } 578 return KMP_HW_UNKNOWN; 579 } 580 581 switch (obj->type) { 582 case HWLOC_OBJ_PACKAGE: 583 return KMP_HW_SOCKET; 584 case HWLOC_OBJ_NUMANODE: 585 return KMP_HW_NUMA; 586 case HWLOC_OBJ_CORE: 587 return KMP_HW_CORE; 588 case HWLOC_OBJ_PU: 589 return KMP_HW_THREAD; 590 case HWLOC_OBJ_GROUP: 591 if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_DIE) 592 return KMP_HW_DIE; 593 #if HWLOC_API_VERSION >= 0x00020100 594 case HWLOC_OBJ_DIE: 595 return KMP_HW_DIE; 596 #endif 597 } 598 return KMP_HW_UNKNOWN; 599 } 600 601 // Returns the number of objects of type 'type' below 'obj' within the topology 602 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is 603 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET 604 // object. 605 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj, 606 hwloc_obj_type_t type) { 607 int retval = 0; 608 hwloc_obj_t first; 609 for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type, 610 obj->logical_index, type, 0); 611 first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, 612 obj->type, first) == obj; 613 first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type, 614 first)) { 615 ++retval; 616 } 617 return retval; 618 } 619 620 static int __kmp_hwloc_count_children_by_depth(hwloc_topology_t t, 621 hwloc_obj_t o, 622 kmp_hwloc_depth_t depth, 623 hwloc_obj_t *f) { 624 if (o->depth == depth) { 625 if (*f == NULL) 626 *f = o; // output first descendant found 627 return 1; 628 } 629 int sum = 0; 630 for (unsigned i = 0; i < o->arity; i++) 631 sum += __kmp_hwloc_count_children_by_depth(t, o->children[i], depth, f); 632 return sum; // will be 0 if no one found (as PU arity is 0) 633 } 634 635 static int __kmp_hwloc_count_children_by_type(hwloc_topology_t t, hwloc_obj_t o, 636 hwloc_obj_type_t type, 637 hwloc_obj_t *f) { 638 if (!hwloc_compare_types(o->type, type)) { 639 if (*f == NULL) 640 *f = o; // output first descendant found 641 return 1; 642 } 643 int sum = 0; 644 for (unsigned i = 0; i < o->arity; i++) 645 sum += __kmp_hwloc_count_children_by_type(t, o->children[i], type, f); 646 return sum; // will be 0 if no one found (as PU arity is 0) 647 } 648 649 // This gets the sub_id for a lower object under a higher object in the 650 // topology tree 651 static int __kmp_hwloc_get_sub_id(hwloc_topology_t t, hwloc_obj_t higher, 652 hwloc_obj_t lower) { 653 hwloc_obj_t obj; 654 hwloc_obj_type_t ltype = lower->type; 655 int lindex = lower->logical_index - 1; 656 int sub_id = 0; 657 // Get the previous lower object 658 obj = hwloc_get_obj_by_type(t, ltype, lindex); 659 while (obj && lindex >= 0 && 660 hwloc_bitmap_isincluded(obj->cpuset, higher->cpuset)) { 661 if (obj->userdata) { 662 sub_id = (int)(RCAST(kmp_intptr_t, obj->userdata)); 663 break; 664 } 665 sub_id++; 666 lindex--; 667 obj = hwloc_get_obj_by_type(t, ltype, lindex); 668 } 669 // store sub_id + 1 so that 0 is differed from NULL 670 lower->userdata = RCAST(void *, sub_id + 1); 671 return sub_id; 672 } 673 674 static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os, 675 kmp_i18n_id_t *const msg_id) { 676 kmp_hw_t type; 677 int hw_thread_index, sub_id, nActiveThreads; 678 int depth; 679 hwloc_obj_t pu, obj, root, prev; 680 int ratio[KMP_HW_LAST]; 681 int count[KMP_HW_LAST]; 682 kmp_hw_t types[KMP_HW_LAST]; 683 684 hwloc_topology_t tp = __kmp_hwloc_topology; 685 *msg_id = kmp_i18n_null; 686 687 // Save the affinity mask for the current thread. 688 kmp_affin_mask_t *oldMask; 689 KMP_CPU_ALLOC(oldMask); 690 __kmp_get_system_affinity(oldMask, TRUE); 691 692 if (!KMP_AFFINITY_CAPABLE()) { 693 // Hack to try and infer the machine topology using only the data 694 // available from cpuid on the current thread, and __kmp_xproc. 695 KMP_ASSERT(__kmp_affinity_type == affinity_none); 696 // hwloc only guarantees existance of PU object, so check PACKAGE and CORE 697 hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0); 698 if (o != NULL) 699 nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_CORE); 700 else 701 nCoresPerPkg = 1; // no PACKAGE found 702 o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0); 703 if (o != NULL) 704 __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_PU); 705 else 706 __kmp_nThreadsPerCore = 1; // no CORE found 707 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 708 if (nCoresPerPkg == 0) 709 nCoresPerPkg = 1; // to prevent possible division by 0 710 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 711 if (__kmp_affinity_verbose) { 712 KMP_INFORM(AffNotUsingHwloc, "KMP_AFFINITY"); 713 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 714 if (__kmp_affinity_uniform_topology()) { 715 KMP_INFORM(Uniform, "KMP_AFFINITY"); 716 } else { 717 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 718 } 719 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 720 __kmp_nThreadsPerCore, __kmp_ncores); 721 } 722 KMP_CPU_FREE(oldMask); 723 return 0; 724 } 725 726 root = hwloc_get_root_obj(tp); 727 728 // Figure out the depth and types in the topology 729 depth = 0; 730 pu = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin()); 731 obj = pu; 732 types[depth] = KMP_HW_THREAD; 733 depth++; 734 while (obj != root && obj != NULL) { 735 obj = obj->parent; 736 #if HWLOC_API_VERSION >= 0x00020000 737 if (obj->memory_arity) { 738 hwloc_obj_t memory; 739 for (memory = obj->memory_first_child; memory; 740 memory = hwloc_get_next_child(tp, obj, memory)) { 741 if (memory->type == HWLOC_OBJ_NUMANODE) 742 break; 743 } 744 if (memory && memory->type == HWLOC_OBJ_NUMANODE) { 745 types[depth] = KMP_HW_NUMA; 746 depth++; 747 } 748 } 749 #endif 750 type = __kmp_hwloc_type_2_topology_type(obj); 751 if (type != KMP_HW_UNKNOWN) { 752 types[depth] = type; 753 depth++; 754 } 755 } 756 KMP_ASSERT(depth > 0 && depth <= KMP_HW_LAST); 757 758 // Get the order for the types correct 759 for (int i = 0, j = depth - 1; i < j; ++i, --j) { 760 kmp_hw_t temp = types[i]; 761 types[i] = types[j]; 762 types[j] = temp; 763 } 764 765 // Allocate the data structure to be returned. 766 AddrUnsPair *retval = 767 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 768 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 769 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 770 771 hw_thread_index = 0; 772 pu = NULL; 773 nActiveThreads = 0; 774 while (pu = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, pu)) { 775 int index = depth - 1; 776 bool included = KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask); 777 Address hw_thread(depth); 778 if (included) { 779 hw_thread.labels[index] = pu->logical_index; 780 __kmp_pu_os_idx[hw_thread_index] = pu->os_index; 781 index--; 782 nActiveThreads++; 783 } 784 obj = pu; 785 prev = obj; 786 while (obj != root && obj != NULL) { 787 obj = obj->parent; 788 #if HWLOC_API_VERSION >= 0x00020000 789 // NUMA Nodes are handled differently since they are not within the 790 // parent/child structure anymore. They are separate children 791 // of obj (memory_first_child points to first memory child) 792 if (obj->memory_arity) { 793 hwloc_obj_t memory; 794 for (memory = obj->memory_first_child; memory; 795 memory = hwloc_get_next_child(tp, obj, memory)) { 796 if (memory->type == HWLOC_OBJ_NUMANODE) 797 break; 798 } 799 if (memory && memory->type == HWLOC_OBJ_NUMANODE) { 800 sub_id = __kmp_hwloc_get_sub_id(tp, memory, prev); 801 if (included) { 802 hw_thread.labels[index] = memory->logical_index; 803 hw_thread.labels[index + 1] = sub_id; 804 index--; 805 } 806 prev = memory; 807 } 808 } 809 #endif 810 type = __kmp_hwloc_type_2_topology_type(obj); 811 if (type != KMP_HW_UNKNOWN) { 812 sub_id = __kmp_hwloc_get_sub_id(tp, obj, prev); 813 if (included) { 814 hw_thread.labels[index] = obj->logical_index; 815 hw_thread.labels[index + 1] = sub_id; 816 index--; 817 } 818 prev = obj; 819 } 820 } 821 if (included) { 822 retval[hw_thread_index] = AddrUnsPair(hw_thread, pu->os_index); 823 hw_thread_index++; 824 } 825 } 826 827 // If there's only one thread context to bind to, return now. 828 KMP_DEBUG_ASSERT(nActiveThreads == __kmp_avail_proc); 829 KMP_ASSERT(nActiveThreads > 0); 830 if (nActiveThreads == 1) { 831 __kmp_ncores = nPackages = 1; 832 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 833 if (__kmp_affinity_verbose) { 834 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 835 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 836 KMP_INFORM(Uniform, "KMP_AFFINITY"); 837 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 838 __kmp_nThreadsPerCore, __kmp_ncores); 839 } 840 841 if (__kmp_affinity_type == affinity_none) { 842 __kmp_free(retval); 843 KMP_CPU_FREE(oldMask); 844 return 0; 845 } 846 847 // Form an Address object which only includes the package level. 848 Address addr(1); 849 addr.labels[0] = retval[0].first.labels[0]; 850 retval[0].first = addr; 851 852 if (__kmp_affinity_gran_levels < 0) { 853 __kmp_affinity_gran_levels = 0; 854 } 855 856 if (__kmp_affinity_verbose) { 857 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 858 } 859 860 *address2os = retval; 861 KMP_CPU_FREE(oldMask); 862 return 1; 863 } 864 865 // Sort the table by physical Id. 866 qsort(retval, nActiveThreads, sizeof(*retval), 867 __kmp_affinity_cmp_Address_labels); 868 869 // Find any levels with radiix 1, and remove them from the map 870 // (except for the package level). 871 depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth, 872 types); 873 874 __kmp_affinity_gather_enumeration_information(retval, nActiveThreads, depth, 875 types, ratio, count); 876 877 for (int level = 0; level < depth; ++level) { 878 if ((types[level] == KMP_HW_L2 || types[level] == KMP_HW_L3)) 879 __kmp_tile_depth = level; 880 } 881 882 // This routine should set __kmp_ncores, as well as 883 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 884 int thread_level, core_level, tile_level, numa_level, socket_level; 885 thread_level = core_level = tile_level = numa_level = socket_level = -1; 886 for (int level = 0; level < depth; ++level) { 887 if (types[level] == KMP_HW_THREAD) 888 thread_level = level; 889 else if (types[level] == KMP_HW_CORE) 890 core_level = level; 891 else if (types[level] == KMP_HW_SOCKET) 892 socket_level = level; 893 else if (types[level] == KMP_HW_TILE) 894 tile_level = level; 895 else if (types[level] == KMP_HW_NUMA) 896 numa_level = level; 897 } 898 __kmp_nThreadsPerCore = 899 __kmp_affinity_calculate_ratio(ratio, thread_level, core_level); 900 nCoresPerPkg = 901 __kmp_affinity_calculate_ratio(ratio, core_level, socket_level); 902 if (socket_level >= 0) 903 nPackages = count[socket_level]; 904 else 905 nPackages = 1; 906 if (core_level >= 0) 907 __kmp_ncores = count[core_level]; 908 else 909 __kmp_ncores = 1; 910 911 unsigned uniform = __kmp_affinity_discover_uniformity(depth, ratio, count); 912 913 // Print the machine topology summary. 914 if (__kmp_affinity_verbose) { 915 kmp_hw_t numerator_type, denominator_type; 916 kmp_str_buf_t buf; 917 __kmp_str_buf_init(&buf); 918 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 919 if (uniform) { 920 KMP_INFORM(Uniform, "KMP_AFFINITY"); 921 } else { 922 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 923 } 924 925 __kmp_str_buf_clear(&buf); 926 927 if (core_level < 0) 928 core_level = depth - 1; 929 int ncores = count[core_level]; 930 931 denominator_type = KMP_HW_UNKNOWN; 932 for (int level = 0; level < depth; ++level) { 933 int c; 934 bool plural; 935 numerator_type = types[level]; 936 c = ratio[level]; 937 plural = (c > 1); 938 if (level == 0) { 939 __kmp_str_buf_print( 940 &buf, "%d %s", c, 941 __kmp_hw_get_catalog_string(numerator_type, plural)); 942 } else { 943 __kmp_str_buf_print(&buf, " x %d %s/%s", c, 944 __kmp_hw_get_catalog_string(numerator_type, plural), 945 __kmp_hw_get_catalog_string(denominator_type)); 946 } 947 denominator_type = numerator_type; 948 } 949 KMP_INFORM(TopologyGeneric, "KMP_AFFINITY", buf.str, ncores); 950 __kmp_str_buf_free(&buf); 951 } 952 953 if (__kmp_affinity_type == affinity_none) { 954 __kmp_free(retval); 955 KMP_CPU_FREE(oldMask); 956 return 0; 957 } 958 959 // Set the granularity level based on what levels are modeled 960 // in the machine topology map. 961 if (__kmp_affinity_gran == affinity_gran_node) 962 __kmp_affinity_gran = affinity_gran_numa; 963 KMP_DEBUG_ASSERT(__kmp_affinity_gran != affinity_gran_default); 964 if (__kmp_affinity_gran_levels < 0) { 965 __kmp_affinity_gran_levels = 0; // lowest level (e.g. fine) 966 if ((thread_level >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) 967 __kmp_affinity_gran_levels++; 968 if ((core_level >= 0) && (__kmp_affinity_gran > affinity_gran_core)) 969 __kmp_affinity_gran_levels++; 970 if ((tile_level >= 0) && (__kmp_affinity_gran > affinity_gran_tile)) 971 __kmp_affinity_gran_levels++; 972 if ((numa_level >= 0) && (__kmp_affinity_gran > affinity_gran_numa)) 973 __kmp_affinity_gran_levels++; 974 if ((socket_level >= 0) && (__kmp_affinity_gran > affinity_gran_package)) 975 __kmp_affinity_gran_levels++; 976 } 977 978 if (__kmp_affinity_verbose) 979 __kmp_affinity_print_topology(retval, nActiveThreads, depth, types); 980 981 KMP_CPU_FREE(oldMask); 982 *address2os = retval; 983 return depth; 984 } 985 #endif // KMP_USE_HWLOC 986 987 // If we don't know how to retrieve the machine's processor topology, or 988 // encounter an error in doing so, this routine is called to form a "flat" 989 // mapping of os thread id's <-> processor id's. 990 static int __kmp_affinity_create_flat_map(AddrUnsPair **address2os, 991 kmp_i18n_id_t *const msg_id) { 992 *address2os = NULL; 993 *msg_id = kmp_i18n_null; 994 995 // Even if __kmp_affinity_type == affinity_none, this routine might still 996 // called to set __kmp_ncores, as well as 997 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 998 if (!KMP_AFFINITY_CAPABLE()) { 999 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1000 __kmp_ncores = nPackages = __kmp_xproc; 1001 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1002 if (__kmp_affinity_verbose) { 1003 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY"); 1004 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1005 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1006 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1007 __kmp_nThreadsPerCore, __kmp_ncores); 1008 } 1009 return 0; 1010 } 1011 1012 // When affinity is off, this routine will still be called to set 1013 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1014 // Make sure all these vars are set correctly, and return now if affinity is 1015 // not enabled. 1016 __kmp_ncores = nPackages = __kmp_avail_proc; 1017 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1018 if (__kmp_affinity_verbose) { 1019 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY"); 1020 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1021 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1022 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1023 __kmp_nThreadsPerCore, __kmp_ncores); 1024 } 1025 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 1026 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 1027 if (__kmp_affinity_type == affinity_none) { 1028 int avail_ct = 0; 1029 int i; 1030 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1031 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) 1032 continue; 1033 __kmp_pu_os_idx[avail_ct++] = i; // suppose indices are flat 1034 } 1035 return 0; 1036 } 1037 1038 // Construct the data structure to be returned. 1039 *address2os = 1040 (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 1041 int avail_ct = 0; 1042 int i; 1043 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1044 // Skip this proc if it is not included in the machine model. 1045 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1046 continue; 1047 } 1048 __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat 1049 Address addr(1); 1050 addr.labels[0] = i; 1051 (*address2os)[avail_ct++] = AddrUnsPair(addr, i); 1052 } 1053 if (__kmp_affinity_verbose) { 1054 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 1055 } 1056 1057 if (__kmp_affinity_gran_levels < 0) { 1058 // Only the package level is modeled in the machine topology map, 1059 // so the #levels of granularity is either 0 or 1. 1060 if (__kmp_affinity_gran > affinity_gran_package) { 1061 __kmp_affinity_gran_levels = 1; 1062 } else { 1063 __kmp_affinity_gran_levels = 0; 1064 } 1065 } 1066 return 1; 1067 } 1068 1069 #if KMP_GROUP_AFFINITY 1070 1071 // If multiple Windows* OS processor groups exist, we can create a 2-level 1072 // topology map with the groups at level 0 and the individual procs at level 1. 1073 // This facilitates letting the threads float among all procs in a group, 1074 // if granularity=group (the default when there are multiple groups). 1075 static int __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os, 1076 kmp_i18n_id_t *const msg_id) { 1077 *address2os = NULL; 1078 *msg_id = kmp_i18n_null; 1079 1080 // If we aren't affinity capable, then return now. 1081 // The flat mapping will be used. 1082 if (!KMP_AFFINITY_CAPABLE()) { 1083 // FIXME set *msg_id 1084 return -1; 1085 } 1086 1087 // Construct the data structure to be returned. 1088 *address2os = 1089 (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 1090 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 1091 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 1092 int avail_ct = 0; 1093 int i; 1094 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1095 // Skip this proc if it is not included in the machine model. 1096 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1097 continue; 1098 } 1099 __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat 1100 Address addr(2); 1101 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR)); 1102 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR)); 1103 (*address2os)[avail_ct++] = AddrUnsPair(addr, i); 1104 1105 if (__kmp_affinity_verbose) { 1106 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0], 1107 addr.labels[1]); 1108 } 1109 } 1110 1111 if (__kmp_affinity_gran_levels < 0) { 1112 if (__kmp_affinity_gran == affinity_gran_group) { 1113 __kmp_affinity_gran_levels = 1; 1114 } else if ((__kmp_affinity_gran == affinity_gran_fine) || 1115 (__kmp_affinity_gran == affinity_gran_thread)) { 1116 __kmp_affinity_gran_levels = 0; 1117 } else { 1118 const char *gran_str = NULL; 1119 if (__kmp_affinity_gran == affinity_gran_core) { 1120 gran_str = "core"; 1121 } else if (__kmp_affinity_gran == affinity_gran_package) { 1122 gran_str = "package"; 1123 } else if (__kmp_affinity_gran == affinity_gran_node) { 1124 gran_str = "node"; 1125 } else { 1126 KMP_ASSERT(0); 1127 } 1128 1129 // Warning: can't use affinity granularity \"gran\" with group topology 1130 // method, using "thread" 1131 __kmp_affinity_gran_levels = 0; 1132 } 1133 } 1134 return 2; 1135 } 1136 1137 #endif /* KMP_GROUP_AFFINITY */ 1138 1139 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1140 1141 /* 1142 * CPUID.B or 1F, Input ECX (sub leaf # aka level number) 1143 Bits Bits Bits Bits 1144 31-16 15-8 7-4 4-0 1145 ---+-----------+--------------+-------------+-----------------+ 1146 EAX| reserved | reserved | reserved | Bits to Shift | 1147 ---+-----------|--------------+-------------+-----------------| 1148 EBX| reserved | Num logical processors at level (16 bits) | 1149 ---+-----------|--------------+-------------------------------| 1150 ECX| reserved | Level Type | Level Number (8 bits) | 1151 ---+-----------+--------------+-------------------------------| 1152 EDX| X2APIC ID (32 bits) | 1153 ---+----------------------------------------------------------+ 1154 */ 1155 1156 enum { 1157 INTEL_LEVEL_TYPE_INVALID = 0, // Package level 1158 INTEL_LEVEL_TYPE_SMT = 1, 1159 INTEL_LEVEL_TYPE_CORE = 2, 1160 INTEL_LEVEL_TYPE_TILE = 3, 1161 INTEL_LEVEL_TYPE_MODULE = 4, 1162 INTEL_LEVEL_TYPE_DIE = 5, 1163 INTEL_LEVEL_TYPE_LAST = 6, 1164 }; 1165 1166 struct cpuid_level_info_t { 1167 unsigned level_type, mask, mask_width, nitems, cache_mask; 1168 }; 1169 1170 template <kmp_uint32 LSB, kmp_uint32 MSB> 1171 static inline unsigned __kmp_extract_bits(kmp_uint32 v) { 1172 const kmp_uint32 SHIFT_LEFT = sizeof(kmp_uint32) * 8 - 1 - MSB; 1173 const kmp_uint32 SHIFT_RIGHT = LSB; 1174 kmp_uint32 retval = v; 1175 retval <<= SHIFT_LEFT; 1176 retval >>= (SHIFT_LEFT + SHIFT_RIGHT); 1177 return retval; 1178 } 1179 1180 static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) { 1181 switch (intel_type) { 1182 case INTEL_LEVEL_TYPE_INVALID: 1183 return KMP_HW_SOCKET; 1184 case INTEL_LEVEL_TYPE_SMT: 1185 return KMP_HW_THREAD; 1186 case INTEL_LEVEL_TYPE_CORE: 1187 return KMP_HW_CORE; 1188 // TODO: add support for the tile and module 1189 case INTEL_LEVEL_TYPE_TILE: 1190 return KMP_HW_UNKNOWN; 1191 case INTEL_LEVEL_TYPE_MODULE: 1192 return KMP_HW_UNKNOWN; 1193 case INTEL_LEVEL_TYPE_DIE: 1194 return KMP_HW_DIE; 1195 } 1196 return KMP_HW_UNKNOWN; 1197 } 1198 1199 // This function takes the topology leaf, a levels array to store the levels 1200 // detected and a bitmap of the known levels. 1201 // Returns the number of levels in the topology 1202 static unsigned 1203 __kmp_x2apicid_get_levels(int leaf, 1204 cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST], 1205 kmp_uint64 known_levels) { 1206 unsigned level, levels_index; 1207 unsigned level_type, mask_width, nitems; 1208 kmp_cpuid buf; 1209 1210 // The new algorithm has known topology layers act as highest unknown topology 1211 // layers when unknown topology layers exist. 1212 // e.g., Suppose layers were SMT CORE <Y> <Z> PACKAGE 1213 // Then CORE will take the characteristics (nitems and mask width) of <Z>. 1214 // In developing the id mask for each layer, this eliminates unknown portions 1215 // of the topology while still keeping the correct underlying structure. 1216 level = levels_index = 0; 1217 do { 1218 __kmp_x86_cpuid(leaf, level, &buf); 1219 level_type = __kmp_extract_bits<8, 15>(buf.ecx); 1220 mask_width = __kmp_extract_bits<0, 4>(buf.eax); 1221 nitems = __kmp_extract_bits<0, 15>(buf.ebx); 1222 if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0) 1223 return 0; 1224 1225 if (known_levels & (1ull << level_type)) { 1226 // Add a new level to the topology 1227 KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST); 1228 levels[levels_index].level_type = level_type; 1229 levels[levels_index].mask_width = mask_width; 1230 levels[levels_index].nitems = nitems; 1231 levels_index++; 1232 } else { 1233 // If it is an unknown level, then logically move the previous layer up 1234 if (levels_index > 0) { 1235 levels[levels_index - 1].mask_width = mask_width; 1236 levels[levels_index - 1].nitems = nitems; 1237 } 1238 } 1239 level++; 1240 } while (level_type != INTEL_LEVEL_TYPE_INVALID); 1241 1242 // Set the masks to & with apicid 1243 for (unsigned i = 0; i < levels_index; ++i) { 1244 if (levels[i].level_type != INTEL_LEVEL_TYPE_INVALID) { 1245 levels[i].mask = ~((-1) << levels[i].mask_width); 1246 levels[i].cache_mask = (-1) << levels[i].mask_width; 1247 for (unsigned j = 0; j < i; ++j) 1248 levels[i].mask ^= levels[j].mask; 1249 } else { 1250 KMP_DEBUG_ASSERT(levels_index > 0); 1251 levels[i].mask = (-1) << levels[i - 1].mask_width; 1252 levels[i].cache_mask = 0; 1253 } 1254 } 1255 return levels_index; 1256 } 1257 1258 static int __kmp_cpuid_mask_width(int count) { 1259 int r = 0; 1260 1261 while ((1 << r) < count) 1262 ++r; 1263 return r; 1264 } 1265 1266 class apicThreadInfo { 1267 public: 1268 unsigned osId; // param to __kmp_affinity_bind_thread 1269 unsigned apicId; // from cpuid after binding 1270 unsigned maxCoresPerPkg; // "" 1271 unsigned maxThreadsPerPkg; // "" 1272 unsigned pkgId; // inferred from above values 1273 unsigned coreId; // "" 1274 unsigned threadId; // "" 1275 }; 1276 1277 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, 1278 const void *b) { 1279 const apicThreadInfo *aa = (const apicThreadInfo *)a; 1280 const apicThreadInfo *bb = (const apicThreadInfo *)b; 1281 if (aa->pkgId < bb->pkgId) 1282 return -1; 1283 if (aa->pkgId > bb->pkgId) 1284 return 1; 1285 if (aa->coreId < bb->coreId) 1286 return -1; 1287 if (aa->coreId > bb->coreId) 1288 return 1; 1289 if (aa->threadId < bb->threadId) 1290 return -1; 1291 if (aa->threadId > bb->threadId) 1292 return 1; 1293 return 0; 1294 } 1295 1296 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 1297 // an algorithm which cycles through the available os threads, setting 1298 // the current thread's affinity mask to that thread, and then retrieves 1299 // the Apic Id for each thread context using the cpuid instruction. 1300 static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os, 1301 kmp_i18n_id_t *const msg_id) { 1302 kmp_cpuid buf; 1303 *address2os = NULL; 1304 *msg_id = kmp_i18n_null; 1305 1306 // Check if cpuid leaf 4 is supported. 1307 __kmp_x86_cpuid(0, 0, &buf); 1308 if (buf.eax < 4) { 1309 *msg_id = kmp_i18n_str_NoLeaf4Support; 1310 return -1; 1311 } 1312 1313 // The algorithm used starts by setting the affinity to each available thread 1314 // and retrieving info from the cpuid instruction, so if we are not capable of 1315 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we 1316 // need to do something else - use the defaults that we calculated from 1317 // issuing cpuid without binding to each proc. 1318 if (!KMP_AFFINITY_CAPABLE()) { 1319 // Hack to try and infer the machine topology using only the data 1320 // available from cpuid on the current thread, and __kmp_xproc. 1321 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1322 1323 // Get an upper bound on the number of threads per package using cpuid(1). 1324 // On some OS/chps combinations where HT is supported by the chip but is 1325 // disabled, this value will be 2 on a single core chip. Usually, it will be 1326 // 2 if HT is enabled and 1 if HT is disabled. 1327 __kmp_x86_cpuid(1, 0, &buf); 1328 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1329 if (maxThreadsPerPkg == 0) { 1330 maxThreadsPerPkg = 1; 1331 } 1332 1333 // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded 1334 // value. 1335 // 1336 // The author of cpu_count.cpp treated this only an upper bound on the 1337 // number of cores, but I haven't seen any cases where it was greater than 1338 // the actual number of cores, so we will treat it as exact in this block of 1339 // code. 1340 // 1341 // First, we need to check if cpuid(4) is supported on this chip. To see if 1342 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or 1343 // greater. 1344 __kmp_x86_cpuid(0, 0, &buf); 1345 if (buf.eax >= 4) { 1346 __kmp_x86_cpuid(4, 0, &buf); 1347 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1348 } else { 1349 nCoresPerPkg = 1; 1350 } 1351 1352 // There is no way to reliably tell if HT is enabled without issuing the 1353 // cpuid instruction from every thread, can correlating the cpuid info, so 1354 // if the machine is not affinity capable, we assume that HT is off. We have 1355 // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine 1356 // does not support HT. 1357 // 1358 // - Older OSes are usually found on machines with older chips, which do not 1359 // support HT. 1360 // - The performance penalty for mistakenly identifying a machine as HT when 1361 // it isn't (which results in blocktime being incorrectly set to 0) is 1362 // greater than the penalty when for mistakenly identifying a machine as 1363 // being 1 thread/core when it is really HT enabled (which results in 1364 // blocktime being incorrectly set to a positive value). 1365 __kmp_ncores = __kmp_xproc; 1366 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1367 __kmp_nThreadsPerCore = 1; 1368 if (__kmp_affinity_verbose) { 1369 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY"); 1370 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1371 if (__kmp_affinity_uniform_topology()) { 1372 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1373 } else { 1374 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1375 } 1376 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1377 __kmp_nThreadsPerCore, __kmp_ncores); 1378 } 1379 return 0; 1380 } 1381 1382 // From here on, we can assume that it is safe to call 1383 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 1384 // __kmp_affinity_type = affinity_none. 1385 1386 // Save the affinity mask for the current thread. 1387 kmp_affin_mask_t *oldMask; 1388 KMP_CPU_ALLOC(oldMask); 1389 KMP_ASSERT(oldMask != NULL); 1390 __kmp_get_system_affinity(oldMask, TRUE); 1391 1392 // Run through each of the available contexts, binding the current thread 1393 // to it, and obtaining the pertinent information using the cpuid instr. 1394 // 1395 // The relevant information is: 1396 // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 1397 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 1398 // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value 1399 // of this field determines the width of the core# + thread# fields in the 1400 // Apic Id. It is also an upper bound on the number of threads per 1401 // package, but it has been verified that situations happen were it is not 1402 // exact. In particular, on certain OS/chip combinations where Intel(R) 1403 // Hyper-Threading Technology is supported by the chip but has been 1404 // disabled, the value of this field will be 2 (for a single core chip). 1405 // On other OS/chip combinations supporting Intel(R) Hyper-Threading 1406 // Technology, the value of this field will be 1 when Intel(R) 1407 // Hyper-Threading Technology is disabled and 2 when it is enabled. 1408 // - Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The value 1409 // of this field (+1) determines the width of the core# field in the Apic 1410 // Id. The comments in "cpucount.cpp" say that this value is an upper 1411 // bound, but the IA-32 architecture manual says that it is exactly the 1412 // number of cores per package, and I haven't seen any case where it 1413 // wasn't. 1414 // 1415 // From this information, deduce the package Id, core Id, and thread Id, 1416 // and set the corresponding fields in the apicThreadInfo struct. 1417 unsigned i; 1418 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 1419 __kmp_avail_proc * sizeof(apicThreadInfo)); 1420 unsigned nApics = 0; 1421 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1422 // Skip this proc if it is not included in the machine model. 1423 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1424 continue; 1425 } 1426 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 1427 1428 __kmp_affinity_dispatch->bind_thread(i); 1429 threadInfo[nApics].osId = i; 1430 1431 // The apic id and max threads per pkg come from cpuid(1). 1432 __kmp_x86_cpuid(1, 0, &buf); 1433 if (((buf.edx >> 9) & 1) == 0) { 1434 __kmp_set_system_affinity(oldMask, TRUE); 1435 __kmp_free(threadInfo); 1436 KMP_CPU_FREE(oldMask); 1437 *msg_id = kmp_i18n_str_ApicNotPresent; 1438 return -1; 1439 } 1440 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 1441 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1442 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 1443 threadInfo[nApics].maxThreadsPerPkg = 1; 1444 } 1445 1446 // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded 1447 // value. 1448 // 1449 // First, we need to check if cpuid(4) is supported on this chip. To see if 1450 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n 1451 // or greater. 1452 __kmp_x86_cpuid(0, 0, &buf); 1453 if (buf.eax >= 4) { 1454 __kmp_x86_cpuid(4, 0, &buf); 1455 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1456 } else { 1457 threadInfo[nApics].maxCoresPerPkg = 1; 1458 } 1459 1460 // Infer the pkgId / coreId / threadId using only the info obtained locally. 1461 int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg); 1462 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 1463 1464 int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg); 1465 int widthT = widthCT - widthC; 1466 if (widthT < 0) { 1467 // I've never seen this one happen, but I suppose it could, if the cpuid 1468 // instruction on a chip was really screwed up. Make sure to restore the 1469 // affinity mask before the tail call. 1470 __kmp_set_system_affinity(oldMask, TRUE); 1471 __kmp_free(threadInfo); 1472 KMP_CPU_FREE(oldMask); 1473 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1474 return -1; 1475 } 1476 1477 int maskC = (1 << widthC) - 1; 1478 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC; 1479 1480 int maskT = (1 << widthT) - 1; 1481 threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT; 1482 1483 nApics++; 1484 } 1485 1486 // We've collected all the info we need. 1487 // Restore the old affinity mask for this thread. 1488 __kmp_set_system_affinity(oldMask, TRUE); 1489 1490 // If there's only one thread context to bind to, form an Address object 1491 // with depth 1 and return immediately (or, if affinity is off, set 1492 // address2os to NULL and return). 1493 // 1494 // If it is configured to omit the package level when there is only a single 1495 // package, the logic at the end of this routine won't work if there is only 1496 // a single thread - it would try to form an Address object with depth 0. 1497 KMP_ASSERT(nApics > 0); 1498 if (nApics == 1) { 1499 __kmp_ncores = nPackages = 1; 1500 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1501 if (__kmp_affinity_verbose) { 1502 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1503 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1504 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1505 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1506 __kmp_nThreadsPerCore, __kmp_ncores); 1507 } 1508 1509 if (__kmp_affinity_type == affinity_none) { 1510 __kmp_free(threadInfo); 1511 KMP_CPU_FREE(oldMask); 1512 return 0; 1513 } 1514 1515 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair)); 1516 Address addr(1); 1517 addr.labels[0] = threadInfo[0].pkgId; 1518 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId); 1519 1520 if (__kmp_affinity_gran_levels < 0) { 1521 __kmp_affinity_gran_levels = 0; 1522 } 1523 1524 if (__kmp_affinity_verbose) { 1525 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 1526 } 1527 1528 __kmp_free(threadInfo); 1529 KMP_CPU_FREE(oldMask); 1530 return 1; 1531 } 1532 1533 // Sort the threadInfo table by physical Id. 1534 qsort(threadInfo, nApics, sizeof(*threadInfo), 1535 __kmp_affinity_cmp_apicThreadInfo_phys_id); 1536 1537 // The table is now sorted by pkgId / coreId / threadId, but we really don't 1538 // know the radix of any of the fields. pkgId's may be sparsely assigned among 1539 // the chips on a system. Although coreId's are usually assigned 1540 // [0 .. coresPerPkg-1] and threadId's are usually assigned 1541 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1542 // 1543 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 1544 // total # packages) are at this point - we want to determine that now. We 1545 // only have an upper bound on the first two figures. 1546 // 1547 // We also perform a consistency check at this point: the values returned by 1548 // the cpuid instruction for any thread bound to a given package had better 1549 // return the same info for maxThreadsPerPkg and maxCoresPerPkg. 1550 nPackages = 1; 1551 nCoresPerPkg = 1; 1552 __kmp_nThreadsPerCore = 1; 1553 unsigned nCores = 1; 1554 1555 unsigned pkgCt = 1; // to determine radii 1556 unsigned lastPkgId = threadInfo[0].pkgId; 1557 unsigned coreCt = 1; 1558 unsigned lastCoreId = threadInfo[0].coreId; 1559 unsigned threadCt = 1; 1560 unsigned lastThreadId = threadInfo[0].threadId; 1561 1562 // intra-pkg consist checks 1563 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 1564 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 1565 1566 for (i = 1; i < nApics; i++) { 1567 if (threadInfo[i].pkgId != lastPkgId) { 1568 nCores++; 1569 pkgCt++; 1570 lastPkgId = threadInfo[i].pkgId; 1571 if ((int)coreCt > nCoresPerPkg) 1572 nCoresPerPkg = coreCt; 1573 coreCt = 1; 1574 lastCoreId = threadInfo[i].coreId; 1575 if ((int)threadCt > __kmp_nThreadsPerCore) 1576 __kmp_nThreadsPerCore = threadCt; 1577 threadCt = 1; 1578 lastThreadId = threadInfo[i].threadId; 1579 1580 // This is a different package, so go on to the next iteration without 1581 // doing any consistency checks. Reset the consistency check vars, though. 1582 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 1583 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 1584 continue; 1585 } 1586 1587 if (threadInfo[i].coreId != lastCoreId) { 1588 nCores++; 1589 coreCt++; 1590 lastCoreId = threadInfo[i].coreId; 1591 if ((int)threadCt > __kmp_nThreadsPerCore) 1592 __kmp_nThreadsPerCore = threadCt; 1593 threadCt = 1; 1594 lastThreadId = threadInfo[i].threadId; 1595 } else if (threadInfo[i].threadId != lastThreadId) { 1596 threadCt++; 1597 lastThreadId = threadInfo[i].threadId; 1598 } else { 1599 __kmp_free(threadInfo); 1600 KMP_CPU_FREE(oldMask); 1601 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1602 return -1; 1603 } 1604 1605 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 1606 // fields agree between all the threads bounds to a given package. 1607 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) || 1608 (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 1609 __kmp_free(threadInfo); 1610 KMP_CPU_FREE(oldMask); 1611 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1612 return -1; 1613 } 1614 } 1615 nPackages = pkgCt; 1616 if ((int)coreCt > nCoresPerPkg) 1617 nCoresPerPkg = coreCt; 1618 if ((int)threadCt > __kmp_nThreadsPerCore) 1619 __kmp_nThreadsPerCore = threadCt; 1620 1621 // When affinity is off, this routine will still be called to set 1622 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1623 // Make sure all these vars are set correctly, and return now if affinity is 1624 // not enabled. 1625 __kmp_ncores = nCores; 1626 if (__kmp_affinity_verbose) { 1627 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1628 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1629 if (__kmp_affinity_uniform_topology()) { 1630 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1631 } else { 1632 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1633 } 1634 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1635 __kmp_nThreadsPerCore, __kmp_ncores); 1636 } 1637 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 1638 KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc); 1639 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 1640 for (i = 0; i < nApics; ++i) { 1641 __kmp_pu_os_idx[i] = threadInfo[i].osId; 1642 } 1643 if (__kmp_affinity_type == affinity_none) { 1644 __kmp_free(threadInfo); 1645 KMP_CPU_FREE(oldMask); 1646 return 0; 1647 } 1648 1649 // Now that we've determined the number of packages, the number of cores per 1650 // package, and the number of threads per core, we can construct the data 1651 // structure that is to be returned. 1652 int pkgLevel = 0; 1653 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1; 1654 int threadLevel = 1655 (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 1656 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 1657 1658 KMP_ASSERT(depth > 0); 1659 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1660 1661 for (i = 0; i < nApics; ++i) { 1662 Address addr(depth); 1663 unsigned os = threadInfo[i].osId; 1664 int d = 0; 1665 1666 if (pkgLevel >= 0) { 1667 addr.labels[d++] = threadInfo[i].pkgId; 1668 } 1669 if (coreLevel >= 0) { 1670 addr.labels[d++] = threadInfo[i].coreId; 1671 } 1672 if (threadLevel >= 0) { 1673 addr.labels[d++] = threadInfo[i].threadId; 1674 } 1675 (*address2os)[i] = AddrUnsPair(addr, os); 1676 } 1677 1678 if (__kmp_affinity_gran_levels < 0) { 1679 // Set the granularity level based on what levels are modeled in the machine 1680 // topology map. 1681 __kmp_affinity_gran_levels = 0; 1682 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1683 __kmp_affinity_gran_levels++; 1684 } 1685 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1686 __kmp_affinity_gran_levels++; 1687 } 1688 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) { 1689 __kmp_affinity_gran_levels++; 1690 } 1691 } 1692 1693 if (__kmp_affinity_verbose) { 1694 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel, 1695 coreLevel, threadLevel); 1696 } 1697 1698 __kmp_free(threadInfo); 1699 KMP_CPU_FREE(oldMask); 1700 return depth; 1701 } 1702 1703 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 1704 // architectures support a newer interface for specifying the x2APIC Ids, 1705 // based on CPUID.B or CPUID.1F 1706 static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os, 1707 kmp_i18n_id_t *const msg_id) { 1708 1709 cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST]; 1710 int ratio[KMP_HW_LAST]; 1711 int count[KMP_HW_LAST]; 1712 kmp_hw_t types[INTEL_LEVEL_TYPE_LAST]; 1713 unsigned levels_index; 1714 kmp_cpuid buf; 1715 kmp_uint64 known_levels; 1716 int topology_leaf, highest_leaf, apic_id; 1717 int num_leaves; 1718 static int leaves[] = {0, 0}; 1719 1720 kmp_i18n_id_t leaf_message_id; 1721 1722 KMP_BUILD_ASSERT(sizeof(known_levels) * CHAR_BIT > KMP_HW_LAST); 1723 1724 *msg_id = kmp_i18n_null; 1725 1726 // Figure out the known topology levels 1727 known_levels = 0ull; 1728 for (int i = 0; i < INTEL_LEVEL_TYPE_LAST; ++i) { 1729 if (__kmp_intel_type_2_topology_type(i) != KMP_HW_UNKNOWN) { 1730 known_levels |= (1ull << i); 1731 } 1732 } 1733 1734 // Get the highest cpuid leaf supported 1735 __kmp_x86_cpuid(0, 0, &buf); 1736 highest_leaf = buf.eax; 1737 1738 // If a specific topology method was requested, only allow that specific leaf 1739 // otherwise, try both leaves 31 and 11 in that order 1740 num_leaves = 0; 1741 if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 1742 num_leaves = 1; 1743 leaves[0] = 11; 1744 leaf_message_id = kmp_i18n_str_NoLeaf11Support; 1745 } else if (__kmp_affinity_top_method == affinity_top_method_x2apicid_1f) { 1746 num_leaves = 1; 1747 leaves[0] = 31; 1748 leaf_message_id = kmp_i18n_str_NoLeaf31Support; 1749 } else { 1750 num_leaves = 2; 1751 leaves[0] = 31; 1752 leaves[1] = 11; 1753 leaf_message_id = kmp_i18n_str_NoLeaf11Support; 1754 } 1755 1756 // Check to see if cpuid leaf 31 or 11 is supported. 1757 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1758 topology_leaf = -1; 1759 for (int i = 0; i < num_leaves; ++i) { 1760 int leaf = leaves[i]; 1761 if (highest_leaf < leaf) 1762 continue; 1763 __kmp_x86_cpuid(leaf, 0, &buf); 1764 if (buf.ebx == 0) 1765 continue; 1766 topology_leaf = leaf; 1767 levels_index = __kmp_x2apicid_get_levels(leaf, levels, known_levels); 1768 if (levels_index == 0) 1769 continue; 1770 break; 1771 } 1772 if (topology_leaf == -1 || levels_index == 0) { 1773 *msg_id = leaf_message_id; 1774 return -1; 1775 } 1776 KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST); 1777 1778 // The algorithm used starts by setting the affinity to each available thread 1779 // and retrieving info from the cpuid instruction, so if we are not capable of 1780 // calling __kmp_get_system_affinity() and __kmp_get_system_affinity(), then 1781 // we need to do something else - use the defaults that we calculated from 1782 // issuing cpuid without binding to each proc. 1783 if (!KMP_AFFINITY_CAPABLE()) { 1784 // Hack to try and infer the machine topology using only the data 1785 // available from cpuid on the current thread, and __kmp_xproc. 1786 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1787 1788 for (unsigned i = 0; i < levels_index; ++i) { 1789 if (levels[i].level_type == INTEL_LEVEL_TYPE_SMT) { 1790 __kmp_nThreadsPerCore = levels[i].nitems; 1791 } else if (levels[i].level_type == INTEL_LEVEL_TYPE_CORE) { 1792 nCoresPerPkg = levels[i].nitems; 1793 } else if (levels[i].level_type == INTEL_LEVEL_TYPE_DIE) { 1794 nDiesPerPkg = levels[i].nitems; 1795 } 1796 } 1797 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1798 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1799 if (__kmp_affinity_verbose) { 1800 KMP_INFORM(AffNotCapableUseLocCpuidL, "KMP_AFFINITY", topology_leaf); 1801 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1802 if (__kmp_affinity_uniform_topology()) { 1803 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1804 } else { 1805 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1806 } 1807 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1808 __kmp_nThreadsPerCore, __kmp_ncores); 1809 } 1810 return 0; 1811 } 1812 1813 // From here on, we can assume that it is safe to call 1814 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 1815 // __kmp_affinity_type = affinity_none. 1816 1817 // Save the affinity mask for the current thread. 1818 kmp_affin_mask_t *oldMask; 1819 KMP_CPU_ALLOC(oldMask); 1820 __kmp_get_system_affinity(oldMask, TRUE); 1821 1822 // Allocate the data structure to be returned. 1823 int depth = levels_index; 1824 for (int i = depth - 1, j = 0; i >= 0; --i, ++j) 1825 types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type); 1826 AddrUnsPair *retval = 1827 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 1828 1829 // Run through each of the available contexts, binding the current thread 1830 // to it, and obtaining the pertinent information using the cpuid instr. 1831 unsigned int proc; 1832 int nApics = 0; 1833 KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) { 1834 cpuid_level_info_t my_levels[INTEL_LEVEL_TYPE_LAST]; 1835 unsigned my_levels_index; 1836 1837 // Skip this proc if it is not included in the machine model. 1838 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 1839 continue; 1840 } 1841 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc); 1842 1843 __kmp_affinity_dispatch->bind_thread(proc); 1844 1845 // New algorithm 1846 __kmp_x86_cpuid(topology_leaf, 0, &buf); 1847 apic_id = buf.edx; 1848 Address addr(depth); 1849 my_levels_index = 1850 __kmp_x2apicid_get_levels(topology_leaf, my_levels, known_levels); 1851 if (my_levels_index == 0 || my_levels_index != levels_index) { 1852 KMP_CPU_FREE(oldMask); 1853 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1854 return -1; 1855 } 1856 // Put in topology information 1857 for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) { 1858 addr.labels[idx] = apic_id & my_levels[j].mask; 1859 if (j > 0) 1860 addr.labels[idx] >>= my_levels[j - 1].mask_width; 1861 } 1862 retval[nApics++] = AddrUnsPair(addr, proc); 1863 } 1864 1865 // We've collected all the info we need. 1866 // Restore the old affinity mask for this thread. 1867 __kmp_set_system_affinity(oldMask, TRUE); 1868 1869 // If there's only one thread context to bind to, return now. 1870 KMP_ASSERT(nApics > 0); 1871 if (nApics == 1) { 1872 int pkg_level; 1873 __kmp_ncores = nPackages = 1; 1874 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1875 if (__kmp_affinity_verbose) { 1876 KMP_INFORM(AffUseGlobCpuidL, "KMP_AFFINITY", topology_leaf); 1877 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1878 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1879 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1880 __kmp_nThreadsPerCore, __kmp_ncores); 1881 } 1882 1883 if (__kmp_affinity_type == affinity_none) { 1884 __kmp_free(retval); 1885 KMP_CPU_FREE(oldMask); 1886 return 0; 1887 } 1888 1889 pkg_level = 0; 1890 for (int i = 0; i < depth; ++i) 1891 if (types[i] == KMP_HW_SOCKET) { 1892 pkg_level = i; 1893 break; 1894 } 1895 // Form an Address object which only includes the package level. 1896 Address addr(1); 1897 addr.labels[0] = retval[0].first.labels[pkg_level]; 1898 retval[0].first = addr; 1899 1900 if (__kmp_affinity_gran_levels < 0) { 1901 __kmp_affinity_gran_levels = 0; 1902 } 1903 1904 if (__kmp_affinity_verbose) { 1905 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 1906 } 1907 1908 *address2os = retval; 1909 KMP_CPU_FREE(oldMask); 1910 return 1; 1911 } 1912 1913 // Sort the table by physical Id. 1914 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels); 1915 1916 __kmp_affinity_gather_enumeration_information(retval, nApics, depth, types, 1917 ratio, count); 1918 1919 // When affinity is off, this routine will still be called to set 1920 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1921 // Make sure all these vars are set correctly, and return if affinity is not 1922 // enabled. 1923 int thread_level, core_level, socket_level, die_level; 1924 thread_level = core_level = die_level = socket_level = -1; 1925 for (int level = 0; level < depth; ++level) { 1926 if (types[level] == KMP_HW_THREAD) 1927 thread_level = level; 1928 else if (types[level] == KMP_HW_CORE) 1929 core_level = level; 1930 else if (types[level] == KMP_HW_DIE) 1931 die_level = level; 1932 else if (types[level] == KMP_HW_SOCKET) 1933 socket_level = level; 1934 } 1935 __kmp_nThreadsPerCore = 1936 __kmp_affinity_calculate_ratio(ratio, thread_level, core_level); 1937 if (die_level > 0) { 1938 nDiesPerPkg = 1939 __kmp_affinity_calculate_ratio(ratio, die_level, socket_level); 1940 nCoresPerPkg = __kmp_affinity_calculate_ratio(ratio, core_level, die_level); 1941 } else { 1942 nCoresPerPkg = 1943 __kmp_affinity_calculate_ratio(ratio, core_level, socket_level); 1944 } 1945 if (socket_level >= 0) 1946 nPackages = count[socket_level]; 1947 else 1948 nPackages = 1; 1949 if (core_level >= 0) 1950 __kmp_ncores = count[core_level]; 1951 else 1952 __kmp_ncores = 1; 1953 1954 // Check to see if the machine topology is uniform 1955 unsigned uniform = __kmp_affinity_discover_uniformity(depth, ratio, count); 1956 1957 // Print the machine topology summary. 1958 if (__kmp_affinity_verbose) { 1959 kmp_hw_t numerator_type, denominator_type; 1960 KMP_INFORM(AffUseGlobCpuidL, "KMP_AFFINITY", topology_leaf); 1961 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1962 if (uniform) { 1963 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1964 } else { 1965 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1966 } 1967 1968 kmp_str_buf_t buf; 1969 __kmp_str_buf_init(&buf); 1970 1971 if (core_level < 0) 1972 core_level = depth - 1; 1973 int ncores = count[core_level]; 1974 1975 denominator_type = KMP_HW_UNKNOWN; 1976 for (int level = 0; level < depth; ++level) { 1977 int c; 1978 bool plural; 1979 numerator_type = types[level]; 1980 c = ratio[level]; 1981 plural = (c > 1); 1982 if (level == 0) { 1983 __kmp_str_buf_print( 1984 &buf, "%d %s", c, 1985 __kmp_hw_get_catalog_string(numerator_type, plural)); 1986 } else { 1987 __kmp_str_buf_print(&buf, " x %d %s/%s", c, 1988 __kmp_hw_get_catalog_string(numerator_type, plural), 1989 __kmp_hw_get_catalog_string(denominator_type)); 1990 } 1991 denominator_type = numerator_type; 1992 } 1993 KMP_INFORM(TopologyGeneric, "KMP_AFFINITY", buf.str, ncores); 1994 __kmp_str_buf_free(&buf); 1995 } 1996 1997 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 1998 KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc); 1999 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 2000 for (proc = 0; (int)proc < nApics; ++proc) { 2001 __kmp_pu_os_idx[proc] = retval[proc].second; 2002 } 2003 if (__kmp_affinity_type == affinity_none) { 2004 __kmp_free(retval); 2005 KMP_CPU_FREE(oldMask); 2006 return 0; 2007 } 2008 2009 // Find any levels with radix 1, and remove them from the map 2010 // (except for the package level). 2011 depth = __kmp_affinity_remove_radix_one_levels(retval, nApics, depth, types); 2012 thread_level = core_level = die_level = socket_level = -1; 2013 for (int level = 0; level < depth; ++level) { 2014 if (types[level] == KMP_HW_THREAD) 2015 thread_level = level; 2016 else if (types[level] == KMP_HW_CORE) 2017 core_level = level; 2018 else if (types[level] == KMP_HW_DIE) 2019 die_level = level; 2020 else if (types[level] == KMP_HW_SOCKET) 2021 socket_level = level; 2022 } 2023 2024 if (__kmp_affinity_gran_levels < 0) { 2025 // Set the granularity level based on what levels are modeled 2026 // in the machine topology map. 2027 __kmp_affinity_gran_levels = 0; 2028 if ((thread_level >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 2029 __kmp_affinity_gran_levels++; 2030 } 2031 if ((core_level >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 2032 __kmp_affinity_gran_levels++; 2033 } 2034 if ((die_level >= 0) && (__kmp_affinity_gran > affinity_gran_die)) { 2035 __kmp_affinity_gran_levels++; 2036 } 2037 if (__kmp_affinity_gran > affinity_gran_package) { 2038 __kmp_affinity_gran_levels++; 2039 } 2040 } 2041 2042 if (__kmp_affinity_verbose) { 2043 __kmp_affinity_print_topology(retval, nApics, depth, types); 2044 } 2045 2046 KMP_CPU_FREE(oldMask); 2047 *address2os = retval; 2048 return depth; 2049 } 2050 2051 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 2052 2053 #define osIdIndex 0 2054 #define threadIdIndex 1 2055 #define coreIdIndex 2 2056 #define pkgIdIndex 3 2057 #define nodeIdIndex 4 2058 2059 typedef unsigned *ProcCpuInfo; 2060 static unsigned maxIndex = pkgIdIndex; 2061 2062 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, 2063 const void *b) { 2064 unsigned i; 2065 const unsigned *aa = *(unsigned *const *)a; 2066 const unsigned *bb = *(unsigned *const *)b; 2067 for (i = maxIndex;; i--) { 2068 if (aa[i] < bb[i]) 2069 return -1; 2070 if (aa[i] > bb[i]) 2071 return 1; 2072 if (i == osIdIndex) 2073 break; 2074 } 2075 return 0; 2076 } 2077 2078 #if KMP_USE_HIER_SCHED 2079 // Set the array sizes for the hierarchy layers 2080 static void __kmp_dispatch_set_hierarchy_values() { 2081 // Set the maximum number of L1's to number of cores 2082 // Set the maximum number of L2's to to either number of cores / 2 for 2083 // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing 2084 // Or the number of cores for Intel(R) Xeon(R) processors 2085 // Set the maximum number of NUMA nodes and L3's to number of packages 2086 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] = 2087 nPackages * nCoresPerPkg * __kmp_nThreadsPerCore; 2088 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores; 2089 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \ 2090 KMP_MIC_SUPPORTED 2091 if (__kmp_mic_type >= mic3) 2092 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2; 2093 else 2094 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) 2095 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores; 2096 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages; 2097 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages; 2098 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1; 2099 // Set the number of threads per unit 2100 // Number of hardware threads per L1/L2/L3/NUMA/LOOP 2101 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1; 2102 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] = 2103 __kmp_nThreadsPerCore; 2104 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \ 2105 KMP_MIC_SUPPORTED 2106 if (__kmp_mic_type >= mic3) 2107 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] = 2108 2 * __kmp_nThreadsPerCore; 2109 else 2110 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) 2111 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] = 2112 __kmp_nThreadsPerCore; 2113 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] = 2114 nCoresPerPkg * __kmp_nThreadsPerCore; 2115 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] = 2116 nCoresPerPkg * __kmp_nThreadsPerCore; 2117 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] = 2118 nPackages * nCoresPerPkg * __kmp_nThreadsPerCore; 2119 } 2120 2121 // Return the index into the hierarchy for this tid and layer type (L1, L2, etc) 2122 // i.e., this thread's L1 or this thread's L2, etc. 2123 int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) { 2124 int index = type + 1; 2125 int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1]; 2126 KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST); 2127 if (type == kmp_hier_layer_e::LAYER_THREAD) 2128 return tid; 2129 else if (type == kmp_hier_layer_e::LAYER_LOOP) 2130 return 0; 2131 KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0); 2132 if (tid >= num_hw_threads) 2133 tid = tid % num_hw_threads; 2134 return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index]; 2135 } 2136 2137 // Return the number of t1's per t2 2138 int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) { 2139 int i1 = t1 + 1; 2140 int i2 = t2 + 1; 2141 KMP_DEBUG_ASSERT(i1 <= i2); 2142 KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST); 2143 KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST); 2144 KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0); 2145 // (nthreads/t2) / (nthreads/t1) = t1 / t2 2146 return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1]; 2147 } 2148 #endif // KMP_USE_HIER_SCHED 2149 2150 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 2151 // affinity map. 2152 static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, 2153 int *line, 2154 kmp_i18n_id_t *const msg_id, 2155 FILE *f) { 2156 *address2os = NULL; 2157 *msg_id = kmp_i18n_null; 2158 2159 // Scan of the file, and count the number of "processor" (osId) fields, 2160 // and find the highest value of <n> for a node_<n> field. 2161 char buf[256]; 2162 unsigned num_records = 0; 2163 while (!feof(f)) { 2164 buf[sizeof(buf) - 1] = 1; 2165 if (!fgets(buf, sizeof(buf), f)) { 2166 // Read errors presumably because of EOF 2167 break; 2168 } 2169 2170 char s1[] = "processor"; 2171 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 2172 num_records++; 2173 continue; 2174 } 2175 2176 // FIXME - this will match "node_<n> <garbage>" 2177 unsigned level; 2178 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 2179 if (nodeIdIndex + level >= maxIndex) { 2180 maxIndex = nodeIdIndex + level; 2181 } 2182 continue; 2183 } 2184 } 2185 2186 // Check for empty file / no valid processor records, or too many. The number 2187 // of records can't exceed the number of valid bits in the affinity mask. 2188 if (num_records == 0) { 2189 *line = 0; 2190 *msg_id = kmp_i18n_str_NoProcRecords; 2191 return -1; 2192 } 2193 if (num_records > (unsigned)__kmp_xproc) { 2194 *line = 0; 2195 *msg_id = kmp_i18n_str_TooManyProcRecords; 2196 return -1; 2197 } 2198 2199 // Set the file pointer back to the beginning, so that we can scan the file 2200 // again, this time performing a full parse of the data. Allocate a vector of 2201 // ProcCpuInfo object, where we will place the data. Adding an extra element 2202 // at the end allows us to remove a lot of extra checks for termination 2203 // conditions. 2204 if (fseek(f, 0, SEEK_SET) != 0) { 2205 *line = 0; 2206 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 2207 return -1; 2208 } 2209 2210 // Allocate the array of records to store the proc info in. The dummy 2211 // element at the end makes the logic in filling them out easier to code. 2212 unsigned **threadInfo = 2213 (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *)); 2214 unsigned i; 2215 for (i = 0; i <= num_records; i++) { 2216 threadInfo[i] = 2217 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2218 } 2219 2220 #define CLEANUP_THREAD_INFO \ 2221 for (i = 0; i <= num_records; i++) { \ 2222 __kmp_free(threadInfo[i]); \ 2223 } \ 2224 __kmp_free(threadInfo); 2225 2226 // A value of UINT_MAX means that we didn't find the field 2227 unsigned __index; 2228 2229 #define INIT_PROC_INFO(p) \ 2230 for (__index = 0; __index <= maxIndex; __index++) { \ 2231 (p)[__index] = UINT_MAX; \ 2232 } 2233 2234 for (i = 0; i <= num_records; i++) { 2235 INIT_PROC_INFO(threadInfo[i]); 2236 } 2237 2238 unsigned num_avail = 0; 2239 *line = 0; 2240 while (!feof(f)) { 2241 // Create an inner scoping level, so that all the goto targets at the end of 2242 // the loop appear in an outer scoping level. This avoids warnings about 2243 // jumping past an initialization to a target in the same block. 2244 { 2245 buf[sizeof(buf) - 1] = 1; 2246 bool long_line = false; 2247 if (!fgets(buf, sizeof(buf), f)) { 2248 // Read errors presumably because of EOF 2249 // If there is valid data in threadInfo[num_avail], then fake 2250 // a blank line in ensure that the last address gets parsed. 2251 bool valid = false; 2252 for (i = 0; i <= maxIndex; i++) { 2253 if (threadInfo[num_avail][i] != UINT_MAX) { 2254 valid = true; 2255 } 2256 } 2257 if (!valid) { 2258 break; 2259 } 2260 buf[0] = 0; 2261 } else if (!buf[sizeof(buf) - 1]) { 2262 // The line is longer than the buffer. Set a flag and don't 2263 // emit an error if we were going to ignore the line, anyway. 2264 long_line = true; 2265 2266 #define CHECK_LINE \ 2267 if (long_line) { \ 2268 CLEANUP_THREAD_INFO; \ 2269 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 2270 return -1; \ 2271 } 2272 } 2273 (*line)++; 2274 2275 char s1[] = "processor"; 2276 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 2277 CHECK_LINE; 2278 char *p = strchr(buf + sizeof(s1) - 1, ':'); 2279 unsigned val; 2280 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2281 goto no_val; 2282 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) 2283 #if KMP_ARCH_AARCH64 2284 // Handle the old AArch64 /proc/cpuinfo layout differently, 2285 // it contains all of the 'processor' entries listed in a 2286 // single 'Processor' section, therefore the normal looking 2287 // for duplicates in that section will always fail. 2288 num_avail++; 2289 #else 2290 goto dup_field; 2291 #endif 2292 threadInfo[num_avail][osIdIndex] = val; 2293 #if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64) 2294 char path[256]; 2295 KMP_SNPRINTF( 2296 path, sizeof(path), 2297 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 2298 threadInfo[num_avail][osIdIndex]); 2299 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 2300 2301 KMP_SNPRINTF(path, sizeof(path), 2302 "/sys/devices/system/cpu/cpu%u/topology/core_id", 2303 threadInfo[num_avail][osIdIndex]); 2304 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 2305 continue; 2306 #else 2307 } 2308 char s2[] = "physical id"; 2309 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 2310 CHECK_LINE; 2311 char *p = strchr(buf + sizeof(s2) - 1, ':'); 2312 unsigned val; 2313 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2314 goto no_val; 2315 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) 2316 goto dup_field; 2317 threadInfo[num_avail][pkgIdIndex] = val; 2318 continue; 2319 } 2320 char s3[] = "core id"; 2321 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 2322 CHECK_LINE; 2323 char *p = strchr(buf + sizeof(s3) - 1, ':'); 2324 unsigned val; 2325 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2326 goto no_val; 2327 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) 2328 goto dup_field; 2329 threadInfo[num_avail][coreIdIndex] = val; 2330 continue; 2331 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 2332 } 2333 char s4[] = "thread id"; 2334 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 2335 CHECK_LINE; 2336 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2337 unsigned val; 2338 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2339 goto no_val; 2340 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) 2341 goto dup_field; 2342 threadInfo[num_avail][threadIdIndex] = val; 2343 continue; 2344 } 2345 unsigned level; 2346 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 2347 CHECK_LINE; 2348 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2349 unsigned val; 2350 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2351 goto no_val; 2352 KMP_ASSERT(nodeIdIndex + level <= maxIndex); 2353 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) 2354 goto dup_field; 2355 threadInfo[num_avail][nodeIdIndex + level] = val; 2356 continue; 2357 } 2358 2359 // We didn't recognize the leading token on the line. There are lots of 2360 // leading tokens that we don't recognize - if the line isn't empty, go on 2361 // to the next line. 2362 if ((*buf != 0) && (*buf != '\n')) { 2363 // If the line is longer than the buffer, read characters 2364 // until we find a newline. 2365 if (long_line) { 2366 int ch; 2367 while (((ch = fgetc(f)) != EOF) && (ch != '\n')) 2368 ; 2369 } 2370 continue; 2371 } 2372 2373 // A newline has signalled the end of the processor record. 2374 // Check that there aren't too many procs specified. 2375 if ((int)num_avail == __kmp_xproc) { 2376 CLEANUP_THREAD_INFO; 2377 *msg_id = kmp_i18n_str_TooManyEntries; 2378 return -1; 2379 } 2380 2381 // Check for missing fields. The osId field must be there, and we 2382 // currently require that the physical id field is specified, also. 2383 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 2384 CLEANUP_THREAD_INFO; 2385 *msg_id = kmp_i18n_str_MissingProcField; 2386 return -1; 2387 } 2388 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 2389 CLEANUP_THREAD_INFO; 2390 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 2391 return -1; 2392 } 2393 2394 // Skip this proc if it is not included in the machine model. 2395 if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], 2396 __kmp_affin_fullMask)) { 2397 INIT_PROC_INFO(threadInfo[num_avail]); 2398 continue; 2399 } 2400 2401 // We have a successful parse of this proc's info. 2402 // Increment the counter, and prepare for the next proc. 2403 num_avail++; 2404 KMP_ASSERT(num_avail <= num_records); 2405 INIT_PROC_INFO(threadInfo[num_avail]); 2406 } 2407 continue; 2408 2409 no_val: 2410 CLEANUP_THREAD_INFO; 2411 *msg_id = kmp_i18n_str_MissingValCpuinfo; 2412 return -1; 2413 2414 dup_field: 2415 CLEANUP_THREAD_INFO; 2416 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 2417 return -1; 2418 } 2419 *line = 0; 2420 2421 #if KMP_MIC && REDUCE_TEAM_SIZE 2422 unsigned teamSize = 0; 2423 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2424 2425 // check for num_records == __kmp_xproc ??? 2426 2427 // If there's only one thread context to bind to, form an Address object with 2428 // depth 1 and return immediately (or, if affinity is off, set address2os to 2429 // NULL and return). 2430 // 2431 // If it is configured to omit the package level when there is only a single 2432 // package, the logic at the end of this routine won't work if there is only a 2433 // single thread - it would try to form an Address object with depth 0. 2434 KMP_ASSERT(num_avail > 0); 2435 KMP_ASSERT(num_avail <= num_records); 2436 if (num_avail == 1) { 2437 __kmp_ncores = 1; 2438 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 2439 if (__kmp_affinity_verbose) { 2440 if (!KMP_AFFINITY_CAPABLE()) { 2441 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2442 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2443 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2444 } else { 2445 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2446 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2447 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2448 } 2449 int index; 2450 kmp_str_buf_t buf; 2451 __kmp_str_buf_init(&buf); 2452 __kmp_str_buf_print(&buf, "1"); 2453 for (index = maxIndex - 1; index > pkgIdIndex; index--) { 2454 __kmp_str_buf_print(&buf, " x 1"); 2455 } 2456 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1); 2457 __kmp_str_buf_free(&buf); 2458 } 2459 2460 if (__kmp_affinity_type == affinity_none) { 2461 CLEANUP_THREAD_INFO; 2462 return 0; 2463 } 2464 2465 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair)); 2466 Address addr(1); 2467 addr.labels[0] = threadInfo[0][pkgIdIndex]; 2468 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]); 2469 2470 if (__kmp_affinity_gran_levels < 0) { 2471 __kmp_affinity_gran_levels = 0; 2472 } 2473 2474 if (__kmp_affinity_verbose) { 2475 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 2476 } 2477 2478 CLEANUP_THREAD_INFO; 2479 return 1; 2480 } 2481 2482 // Sort the threadInfo table by physical Id. 2483 qsort(threadInfo, num_avail, sizeof(*threadInfo), 2484 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 2485 2486 // The table is now sorted by pkgId / coreId / threadId, but we really don't 2487 // know the radix of any of the fields. pkgId's may be sparsely assigned among 2488 // the chips on a system. Although coreId's are usually assigned 2489 // [0 .. coresPerPkg-1] and threadId's are usually assigned 2490 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2491 // 2492 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 2493 // total # packages) are at this point - we want to determine that now. We 2494 // only have an upper bound on the first two figures. 2495 unsigned *counts = 2496 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2497 unsigned *maxCt = 2498 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2499 unsigned *totals = 2500 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2501 unsigned *lastId = 2502 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2503 2504 bool assign_thread_ids = false; 2505 unsigned threadIdCt; 2506 unsigned index; 2507 2508 restart_radix_check: 2509 threadIdCt = 0; 2510 2511 // Initialize the counter arrays with data from threadInfo[0]. 2512 if (assign_thread_ids) { 2513 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 2514 threadInfo[0][threadIdIndex] = threadIdCt++; 2515 } else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 2516 threadIdCt = threadInfo[0][threadIdIndex] + 1; 2517 } 2518 } 2519 for (index = 0; index <= maxIndex; index++) { 2520 counts[index] = 1; 2521 maxCt[index] = 1; 2522 totals[index] = 1; 2523 lastId[index] = threadInfo[0][index]; 2524 ; 2525 } 2526 2527 // Run through the rest of the OS procs. 2528 for (i = 1; i < num_avail; i++) { 2529 // Find the most significant index whose id differs from the id for the 2530 // previous OS proc. 2531 for (index = maxIndex; index >= threadIdIndex; index--) { 2532 if (assign_thread_ids && (index == threadIdIndex)) { 2533 // Auto-assign the thread id field if it wasn't specified. 2534 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2535 threadInfo[i][threadIdIndex] = threadIdCt++; 2536 } 2537 // Apparently the thread id field was specified for some entries and not 2538 // others. Start the thread id counter off at the next higher thread id. 2539 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2540 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2541 } 2542 } 2543 if (threadInfo[i][index] != lastId[index]) { 2544 // Run through all indices which are less significant, and reset the 2545 // counts to 1. At all levels up to and including index, we need to 2546 // increment the totals and record the last id. 2547 unsigned index2; 2548 for (index2 = threadIdIndex; index2 < index; index2++) { 2549 totals[index2]++; 2550 if (counts[index2] > maxCt[index2]) { 2551 maxCt[index2] = counts[index2]; 2552 } 2553 counts[index2] = 1; 2554 lastId[index2] = threadInfo[i][index2]; 2555 } 2556 counts[index]++; 2557 totals[index]++; 2558 lastId[index] = threadInfo[i][index]; 2559 2560 if (assign_thread_ids && (index > threadIdIndex)) { 2561 2562 #if KMP_MIC && REDUCE_TEAM_SIZE 2563 // The default team size is the total #threads in the machine 2564 // minus 1 thread for every core that has 3 or more threads. 2565 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2566 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2567 2568 // Restart the thread counter, as we are on a new core. 2569 threadIdCt = 0; 2570 2571 // Auto-assign the thread id field if it wasn't specified. 2572 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2573 threadInfo[i][threadIdIndex] = threadIdCt++; 2574 } 2575 2576 // Apparently the thread id field was specified for some entries and 2577 // not others. Start the thread id counter off at the next higher 2578 // thread id. 2579 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2580 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2581 } 2582 } 2583 break; 2584 } 2585 } 2586 if (index < threadIdIndex) { 2587 // If thread ids were specified, it is an error if they are not unique. 2588 // Also, check that we waven't already restarted the loop (to be safe - 2589 // shouldn't need to). 2590 if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) { 2591 __kmp_free(lastId); 2592 __kmp_free(totals); 2593 __kmp_free(maxCt); 2594 __kmp_free(counts); 2595 CLEANUP_THREAD_INFO; 2596 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2597 return -1; 2598 } 2599 2600 // If the thread ids were not specified and we see entries entries that 2601 // are duplicates, start the loop over and assign the thread ids manually. 2602 assign_thread_ids = true; 2603 goto restart_radix_check; 2604 } 2605 } 2606 2607 #if KMP_MIC && REDUCE_TEAM_SIZE 2608 // The default team size is the total #threads in the machine 2609 // minus 1 thread for every core that has 3 or more threads. 2610 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2611 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2612 2613 for (index = threadIdIndex; index <= maxIndex; index++) { 2614 if (counts[index] > maxCt[index]) { 2615 maxCt[index] = counts[index]; 2616 } 2617 } 2618 2619 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2620 nCoresPerPkg = maxCt[coreIdIndex]; 2621 nPackages = totals[pkgIdIndex]; 2622 2623 // Check to see if the machine topology is uniform 2624 unsigned prod = totals[maxIndex]; 2625 for (index = threadIdIndex; index < maxIndex; index++) { 2626 prod *= maxCt[index]; 2627 } 2628 bool uniform = (prod == totals[threadIdIndex]); 2629 2630 // When affinity is off, this routine will still be called to set 2631 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 2632 // Make sure all these vars are set correctly, and return now if affinity is 2633 // not enabled. 2634 __kmp_ncores = totals[coreIdIndex]; 2635 2636 if (__kmp_affinity_verbose) { 2637 if (!KMP_AFFINITY_CAPABLE()) { 2638 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2639 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2640 if (uniform) { 2641 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2642 } else { 2643 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2644 } 2645 } else { 2646 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2647 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2648 if (uniform) { 2649 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2650 } else { 2651 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2652 } 2653 } 2654 kmp_str_buf_t buf; 2655 __kmp_str_buf_init(&buf); 2656 2657 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]); 2658 for (index = maxIndex - 1; index >= pkgIdIndex; index--) { 2659 __kmp_str_buf_print(&buf, " x %d", maxCt[index]); 2660 } 2661 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex], 2662 maxCt[threadIdIndex], __kmp_ncores); 2663 2664 __kmp_str_buf_free(&buf); 2665 } 2666 2667 #if KMP_MIC && REDUCE_TEAM_SIZE 2668 // Set the default team size. 2669 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2670 __kmp_dflt_team_nth = teamSize; 2671 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting " 2672 "__kmp_dflt_team_nth = %d\n", 2673 __kmp_dflt_team_nth)); 2674 } 2675 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2676 2677 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 2678 KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc); 2679 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 2680 for (i = 0; i < num_avail; ++i) { // fill the os indices 2681 __kmp_pu_os_idx[i] = threadInfo[i][osIdIndex]; 2682 } 2683 2684 if (__kmp_affinity_type == affinity_none) { 2685 __kmp_free(lastId); 2686 __kmp_free(totals); 2687 __kmp_free(maxCt); 2688 __kmp_free(counts); 2689 CLEANUP_THREAD_INFO; 2690 return 0; 2691 } 2692 2693 // Count the number of levels which have more nodes at that level than at the 2694 // parent's level (with there being an implicit root node of the top level). 2695 // This is equivalent to saying that there is at least one node at this level 2696 // which has a sibling. These levels are in the map, and the package level is 2697 // always in the map. 2698 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2699 for (index = threadIdIndex; index < maxIndex; index++) { 2700 KMP_ASSERT(totals[index] >= totals[index + 1]); 2701 inMap[index] = (totals[index] > totals[index + 1]); 2702 } 2703 inMap[maxIndex] = (totals[maxIndex] > 1); 2704 inMap[pkgIdIndex] = true; 2705 2706 int depth = 0; 2707 for (index = threadIdIndex; index <= maxIndex; index++) { 2708 if (inMap[index]) { 2709 depth++; 2710 } 2711 } 2712 KMP_ASSERT(depth > 0); 2713 2714 // Construct the data structure that is to be returned. 2715 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * num_avail); 2716 int pkgLevel = -1; 2717 int coreLevel = -1; 2718 int threadLevel = -1; 2719 2720 for (i = 0; i < num_avail; ++i) { 2721 Address addr(depth); 2722 unsigned os = threadInfo[i][osIdIndex]; 2723 int src_index; 2724 int dst_index = 0; 2725 2726 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2727 if (!inMap[src_index]) { 2728 continue; 2729 } 2730 addr.labels[dst_index] = threadInfo[i][src_index]; 2731 if (src_index == pkgIdIndex) { 2732 pkgLevel = dst_index; 2733 } else if (src_index == coreIdIndex) { 2734 coreLevel = dst_index; 2735 } else if (src_index == threadIdIndex) { 2736 threadLevel = dst_index; 2737 } 2738 dst_index++; 2739 } 2740 (*address2os)[i] = AddrUnsPair(addr, os); 2741 } 2742 2743 if (__kmp_affinity_gran_levels < 0) { 2744 // Set the granularity level based on what levels are modeled 2745 // in the machine topology map. 2746 unsigned src_index; 2747 __kmp_affinity_gran_levels = 0; 2748 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) { 2749 if (!inMap[src_index]) { 2750 continue; 2751 } 2752 switch (src_index) { 2753 case threadIdIndex: 2754 if (__kmp_affinity_gran > affinity_gran_thread) { 2755 __kmp_affinity_gran_levels++; 2756 } 2757 2758 break; 2759 case coreIdIndex: 2760 if (__kmp_affinity_gran > affinity_gran_core) { 2761 __kmp_affinity_gran_levels++; 2762 } 2763 break; 2764 2765 case pkgIdIndex: 2766 if (__kmp_affinity_gran > affinity_gran_package) { 2767 __kmp_affinity_gran_levels++; 2768 } 2769 break; 2770 } 2771 } 2772 } 2773 2774 if (__kmp_affinity_verbose) { 2775 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel, 2776 coreLevel, threadLevel); 2777 } 2778 2779 __kmp_free(inMap); 2780 __kmp_free(lastId); 2781 __kmp_free(totals); 2782 __kmp_free(maxCt); 2783 __kmp_free(counts); 2784 CLEANUP_THREAD_INFO; 2785 return depth; 2786 } 2787 2788 // Create and return a table of affinity masks, indexed by OS thread ID. 2789 // This routine handles OR'ing together all the affinity masks of threads 2790 // that are sufficiently close, if granularity > fine. 2791 static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex, 2792 unsigned *numUnique, 2793 AddrUnsPair *address2os, 2794 unsigned numAddrs) { 2795 // First form a table of affinity masks in order of OS thread id. 2796 unsigned depth; 2797 unsigned maxOsId; 2798 unsigned i; 2799 2800 KMP_ASSERT(numAddrs > 0); 2801 depth = address2os[0].first.depth; 2802 2803 maxOsId = 0; 2804 for (i = numAddrs - 1;; --i) { 2805 unsigned osId = address2os[i].second; 2806 if (osId > maxOsId) { 2807 maxOsId = osId; 2808 } 2809 if (i == 0) 2810 break; 2811 } 2812 kmp_affin_mask_t *osId2Mask; 2813 KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1)); 2814 2815 // Sort the address2os table according to physical order. Doing so will put 2816 // all threads on the same core/package/node in consecutive locations. 2817 qsort(address2os, numAddrs, sizeof(*address2os), 2818 __kmp_affinity_cmp_Address_labels); 2819 2820 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2821 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2822 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2823 } 2824 if (__kmp_affinity_gran_levels >= (int)depth) { 2825 if (__kmp_affinity_verbose || 2826 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 2827 KMP_WARNING(AffThreadsMayMigrate); 2828 } 2829 } 2830 2831 // Run through the table, forming the masks for all threads on each core. 2832 // Threads on the same core will have identical "Address" objects, not 2833 // considering the last level, which must be the thread id. All threads on a 2834 // core will appear consecutively. 2835 unsigned unique = 0; 2836 unsigned j = 0; // index of 1st thread on core 2837 unsigned leader = 0; 2838 Address *leaderAddr = &(address2os[0].first); 2839 kmp_affin_mask_t *sum; 2840 KMP_CPU_ALLOC_ON_STACK(sum); 2841 KMP_CPU_ZERO(sum); 2842 KMP_CPU_SET(address2os[0].second, sum); 2843 for (i = 1; i < numAddrs; i++) { 2844 // If this thread is sufficiently close to the leader (within the 2845 // granularity setting), then set the bit for this os thread in the 2846 // affinity mask for this group, and go on to the next thread. 2847 if (leaderAddr->isClose(address2os[i].first, __kmp_affinity_gran_levels)) { 2848 KMP_CPU_SET(address2os[i].second, sum); 2849 continue; 2850 } 2851 2852 // For every thread in this group, copy the mask to the thread's entry in 2853 // the osId2Mask table. Mark the first address as a leader. 2854 for (; j < i; j++) { 2855 unsigned osId = address2os[j].second; 2856 KMP_DEBUG_ASSERT(osId <= maxOsId); 2857 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2858 KMP_CPU_COPY(mask, sum); 2859 address2os[j].first.leader = (j == leader); 2860 } 2861 unique++; 2862 2863 // Start a new mask. 2864 leader = i; 2865 leaderAddr = &(address2os[i].first); 2866 KMP_CPU_ZERO(sum); 2867 KMP_CPU_SET(address2os[i].second, sum); 2868 } 2869 2870 // For every thread in last group, copy the mask to the thread's 2871 // entry in the osId2Mask table. 2872 for (; j < i; j++) { 2873 unsigned osId = address2os[j].second; 2874 KMP_DEBUG_ASSERT(osId <= maxOsId); 2875 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2876 KMP_CPU_COPY(mask, sum); 2877 address2os[j].first.leader = (j == leader); 2878 } 2879 unique++; 2880 KMP_CPU_FREE_FROM_STACK(sum); 2881 2882 *maxIndex = maxOsId; 2883 *numUnique = unique; 2884 return osId2Mask; 2885 } 2886 2887 // Stuff for the affinity proclist parsers. It's easier to declare these vars 2888 // as file-static than to try and pass them through the calling sequence of 2889 // the recursive-descent OMP_PLACES parser. 2890 static kmp_affin_mask_t *newMasks; 2891 static int numNewMasks; 2892 static int nextNewMask; 2893 2894 #define ADD_MASK(_mask) \ 2895 { \ 2896 if (nextNewMask >= numNewMasks) { \ 2897 int i; \ 2898 numNewMasks *= 2; \ 2899 kmp_affin_mask_t *temp; \ 2900 KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \ 2901 for (i = 0; i < numNewMasks / 2; i++) { \ 2902 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); \ 2903 kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i); \ 2904 KMP_CPU_COPY(dest, src); \ 2905 } \ 2906 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2); \ 2907 newMasks = temp; \ 2908 } \ 2909 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 2910 nextNewMask++; \ 2911 } 2912 2913 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId) \ 2914 { \ 2915 if (((_osId) > _maxOsId) || \ 2916 (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 2917 if (__kmp_affinity_verbose || \ 2918 (__kmp_affinity_warnings && \ 2919 (__kmp_affinity_type != affinity_none))) { \ 2920 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 2921 } \ 2922 } else { \ 2923 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 2924 } \ 2925 } 2926 2927 // Re-parse the proclist (for the explicit affinity type), and form the list 2928 // of affinity newMasks indexed by gtid. 2929 static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 2930 unsigned int *out_numMasks, 2931 const char *proclist, 2932 kmp_affin_mask_t *osId2Mask, 2933 int maxOsId) { 2934 int i; 2935 const char *scan = proclist; 2936 const char *next = proclist; 2937 2938 // We use malloc() for the temporary mask vector, so that we can use 2939 // realloc() to extend it. 2940 numNewMasks = 2; 2941 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 2942 nextNewMask = 0; 2943 kmp_affin_mask_t *sumMask; 2944 KMP_CPU_ALLOC(sumMask); 2945 int setSize = 0; 2946 2947 for (;;) { 2948 int start, end, stride; 2949 2950 SKIP_WS(scan); 2951 next = scan; 2952 if (*next == '\0') { 2953 break; 2954 } 2955 2956 if (*next == '{') { 2957 int num; 2958 setSize = 0; 2959 next++; // skip '{' 2960 SKIP_WS(next); 2961 scan = next; 2962 2963 // Read the first integer in the set. 2964 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist"); 2965 SKIP_DIGITS(next); 2966 num = __kmp_str_to_int(scan, *next); 2967 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2968 2969 // Copy the mask for that osId to the sum (union) mask. 2970 if ((num > maxOsId) || 2971 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2972 if (__kmp_affinity_verbose || 2973 (__kmp_affinity_warnings && 2974 (__kmp_affinity_type != affinity_none))) { 2975 KMP_WARNING(AffIgnoreInvalidProcID, num); 2976 } 2977 KMP_CPU_ZERO(sumMask); 2978 } else { 2979 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2980 setSize = 1; 2981 } 2982 2983 for (;;) { 2984 // Check for end of set. 2985 SKIP_WS(next); 2986 if (*next == '}') { 2987 next++; // skip '}' 2988 break; 2989 } 2990 2991 // Skip optional comma. 2992 if (*next == ',') { 2993 next++; 2994 } 2995 SKIP_WS(next); 2996 2997 // Read the next integer in the set. 2998 scan = next; 2999 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 3000 3001 SKIP_DIGITS(next); 3002 num = __kmp_str_to_int(scan, *next); 3003 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 3004 3005 // Add the mask for that osId to the sum mask. 3006 if ((num > maxOsId) || 3007 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3008 if (__kmp_affinity_verbose || 3009 (__kmp_affinity_warnings && 3010 (__kmp_affinity_type != affinity_none))) { 3011 KMP_WARNING(AffIgnoreInvalidProcID, num); 3012 } 3013 } else { 3014 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 3015 setSize++; 3016 } 3017 } 3018 if (setSize > 0) { 3019 ADD_MASK(sumMask); 3020 } 3021 3022 SKIP_WS(next); 3023 if (*next == ',') { 3024 next++; 3025 } 3026 scan = next; 3027 continue; 3028 } 3029 3030 // Read the first integer. 3031 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 3032 SKIP_DIGITS(next); 3033 start = __kmp_str_to_int(scan, *next); 3034 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 3035 SKIP_WS(next); 3036 3037 // If this isn't a range, then add a mask to the list and go on. 3038 if (*next != '-') { 3039 ADD_MASK_OSID(start, osId2Mask, maxOsId); 3040 3041 // Skip optional comma. 3042 if (*next == ',') { 3043 next++; 3044 } 3045 scan = next; 3046 continue; 3047 } 3048 3049 // This is a range. Skip over the '-' and read in the 2nd int. 3050 next++; // skip '-' 3051 SKIP_WS(next); 3052 scan = next; 3053 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 3054 SKIP_DIGITS(next); 3055 end = __kmp_str_to_int(scan, *next); 3056 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 3057 3058 // Check for a stride parameter 3059 stride = 1; 3060 SKIP_WS(next); 3061 if (*next == ':') { 3062 // A stride is specified. Skip over the ':" and read the 3rd int. 3063 int sign = +1; 3064 next++; // skip ':' 3065 SKIP_WS(next); 3066 scan = next; 3067 if (*next == '-') { 3068 sign = -1; 3069 next++; 3070 SKIP_WS(next); 3071 scan = next; 3072 } 3073 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 3074 SKIP_DIGITS(next); 3075 stride = __kmp_str_to_int(scan, *next); 3076 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 3077 stride *= sign; 3078 } 3079 3080 // Do some range checks. 3081 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 3082 if (stride > 0) { 3083 KMP_ASSERT2(start <= end, "bad explicit proc list"); 3084 } else { 3085 KMP_ASSERT2(start >= end, "bad explicit proc list"); 3086 } 3087 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 3088 3089 // Add the mask for each OS proc # to the list. 3090 if (stride > 0) { 3091 do { 3092 ADD_MASK_OSID(start, osId2Mask, maxOsId); 3093 start += stride; 3094 } while (start <= end); 3095 } else { 3096 do { 3097 ADD_MASK_OSID(start, osId2Mask, maxOsId); 3098 start += stride; 3099 } while (start >= end); 3100 } 3101 3102 // Skip optional comma. 3103 SKIP_WS(next); 3104 if (*next == ',') { 3105 next++; 3106 } 3107 scan = next; 3108 } 3109 3110 *out_numMasks = nextNewMask; 3111 if (nextNewMask == 0) { 3112 *out_masks = NULL; 3113 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3114 return; 3115 } 3116 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3117 for (i = 0; i < nextNewMask; i++) { 3118 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 3119 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 3120 KMP_CPU_COPY(dest, src); 3121 } 3122 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3123 KMP_CPU_FREE(sumMask); 3124 } 3125 3126 /*----------------------------------------------------------------------------- 3127 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 3128 places. Again, Here is the grammar: 3129 3130 place_list := place 3131 place_list := place , place_list 3132 place := num 3133 place := place : num 3134 place := place : num : signed 3135 place := { subplacelist } 3136 place := ! place // (lowest priority) 3137 subplace_list := subplace 3138 subplace_list := subplace , subplace_list 3139 subplace := num 3140 subplace := num : num 3141 subplace := num : num : signed 3142 signed := num 3143 signed := + signed 3144 signed := - signed 3145 -----------------------------------------------------------------------------*/ 3146 static void __kmp_process_subplace_list(const char **scan, 3147 kmp_affin_mask_t *osId2Mask, 3148 int maxOsId, kmp_affin_mask_t *tempMask, 3149 int *setSize) { 3150 const char *next; 3151 3152 for (;;) { 3153 int start, count, stride, i; 3154 3155 // Read in the starting proc id 3156 SKIP_WS(*scan); 3157 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3158 next = *scan; 3159 SKIP_DIGITS(next); 3160 start = __kmp_str_to_int(*scan, *next); 3161 KMP_ASSERT(start >= 0); 3162 *scan = next; 3163 3164 // valid follow sets are ',' ':' and '}' 3165 SKIP_WS(*scan); 3166 if (**scan == '}' || **scan == ',') { 3167 if ((start > maxOsId) || 3168 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3169 if (__kmp_affinity_verbose || 3170 (__kmp_affinity_warnings && 3171 (__kmp_affinity_type != affinity_none))) { 3172 KMP_WARNING(AffIgnoreInvalidProcID, start); 3173 } 3174 } else { 3175 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3176 (*setSize)++; 3177 } 3178 if (**scan == '}') { 3179 break; 3180 } 3181 (*scan)++; // skip ',' 3182 continue; 3183 } 3184 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3185 (*scan)++; // skip ':' 3186 3187 // Read count parameter 3188 SKIP_WS(*scan); 3189 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3190 next = *scan; 3191 SKIP_DIGITS(next); 3192 count = __kmp_str_to_int(*scan, *next); 3193 KMP_ASSERT(count >= 0); 3194 *scan = next; 3195 3196 // valid follow sets are ',' ':' and '}' 3197 SKIP_WS(*scan); 3198 if (**scan == '}' || **scan == ',') { 3199 for (i = 0; i < count; i++) { 3200 if ((start > maxOsId) || 3201 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3202 if (__kmp_affinity_verbose || 3203 (__kmp_affinity_warnings && 3204 (__kmp_affinity_type != affinity_none))) { 3205 KMP_WARNING(AffIgnoreInvalidProcID, start); 3206 } 3207 break; // don't proliferate warnings for large count 3208 } else { 3209 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3210 start++; 3211 (*setSize)++; 3212 } 3213 } 3214 if (**scan == '}') { 3215 break; 3216 } 3217 (*scan)++; // skip ',' 3218 continue; 3219 } 3220 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3221 (*scan)++; // skip ':' 3222 3223 // Read stride parameter 3224 int sign = +1; 3225 for (;;) { 3226 SKIP_WS(*scan); 3227 if (**scan == '+') { 3228 (*scan)++; // skip '+' 3229 continue; 3230 } 3231 if (**scan == '-') { 3232 sign *= -1; 3233 (*scan)++; // skip '-' 3234 continue; 3235 } 3236 break; 3237 } 3238 SKIP_WS(*scan); 3239 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3240 next = *scan; 3241 SKIP_DIGITS(next); 3242 stride = __kmp_str_to_int(*scan, *next); 3243 KMP_ASSERT(stride >= 0); 3244 *scan = next; 3245 stride *= sign; 3246 3247 // valid follow sets are ',' and '}' 3248 SKIP_WS(*scan); 3249 if (**scan == '}' || **scan == ',') { 3250 for (i = 0; i < count; i++) { 3251 if ((start > maxOsId) || 3252 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3253 if (__kmp_affinity_verbose || 3254 (__kmp_affinity_warnings && 3255 (__kmp_affinity_type != affinity_none))) { 3256 KMP_WARNING(AffIgnoreInvalidProcID, start); 3257 } 3258 break; // don't proliferate warnings for large count 3259 } else { 3260 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3261 start += stride; 3262 (*setSize)++; 3263 } 3264 } 3265 if (**scan == '}') { 3266 break; 3267 } 3268 (*scan)++; // skip ',' 3269 continue; 3270 } 3271 3272 KMP_ASSERT2(0, "bad explicit places list"); 3273 } 3274 } 3275 3276 static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 3277 int maxOsId, kmp_affin_mask_t *tempMask, 3278 int *setSize) { 3279 const char *next; 3280 3281 // valid follow sets are '{' '!' and num 3282 SKIP_WS(*scan); 3283 if (**scan == '{') { 3284 (*scan)++; // skip '{' 3285 __kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize); 3286 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 3287 (*scan)++; // skip '}' 3288 } else if (**scan == '!') { 3289 (*scan)++; // skip '!' 3290 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 3291 KMP_CPU_COMPLEMENT(maxOsId, tempMask); 3292 } else if ((**scan >= '0') && (**scan <= '9')) { 3293 next = *scan; 3294 SKIP_DIGITS(next); 3295 int num = __kmp_str_to_int(*scan, *next); 3296 KMP_ASSERT(num >= 0); 3297 if ((num > maxOsId) || 3298 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3299 if (__kmp_affinity_verbose || 3300 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 3301 KMP_WARNING(AffIgnoreInvalidProcID, num); 3302 } 3303 } else { 3304 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 3305 (*setSize)++; 3306 } 3307 *scan = next; // skip num 3308 } else { 3309 KMP_ASSERT2(0, "bad explicit places list"); 3310 } 3311 } 3312 3313 // static void 3314 void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 3315 unsigned int *out_numMasks, 3316 const char *placelist, 3317 kmp_affin_mask_t *osId2Mask, 3318 int maxOsId) { 3319 int i, j, count, stride, sign; 3320 const char *scan = placelist; 3321 const char *next = placelist; 3322 3323 numNewMasks = 2; 3324 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 3325 nextNewMask = 0; 3326 3327 // tempMask is modified based on the previous or initial 3328 // place to form the current place 3329 // previousMask contains the previous place 3330 kmp_affin_mask_t *tempMask; 3331 kmp_affin_mask_t *previousMask; 3332 KMP_CPU_ALLOC(tempMask); 3333 KMP_CPU_ZERO(tempMask); 3334 KMP_CPU_ALLOC(previousMask); 3335 KMP_CPU_ZERO(previousMask); 3336 int setSize = 0; 3337 3338 for (;;) { 3339 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 3340 3341 // valid follow sets are ',' ':' and EOL 3342 SKIP_WS(scan); 3343 if (*scan == '\0' || *scan == ',') { 3344 if (setSize > 0) { 3345 ADD_MASK(tempMask); 3346 } 3347 KMP_CPU_ZERO(tempMask); 3348 setSize = 0; 3349 if (*scan == '\0') { 3350 break; 3351 } 3352 scan++; // skip ',' 3353 continue; 3354 } 3355 3356 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3357 scan++; // skip ':' 3358 3359 // Read count parameter 3360 SKIP_WS(scan); 3361 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 3362 next = scan; 3363 SKIP_DIGITS(next); 3364 count = __kmp_str_to_int(scan, *next); 3365 KMP_ASSERT(count >= 0); 3366 scan = next; 3367 3368 // valid follow sets are ',' ':' and EOL 3369 SKIP_WS(scan); 3370 if (*scan == '\0' || *scan == ',') { 3371 stride = +1; 3372 } else { 3373 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3374 scan++; // skip ':' 3375 3376 // Read stride parameter 3377 sign = +1; 3378 for (;;) { 3379 SKIP_WS(scan); 3380 if (*scan == '+') { 3381 scan++; // skip '+' 3382 continue; 3383 } 3384 if (*scan == '-') { 3385 sign *= -1; 3386 scan++; // skip '-' 3387 continue; 3388 } 3389 break; 3390 } 3391 SKIP_WS(scan); 3392 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 3393 next = scan; 3394 SKIP_DIGITS(next); 3395 stride = __kmp_str_to_int(scan, *next); 3396 KMP_DEBUG_ASSERT(stride >= 0); 3397 scan = next; 3398 stride *= sign; 3399 } 3400 3401 // Add places determined by initial_place : count : stride 3402 for (i = 0; i < count; i++) { 3403 if (setSize == 0) { 3404 break; 3405 } 3406 // Add the current place, then build the next place (tempMask) from that 3407 KMP_CPU_COPY(previousMask, tempMask); 3408 ADD_MASK(previousMask); 3409 KMP_CPU_ZERO(tempMask); 3410 setSize = 0; 3411 KMP_CPU_SET_ITERATE(j, previousMask) { 3412 if (!KMP_CPU_ISSET(j, previousMask)) { 3413 continue; 3414 } 3415 if ((j + stride > maxOsId) || (j + stride < 0) || 3416 (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) || 3417 (!KMP_CPU_ISSET(j + stride, 3418 KMP_CPU_INDEX(osId2Mask, j + stride)))) { 3419 if ((__kmp_affinity_verbose || 3420 (__kmp_affinity_warnings && 3421 (__kmp_affinity_type != affinity_none))) && 3422 i < count - 1) { 3423 KMP_WARNING(AffIgnoreInvalidProcID, j + stride); 3424 } 3425 continue; 3426 } 3427 KMP_CPU_SET(j + stride, tempMask); 3428 setSize++; 3429 } 3430 } 3431 KMP_CPU_ZERO(tempMask); 3432 setSize = 0; 3433 3434 // valid follow sets are ',' and EOL 3435 SKIP_WS(scan); 3436 if (*scan == '\0') { 3437 break; 3438 } 3439 if (*scan == ',') { 3440 scan++; // skip ',' 3441 continue; 3442 } 3443 3444 KMP_ASSERT2(0, "bad explicit places list"); 3445 } 3446 3447 *out_numMasks = nextNewMask; 3448 if (nextNewMask == 0) { 3449 *out_masks = NULL; 3450 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3451 return; 3452 } 3453 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3454 KMP_CPU_FREE(tempMask); 3455 KMP_CPU_FREE(previousMask); 3456 for (i = 0; i < nextNewMask; i++) { 3457 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 3458 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 3459 KMP_CPU_COPY(dest, src); 3460 } 3461 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3462 } 3463 3464 #undef ADD_MASK 3465 #undef ADD_MASK_OSID 3466 3467 #if KMP_USE_HWLOC 3468 static int __kmp_hwloc_skip_PUs_obj(hwloc_topology_t t, hwloc_obj_t o) { 3469 // skip PUs descendants of the object o 3470 int skipped = 0; 3471 hwloc_obj_t hT = NULL; 3472 int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT); 3473 for (int i = 0; i < N; ++i) { 3474 KMP_DEBUG_ASSERT(hT); 3475 unsigned idx = hT->os_index; 3476 if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3477 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3478 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3479 ++skipped; 3480 } 3481 hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT); 3482 } 3483 return skipped; // count number of skipped units 3484 } 3485 3486 static int __kmp_hwloc_obj_has_PUs(hwloc_topology_t t, hwloc_obj_t o) { 3487 // check if obj has PUs present in fullMask 3488 hwloc_obj_t hT = NULL; 3489 int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT); 3490 for (int i = 0; i < N; ++i) { 3491 KMP_DEBUG_ASSERT(hT); 3492 unsigned idx = hT->os_index; 3493 if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) 3494 return 1; // found PU 3495 hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT); 3496 } 3497 return 0; // no PUs found 3498 } 3499 #endif // KMP_USE_HWLOC 3500 3501 static void __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) { 3502 AddrUnsPair *newAddr; 3503 if (__kmp_hws_requested == 0) 3504 goto _exit; // no topology limiting actions requested, exit 3505 #if KMP_USE_HWLOC 3506 if (__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 3507 // Number of subobjects calculated dynamically, this works fine for 3508 // any non-uniform topology. 3509 // L2 cache objects are determined by depth, other objects - by type. 3510 hwloc_topology_t tp = __kmp_hwloc_topology; 3511 int nS = 0, nN = 0, nL = 0, nC = 0, 3512 nT = 0; // logical index including skipped 3513 int nCr = 0, nTr = 0; // number of requested units 3514 int nPkg = 0, nCo = 0, n_new = 0, n_old = 0, nCpP = 0, nTpC = 0; // counters 3515 hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to) 3516 int L2depth, idx; 3517 3518 // check support of extensions ---------------------------------- 3519 int numa_support = 0, tile_support = 0; 3520 if (__kmp_pu_os_idx) 3521 hT = hwloc_get_pu_obj_by_os_index(tp, 3522 __kmp_pu_os_idx[__kmp_avail_proc - 1]); 3523 else 3524 hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, __kmp_avail_proc - 1); 3525 if (hT == NULL) { // something's gone wrong 3526 KMP_WARNING(AffHWSubsetUnsupported); 3527 goto _exit; 3528 } 3529 // check NUMA node 3530 hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT); 3531 hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT); 3532 if (hN != NULL && hN->depth > hS->depth) { 3533 numa_support = 1; // 1 in case socket includes node(s) 3534 } else if (__kmp_hws_node.num > 0) { 3535 // don't support sockets inside NUMA node (no such HW found for testing) 3536 KMP_WARNING(AffHWSubsetUnsupported); 3537 goto _exit; 3538 } 3539 // check L2 cahce, get object by depth because of multiple caches 3540 L2depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED); 3541 hL = hwloc_get_ancestor_obj_by_depth(tp, L2depth, hT); 3542 if (hL != NULL && 3543 __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1) { 3544 tile_support = 1; // no sense to count L2 if it includes single core 3545 } else if (__kmp_hws_tile.num > 0) { 3546 if (__kmp_hws_core.num == 0) { 3547 __kmp_hws_core = __kmp_hws_tile; // replace L2 with core 3548 __kmp_hws_tile.num = 0; 3549 } else { 3550 // L2 and core are both requested, but represent same object 3551 KMP_WARNING(AffHWSubsetInvalid); 3552 goto _exit; 3553 } 3554 } 3555 // end of check of extensions ----------------------------------- 3556 3557 // fill in unset items, validate settings ----------------------- 3558 if (__kmp_hws_socket.num == 0) 3559 __kmp_hws_socket.num = nPackages; // use all available sockets 3560 if (__kmp_hws_socket.offset >= nPackages) { 3561 KMP_WARNING(AffHWSubsetManySockets); 3562 goto _exit; 3563 } 3564 if (numa_support) { 3565 hN = NULL; 3566 int NN = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, 3567 &hN); // num nodes in socket 3568 if (__kmp_hws_node.num == 0) 3569 __kmp_hws_node.num = NN; // use all available nodes 3570 if (__kmp_hws_node.offset >= NN) { 3571 KMP_WARNING(AffHWSubsetManyNodes); 3572 goto _exit; 3573 } 3574 if (tile_support) { 3575 // get num tiles in node 3576 int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL); 3577 if (__kmp_hws_tile.num == 0) { 3578 __kmp_hws_tile.num = NL + 1; 3579 } // use all available tiles, some node may have more tiles, thus +1 3580 if (__kmp_hws_tile.offset >= NL) { 3581 KMP_WARNING(AffHWSubsetManyTiles); 3582 goto _exit; 3583 } 3584 int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, 3585 &hC); // num cores in tile 3586 if (__kmp_hws_core.num == 0) 3587 __kmp_hws_core.num = NC; // use all available cores 3588 if (__kmp_hws_core.offset >= NC) { 3589 KMP_WARNING(AffHWSubsetManyCores); 3590 goto _exit; 3591 } 3592 } else { // tile_support 3593 int NC = __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, 3594 &hC); // num cores in node 3595 if (__kmp_hws_core.num == 0) 3596 __kmp_hws_core.num = NC; // use all available cores 3597 if (__kmp_hws_core.offset >= NC) { 3598 KMP_WARNING(AffHWSubsetManyCores); 3599 goto _exit; 3600 } 3601 } // tile_support 3602 } else { // numa_support 3603 if (tile_support) { 3604 // get num tiles in socket 3605 int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL); 3606 if (__kmp_hws_tile.num == 0) 3607 __kmp_hws_tile.num = NL; // use all available tiles 3608 if (__kmp_hws_tile.offset >= NL) { 3609 KMP_WARNING(AffHWSubsetManyTiles); 3610 goto _exit; 3611 } 3612 int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, 3613 &hC); // num cores in tile 3614 if (__kmp_hws_core.num == 0) 3615 __kmp_hws_core.num = NC; // use all available cores 3616 if (__kmp_hws_core.offset >= NC) { 3617 KMP_WARNING(AffHWSubsetManyCores); 3618 goto _exit; 3619 } 3620 } else { // tile_support 3621 int NC = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, 3622 &hC); // num cores in socket 3623 if (__kmp_hws_core.num == 0) 3624 __kmp_hws_core.num = NC; // use all available cores 3625 if (__kmp_hws_core.offset >= NC) { 3626 KMP_WARNING(AffHWSubsetManyCores); 3627 goto _exit; 3628 } 3629 } // tile_support 3630 } 3631 if (__kmp_hws_proc.num == 0) 3632 __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all available procs 3633 if (__kmp_hws_proc.offset >= __kmp_nThreadsPerCore) { 3634 KMP_WARNING(AffHWSubsetManyProcs); 3635 goto _exit; 3636 } 3637 // end of validation -------------------------------------------- 3638 3639 if (pAddr) // pAddr is NULL in case of affinity_none 3640 newAddr = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * 3641 __kmp_avail_proc); // max size 3642 // main loop to form HW subset ---------------------------------- 3643 hS = NULL; 3644 int NP = hwloc_get_nbobjs_by_type(tp, HWLOC_OBJ_PACKAGE); 3645 for (int s = 0; s < NP; ++s) { 3646 // Check Socket ----------------------------------------------- 3647 hS = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hS); 3648 if (!__kmp_hwloc_obj_has_PUs(tp, hS)) 3649 continue; // skip socket if all PUs are out of fullMask 3650 ++nS; // only count objects those have PUs in affinity mask 3651 if (nS <= __kmp_hws_socket.offset || 3652 nS > __kmp_hws_socket.num + __kmp_hws_socket.offset) { 3653 n_old += __kmp_hwloc_skip_PUs_obj(tp, hS); // skip socket 3654 continue; // move to next socket 3655 } 3656 nCr = 0; // count number of cores per socket 3657 // socket requested, go down the topology tree 3658 // check 4 cases: (+NUMA+Tile), (+NUMA-Tile), (-NUMA+Tile), (-NUMA-Tile) 3659 if (numa_support) { 3660 nN = 0; 3661 hN = NULL; 3662 // num nodes in current socket 3663 int NN = 3664 __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, &hN); 3665 for (int n = 0; n < NN; ++n) { 3666 // Check NUMA Node ---------------------------------------- 3667 if (!__kmp_hwloc_obj_has_PUs(tp, hN)) { 3668 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3669 continue; // skip node if all PUs are out of fullMask 3670 } 3671 ++nN; 3672 if (nN <= __kmp_hws_node.offset || 3673 nN > __kmp_hws_node.num + __kmp_hws_node.offset) { 3674 // skip node as not requested 3675 n_old += __kmp_hwloc_skip_PUs_obj(tp, hN); // skip node 3676 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3677 continue; // move to next node 3678 } 3679 // node requested, go down the topology tree 3680 if (tile_support) { 3681 nL = 0; 3682 hL = NULL; 3683 int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL); 3684 for (int l = 0; l < NL; ++l) { 3685 // Check L2 (tile) ------------------------------------ 3686 if (!__kmp_hwloc_obj_has_PUs(tp, hL)) { 3687 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3688 continue; // skip tile if all PUs are out of fullMask 3689 } 3690 ++nL; 3691 if (nL <= __kmp_hws_tile.offset || 3692 nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) { 3693 // skip tile as not requested 3694 n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile 3695 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3696 continue; // move to next tile 3697 } 3698 // tile requested, go down the topology tree 3699 nC = 0; 3700 hC = NULL; 3701 // num cores in current tile 3702 int NC = __kmp_hwloc_count_children_by_type(tp, hL, 3703 HWLOC_OBJ_CORE, &hC); 3704 for (int c = 0; c < NC; ++c) { 3705 // Check Core --------------------------------------- 3706 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3707 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3708 continue; // skip core if all PUs are out of fullMask 3709 } 3710 ++nC; 3711 if (nC <= __kmp_hws_core.offset || 3712 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3713 // skip node as not requested 3714 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3715 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3716 continue; // move to next node 3717 } 3718 // core requested, go down to PUs 3719 nT = 0; 3720 nTr = 0; 3721 hT = NULL; 3722 // num procs in current core 3723 int NT = __kmp_hwloc_count_children_by_type(tp, hC, 3724 HWLOC_OBJ_PU, &hT); 3725 for (int t = 0; t < NT; ++t) { 3726 // Check PU --------------------------------------- 3727 idx = hT->os_index; 3728 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3729 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3730 continue; // skip PU if not in fullMask 3731 } 3732 ++nT; 3733 if (nT <= __kmp_hws_proc.offset || 3734 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3735 // skip PU 3736 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3737 ++n_old; 3738 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3739 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3740 continue; // move to next node 3741 } 3742 ++nTr; 3743 if (pAddr) // collect requested thread's data 3744 newAddr[n_new] = (*pAddr)[n_old]; 3745 ++n_new; 3746 ++n_old; 3747 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3748 } // threads loop 3749 if (nTr > 0) { 3750 ++nCr; // num cores per socket 3751 ++nCo; // total num cores 3752 if (nTr > nTpC) 3753 nTpC = nTr; // calc max threads per core 3754 } 3755 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3756 } // cores loop 3757 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3758 } // tiles loop 3759 } else { // tile_support 3760 // no tiles, check cores 3761 nC = 0; 3762 hC = NULL; 3763 // num cores in current node 3764 int NC = 3765 __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, &hC); 3766 for (int c = 0; c < NC; ++c) { 3767 // Check Core --------------------------------------- 3768 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3769 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3770 continue; // skip core if all PUs are out of fullMask 3771 } 3772 ++nC; 3773 if (nC <= __kmp_hws_core.offset || 3774 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3775 // skip node as not requested 3776 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3777 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3778 continue; // move to next node 3779 } 3780 // core requested, go down to PUs 3781 nT = 0; 3782 nTr = 0; 3783 hT = NULL; 3784 int NT = 3785 __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT); 3786 for (int t = 0; t < NT; ++t) { 3787 // Check PU --------------------------------------- 3788 idx = hT->os_index; 3789 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3790 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3791 continue; // skip PU if not in fullMask 3792 } 3793 ++nT; 3794 if (nT <= __kmp_hws_proc.offset || 3795 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3796 // skip PU 3797 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3798 ++n_old; 3799 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3800 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3801 continue; // move to next node 3802 } 3803 ++nTr; 3804 if (pAddr) // collect requested thread's data 3805 newAddr[n_new] = (*pAddr)[n_old]; 3806 ++n_new; 3807 ++n_old; 3808 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3809 } // threads loop 3810 if (nTr > 0) { 3811 ++nCr; // num cores per socket 3812 ++nCo; // total num cores 3813 if (nTr > nTpC) 3814 nTpC = nTr; // calc max threads per core 3815 } 3816 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3817 } // cores loop 3818 } // tiles support 3819 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3820 } // nodes loop 3821 } else { // numa_support 3822 // no NUMA support 3823 if (tile_support) { 3824 nL = 0; 3825 hL = NULL; 3826 // num tiles in current socket 3827 int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL); 3828 for (int l = 0; l < NL; ++l) { 3829 // Check L2 (tile) ------------------------------------ 3830 if (!__kmp_hwloc_obj_has_PUs(tp, hL)) { 3831 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3832 continue; // skip tile if all PUs are out of fullMask 3833 } 3834 ++nL; 3835 if (nL <= __kmp_hws_tile.offset || 3836 nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) { 3837 // skip tile as not requested 3838 n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile 3839 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3840 continue; // move to next tile 3841 } 3842 // tile requested, go down the topology tree 3843 nC = 0; 3844 hC = NULL; 3845 // num cores per tile 3846 int NC = 3847 __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC); 3848 for (int c = 0; c < NC; ++c) { 3849 // Check Core --------------------------------------- 3850 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3851 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3852 continue; // skip core if all PUs are out of fullMask 3853 } 3854 ++nC; 3855 if (nC <= __kmp_hws_core.offset || 3856 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3857 // skip node as not requested 3858 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3859 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3860 continue; // move to next node 3861 } 3862 // core requested, go down to PUs 3863 nT = 0; 3864 nTr = 0; 3865 hT = NULL; 3866 // num procs per core 3867 int NT = 3868 __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT); 3869 for (int t = 0; t < NT; ++t) { 3870 // Check PU --------------------------------------- 3871 idx = hT->os_index; 3872 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3873 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3874 continue; // skip PU if not in fullMask 3875 } 3876 ++nT; 3877 if (nT <= __kmp_hws_proc.offset || 3878 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3879 // skip PU 3880 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3881 ++n_old; 3882 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3883 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3884 continue; // move to next node 3885 } 3886 ++nTr; 3887 if (pAddr) // collect requested thread's data 3888 newAddr[n_new] = (*pAddr)[n_old]; 3889 ++n_new; 3890 ++n_old; 3891 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3892 } // threads loop 3893 if (nTr > 0) { 3894 ++nCr; // num cores per socket 3895 ++nCo; // total num cores 3896 if (nTr > nTpC) 3897 nTpC = nTr; // calc max threads per core 3898 } 3899 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3900 } // cores loop 3901 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3902 } // tiles loop 3903 } else { // tile_support 3904 // no tiles, check cores 3905 nC = 0; 3906 hC = NULL; 3907 // num cores in socket 3908 int NC = 3909 __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, &hC); 3910 for (int c = 0; c < NC; ++c) { 3911 // Check Core ------------------------------------------- 3912 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3913 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3914 continue; // skip core if all PUs are out of fullMask 3915 } 3916 ++nC; 3917 if (nC <= __kmp_hws_core.offset || 3918 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3919 // skip node as not requested 3920 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3921 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3922 continue; // move to next node 3923 } 3924 // core requested, go down to PUs 3925 nT = 0; 3926 nTr = 0; 3927 hT = NULL; 3928 // num procs per core 3929 int NT = 3930 __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT); 3931 for (int t = 0; t < NT; ++t) { 3932 // Check PU --------------------------------------- 3933 idx = hT->os_index; 3934 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3935 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3936 continue; // skip PU if not in fullMask 3937 } 3938 ++nT; 3939 if (nT <= __kmp_hws_proc.offset || 3940 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3941 // skip PU 3942 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3943 ++n_old; 3944 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3945 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3946 continue; // move to next node 3947 } 3948 ++nTr; 3949 if (pAddr) // collect requested thread's data 3950 newAddr[n_new] = (*pAddr)[n_old]; 3951 ++n_new; 3952 ++n_old; 3953 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3954 } // threads loop 3955 if (nTr > 0) { 3956 ++nCr; // num cores per socket 3957 ++nCo; // total num cores 3958 if (nTr > nTpC) 3959 nTpC = nTr; // calc max threads per core 3960 } 3961 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3962 } // cores loop 3963 } // tiles support 3964 } // numa_support 3965 if (nCr > 0) { // found cores? 3966 ++nPkg; // num sockets 3967 if (nCr > nCpP) 3968 nCpP = nCr; // calc max cores per socket 3969 } 3970 } // sockets loop 3971 3972 // check the subset is valid 3973 KMP_DEBUG_ASSERT(n_old == __kmp_avail_proc); 3974 KMP_DEBUG_ASSERT(nPkg > 0); 3975 KMP_DEBUG_ASSERT(nCpP > 0); 3976 KMP_DEBUG_ASSERT(nTpC > 0); 3977 KMP_DEBUG_ASSERT(nCo > 0); 3978 KMP_DEBUG_ASSERT(nPkg <= nPackages); 3979 KMP_DEBUG_ASSERT(nCpP <= nCoresPerPkg); 3980 KMP_DEBUG_ASSERT(nTpC <= __kmp_nThreadsPerCore); 3981 KMP_DEBUG_ASSERT(nCo <= __kmp_ncores); 3982 3983 nPackages = nPkg; // correct num sockets 3984 nCoresPerPkg = nCpP; // correct num cores per socket 3985 __kmp_nThreadsPerCore = nTpC; // correct num threads per core 3986 __kmp_avail_proc = n_new; // correct num procs 3987 __kmp_ncores = nCo; // correct num cores 3988 // hwloc topology method end 3989 } else 3990 #endif // KMP_USE_HWLOC 3991 { 3992 int n_old = 0, n_new = 0, proc_num = 0; 3993 if (__kmp_hws_node.num > 0 || __kmp_hws_tile.num > 0) { 3994 KMP_WARNING(AffHWSubsetNoHWLOC); 3995 goto _exit; 3996 } 3997 if (__kmp_hws_socket.num == 0) 3998 __kmp_hws_socket.num = nPackages; // use all available sockets 3999 if (__kmp_hws_die.num == 0) 4000 __kmp_hws_die.num = nDiesPerPkg; // use all available dies 4001 if (__kmp_hws_core.num == 0) 4002 __kmp_hws_core.num = nCoresPerPkg; // use all available cores 4003 if (__kmp_hws_proc.num == 0 || __kmp_hws_proc.num > __kmp_nThreadsPerCore) 4004 __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all HW contexts 4005 if (!__kmp_affinity_uniform_topology()) { 4006 KMP_WARNING(AffHWSubsetNonUniform); 4007 goto _exit; // don't support non-uniform topology 4008 } 4009 if (depth > 4) { 4010 KMP_WARNING(AffHWSubsetNonThreeLevel); 4011 goto _exit; // don't support not-3-level topology 4012 } 4013 if (__kmp_hws_socket.offset + __kmp_hws_socket.num > nPackages) { 4014 KMP_WARNING(AffHWSubsetManySockets); 4015 goto _exit; 4016 } 4017 if (depth == 4 && __kmp_hws_die.offset + __kmp_hws_die.num > nDiesPerPkg) { 4018 KMP_WARNING(AffHWSubsetManyDies); 4019 goto _exit; 4020 } 4021 if (__kmp_hws_core.offset + __kmp_hws_core.num > nCoresPerPkg) { 4022 KMP_WARNING(AffHWSubsetManyCores); 4023 goto _exit; 4024 } 4025 // Form the requested subset 4026 if (pAddr) // pAddr is NULL in case of affinity_none 4027 newAddr = (AddrUnsPair *)__kmp_allocate( 4028 sizeof(AddrUnsPair) * __kmp_hws_socket.num * __kmp_hws_die.num * 4029 __kmp_hws_core.num * __kmp_hws_proc.num); 4030 for (int i = 0; i < nPackages; ++i) { 4031 if (i < __kmp_hws_socket.offset || 4032 i >= __kmp_hws_socket.offset + __kmp_hws_socket.num) { 4033 // skip not-requested socket 4034 n_old += nDiesPerPkg * nCoresPerPkg * __kmp_nThreadsPerCore; 4035 if (__kmp_pu_os_idx != NULL) { 4036 // walk through skipped socket 4037 for (int l = 0; l < nDiesPerPkg; ++l) { 4038 for (int j = 0; j < nCoresPerPkg; ++j) { 4039 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 4040 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 4041 ++proc_num; 4042 } 4043 } 4044 } 4045 } 4046 } else { 4047 // walk through requested socket 4048 for (int l = 0; l < nDiesPerPkg; ++l) { 4049 // skip unwanted die 4050 if (l < __kmp_hws_die.offset || 4051 l >= __kmp_hws_die.offset + __kmp_hws_die.num) { 4052 n_old += nCoresPerPkg; 4053 if (__kmp_pu_os_idx != NULL) { 4054 for (int k = 0; k < nCoresPerPkg; ++k) { 4055 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 4056 ++proc_num; 4057 } 4058 } 4059 } else { 4060 for (int j = 0; j < nCoresPerPkg; ++j) { 4061 if (j < __kmp_hws_core.offset || 4062 j >= __kmp_hws_core.offset + 4063 __kmp_hws_core.num) { // skip not-requested core 4064 n_old += __kmp_nThreadsPerCore; 4065 if (__kmp_pu_os_idx != NULL) { 4066 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 4067 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], 4068 __kmp_affin_fullMask); 4069 ++proc_num; 4070 } 4071 } 4072 } else { 4073 // walk through requested core 4074 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 4075 if (k < __kmp_hws_proc.num) { 4076 if (pAddr) // collect requested thread's data 4077 newAddr[n_new] = (*pAddr)[n_old]; 4078 n_new++; 4079 } else { 4080 if (__kmp_pu_os_idx != NULL) 4081 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], 4082 __kmp_affin_fullMask); 4083 } 4084 n_old++; 4085 ++proc_num; 4086 } 4087 } 4088 } 4089 } 4090 } 4091 } 4092 } 4093 KMP_DEBUG_ASSERT(n_old == nPackages * nDiesPerPkg * nCoresPerPkg * 4094 __kmp_nThreadsPerCore); 4095 KMP_DEBUG_ASSERT(n_new == __kmp_hws_socket.num * __kmp_hws_die.num * 4096 __kmp_hws_core.num * __kmp_hws_proc.num); 4097 nPackages = __kmp_hws_socket.num; // correct nPackages 4098 nCoresPerPkg = __kmp_hws_core.num; // correct nCoresPerPkg 4099 nDiesPerPkg = __kmp_hws_die.num; // correct nDiesPerPkg 4100 __kmp_nThreadsPerCore = __kmp_hws_proc.num; // correct __kmp_nThreadsPerCore 4101 __kmp_avail_proc = n_new; // correct avail_proc 4102 __kmp_ncores = 4103 nPackages * nDiesPerPkg * __kmp_hws_core.num; // correct ncores 4104 } // non-hwloc topology method 4105 if (pAddr) { 4106 __kmp_free(*pAddr); 4107 *pAddr = newAddr; // replace old topology with new one 4108 } 4109 if (__kmp_affinity_verbose) { 4110 KMP_INFORM(AvailableOSProc, "KMP_HW_SUBSET", __kmp_avail_proc); 4111 kmp_str_buf_t buf; 4112 __kmp_str_buf_init(&buf); 4113 __kmp_str_buf_print(&buf, "%d", nPackages); 4114 KMP_INFORM(TopologyExtra, "KMP_HW_SUBSET", buf.str, nCoresPerPkg, 4115 __kmp_nThreadsPerCore, __kmp_ncores); 4116 __kmp_str_buf_free(&buf); 4117 } 4118 _exit: 4119 if (__kmp_pu_os_idx != NULL) { 4120 __kmp_free(__kmp_pu_os_idx); 4121 __kmp_pu_os_idx = NULL; 4122 } 4123 } 4124 4125 // This function figures out the deepest level at which there is at least one 4126 // cluster/core with more than one processing unit bound to it. 4127 static int __kmp_affinity_find_core_level(const AddrUnsPair *address2os, 4128 int nprocs, int bottom_level) { 4129 int core_level = 0; 4130 4131 for (int i = 0; i < nprocs; i++) { 4132 for (int j = bottom_level; j > 0; j--) { 4133 if (address2os[i].first.labels[j] > 0) { 4134 if (core_level < (j - 1)) { 4135 core_level = j - 1; 4136 } 4137 } 4138 } 4139 } 4140 return core_level; 4141 } 4142 4143 // This function counts number of clusters/cores at given level. 4144 static int __kmp_affinity_compute_ncores(const AddrUnsPair *address2os, 4145 int nprocs, int bottom_level, 4146 int core_level) { 4147 int ncores = 0; 4148 int i, j; 4149 4150 j = bottom_level; 4151 for (i = 0; i < nprocs; i++) { 4152 for (j = bottom_level; j > core_level; j--) { 4153 if ((i + 1) < nprocs) { 4154 if (address2os[i + 1].first.labels[j] > 0) { 4155 break; 4156 } 4157 } 4158 } 4159 if (j == core_level) { 4160 ncores++; 4161 } 4162 } 4163 if (j > core_level) { 4164 // In case of ( nprocs < __kmp_avail_proc ) we may end too deep and miss one 4165 // core. May occur when called from __kmp_affinity_find_core(). 4166 ncores++; 4167 } 4168 return ncores; 4169 } 4170 4171 // This function finds to which cluster/core given processing unit is bound. 4172 static int __kmp_affinity_find_core(const AddrUnsPair *address2os, int proc, 4173 int bottom_level, int core_level) { 4174 return __kmp_affinity_compute_ncores(address2os, proc + 1, bottom_level, 4175 core_level) - 4176 1; 4177 } 4178 4179 // This function finds maximal number of processing units bound to a 4180 // cluster/core at given level. 4181 static int __kmp_affinity_max_proc_per_core(const AddrUnsPair *address2os, 4182 int nprocs, int bottom_level, 4183 int core_level) { 4184 int maxprocpercore = 0; 4185 4186 if (core_level < bottom_level) { 4187 for (int i = 0; i < nprocs; i++) { 4188 int percore = address2os[i].first.labels[core_level + 1] + 1; 4189 4190 if (percore > maxprocpercore) { 4191 maxprocpercore = percore; 4192 } 4193 } 4194 } else { 4195 maxprocpercore = 1; 4196 } 4197 return maxprocpercore; 4198 } 4199 4200 static AddrUnsPair *address2os = NULL; 4201 static int *procarr = NULL; 4202 static int __kmp_aff_depth = 0; 4203 4204 #if KMP_USE_HIER_SCHED 4205 #define KMP_EXIT_AFF_NONE \ 4206 KMP_ASSERT(__kmp_affinity_type == affinity_none); \ 4207 KMP_ASSERT(address2os == NULL); \ 4208 __kmp_apply_thread_places(NULL, 0); \ 4209 __kmp_create_affinity_none_places(); \ 4210 __kmp_dispatch_set_hierarchy_values(); \ 4211 return; 4212 #else 4213 #define KMP_EXIT_AFF_NONE \ 4214 KMP_ASSERT(__kmp_affinity_type == affinity_none); \ 4215 KMP_ASSERT(address2os == NULL); \ 4216 __kmp_apply_thread_places(NULL, 0); \ 4217 __kmp_create_affinity_none_places(); \ 4218 return; 4219 #endif 4220 4221 // Create a one element mask array (set of places) which only contains the 4222 // initial process's affinity mask 4223 static void __kmp_create_affinity_none_places() { 4224 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4225 KMP_ASSERT(__kmp_affinity_type == affinity_none); 4226 __kmp_affinity_num_masks = 1; 4227 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4228 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, 0); 4229 KMP_CPU_COPY(dest, __kmp_affin_fullMask); 4230 } 4231 4232 static int __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) { 4233 const Address *aa = &(((const AddrUnsPair *)a)->first); 4234 const Address *bb = &(((const AddrUnsPair *)b)->first); 4235 unsigned depth = aa->depth; 4236 unsigned i; 4237 KMP_DEBUG_ASSERT(depth == bb->depth); 4238 KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth); 4239 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0); 4240 for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) { 4241 int j = depth - i - 1; 4242 if (aa->childNums[j] < bb->childNums[j]) 4243 return -1; 4244 if (aa->childNums[j] > bb->childNums[j]) 4245 return 1; 4246 } 4247 for (; i < depth; i++) { 4248 int j = i - __kmp_affinity_compact; 4249 if (aa->childNums[j] < bb->childNums[j]) 4250 return -1; 4251 if (aa->childNums[j] > bb->childNums[j]) 4252 return 1; 4253 } 4254 return 0; 4255 } 4256 4257 static void __kmp_aux_affinity_initialize(void) { 4258 if (__kmp_affinity_masks != NULL) { 4259 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4260 return; 4261 } 4262 4263 // Create the "full" mask - this defines all of the processors that we 4264 // consider to be in the machine model. If respect is set, then it is the 4265 // initialization thread's affinity mask. Otherwise, it is all processors that 4266 // we know about on the machine. 4267 if (__kmp_affin_fullMask == NULL) { 4268 KMP_CPU_ALLOC(__kmp_affin_fullMask); 4269 } 4270 if (KMP_AFFINITY_CAPABLE()) { 4271 __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE); 4272 if (__kmp_affinity_respect_mask) { 4273 // Count the number of available processors. 4274 unsigned i; 4275 __kmp_avail_proc = 0; 4276 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 4277 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 4278 continue; 4279 } 4280 __kmp_avail_proc++; 4281 } 4282 if (__kmp_avail_proc > __kmp_xproc) { 4283 if (__kmp_affinity_verbose || 4284 (__kmp_affinity_warnings && 4285 (__kmp_affinity_type != affinity_none))) { 4286 KMP_WARNING(ErrorInitializeAffinity); 4287 } 4288 __kmp_affinity_type = affinity_none; 4289 KMP_AFFINITY_DISABLE(); 4290 return; 4291 } 4292 4293 if (__kmp_affinity_verbose) { 4294 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4295 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4296 __kmp_affin_fullMask); 4297 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 4298 } 4299 } else { 4300 if (__kmp_affinity_verbose) { 4301 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4302 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4303 __kmp_affin_fullMask); 4304 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 4305 } 4306 __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask); 4307 __kmp_avail_proc = __kmp_xproc; 4308 #if KMP_OS_WINDOWS 4309 // Set the process affinity mask since threads' affinity 4310 // masks must be subset of process mask in Windows* OS 4311 __kmp_affin_fullMask->set_process_affinity(true); 4312 #endif 4313 } 4314 } 4315 4316 if (__kmp_affinity_gran == affinity_gran_tile && 4317 // check if user's request is valid 4318 __kmp_affinity_dispatch->get_api_type() == KMPAffinity::NATIVE_OS) { 4319 KMP_WARNING(AffTilesNoHWLOC, "KMP_AFFINITY"); 4320 __kmp_affinity_gran = affinity_gran_package; 4321 } 4322 4323 int depth = -1; 4324 kmp_i18n_id_t msg_id = kmp_i18n_null; 4325 4326 // For backward compatibility, setting KMP_CPUINFO_FILE => 4327 // KMP_TOPOLOGY_METHOD=cpuinfo 4328 if ((__kmp_cpuinfo_file != NULL) && 4329 (__kmp_affinity_top_method == affinity_top_method_all)) { 4330 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 4331 } 4332 4333 if (__kmp_affinity_top_method == affinity_top_method_all) { 4334 // In the default code path, errors are not fatal - we just try using 4335 // another method. We only emit a warning message if affinity is on, or the 4336 // verbose flag is set, and the nowarnings flag was not set. 4337 const char *file_name = NULL; 4338 int line = 0; 4339 #if KMP_USE_HWLOC 4340 if (depth < 0 && 4341 __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 4342 if (__kmp_affinity_verbose) { 4343 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 4344 } 4345 if (!__kmp_hwloc_error) { 4346 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); 4347 if (depth == 0) { 4348 KMP_EXIT_AFF_NONE; 4349 } else if (depth < 0 && __kmp_affinity_verbose) { 4350 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 4351 } 4352 } else if (__kmp_affinity_verbose) { 4353 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 4354 } 4355 } 4356 #endif 4357 4358 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4359 4360 if (depth < 0) { 4361 if (__kmp_affinity_verbose) { 4362 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 4363 } 4364 4365 file_name = NULL; 4366 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 4367 if (depth == 0) { 4368 KMP_EXIT_AFF_NONE; 4369 } 4370 4371 if (depth < 0) { 4372 if (__kmp_affinity_verbose) { 4373 if (msg_id != kmp_i18n_null) { 4374 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", 4375 __kmp_i18n_catgets(msg_id), 4376 KMP_I18N_STR(DecodingLegacyAPIC)); 4377 } else { 4378 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 4379 KMP_I18N_STR(DecodingLegacyAPIC)); 4380 } 4381 } 4382 4383 file_name = NULL; 4384 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 4385 if (depth == 0) { 4386 KMP_EXIT_AFF_NONE; 4387 } 4388 } 4389 } 4390 4391 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4392 4393 #if KMP_OS_LINUX 4394 4395 if (depth < 0) { 4396 if (__kmp_affinity_verbose) { 4397 if (msg_id != kmp_i18n_null) { 4398 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", 4399 __kmp_i18n_catgets(msg_id), "/proc/cpuinfo"); 4400 } else { 4401 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo"); 4402 } 4403 } 4404 4405 kmp_safe_raii_file_t f("/proc/cpuinfo", "r"); 4406 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 4407 if (depth == 0) { 4408 KMP_EXIT_AFF_NONE; 4409 } 4410 } 4411 4412 #endif /* KMP_OS_LINUX */ 4413 4414 #if KMP_GROUP_AFFINITY 4415 4416 if ((depth < 0) && (__kmp_num_proc_groups > 1)) { 4417 if (__kmp_affinity_verbose) { 4418 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 4419 } 4420 4421 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 4422 KMP_ASSERT(depth != 0); 4423 } 4424 4425 #endif /* KMP_GROUP_AFFINITY */ 4426 4427 if (depth < 0) { 4428 if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) { 4429 if (file_name == NULL) { 4430 KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id)); 4431 } else if (line == 0) { 4432 KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id)); 4433 } else { 4434 KMP_INFORM(UsingFlatOSFileLine, file_name, line, 4435 __kmp_i18n_catgets(msg_id)); 4436 } 4437 } 4438 // FIXME - print msg if msg_id = kmp_i18n_null ??? 4439 4440 file_name = ""; 4441 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 4442 if (depth == 0) { 4443 KMP_EXIT_AFF_NONE; 4444 } 4445 KMP_ASSERT(depth > 0); 4446 KMP_ASSERT(address2os != NULL); 4447 } 4448 } 4449 4450 #if KMP_USE_HWLOC 4451 else if (__kmp_affinity_top_method == affinity_top_method_hwloc) { 4452 KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC); 4453 if (__kmp_affinity_verbose) { 4454 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 4455 } 4456 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); 4457 if (depth == 0) { 4458 KMP_EXIT_AFF_NONE; 4459 } 4460 } 4461 #endif // KMP_USE_HWLOC 4462 4463 // If the user has specified that a particular topology discovery method is to 4464 // be used, then we abort if that method fails. The exception is group 4465 // affinity, which might have been implicitly set. 4466 4467 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4468 4469 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid || 4470 __kmp_affinity_top_method == affinity_top_method_x2apicid_1f) { 4471 if (__kmp_affinity_verbose) { 4472 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 4473 } 4474 4475 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 4476 if (depth == 0) { 4477 KMP_EXIT_AFF_NONE; 4478 } 4479 if (depth < 0) { 4480 KMP_ASSERT(msg_id != kmp_i18n_null); 4481 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4482 } 4483 } else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 4484 if (__kmp_affinity_verbose) { 4485 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 4486 } 4487 4488 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 4489 if (depth == 0) { 4490 KMP_EXIT_AFF_NONE; 4491 } 4492 if (depth < 0) { 4493 KMP_ASSERT(msg_id != kmp_i18n_null); 4494 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4495 } 4496 } 4497 4498 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4499 4500 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 4501 const char *filename; 4502 const char *env_var = nullptr; 4503 if (__kmp_cpuinfo_file != NULL) { 4504 filename = __kmp_cpuinfo_file; 4505 env_var = "KMP_CPUINFO_FILE"; 4506 } else { 4507 filename = "/proc/cpuinfo"; 4508 } 4509 4510 if (__kmp_affinity_verbose) { 4511 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 4512 } 4513 4514 kmp_safe_raii_file_t f(filename, "r", env_var); 4515 int line = 0; 4516 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 4517 if (depth < 0) { 4518 KMP_ASSERT(msg_id != kmp_i18n_null); 4519 if (line > 0) { 4520 KMP_FATAL(FileLineMsgExiting, filename, line, 4521 __kmp_i18n_catgets(msg_id)); 4522 } else { 4523 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 4524 } 4525 } 4526 if (__kmp_affinity_type == affinity_none) { 4527 KMP_ASSERT(depth == 0); 4528 KMP_EXIT_AFF_NONE; 4529 } 4530 } 4531 4532 #if KMP_GROUP_AFFINITY 4533 4534 else if (__kmp_affinity_top_method == affinity_top_method_group) { 4535 if (__kmp_affinity_verbose) { 4536 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 4537 } 4538 4539 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 4540 KMP_ASSERT(depth != 0); 4541 if (depth < 0) { 4542 KMP_ASSERT(msg_id != kmp_i18n_null); 4543 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4544 } 4545 } 4546 4547 #endif /* KMP_GROUP_AFFINITY */ 4548 4549 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 4550 if (__kmp_affinity_verbose) { 4551 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY"); 4552 } 4553 4554 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 4555 if (depth == 0) { 4556 KMP_EXIT_AFF_NONE; 4557 } 4558 // should not fail 4559 KMP_ASSERT(depth > 0); 4560 KMP_ASSERT(address2os != NULL); 4561 } 4562 4563 #if KMP_USE_HIER_SCHED 4564 __kmp_dispatch_set_hierarchy_values(); 4565 #endif 4566 4567 if (address2os == NULL) { 4568 if (KMP_AFFINITY_CAPABLE() && 4569 (__kmp_affinity_verbose || 4570 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) { 4571 KMP_WARNING(ErrorInitializeAffinity); 4572 } 4573 __kmp_affinity_type = affinity_none; 4574 __kmp_create_affinity_none_places(); 4575 KMP_AFFINITY_DISABLE(); 4576 return; 4577 } 4578 4579 if (__kmp_affinity_gran == affinity_gran_tile 4580 #if KMP_USE_HWLOC 4581 && __kmp_tile_depth == 0 4582 #endif 4583 ) { 4584 // tiles requested but not detected, warn user on this 4585 KMP_WARNING(AffTilesNoTiles, "KMP_AFFINITY"); 4586 } 4587 4588 __kmp_apply_thread_places(&address2os, depth); 4589 4590 // Create the table of masks, indexed by thread Id. 4591 unsigned maxIndex; 4592 unsigned numUnique; 4593 kmp_affin_mask_t *osId2Mask = 4594 __kmp_create_masks(&maxIndex, &numUnique, address2os, __kmp_avail_proc); 4595 if (__kmp_affinity_gran_levels == 0) { 4596 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 4597 } 4598 4599 // Set the childNums vector in all Address objects. This must be done before 4600 // we can sort using __kmp_affinity_cmp_Address_child_num(), which takes into 4601 // account the setting of __kmp_affinity_compact. 4602 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc); 4603 4604 switch (__kmp_affinity_type) { 4605 4606 case affinity_explicit: 4607 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 4608 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) { 4609 __kmp_affinity_process_proclist( 4610 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 4611 __kmp_affinity_proclist, osId2Mask, maxIndex); 4612 } else { 4613 __kmp_affinity_process_placelist( 4614 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 4615 __kmp_affinity_proclist, osId2Mask, maxIndex); 4616 } 4617 if (__kmp_affinity_num_masks == 0) { 4618 if (__kmp_affinity_verbose || 4619 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 4620 KMP_WARNING(AffNoValidProcID); 4621 } 4622 __kmp_affinity_type = affinity_none; 4623 __kmp_create_affinity_none_places(); 4624 return; 4625 } 4626 break; 4627 4628 // The other affinity types rely on sorting the Addresses according to some 4629 // permutation of the machine topology tree. Set __kmp_affinity_compact and 4630 // __kmp_affinity_offset appropriately, then jump to a common code fragment 4631 // to do the sort and create the array of affinity masks. 4632 4633 case affinity_logical: 4634 __kmp_affinity_compact = 0; 4635 if (__kmp_affinity_offset) { 4636 __kmp_affinity_offset = 4637 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 4638 } 4639 goto sortAddresses; 4640 4641 case affinity_physical: 4642 if (__kmp_nThreadsPerCore > 1) { 4643 __kmp_affinity_compact = 1; 4644 if (__kmp_affinity_compact >= depth) { 4645 __kmp_affinity_compact = 0; 4646 } 4647 } else { 4648 __kmp_affinity_compact = 0; 4649 } 4650 if (__kmp_affinity_offset) { 4651 __kmp_affinity_offset = 4652 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 4653 } 4654 goto sortAddresses; 4655 4656 case affinity_scatter: 4657 if (__kmp_affinity_compact >= depth) { 4658 __kmp_affinity_compact = 0; 4659 } else { 4660 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 4661 } 4662 goto sortAddresses; 4663 4664 case affinity_compact: 4665 if (__kmp_affinity_compact >= depth) { 4666 __kmp_affinity_compact = depth - 1; 4667 } 4668 goto sortAddresses; 4669 4670 case affinity_balanced: 4671 if (depth <= 1) { 4672 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 4673 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 4674 } 4675 __kmp_affinity_type = affinity_none; 4676 __kmp_create_affinity_none_places(); 4677 return; 4678 } else if (!__kmp_affinity_uniform_topology()) { 4679 // Save the depth for further usage 4680 __kmp_aff_depth = depth; 4681 4682 int core_level = __kmp_affinity_find_core_level( 4683 address2os, __kmp_avail_proc, depth - 1); 4684 int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, 4685 depth - 1, core_level); 4686 int maxprocpercore = __kmp_affinity_max_proc_per_core( 4687 address2os, __kmp_avail_proc, depth - 1, core_level); 4688 4689 int nproc = ncores * maxprocpercore; 4690 if ((nproc < 2) || (nproc < __kmp_avail_proc)) { 4691 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 4692 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 4693 } 4694 __kmp_affinity_type = affinity_none; 4695 return; 4696 } 4697 4698 procarr = (int *)__kmp_allocate(sizeof(int) * nproc); 4699 for (int i = 0; i < nproc; i++) { 4700 procarr[i] = -1; 4701 } 4702 4703 int lastcore = -1; 4704 int inlastcore = 0; 4705 for (int i = 0; i < __kmp_avail_proc; i++) { 4706 int proc = address2os[i].second; 4707 int core = 4708 __kmp_affinity_find_core(address2os, i, depth - 1, core_level); 4709 4710 if (core == lastcore) { 4711 inlastcore++; 4712 } else { 4713 inlastcore = 0; 4714 } 4715 lastcore = core; 4716 4717 procarr[core * maxprocpercore + inlastcore] = proc; 4718 } 4719 } 4720 if (__kmp_affinity_compact >= depth) { 4721 __kmp_affinity_compact = depth - 1; 4722 } 4723 4724 sortAddresses: 4725 // Allocate the gtid->affinity mask table. 4726 if (__kmp_affinity_dups) { 4727 __kmp_affinity_num_masks = __kmp_avail_proc; 4728 } else { 4729 __kmp_affinity_num_masks = numUnique; 4730 } 4731 4732 if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) && 4733 (__kmp_affinity_num_places > 0) && 4734 ((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) { 4735 __kmp_affinity_num_masks = __kmp_affinity_num_places; 4736 } 4737 4738 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4739 4740 // Sort the address2os table according to the current setting of 4741 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 4742 qsort(address2os, __kmp_avail_proc, sizeof(*address2os), 4743 __kmp_affinity_cmp_Address_child_num); 4744 { 4745 int i; 4746 unsigned j; 4747 for (i = 0, j = 0; i < __kmp_avail_proc; i++) { 4748 if ((!__kmp_affinity_dups) && (!address2os[i].first.leader)) { 4749 continue; 4750 } 4751 unsigned osId = address2os[i].second; 4752 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 4753 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j); 4754 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 4755 KMP_CPU_COPY(dest, src); 4756 if (++j >= __kmp_affinity_num_masks) { 4757 break; 4758 } 4759 } 4760 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 4761 } 4762 break; 4763 4764 default: 4765 KMP_ASSERT2(0, "Unexpected affinity setting"); 4766 } 4767 4768 KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1); 4769 machine_hierarchy.init(address2os, __kmp_avail_proc); 4770 } 4771 #undef KMP_EXIT_AFF_NONE 4772 4773 void __kmp_affinity_initialize(void) { 4774 // Much of the code above was written assuming that if a machine was not 4775 // affinity capable, then __kmp_affinity_type == affinity_none. We now 4776 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 4777 // There are too many checks for __kmp_affinity_type == affinity_none 4778 // in this code. Instead of trying to change them all, check if 4779 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 4780 // affinity_none, call the real initialization routine, then restore 4781 // __kmp_affinity_type to affinity_disabled. 4782 int disabled = (__kmp_affinity_type == affinity_disabled); 4783 if (!KMP_AFFINITY_CAPABLE()) { 4784 KMP_ASSERT(disabled); 4785 } 4786 if (disabled) { 4787 __kmp_affinity_type = affinity_none; 4788 } 4789 __kmp_aux_affinity_initialize(); 4790 if (disabled) { 4791 __kmp_affinity_type = affinity_disabled; 4792 } 4793 } 4794 4795 void __kmp_affinity_uninitialize(void) { 4796 if (__kmp_affinity_masks != NULL) { 4797 KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4798 __kmp_affinity_masks = NULL; 4799 } 4800 if (__kmp_affin_fullMask != NULL) { 4801 KMP_CPU_FREE(__kmp_affin_fullMask); 4802 __kmp_affin_fullMask = NULL; 4803 } 4804 __kmp_affinity_num_masks = 0; 4805 __kmp_affinity_type = affinity_default; 4806 __kmp_affinity_num_places = 0; 4807 if (__kmp_affinity_proclist != NULL) { 4808 __kmp_free(__kmp_affinity_proclist); 4809 __kmp_affinity_proclist = NULL; 4810 } 4811 if (address2os != NULL) { 4812 __kmp_free(address2os); 4813 address2os = NULL; 4814 } 4815 if (procarr != NULL) { 4816 __kmp_free(procarr); 4817 procarr = NULL; 4818 } 4819 #if KMP_USE_HWLOC 4820 if (__kmp_hwloc_topology != NULL) { 4821 hwloc_topology_destroy(__kmp_hwloc_topology); 4822 __kmp_hwloc_topology = NULL; 4823 } 4824 #endif 4825 KMPAffinity::destroy_api(); 4826 } 4827 4828 void __kmp_affinity_set_init_mask(int gtid, int isa_root) { 4829 if (!KMP_AFFINITY_CAPABLE()) { 4830 return; 4831 } 4832 4833 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4834 if (th->th.th_affin_mask == NULL) { 4835 KMP_CPU_ALLOC(th->th.th_affin_mask); 4836 } else { 4837 KMP_CPU_ZERO(th->th.th_affin_mask); 4838 } 4839 4840 // Copy the thread mask to the kmp_info_t structure. If 4841 // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that 4842 // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set, 4843 // then the full mask is the same as the mask of the initialization thread. 4844 kmp_affin_mask_t *mask; 4845 int i; 4846 4847 if (KMP_AFFINITY_NON_PROC_BIND) { 4848 if ((__kmp_affinity_type == affinity_none) || 4849 (__kmp_affinity_type == affinity_balanced)) { 4850 #if KMP_GROUP_AFFINITY 4851 if (__kmp_num_proc_groups > 1) { 4852 return; 4853 } 4854 #endif 4855 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4856 i = 0; 4857 mask = __kmp_affin_fullMask; 4858 } else { 4859 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 4860 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4861 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4862 } 4863 } else { 4864 if ((!isa_root) || 4865 (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 4866 #if KMP_GROUP_AFFINITY 4867 if (__kmp_num_proc_groups > 1) { 4868 return; 4869 } 4870 #endif 4871 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4872 i = KMP_PLACE_ALL; 4873 mask = __kmp_affin_fullMask; 4874 } else { 4875 // int i = some hash function or just a counter that doesn't 4876 // always start at 0. Use gtid for now. 4877 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 4878 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4879 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4880 } 4881 } 4882 4883 th->th.th_current_place = i; 4884 if (isa_root) { 4885 th->th.th_new_place = i; 4886 th->th.th_first_place = 0; 4887 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4888 } else if (KMP_AFFINITY_NON_PROC_BIND) { 4889 // When using a Non-OMP_PROC_BIND affinity method, 4890 // set all threads' place-partition-var to the entire place list 4891 th->th.th_first_place = 0; 4892 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4893 } 4894 4895 if (i == KMP_PLACE_ALL) { 4896 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 4897 gtid)); 4898 } else { 4899 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 4900 gtid, i)); 4901 } 4902 4903 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4904 4905 if (__kmp_affinity_verbose 4906 /* to avoid duplicate printing (will be correctly printed on barrier) */ 4907 && (__kmp_affinity_type == affinity_none || 4908 (i != KMP_PLACE_ALL && __kmp_affinity_type != affinity_balanced))) { 4909 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4910 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4911 th->th.th_affin_mask); 4912 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4913 __kmp_gettid(), gtid, buf); 4914 } 4915 4916 #if KMP_OS_WINDOWS 4917 // On Windows* OS, the process affinity mask might have changed. If the user 4918 // didn't request affinity and this call fails, just continue silently. 4919 // See CQ171393. 4920 if (__kmp_affinity_type == affinity_none) { 4921 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 4922 } else 4923 #endif 4924 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4925 } 4926 4927 void __kmp_affinity_set_place(int gtid) { 4928 if (!KMP_AFFINITY_CAPABLE()) { 4929 return; 4930 } 4931 4932 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4933 4934 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current " 4935 "place = %d)\n", 4936 gtid, th->th.th_new_place, th->th.th_current_place)); 4937 4938 // Check that the new place is within this thread's partition. 4939 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4940 KMP_ASSERT(th->th.th_new_place >= 0); 4941 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 4942 if (th->th.th_first_place <= th->th.th_last_place) { 4943 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) && 4944 (th->th.th_new_place <= th->th.th_last_place)); 4945 } else { 4946 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) || 4947 (th->th.th_new_place >= th->th.th_last_place)); 4948 } 4949 4950 // Copy the thread mask to the kmp_info_t structure, 4951 // and set this thread's affinity. 4952 kmp_affin_mask_t *mask = 4953 KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place); 4954 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4955 th->th.th_current_place = th->th.th_new_place; 4956 4957 if (__kmp_affinity_verbose) { 4958 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4959 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4960 th->th.th_affin_mask); 4961 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), 4962 __kmp_gettid(), gtid, buf); 4963 } 4964 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4965 } 4966 4967 int __kmp_aux_set_affinity(void **mask) { 4968 int gtid; 4969 kmp_info_t *th; 4970 int retval; 4971 4972 if (!KMP_AFFINITY_CAPABLE()) { 4973 return -1; 4974 } 4975 4976 gtid = __kmp_entry_gtid(); 4977 KA_TRACE( 4978 1000, (""); { 4979 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4980 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4981 (kmp_affin_mask_t *)(*mask)); 4982 __kmp_debug_printf( 4983 "kmp_set_affinity: setting affinity mask for thread %d = %s\n", 4984 gtid, buf); 4985 }); 4986 4987 if (__kmp_env_consistency_check) { 4988 if ((mask == NULL) || (*mask == NULL)) { 4989 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4990 } else { 4991 unsigned proc; 4992 int num_procs = 0; 4993 4994 KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) { 4995 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4996 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4997 } 4998 if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 4999 continue; 5000 } 5001 num_procs++; 5002 } 5003 if (num_procs == 0) { 5004 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 5005 } 5006 5007 #if KMP_GROUP_AFFINITY 5008 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 5009 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 5010 } 5011 #endif /* KMP_GROUP_AFFINITY */ 5012 } 5013 } 5014 5015 th = __kmp_threads[gtid]; 5016 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 5017 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 5018 if (retval == 0) { 5019 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 5020 } 5021 5022 th->th.th_current_place = KMP_PLACE_UNDEFINED; 5023 th->th.th_new_place = KMP_PLACE_UNDEFINED; 5024 th->th.th_first_place = 0; 5025 th->th.th_last_place = __kmp_affinity_num_masks - 1; 5026 5027 // Turn off 4.0 affinity for the current tread at this parallel level. 5028 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 5029 5030 return retval; 5031 } 5032 5033 int __kmp_aux_get_affinity(void **mask) { 5034 int gtid; 5035 int retval; 5036 kmp_info_t *th; 5037 5038 if (!KMP_AFFINITY_CAPABLE()) { 5039 return -1; 5040 } 5041 5042 gtid = __kmp_entry_gtid(); 5043 th = __kmp_threads[gtid]; 5044 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 5045 5046 KA_TRACE( 5047 1000, (""); { 5048 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5049 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 5050 th->th.th_affin_mask); 5051 __kmp_printf( 5052 "kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, 5053 buf); 5054 }); 5055 5056 if (__kmp_env_consistency_check) { 5057 if ((mask == NULL) || (*mask == NULL)) { 5058 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 5059 } 5060 } 5061 5062 #if !KMP_OS_WINDOWS 5063 5064 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 5065 KA_TRACE( 5066 1000, (""); { 5067 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5068 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 5069 (kmp_affin_mask_t *)(*mask)); 5070 __kmp_printf( 5071 "kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, 5072 buf); 5073 }); 5074 return retval; 5075 5076 #else 5077 (void)retval; 5078 5079 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 5080 return 0; 5081 5082 #endif /* KMP_OS_WINDOWS */ 5083 } 5084 5085 int __kmp_aux_get_affinity_max_proc() { 5086 if (!KMP_AFFINITY_CAPABLE()) { 5087 return 0; 5088 } 5089 #if KMP_GROUP_AFFINITY 5090 if (__kmp_num_proc_groups > 1) { 5091 return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT); 5092 } 5093 #endif 5094 return __kmp_xproc; 5095 } 5096 5097 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) { 5098 if (!KMP_AFFINITY_CAPABLE()) { 5099 return -1; 5100 } 5101 5102 KA_TRACE( 5103 1000, (""); { 5104 int gtid = __kmp_entry_gtid(); 5105 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5106 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 5107 (kmp_affin_mask_t *)(*mask)); 5108 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in " 5109 "affinity mask for thread %d = %s\n", 5110 proc, gtid, buf); 5111 }); 5112 5113 if (__kmp_env_consistency_check) { 5114 if ((mask == NULL) || (*mask == NULL)) { 5115 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 5116 } 5117 } 5118 5119 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 5120 return -1; 5121 } 5122 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 5123 return -2; 5124 } 5125 5126 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 5127 return 0; 5128 } 5129 5130 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) { 5131 if (!KMP_AFFINITY_CAPABLE()) { 5132 return -1; 5133 } 5134 5135 KA_TRACE( 5136 1000, (""); { 5137 int gtid = __kmp_entry_gtid(); 5138 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5139 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 5140 (kmp_affin_mask_t *)(*mask)); 5141 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in " 5142 "affinity mask for thread %d = %s\n", 5143 proc, gtid, buf); 5144 }); 5145 5146 if (__kmp_env_consistency_check) { 5147 if ((mask == NULL) || (*mask == NULL)) { 5148 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 5149 } 5150 } 5151 5152 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 5153 return -1; 5154 } 5155 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 5156 return -2; 5157 } 5158 5159 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 5160 return 0; 5161 } 5162 5163 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) { 5164 if (!KMP_AFFINITY_CAPABLE()) { 5165 return -1; 5166 } 5167 5168 KA_TRACE( 5169 1000, (""); { 5170 int gtid = __kmp_entry_gtid(); 5171 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5172 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 5173 (kmp_affin_mask_t *)(*mask)); 5174 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in " 5175 "affinity mask for thread %d = %s\n", 5176 proc, gtid, buf); 5177 }); 5178 5179 if (__kmp_env_consistency_check) { 5180 if ((mask == NULL) || (*mask == NULL)) { 5181 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc"); 5182 } 5183 } 5184 5185 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 5186 return -1; 5187 } 5188 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 5189 return 0; 5190 } 5191 5192 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 5193 } 5194 5195 // Dynamic affinity settings - Affinity balanced 5196 void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) { 5197 KMP_DEBUG_ASSERT(th); 5198 bool fine_gran = true; 5199 int tid = th->th.th_info.ds.ds_tid; 5200 5201 switch (__kmp_affinity_gran) { 5202 case affinity_gran_fine: 5203 case affinity_gran_thread: 5204 break; 5205 case affinity_gran_core: 5206 if (__kmp_nThreadsPerCore > 1) { 5207 fine_gran = false; 5208 } 5209 break; 5210 case affinity_gran_package: 5211 if (nCoresPerPkg > 1) { 5212 fine_gran = false; 5213 } 5214 break; 5215 default: 5216 fine_gran = false; 5217 } 5218 5219 if (__kmp_affinity_uniform_topology()) { 5220 int coreID; 5221 int threadID; 5222 // Number of hyper threads per core in HT machine 5223 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 5224 // Number of cores 5225 int ncores = __kmp_ncores; 5226 if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) { 5227 __kmp_nth_per_core = __kmp_avail_proc / nPackages; 5228 ncores = nPackages; 5229 } 5230 // How many threads will be bound to each core 5231 int chunk = nthreads / ncores; 5232 // How many cores will have an additional thread bound to it - "big cores" 5233 int big_cores = nthreads % ncores; 5234 // Number of threads on the big cores 5235 int big_nth = (chunk + 1) * big_cores; 5236 if (tid < big_nth) { 5237 coreID = tid / (chunk + 1); 5238 threadID = (tid % (chunk + 1)) % __kmp_nth_per_core; 5239 } else { // tid >= big_nth 5240 coreID = (tid - big_cores) / chunk; 5241 threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core; 5242 } 5243 5244 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 5245 "Illegal set affinity operation when not capable"); 5246 5247 kmp_affin_mask_t *mask = th->th.th_affin_mask; 5248 KMP_CPU_ZERO(mask); 5249 5250 if (fine_gran) { 5251 int osID = address2os[coreID * __kmp_nth_per_core + threadID].second; 5252 KMP_CPU_SET(osID, mask); 5253 } else { 5254 for (int i = 0; i < __kmp_nth_per_core; i++) { 5255 int osID; 5256 osID = address2os[coreID * __kmp_nth_per_core + i].second; 5257 KMP_CPU_SET(osID, mask); 5258 } 5259 } 5260 if (__kmp_affinity_verbose) { 5261 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5262 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 5263 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 5264 __kmp_gettid(), tid, buf); 5265 } 5266 __kmp_set_system_affinity(mask, TRUE); 5267 } else { // Non-uniform topology 5268 5269 kmp_affin_mask_t *mask = th->th.th_affin_mask; 5270 KMP_CPU_ZERO(mask); 5271 5272 int core_level = __kmp_affinity_find_core_level( 5273 address2os, __kmp_avail_proc, __kmp_aff_depth - 1); 5274 int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, 5275 __kmp_aff_depth - 1, core_level); 5276 int nth_per_core = __kmp_affinity_max_proc_per_core( 5277 address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level); 5278 5279 // For performance gain consider the special case nthreads == 5280 // __kmp_avail_proc 5281 if (nthreads == __kmp_avail_proc) { 5282 if (fine_gran) { 5283 int osID = address2os[tid].second; 5284 KMP_CPU_SET(osID, mask); 5285 } else { 5286 int core = __kmp_affinity_find_core(address2os, tid, 5287 __kmp_aff_depth - 1, core_level); 5288 for (int i = 0; i < __kmp_avail_proc; i++) { 5289 int osID = address2os[i].second; 5290 if (__kmp_affinity_find_core(address2os, i, __kmp_aff_depth - 1, 5291 core_level) == core) { 5292 KMP_CPU_SET(osID, mask); 5293 } 5294 } 5295 } 5296 } else if (nthreads <= ncores) { 5297 5298 int core = 0; 5299 for (int i = 0; i < ncores; i++) { 5300 // Check if this core from procarr[] is in the mask 5301 int in_mask = 0; 5302 for (int j = 0; j < nth_per_core; j++) { 5303 if (procarr[i * nth_per_core + j] != -1) { 5304 in_mask = 1; 5305 break; 5306 } 5307 } 5308 if (in_mask) { 5309 if (tid == core) { 5310 for (int j = 0; j < nth_per_core; j++) { 5311 int osID = procarr[i * nth_per_core + j]; 5312 if (osID != -1) { 5313 KMP_CPU_SET(osID, mask); 5314 // For fine granularity it is enough to set the first available 5315 // osID for this core 5316 if (fine_gran) { 5317 break; 5318 } 5319 } 5320 } 5321 break; 5322 } else { 5323 core++; 5324 } 5325 } 5326 } 5327 } else { // nthreads > ncores 5328 // Array to save the number of processors at each core 5329 int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores); 5330 // Array to save the number of cores with "x" available processors; 5331 int *ncores_with_x_procs = 5332 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 5333 // Array to save the number of cores with # procs from x to nth_per_core 5334 int *ncores_with_x_to_max_procs = 5335 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 5336 5337 for (int i = 0; i <= nth_per_core; i++) { 5338 ncores_with_x_procs[i] = 0; 5339 ncores_with_x_to_max_procs[i] = 0; 5340 } 5341 5342 for (int i = 0; i < ncores; i++) { 5343 int cnt = 0; 5344 for (int j = 0; j < nth_per_core; j++) { 5345 if (procarr[i * nth_per_core + j] != -1) { 5346 cnt++; 5347 } 5348 } 5349 nproc_at_core[i] = cnt; 5350 ncores_with_x_procs[cnt]++; 5351 } 5352 5353 for (int i = 0; i <= nth_per_core; i++) { 5354 for (int j = i; j <= nth_per_core; j++) { 5355 ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j]; 5356 } 5357 } 5358 5359 // Max number of processors 5360 int nproc = nth_per_core * ncores; 5361 // An array to keep number of threads per each context 5362 int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc); 5363 for (int i = 0; i < nproc; i++) { 5364 newarr[i] = 0; 5365 } 5366 5367 int nth = nthreads; 5368 int flag = 0; 5369 while (nth > 0) { 5370 for (int j = 1; j <= nth_per_core; j++) { 5371 int cnt = ncores_with_x_to_max_procs[j]; 5372 for (int i = 0; i < ncores; i++) { 5373 // Skip the core with 0 processors 5374 if (nproc_at_core[i] == 0) { 5375 continue; 5376 } 5377 for (int k = 0; k < nth_per_core; k++) { 5378 if (procarr[i * nth_per_core + k] != -1) { 5379 if (newarr[i * nth_per_core + k] == 0) { 5380 newarr[i * nth_per_core + k] = 1; 5381 cnt--; 5382 nth--; 5383 break; 5384 } else { 5385 if (flag != 0) { 5386 newarr[i * nth_per_core + k]++; 5387 cnt--; 5388 nth--; 5389 break; 5390 } 5391 } 5392 } 5393 } 5394 if (cnt == 0 || nth == 0) { 5395 break; 5396 } 5397 } 5398 if (nth == 0) { 5399 break; 5400 } 5401 } 5402 flag = 1; 5403 } 5404 int sum = 0; 5405 for (int i = 0; i < nproc; i++) { 5406 sum += newarr[i]; 5407 if (sum > tid) { 5408 if (fine_gran) { 5409 int osID = procarr[i]; 5410 KMP_CPU_SET(osID, mask); 5411 } else { 5412 int coreID = i / nth_per_core; 5413 for (int ii = 0; ii < nth_per_core; ii++) { 5414 int osID = procarr[coreID * nth_per_core + ii]; 5415 if (osID != -1) { 5416 KMP_CPU_SET(osID, mask); 5417 } 5418 } 5419 } 5420 break; 5421 } 5422 } 5423 __kmp_free(newarr); 5424 } 5425 5426 if (__kmp_affinity_verbose) { 5427 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5428 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 5429 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 5430 __kmp_gettid(), tid, buf); 5431 } 5432 __kmp_set_system_affinity(mask, TRUE); 5433 } 5434 } 5435 5436 #if KMP_OS_LINUX || KMP_OS_FREEBSD 5437 // We don't need this entry for Windows because 5438 // there is GetProcessAffinityMask() api 5439 // 5440 // The intended usage is indicated by these steps: 5441 // 1) The user gets the current affinity mask 5442 // 2) Then sets the affinity by calling this function 5443 // 3) Error check the return value 5444 // 4) Use non-OpenMP parallelization 5445 // 5) Reset the affinity to what was stored in step 1) 5446 #ifdef __cplusplus 5447 extern "C" 5448 #endif 5449 int 5450 kmp_set_thread_affinity_mask_initial() 5451 // the function returns 0 on success, 5452 // -1 if we cannot bind thread 5453 // >0 (errno) if an error happened during binding 5454 { 5455 int gtid = __kmp_get_gtid(); 5456 if (gtid < 0) { 5457 // Do not touch non-omp threads 5458 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5459 "non-omp thread, returning\n")); 5460 return -1; 5461 } 5462 if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) { 5463 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5464 "affinity not initialized, returning\n")); 5465 return -1; 5466 } 5467 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5468 "set full mask for thread %d\n", 5469 gtid)); 5470 KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL); 5471 return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE); 5472 } 5473 #endif 5474 5475 #endif // KMP_AFFINITY_SUPPORTED 5476