1 /* 2 * kmp_affinity.cpp -- affinity management 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_i18n.h" 16 #include "kmp_io.h" 17 #include "kmp_str.h" 18 #include "kmp_wrapper_getpid.h" 19 #if KMP_USE_HIER_SCHED 20 #include "kmp_dispatch_hier.h" 21 #endif 22 #if KMP_USE_HWLOC 23 // Copied from hwloc 24 #define HWLOC_GROUP_KIND_INTEL_DIE 104 25 #endif 26 27 // Store the real or imagined machine hierarchy here 28 static hierarchy_info machine_hierarchy; 29 30 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); } 31 32 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 33 kmp_uint32 depth; 34 // The test below is true if affinity is available, but set to "none". Need to 35 // init on first use of hierarchical barrier. 36 if (TCR_1(machine_hierarchy.uninitialized)) 37 machine_hierarchy.init(NULL, nproc); 38 39 // Adjust the hierarchy in case num threads exceeds original 40 if (nproc > machine_hierarchy.base_num_threads) 41 machine_hierarchy.resize(nproc); 42 43 depth = machine_hierarchy.depth; 44 KMP_DEBUG_ASSERT(depth > 0); 45 46 thr_bar->depth = depth; 47 __kmp_type_convert(machine_hierarchy.numPerLevel[0] - 1, 48 &(thr_bar->base_leaf_kids)); 49 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 50 } 51 52 #if KMP_AFFINITY_SUPPORTED 53 54 const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) { 55 switch (type) { 56 case KMP_HW_SOCKET: 57 return ((plural) ? KMP_I18N_STR(Sockets) : KMP_I18N_STR(Socket)); 58 case KMP_HW_DIE: 59 return ((plural) ? KMP_I18N_STR(Dice) : KMP_I18N_STR(Die)); 60 case KMP_HW_MODULE: 61 return ((plural) ? KMP_I18N_STR(Modules) : KMP_I18N_STR(Module)); 62 case KMP_HW_TILE: 63 return ((plural) ? KMP_I18N_STR(Tiles) : KMP_I18N_STR(Tile)); 64 case KMP_HW_NUMA: 65 return ((plural) ? KMP_I18N_STR(NumaDomains) : KMP_I18N_STR(NumaDomain)); 66 case KMP_HW_L3: 67 return ((plural) ? KMP_I18N_STR(L3Caches) : KMP_I18N_STR(L3Cache)); 68 case KMP_HW_L2: 69 return ((plural) ? KMP_I18N_STR(L2Caches) : KMP_I18N_STR(L2Cache)); 70 case KMP_HW_L1: 71 return ((plural) ? KMP_I18N_STR(L1Caches) : KMP_I18N_STR(L1Cache)); 72 case KMP_HW_CORE: 73 return ((plural) ? KMP_I18N_STR(Cores) : KMP_I18N_STR(Core)); 74 case KMP_HW_THREAD: 75 return ((plural) ? KMP_I18N_STR(Threads) : KMP_I18N_STR(Thread)); 76 case KMP_HW_PROC_GROUP: 77 return ((plural) ? KMP_I18N_STR(ProcGroups) : KMP_I18N_STR(ProcGroup)); 78 } 79 return KMP_I18N_STR(Unknown); 80 } 81 82 // This function removes the topology levels that are radix 1 and don't offer 83 // further information about the topology. The most common example is when you 84 // have one thread context per core, we don't want the extra thread context 85 // level if it offers no unique labels. So they are removed. 86 // return value: the new depth of address2os 87 static int __kmp_affinity_remove_radix_one_levels(AddrUnsPair *addrP, int nTh, 88 int depth, kmp_hw_t *types) { 89 int preference[KMP_HW_LAST]; 90 int top_index1, top_index2; 91 // Set up preference associative array 92 preference[KMP_HW_PROC_GROUP] = 110; 93 preference[KMP_HW_SOCKET] = 100; 94 preference[KMP_HW_CORE] = 95; 95 preference[KMP_HW_THREAD] = 90; 96 preference[KMP_HW_DIE] = 85; 97 preference[KMP_HW_NUMA] = 80; 98 preference[KMP_HW_TILE] = 75; 99 preference[KMP_HW_MODULE] = 73; 100 preference[KMP_HW_L3] = 70; 101 preference[KMP_HW_L2] = 65; 102 preference[KMP_HW_L1] = 60; 103 top_index1 = 0; 104 top_index2 = 1; 105 while (top_index1 < depth - 1 && top_index2 < depth) { 106 KMP_DEBUG_ASSERT(top_index1 >= 0 && top_index1 < depth); 107 KMP_DEBUG_ASSERT(top_index2 >= 0 && top_index2 < depth); 108 kmp_hw_t type1 = types[top_index1]; 109 kmp_hw_t type2 = types[top_index2]; 110 if (type1 == KMP_HW_SOCKET && type2 == KMP_HW_CORE) { 111 top_index1 = top_index2++; 112 continue; 113 } 114 bool radix1 = true; 115 bool all_same = true; 116 unsigned id1 = addrP[0].first.labels[top_index1]; 117 unsigned id2 = addrP[0].first.labels[top_index2]; 118 int pref1 = preference[type1]; 119 int pref2 = preference[type2]; 120 for (int hwidx = 1; hwidx < nTh; ++hwidx) { 121 if (addrP[hwidx].first.labels[top_index1] == id1 && 122 addrP[hwidx].first.labels[top_index2] != id2) { 123 radix1 = false; 124 break; 125 } 126 if (addrP[hwidx].first.labels[top_index2] != id2) 127 all_same = false; 128 id1 = addrP[hwidx].first.labels[top_index1]; 129 id2 = addrP[hwidx].first.labels[top_index2]; 130 } 131 if (radix1) { 132 // Select the layer to remove based on preference 133 kmp_hw_t remove_type, keep_type; 134 int remove_layer, remove_layer_ids; 135 if (pref1 > pref2) { 136 remove_type = type2; 137 remove_layer = remove_layer_ids = top_index2; 138 keep_type = type1; 139 } else { 140 remove_type = type1; 141 remove_layer = remove_layer_ids = top_index1; 142 keep_type = type2; 143 } 144 // If all the indexes for the second (deeper) layer are the same. 145 // e.g., all are zero, then make sure to keep the first layer's ids 146 if (all_same) 147 remove_layer_ids = top_index2; 148 // Remove radix one type by setting the equivalence, removing the id from 149 // the hw threads and removing the layer from types and depth 150 for (int idx = 0; idx < nTh; ++idx) { 151 Address &hw_thread = addrP[idx].first; 152 for (int d = remove_layer_ids; d < depth - 1; ++d) 153 hw_thread.labels[d] = hw_thread.labels[d + 1]; 154 hw_thread.depth--; 155 } 156 for (int idx = remove_layer; idx < depth - 1; ++idx) 157 types[idx] = types[idx + 1]; 158 depth--; 159 } else { 160 top_index1 = top_index2++; 161 } 162 } 163 KMP_ASSERT(depth > 0); 164 return depth; 165 } 166 // Gather the count of each topology layer and the ratio 167 // ratio contains the number of types[i] / types[i+1] and so forth 168 // count contains the absolute number of types[i] 169 static void __kmp_affinity_gather_enumeration_information(AddrUnsPair *addrP, 170 int nTh, int depth, 171 kmp_hw_t *types, 172 int *ratio, 173 int *count) { 174 int previous_id[KMP_HW_LAST]; 175 int max[KMP_HW_LAST]; 176 177 for (int i = 0; i < depth; ++i) { 178 previous_id[i] = -1; 179 max[i] = 0; 180 count[i] = 0; 181 ratio[i] = 0; 182 } 183 for (int i = 0; i < nTh; ++i) { 184 Address &hw_thread = addrP[i].first; 185 for (int layer = 0; layer < depth; ++layer) { 186 int id = hw_thread.labels[layer]; 187 if (id != previous_id[layer]) { 188 // Add an additional increment to each count 189 for (int l = layer; l < depth; ++l) 190 count[l]++; 191 // Keep track of topology layer ratio statistics 192 max[layer]++; 193 for (int l = layer + 1; l < depth; ++l) { 194 if (max[l] > ratio[l]) 195 ratio[l] = max[l]; 196 max[l] = 1; 197 } 198 break; 199 } 200 } 201 for (int layer = 0; layer < depth; ++layer) { 202 previous_id[layer] = hw_thread.labels[layer]; 203 } 204 } 205 for (int layer = 0; layer < depth; ++layer) { 206 if (max[layer] > ratio[layer]) 207 ratio[layer] = max[layer]; 208 } 209 } 210 211 // Find out if the topology is uniform 212 static bool __kmp_affinity_discover_uniformity(int depth, int *ratio, 213 int *count) { 214 int num = 1; 215 for (int level = 0; level < depth; ++level) 216 num *= ratio[level]; 217 return (num == count[depth - 1]); 218 } 219 220 // calculate the number of X's per Y 221 static inline int __kmp_affinity_calculate_ratio(int *ratio, int deep_level, 222 int shallow_level) { 223 int retval = 1; 224 if (deep_level < 0 || shallow_level < 0) 225 return retval; 226 for (int level = deep_level; level > shallow_level; --level) 227 retval *= ratio[level]; 228 return retval; 229 } 230 231 static void __kmp_affinity_print_topology(AddrUnsPair *addrP, int len, 232 int depth, kmp_hw_t *types) { 233 int proc; 234 kmp_str_buf_t buf; 235 __kmp_str_buf_init(&buf); 236 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 237 for (proc = 0; proc < len; proc++) { 238 for (int i = 0; i < depth; ++i) { 239 __kmp_str_buf_print(&buf, "%s %d ", __kmp_hw_get_catalog_string(types[i]), 240 addrP[proc].first.labels[i]); 241 } 242 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", addrP[proc].second, buf.str); 243 __kmp_str_buf_clear(&buf); 244 } 245 __kmp_str_buf_free(&buf); 246 } 247 248 // Print out the detailed machine topology map, i.e. the physical locations 249 // of each OS proc. 250 static void __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, 251 int depth, int pkgLevel, 252 int coreLevel, int threadLevel) { 253 int proc; 254 255 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 256 for (proc = 0; proc < len; proc++) { 257 int level; 258 kmp_str_buf_t buf; 259 __kmp_str_buf_init(&buf); 260 for (level = 0; level < depth; level++) { 261 if (level == threadLevel) { 262 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread)); 263 } else if (level == coreLevel) { 264 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core)); 265 } else if (level == pkgLevel) { 266 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package)); 267 } else if (level > pkgLevel) { 268 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node), 269 level - pkgLevel - 1); 270 } else { 271 __kmp_str_buf_print(&buf, "L%d ", level); 272 } 273 __kmp_str_buf_print(&buf, "%d ", address2os[proc].first.labels[level]); 274 } 275 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second, 276 buf.str); 277 __kmp_str_buf_free(&buf); 278 } 279 } 280 281 bool KMPAffinity::picked_api = false; 282 283 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); } 284 void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); } 285 void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); } 286 void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); } 287 void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); } 288 void KMPAffinity::operator delete(void *p) { __kmp_free(p); } 289 290 void KMPAffinity::pick_api() { 291 KMPAffinity *affinity_dispatch; 292 if (picked_api) 293 return; 294 #if KMP_USE_HWLOC 295 // Only use Hwloc if affinity isn't explicitly disabled and 296 // user requests Hwloc topology method 297 if (__kmp_affinity_top_method == affinity_top_method_hwloc && 298 __kmp_affinity_type != affinity_disabled) { 299 affinity_dispatch = new KMPHwlocAffinity(); 300 } else 301 #endif 302 { 303 affinity_dispatch = new KMPNativeAffinity(); 304 } 305 __kmp_affinity_dispatch = affinity_dispatch; 306 picked_api = true; 307 } 308 309 void KMPAffinity::destroy_api() { 310 if (__kmp_affinity_dispatch != NULL) { 311 delete __kmp_affinity_dispatch; 312 __kmp_affinity_dispatch = NULL; 313 picked_api = false; 314 } 315 } 316 317 #define KMP_ADVANCE_SCAN(scan) \ 318 while (*scan != '\0') { \ 319 scan++; \ 320 } 321 322 // Print the affinity mask to the character array in a pretty format. 323 // The format is a comma separated list of non-negative integers or integer 324 // ranges: e.g., 1,2,3-5,7,9-15 325 // The format can also be the string "{<empty>}" if no bits are set in mask 326 char *__kmp_affinity_print_mask(char *buf, int buf_len, 327 kmp_affin_mask_t *mask) { 328 int start = 0, finish = 0, previous = 0; 329 bool first_range; 330 KMP_ASSERT(buf); 331 KMP_ASSERT(buf_len >= 40); 332 KMP_ASSERT(mask); 333 char *scan = buf; 334 char *end = buf + buf_len - 1; 335 336 // Check for empty set. 337 if (mask->begin() == mask->end()) { 338 KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}"); 339 KMP_ADVANCE_SCAN(scan); 340 KMP_ASSERT(scan <= end); 341 return buf; 342 } 343 344 first_range = true; 345 start = mask->begin(); 346 while (1) { 347 // Find next range 348 // [start, previous] is inclusive range of contiguous bits in mask 349 for (finish = mask->next(start), previous = start; 350 finish == previous + 1 && finish != mask->end(); 351 finish = mask->next(finish)) { 352 previous = finish; 353 } 354 355 // The first range does not need a comma printed before it, but the rest 356 // of the ranges do need a comma beforehand 357 if (!first_range) { 358 KMP_SNPRINTF(scan, end - scan + 1, "%s", ","); 359 KMP_ADVANCE_SCAN(scan); 360 } else { 361 first_range = false; 362 } 363 // Range with three or more contiguous bits in the affinity mask 364 if (previous - start > 1) { 365 KMP_SNPRINTF(scan, end - scan + 1, "%u-%u", start, previous); 366 } else { 367 // Range with one or two contiguous bits in the affinity mask 368 KMP_SNPRINTF(scan, end - scan + 1, "%u", start); 369 KMP_ADVANCE_SCAN(scan); 370 if (previous - start > 0) { 371 KMP_SNPRINTF(scan, end - scan + 1, ",%u", previous); 372 } 373 } 374 KMP_ADVANCE_SCAN(scan); 375 // Start over with new start point 376 start = finish; 377 if (start == mask->end()) 378 break; 379 // Check for overflow 380 if (end - scan < 2) 381 break; 382 } 383 384 // Check for overflow 385 KMP_ASSERT(scan <= end); 386 return buf; 387 } 388 #undef KMP_ADVANCE_SCAN 389 390 // Print the affinity mask to the string buffer object in a pretty format 391 // The format is a comma separated list of non-negative integers or integer 392 // ranges: e.g., 1,2,3-5,7,9-15 393 // The format can also be the string "{<empty>}" if no bits are set in mask 394 kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf, 395 kmp_affin_mask_t *mask) { 396 int start = 0, finish = 0, previous = 0; 397 bool first_range; 398 KMP_ASSERT(buf); 399 KMP_ASSERT(mask); 400 401 __kmp_str_buf_clear(buf); 402 403 // Check for empty set. 404 if (mask->begin() == mask->end()) { 405 __kmp_str_buf_print(buf, "%s", "{<empty>}"); 406 return buf; 407 } 408 409 first_range = true; 410 start = mask->begin(); 411 while (1) { 412 // Find next range 413 // [start, previous] is inclusive range of contiguous bits in mask 414 for (finish = mask->next(start), previous = start; 415 finish == previous + 1 && finish != mask->end(); 416 finish = mask->next(finish)) { 417 previous = finish; 418 } 419 420 // The first range does not need a comma printed before it, but the rest 421 // of the ranges do need a comma beforehand 422 if (!first_range) { 423 __kmp_str_buf_print(buf, "%s", ","); 424 } else { 425 first_range = false; 426 } 427 // Range with three or more contiguous bits in the affinity mask 428 if (previous - start > 1) { 429 __kmp_str_buf_print(buf, "%u-%u", start, previous); 430 } else { 431 // Range with one or two contiguous bits in the affinity mask 432 __kmp_str_buf_print(buf, "%u", start); 433 if (previous - start > 0) { 434 __kmp_str_buf_print(buf, ",%u", previous); 435 } 436 } 437 // Start over with new start point 438 start = finish; 439 if (start == mask->end()) 440 break; 441 } 442 return buf; 443 } 444 445 void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) { 446 KMP_CPU_ZERO(mask); 447 448 #if KMP_GROUP_AFFINITY 449 450 if (__kmp_num_proc_groups > 1) { 451 int group; 452 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 453 for (group = 0; group < __kmp_num_proc_groups; group++) { 454 int i; 455 int num = __kmp_GetActiveProcessorCount(group); 456 for (i = 0; i < num; i++) { 457 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 458 } 459 } 460 } else 461 462 #endif /* KMP_GROUP_AFFINITY */ 463 464 { 465 int proc; 466 for (proc = 0; proc < __kmp_xproc; proc++) { 467 KMP_CPU_SET(proc, mask); 468 } 469 } 470 } 471 472 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be 473 // called to renumber the labels from [0..n] and place them into the child_num 474 // vector of the address object. This is done in case the labels used for 475 // the children at one node of the hierarchy differ from those used for 476 // another node at the same level. Example: suppose the machine has 2 nodes 477 // with 2 packages each. The first node contains packages 601 and 602, and 478 // second node contains packages 603 and 604. If we try to sort the table 479 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604 480 // because we are paying attention to the labels themselves, not the ordinal 481 // child numbers. By using the child numbers in the sort, the result is 482 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604. 483 static void __kmp_affinity_assign_child_nums(AddrUnsPair *address2os, 484 int numAddrs) { 485 KMP_DEBUG_ASSERT(numAddrs > 0); 486 int depth = address2os->first.depth; 487 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 488 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 489 int labCt; 490 for (labCt = 0; labCt < depth; labCt++) { 491 address2os[0].first.childNums[labCt] = counts[labCt] = 0; 492 lastLabel[labCt] = address2os[0].first.labels[labCt]; 493 } 494 int i; 495 for (i = 1; i < numAddrs; i++) { 496 for (labCt = 0; labCt < depth; labCt++) { 497 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) { 498 int labCt2; 499 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) { 500 counts[labCt2] = 0; 501 lastLabel[labCt2] = address2os[i].first.labels[labCt2]; 502 } 503 counts[labCt]++; 504 lastLabel[labCt] = address2os[i].first.labels[labCt]; 505 break; 506 } 507 } 508 for (labCt = 0; labCt < depth; labCt++) { 509 address2os[i].first.childNums[labCt] = counts[labCt]; 510 } 511 for (; labCt < (int)Address::maxDepth; labCt++) { 512 address2os[i].first.childNums[labCt] = 0; 513 } 514 } 515 __kmp_free(lastLabel); 516 __kmp_free(counts); 517 } 518 519 // All of the __kmp_affinity_create_*_map() routines should set 520 // __kmp_affinity_masks to a vector of affinity mask objects of length 521 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and return 522 // the number of levels in the machine topology tree (zero if 523 // __kmp_affinity_type == affinity_none). 524 // 525 // All of the __kmp_affinity_create_*_map() routines should set 526 // *__kmp_affin_fullMask to the affinity mask for the initialization thread. 527 // They need to save and restore the mask, and it could be needed later, so 528 // saving it is just an optimization to avoid calling kmp_get_system_affinity() 529 // again. 530 kmp_affin_mask_t *__kmp_affin_fullMask = NULL; 531 532 static int nCoresPerPkg, nPackages; 533 static int __kmp_nThreadsPerCore; 534 #ifndef KMP_DFLT_NTH_CORES 535 static int __kmp_ncores; 536 #endif 537 static int *__kmp_pu_os_idx = NULL; 538 static int nDiesPerPkg = 1; 539 540 // __kmp_affinity_uniform_topology() doesn't work when called from 541 // places which support arbitrarily many levels in the machine topology 542 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map() 543 // __kmp_affinity_create_x2apicid_map(). 544 inline static bool __kmp_affinity_uniform_topology() { 545 return __kmp_avail_proc == 546 (__kmp_nThreadsPerCore * nCoresPerPkg * nDiesPerPkg * nPackages); 547 } 548 549 #if KMP_USE_HWLOC 550 551 static inline bool __kmp_hwloc_is_cache_type(hwloc_obj_t obj) { 552 #if HWLOC_API_VERSION >= 0x00020000 553 return hwloc_obj_type_is_cache(obj->type); 554 #else 555 return obj->type == HWLOC_OBJ_CACHE; 556 #endif 557 } 558 559 // Returns KMP_HW_* type derived from HWLOC_* type 560 static inline kmp_hw_t __kmp_hwloc_type_2_topology_type(hwloc_obj_t obj) { 561 562 if (__kmp_hwloc_is_cache_type(obj)) { 563 if (obj->attr->cache.type == HWLOC_OBJ_CACHE_INSTRUCTION) 564 return KMP_HW_UNKNOWN; 565 switch (obj->attr->cache.depth) { 566 case 1: 567 return KMP_HW_L1; 568 case 2: 569 #if KMP_MIC_SUPPORTED 570 if (__kmp_mic_type == mic3) { 571 return KMP_HW_TILE; 572 } 573 #endif 574 return KMP_HW_L2; 575 case 3: 576 return KMP_HW_L3; 577 } 578 return KMP_HW_UNKNOWN; 579 } 580 581 switch (obj->type) { 582 case HWLOC_OBJ_PACKAGE: 583 return KMP_HW_SOCKET; 584 case HWLOC_OBJ_NUMANODE: 585 return KMP_HW_NUMA; 586 case HWLOC_OBJ_CORE: 587 return KMP_HW_CORE; 588 case HWLOC_OBJ_PU: 589 return KMP_HW_THREAD; 590 case HWLOC_OBJ_GROUP: 591 if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_DIE) 592 return KMP_HW_DIE; 593 #if HWLOC_API_VERSION >= 0x00020100 594 case HWLOC_OBJ_DIE: 595 return KMP_HW_DIE; 596 #endif 597 } 598 return KMP_HW_UNKNOWN; 599 } 600 601 // Returns the number of objects of type 'type' below 'obj' within the topology 602 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is 603 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET 604 // object. 605 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj, 606 hwloc_obj_type_t type) { 607 int retval = 0; 608 hwloc_obj_t first; 609 for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type, 610 obj->logical_index, type, 0); 611 first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, 612 obj->type, first) == obj; 613 first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type, 614 first)) { 615 ++retval; 616 } 617 return retval; 618 } 619 620 static int __kmp_hwloc_count_children_by_depth(hwloc_topology_t t, 621 hwloc_obj_t o, 622 kmp_hwloc_depth_t depth, 623 hwloc_obj_t *f) { 624 if (o->depth == depth) { 625 if (*f == NULL) 626 *f = o; // output first descendant found 627 return 1; 628 } 629 int sum = 0; 630 for (unsigned i = 0; i < o->arity; i++) 631 sum += __kmp_hwloc_count_children_by_depth(t, o->children[i], depth, f); 632 return sum; // will be 0 if no one found (as PU arity is 0) 633 } 634 635 static int __kmp_hwloc_count_children_by_type(hwloc_topology_t t, hwloc_obj_t o, 636 hwloc_obj_type_t type, 637 hwloc_obj_t *f) { 638 if (!hwloc_compare_types(o->type, type)) { 639 if (*f == NULL) 640 *f = o; // output first descendant found 641 return 1; 642 } 643 int sum = 0; 644 for (unsigned i = 0; i < o->arity; i++) 645 sum += __kmp_hwloc_count_children_by_type(t, o->children[i], type, f); 646 return sum; // will be 0 if no one found (as PU arity is 0) 647 } 648 649 // This gets the sub_id for a lower object under a higher object in the 650 // topology tree 651 static int __kmp_hwloc_get_sub_id(hwloc_topology_t t, hwloc_obj_t higher, 652 hwloc_obj_t lower) { 653 hwloc_obj_t obj; 654 hwloc_obj_type_t ltype = lower->type; 655 int lindex = lower->logical_index - 1; 656 int sub_id = 0; 657 // Get the previous lower object 658 obj = hwloc_get_obj_by_type(t, ltype, lindex); 659 while (obj && lindex >= 0 && 660 hwloc_bitmap_isincluded(obj->cpuset, higher->cpuset)) { 661 if (obj->userdata) { 662 sub_id = (int)(RCAST(kmp_intptr_t, obj->userdata)); 663 break; 664 } 665 sub_id++; 666 lindex--; 667 obj = hwloc_get_obj_by_type(t, ltype, lindex); 668 } 669 // store sub_id + 1 so that 0 is differed from NULL 670 lower->userdata = RCAST(void *, sub_id + 1); 671 return sub_id; 672 } 673 674 static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os, 675 kmp_i18n_id_t *const msg_id) { 676 kmp_hw_t type; 677 int hw_thread_index, sub_id, nActiveThreads; 678 int depth; 679 hwloc_obj_t pu, obj, root, prev; 680 int ratio[KMP_HW_LAST]; 681 int count[KMP_HW_LAST]; 682 kmp_hw_t types[KMP_HW_LAST]; 683 684 hwloc_topology_t tp = __kmp_hwloc_topology; 685 *msg_id = kmp_i18n_null; 686 687 // Save the affinity mask for the current thread. 688 kmp_affin_mask_t *oldMask; 689 KMP_CPU_ALLOC(oldMask); 690 __kmp_get_system_affinity(oldMask, TRUE); 691 692 if (!KMP_AFFINITY_CAPABLE()) { 693 // Hack to try and infer the machine topology using only the data 694 // available from cpuid on the current thread, and __kmp_xproc. 695 KMP_ASSERT(__kmp_affinity_type == affinity_none); 696 // hwloc only guarantees existance of PU object, so check PACKAGE and CORE 697 hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0); 698 if (o != NULL) 699 nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_CORE); 700 else 701 nCoresPerPkg = 1; // no PACKAGE found 702 o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0); 703 if (o != NULL) 704 __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_PU); 705 else 706 __kmp_nThreadsPerCore = 1; // no CORE found 707 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 708 if (nCoresPerPkg == 0) 709 nCoresPerPkg = 1; // to prevent possible division by 0 710 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 711 if (__kmp_affinity_verbose) { 712 KMP_INFORM(AffNotUsingHwloc, "KMP_AFFINITY"); 713 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 714 if (__kmp_affinity_uniform_topology()) { 715 KMP_INFORM(Uniform, "KMP_AFFINITY"); 716 } else { 717 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 718 } 719 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 720 __kmp_nThreadsPerCore, __kmp_ncores); 721 } 722 KMP_CPU_FREE(oldMask); 723 return 0; 724 } 725 726 root = hwloc_get_root_obj(tp); 727 728 // Figure out the depth and types in the topology 729 depth = 0; 730 pu = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin()); 731 obj = pu; 732 types[depth] = KMP_HW_THREAD; 733 depth++; 734 while (obj != root && obj != NULL) { 735 obj = obj->parent; 736 #if HWLOC_API_VERSION >= 0x00020000 737 if (obj->memory_arity) { 738 hwloc_obj_t memory; 739 for (memory = obj->memory_first_child; memory; 740 memory = hwloc_get_next_child(tp, obj, memory)) { 741 if (memory->type == HWLOC_OBJ_NUMANODE) 742 break; 743 } 744 if (memory && memory->type == HWLOC_OBJ_NUMANODE) { 745 types[depth] = KMP_HW_NUMA; 746 depth++; 747 } 748 } 749 #endif 750 type = __kmp_hwloc_type_2_topology_type(obj); 751 if (type != KMP_HW_UNKNOWN) { 752 types[depth] = type; 753 depth++; 754 } 755 } 756 KMP_ASSERT(depth > 0 && depth <= KMP_HW_LAST); 757 758 // Get the order for the types correct 759 for (int i = 0, j = depth - 1; i < j; ++i, --j) { 760 kmp_hw_t temp = types[i]; 761 types[i] = types[j]; 762 types[j] = temp; 763 } 764 765 // Allocate the data structure to be returned. 766 AddrUnsPair *retval = 767 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 768 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 769 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 770 771 hw_thread_index = 0; 772 pu = NULL; 773 nActiveThreads = 0; 774 while (pu = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, pu)) { 775 int index = depth - 1; 776 bool included = KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask); 777 Address hw_thread(depth); 778 if (included) { 779 hw_thread.labels[index] = pu->logical_index; 780 __kmp_pu_os_idx[hw_thread_index] = pu->os_index; 781 index--; 782 nActiveThreads++; 783 } 784 obj = pu; 785 prev = obj; 786 while (obj != root && obj != NULL) { 787 obj = obj->parent; 788 #if HWLOC_API_VERSION >= 0x00020000 789 // NUMA Nodes are handled differently since they are not within the 790 // parent/child structure anymore. They are separate children 791 // of obj (memory_first_child points to first memory child) 792 if (obj->memory_arity) { 793 hwloc_obj_t memory; 794 for (memory = obj->memory_first_child; memory; 795 memory = hwloc_get_next_child(tp, obj, memory)) { 796 if (memory->type == HWLOC_OBJ_NUMANODE) 797 break; 798 } 799 if (memory && memory->type == HWLOC_OBJ_NUMANODE) { 800 sub_id = __kmp_hwloc_get_sub_id(tp, memory, prev); 801 if (included) { 802 hw_thread.labels[index] = memory->logical_index; 803 hw_thread.labels[index + 1] = sub_id; 804 index--; 805 } 806 prev = memory; 807 } 808 } 809 #endif 810 type = __kmp_hwloc_type_2_topology_type(obj); 811 if (type != KMP_HW_UNKNOWN) { 812 sub_id = __kmp_hwloc_get_sub_id(tp, obj, prev); 813 if (included) { 814 hw_thread.labels[index] = obj->logical_index; 815 hw_thread.labels[index + 1] = sub_id; 816 index--; 817 } 818 prev = obj; 819 } 820 } 821 if (included) { 822 retval[hw_thread_index] = AddrUnsPair(hw_thread, pu->os_index); 823 hw_thread_index++; 824 } 825 } 826 827 // If there's only one thread context to bind to, return now. 828 KMP_DEBUG_ASSERT(nActiveThreads == __kmp_avail_proc); 829 KMP_ASSERT(nActiveThreads > 0); 830 if (nActiveThreads == 1) { 831 __kmp_ncores = nPackages = 1; 832 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 833 if (__kmp_affinity_verbose) { 834 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 835 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 836 KMP_INFORM(Uniform, "KMP_AFFINITY"); 837 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 838 __kmp_nThreadsPerCore, __kmp_ncores); 839 } 840 841 if (__kmp_affinity_type == affinity_none) { 842 __kmp_free(retval); 843 KMP_CPU_FREE(oldMask); 844 return 0; 845 } 846 847 // Form an Address object which only includes the package level. 848 Address addr(1); 849 addr.labels[0] = retval[0].first.labels[0]; 850 retval[0].first = addr; 851 852 if (__kmp_affinity_gran_levels < 0) { 853 __kmp_affinity_gran_levels = 0; 854 } 855 856 if (__kmp_affinity_verbose) { 857 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 858 } 859 860 *address2os = retval; 861 KMP_CPU_FREE(oldMask); 862 return 1; 863 } 864 865 // Sort the table by physical Id. 866 qsort(retval, nActiveThreads, sizeof(*retval), 867 __kmp_affinity_cmp_Address_labels); 868 869 // Find any levels with radiix 1, and remove them from the map 870 // (except for the package level). 871 depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth, 872 types); 873 874 __kmp_affinity_gather_enumeration_information(retval, nActiveThreads, depth, 875 types, ratio, count); 876 877 for (int level = 0; level < depth; ++level) { 878 if ((types[level] == KMP_HW_L2 || types[level] == KMP_HW_L3)) 879 __kmp_tile_depth = level; 880 } 881 882 // This routine should set __kmp_ncores, as well as 883 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 884 int thread_level, core_level, tile_level, numa_level, socket_level; 885 thread_level = core_level = tile_level = numa_level = socket_level = -1; 886 for (int level = 0; level < depth; ++level) { 887 if (types[level] == KMP_HW_THREAD) 888 thread_level = level; 889 else if (types[level] == KMP_HW_CORE) 890 core_level = level; 891 else if (types[level] == KMP_HW_SOCKET) 892 socket_level = level; 893 else if (types[level] == KMP_HW_TILE) 894 tile_level = level; 895 else if (types[level] == KMP_HW_NUMA) 896 numa_level = level; 897 } 898 __kmp_nThreadsPerCore = 899 __kmp_affinity_calculate_ratio(ratio, thread_level, core_level); 900 nCoresPerPkg = 901 __kmp_affinity_calculate_ratio(ratio, core_level, socket_level); 902 if (socket_level >= 0) 903 nPackages = count[socket_level]; 904 else 905 nPackages = 1; 906 if (core_level >= 0) 907 __kmp_ncores = count[core_level]; 908 else 909 __kmp_ncores = 1; 910 911 unsigned uniform = __kmp_affinity_discover_uniformity(depth, ratio, count); 912 913 // Print the machine topology summary. 914 if (__kmp_affinity_verbose) { 915 kmp_hw_t numerator_type, denominator_type; 916 kmp_str_buf_t buf; 917 __kmp_str_buf_init(&buf); 918 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 919 if (uniform) { 920 KMP_INFORM(Uniform, "KMP_AFFINITY"); 921 } else { 922 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 923 } 924 925 __kmp_str_buf_clear(&buf); 926 927 if (core_level < 0) 928 core_level = depth - 1; 929 int ncores = count[core_level]; 930 931 denominator_type = KMP_HW_UNKNOWN; 932 for (int level = 0; level < depth; ++level) { 933 int c; 934 bool plural; 935 numerator_type = types[level]; 936 c = ratio[level]; 937 plural = (c > 1); 938 if (level == 0) { 939 __kmp_str_buf_print( 940 &buf, "%d %s", c, 941 __kmp_hw_get_catalog_string(numerator_type, plural)); 942 } else { 943 __kmp_str_buf_print(&buf, " x %d %s/%s", c, 944 __kmp_hw_get_catalog_string(numerator_type, plural), 945 __kmp_hw_get_catalog_string(denominator_type)); 946 } 947 denominator_type = numerator_type; 948 } 949 KMP_INFORM(TopologyGeneric, "KMP_AFFINITY", buf.str, ncores); 950 __kmp_str_buf_free(&buf); 951 } 952 953 if (__kmp_affinity_type == affinity_none) { 954 __kmp_free(retval); 955 KMP_CPU_FREE(oldMask); 956 return 0; 957 } 958 959 // Set the granularity level based on what levels are modeled 960 // in the machine topology map. 961 if (__kmp_affinity_gran == affinity_gran_node) 962 __kmp_affinity_gran = affinity_gran_numa; 963 KMP_DEBUG_ASSERT(__kmp_affinity_gran != affinity_gran_default); 964 if (__kmp_affinity_gran_levels < 0) { 965 __kmp_affinity_gran_levels = 0; // lowest level (e.g. fine) 966 if ((thread_level >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) 967 __kmp_affinity_gran_levels++; 968 if ((core_level >= 0) && (__kmp_affinity_gran > affinity_gran_core)) 969 __kmp_affinity_gran_levels++; 970 if ((tile_level >= 0) && (__kmp_affinity_gran > affinity_gran_tile)) 971 __kmp_affinity_gran_levels++; 972 if ((numa_level >= 0) && (__kmp_affinity_gran > affinity_gran_numa)) 973 __kmp_affinity_gran_levels++; 974 if ((socket_level >= 0) && (__kmp_affinity_gran > affinity_gran_package)) 975 __kmp_affinity_gran_levels++; 976 } 977 978 if (__kmp_affinity_verbose) 979 __kmp_affinity_print_topology(retval, nActiveThreads, depth, types); 980 981 KMP_CPU_FREE(oldMask); 982 *address2os = retval; 983 return depth; 984 } 985 #endif // KMP_USE_HWLOC 986 987 // If we don't know how to retrieve the machine's processor topology, or 988 // encounter an error in doing so, this routine is called to form a "flat" 989 // mapping of os thread id's <-> processor id's. 990 static int __kmp_affinity_create_flat_map(AddrUnsPair **address2os, 991 kmp_i18n_id_t *const msg_id) { 992 *address2os = NULL; 993 *msg_id = kmp_i18n_null; 994 995 // Even if __kmp_affinity_type == affinity_none, this routine might still 996 // called to set __kmp_ncores, as well as 997 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 998 if (!KMP_AFFINITY_CAPABLE()) { 999 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1000 __kmp_ncores = nPackages = __kmp_xproc; 1001 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1002 if (__kmp_affinity_verbose) { 1003 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY"); 1004 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1005 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1006 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1007 __kmp_nThreadsPerCore, __kmp_ncores); 1008 } 1009 return 0; 1010 } 1011 1012 // When affinity is off, this routine will still be called to set 1013 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1014 // Make sure all these vars are set correctly, and return now if affinity is 1015 // not enabled. 1016 __kmp_ncores = nPackages = __kmp_avail_proc; 1017 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1018 if (__kmp_affinity_verbose) { 1019 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY"); 1020 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1021 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1022 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1023 __kmp_nThreadsPerCore, __kmp_ncores); 1024 } 1025 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 1026 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 1027 if (__kmp_affinity_type == affinity_none) { 1028 int avail_ct = 0; 1029 int i; 1030 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1031 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) 1032 continue; 1033 __kmp_pu_os_idx[avail_ct++] = i; // suppose indices are flat 1034 } 1035 return 0; 1036 } 1037 1038 // Construct the data structure to be returned. 1039 *address2os = 1040 (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 1041 int avail_ct = 0; 1042 int i; 1043 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1044 // Skip this proc if it is not included in the machine model. 1045 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1046 continue; 1047 } 1048 __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat 1049 Address addr(1); 1050 addr.labels[0] = i; 1051 (*address2os)[avail_ct++] = AddrUnsPair(addr, i); 1052 } 1053 if (__kmp_affinity_verbose) { 1054 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 1055 } 1056 1057 if (__kmp_affinity_gran_levels < 0) { 1058 // Only the package level is modeled in the machine topology map, 1059 // so the #levels of granularity is either 0 or 1. 1060 if (__kmp_affinity_gran > affinity_gran_package) { 1061 __kmp_affinity_gran_levels = 1; 1062 } else { 1063 __kmp_affinity_gran_levels = 0; 1064 } 1065 } 1066 return 1; 1067 } 1068 1069 #if KMP_GROUP_AFFINITY 1070 1071 // If multiple Windows* OS processor groups exist, we can create a 2-level 1072 // topology map with the groups at level 0 and the individual procs at level 1. 1073 // This facilitates letting the threads float among all procs in a group, 1074 // if granularity=group (the default when there are multiple groups). 1075 static int __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os, 1076 kmp_i18n_id_t *const msg_id) { 1077 *address2os = NULL; 1078 *msg_id = kmp_i18n_null; 1079 1080 // If we aren't affinity capable, then return now. 1081 // The flat mapping will be used. 1082 if (!KMP_AFFINITY_CAPABLE()) { 1083 // FIXME set *msg_id 1084 return -1; 1085 } 1086 1087 // Construct the data structure to be returned. 1088 *address2os = 1089 (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 1090 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 1091 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 1092 int avail_ct = 0; 1093 int i; 1094 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1095 // Skip this proc if it is not included in the machine model. 1096 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1097 continue; 1098 } 1099 __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat 1100 Address addr(2); 1101 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR)); 1102 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR)); 1103 (*address2os)[avail_ct++] = AddrUnsPair(addr, i); 1104 1105 if (__kmp_affinity_verbose) { 1106 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0], 1107 addr.labels[1]); 1108 } 1109 } 1110 1111 if (__kmp_affinity_gran_levels < 0) { 1112 if (__kmp_affinity_gran == affinity_gran_group) { 1113 __kmp_affinity_gran_levels = 1; 1114 } else if ((__kmp_affinity_gran == affinity_gran_fine) || 1115 (__kmp_affinity_gran == affinity_gran_thread)) { 1116 __kmp_affinity_gran_levels = 0; 1117 } else { 1118 const char *gran_str = NULL; 1119 if (__kmp_affinity_gran == affinity_gran_core) { 1120 gran_str = "core"; 1121 } else if (__kmp_affinity_gran == affinity_gran_package) { 1122 gran_str = "package"; 1123 } else if (__kmp_affinity_gran == affinity_gran_node) { 1124 gran_str = "node"; 1125 } else { 1126 KMP_ASSERT(0); 1127 } 1128 1129 // Warning: can't use affinity granularity \"gran\" with group topology 1130 // method, using "thread" 1131 __kmp_affinity_gran_levels = 0; 1132 } 1133 } 1134 return 2; 1135 } 1136 1137 #endif /* KMP_GROUP_AFFINITY */ 1138 1139 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1140 1141 /* 1142 * CPUID.B or 1F, Input ECX (sub leaf # aka level number) 1143 Bits Bits Bits Bits 1144 31-16 15-8 7-4 4-0 1145 ---+-----------+--------------+-------------+-----------------+ 1146 EAX| reserved | reserved | reserved | Bits to Shift | 1147 ---+-----------|--------------+-------------+-----------------| 1148 EBX| reserved | Num logical processors at level (16 bits) | 1149 ---+-----------|--------------+-------------------------------| 1150 ECX| reserved | Level Type | Level Number (8 bits) | 1151 ---+-----------+--------------+-------------------------------| 1152 EDX| X2APIC ID (32 bits) | 1153 ---+----------------------------------------------------------+ 1154 */ 1155 1156 enum { 1157 INTEL_LEVEL_TYPE_INVALID = 0, // Package level 1158 INTEL_LEVEL_TYPE_SMT = 1, 1159 INTEL_LEVEL_TYPE_CORE = 2, 1160 INTEL_LEVEL_TYPE_TILE = 3, 1161 INTEL_LEVEL_TYPE_MODULE = 4, 1162 INTEL_LEVEL_TYPE_DIE = 5, 1163 INTEL_LEVEL_TYPE_LAST = 6, 1164 }; 1165 1166 struct cpuid_level_info_t { 1167 unsigned level_type, mask, mask_width, nitems, cache_mask; 1168 }; 1169 1170 template <kmp_uint32 LSB, kmp_uint32 MSB> 1171 static inline unsigned __kmp_extract_bits(kmp_uint32 v) { 1172 const kmp_uint32 SHIFT_LEFT = sizeof(kmp_uint32) * 8 - 1 - MSB; 1173 const kmp_uint32 SHIFT_RIGHT = LSB; 1174 kmp_uint32 retval = v; 1175 retval <<= SHIFT_LEFT; 1176 retval >>= (SHIFT_LEFT + SHIFT_RIGHT); 1177 return retval; 1178 } 1179 1180 static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) { 1181 switch (intel_type) { 1182 case INTEL_LEVEL_TYPE_INVALID: 1183 return KMP_HW_SOCKET; 1184 case INTEL_LEVEL_TYPE_SMT: 1185 return KMP_HW_THREAD; 1186 case INTEL_LEVEL_TYPE_CORE: 1187 return KMP_HW_CORE; 1188 // TODO: add support for the tile and module 1189 case INTEL_LEVEL_TYPE_TILE: 1190 return KMP_HW_UNKNOWN; 1191 case INTEL_LEVEL_TYPE_MODULE: 1192 return KMP_HW_UNKNOWN; 1193 case INTEL_LEVEL_TYPE_DIE: 1194 return KMP_HW_DIE; 1195 } 1196 return KMP_HW_UNKNOWN; 1197 } 1198 1199 // This function takes the topology leaf, a levels array to store the levels 1200 // detected and a bitmap of the known levels. 1201 // Returns the number of levels in the topology 1202 static unsigned 1203 __kmp_x2apicid_get_levels(int leaf, 1204 cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST], 1205 kmp_uint64 known_levels) { 1206 unsigned level, levels_index; 1207 unsigned level_type, mask_width, nitems; 1208 kmp_cpuid buf; 1209 1210 // The new algorithm has known topology layers act as highest unknown topology 1211 // layers when unknown topology layers exist. 1212 // e.g., Suppose layers were SMT CORE <Y> <Z> PACKAGE 1213 // Then CORE will take the characteristics (nitems and mask width) of <Z>. 1214 // In developing the id mask for each layer, this eliminates unknown portions 1215 // of the topology while still keeping the correct underlying structure. 1216 level = levels_index = 0; 1217 do { 1218 __kmp_x86_cpuid(leaf, level, &buf); 1219 level_type = __kmp_extract_bits<8, 15>(buf.ecx); 1220 mask_width = __kmp_extract_bits<0, 4>(buf.eax); 1221 nitems = __kmp_extract_bits<0, 15>(buf.ebx); 1222 if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0) 1223 return 0; 1224 1225 if (known_levels & (1ull << level_type)) { 1226 // Add a new level to the topology 1227 KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST); 1228 levels[levels_index].level_type = level_type; 1229 levels[levels_index].mask_width = mask_width; 1230 levels[levels_index].nitems = nitems; 1231 levels_index++; 1232 } else { 1233 // If it is an unknown level, then logically move the previous layer up 1234 if (levels_index > 0) { 1235 levels[levels_index - 1].mask_width = mask_width; 1236 levels[levels_index - 1].nitems = nitems; 1237 } 1238 } 1239 level++; 1240 } while (level_type != INTEL_LEVEL_TYPE_INVALID); 1241 1242 // Set the masks to & with apicid 1243 for (unsigned i = 0; i < levels_index; ++i) { 1244 if (levels[i].level_type != INTEL_LEVEL_TYPE_INVALID) { 1245 levels[i].mask = ~((-1) << levels[i].mask_width); 1246 levels[i].cache_mask = (-1) << levels[i].mask_width; 1247 for (unsigned j = 0; j < i; ++j) 1248 levels[i].mask ^= levels[j].mask; 1249 } else { 1250 KMP_DEBUG_ASSERT(levels_index > 0); 1251 levels[i].mask = (-1) << levels[i - 1].mask_width; 1252 levels[i].cache_mask = 0; 1253 } 1254 } 1255 return levels_index; 1256 } 1257 1258 static int __kmp_cpuid_mask_width(int count) { 1259 int r = 0; 1260 1261 while ((1 << r) < count) 1262 ++r; 1263 return r; 1264 } 1265 1266 class apicThreadInfo { 1267 public: 1268 unsigned osId; // param to __kmp_affinity_bind_thread 1269 unsigned apicId; // from cpuid after binding 1270 unsigned maxCoresPerPkg; // "" 1271 unsigned maxThreadsPerPkg; // "" 1272 unsigned pkgId; // inferred from above values 1273 unsigned coreId; // "" 1274 unsigned threadId; // "" 1275 }; 1276 1277 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, 1278 const void *b) { 1279 const apicThreadInfo *aa = (const apicThreadInfo *)a; 1280 const apicThreadInfo *bb = (const apicThreadInfo *)b; 1281 if (aa->pkgId < bb->pkgId) 1282 return -1; 1283 if (aa->pkgId > bb->pkgId) 1284 return 1; 1285 if (aa->coreId < bb->coreId) 1286 return -1; 1287 if (aa->coreId > bb->coreId) 1288 return 1; 1289 if (aa->threadId < bb->threadId) 1290 return -1; 1291 if (aa->threadId > bb->threadId) 1292 return 1; 1293 return 0; 1294 } 1295 1296 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 1297 // an algorithm which cycles through the available os threads, setting 1298 // the current thread's affinity mask to that thread, and then retrieves 1299 // the Apic Id for each thread context using the cpuid instruction. 1300 static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os, 1301 kmp_i18n_id_t *const msg_id) { 1302 kmp_cpuid buf; 1303 *address2os = NULL; 1304 *msg_id = kmp_i18n_null; 1305 1306 // Check if cpuid leaf 4 is supported. 1307 __kmp_x86_cpuid(0, 0, &buf); 1308 if (buf.eax < 4) { 1309 *msg_id = kmp_i18n_str_NoLeaf4Support; 1310 return -1; 1311 } 1312 1313 // The algorithm used starts by setting the affinity to each available thread 1314 // and retrieving info from the cpuid instruction, so if we are not capable of 1315 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we 1316 // need to do something else - use the defaults that we calculated from 1317 // issuing cpuid without binding to each proc. 1318 if (!KMP_AFFINITY_CAPABLE()) { 1319 // Hack to try and infer the machine topology using only the data 1320 // available from cpuid on the current thread, and __kmp_xproc. 1321 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1322 1323 // Get an upper bound on the number of threads per package using cpuid(1). 1324 // On some OS/chps combinations where HT is supported by the chip but is 1325 // disabled, this value will be 2 on a single core chip. Usually, it will be 1326 // 2 if HT is enabled and 1 if HT is disabled. 1327 __kmp_x86_cpuid(1, 0, &buf); 1328 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1329 if (maxThreadsPerPkg == 0) { 1330 maxThreadsPerPkg = 1; 1331 } 1332 1333 // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded 1334 // value. 1335 // 1336 // The author of cpu_count.cpp treated this only an upper bound on the 1337 // number of cores, but I haven't seen any cases where it was greater than 1338 // the actual number of cores, so we will treat it as exact in this block of 1339 // code. 1340 // 1341 // First, we need to check if cpuid(4) is supported on this chip. To see if 1342 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or 1343 // greater. 1344 __kmp_x86_cpuid(0, 0, &buf); 1345 if (buf.eax >= 4) { 1346 __kmp_x86_cpuid(4, 0, &buf); 1347 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1348 } else { 1349 nCoresPerPkg = 1; 1350 } 1351 1352 // There is no way to reliably tell if HT is enabled without issuing the 1353 // cpuid instruction from every thread, can correlating the cpuid info, so 1354 // if the machine is not affinity capable, we assume that HT is off. We have 1355 // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine 1356 // does not support HT. 1357 // 1358 // - Older OSes are usually found on machines with older chips, which do not 1359 // support HT. 1360 // - The performance penalty for mistakenly identifying a machine as HT when 1361 // it isn't (which results in blocktime being incorrectly set to 0) is 1362 // greater than the penalty when for mistakenly identifying a machine as 1363 // being 1 thread/core when it is really HT enabled (which results in 1364 // blocktime being incorrectly set to a positive value). 1365 __kmp_ncores = __kmp_xproc; 1366 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1367 __kmp_nThreadsPerCore = 1; 1368 if (__kmp_affinity_verbose) { 1369 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY"); 1370 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1371 if (__kmp_affinity_uniform_topology()) { 1372 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1373 } else { 1374 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1375 } 1376 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1377 __kmp_nThreadsPerCore, __kmp_ncores); 1378 } 1379 return 0; 1380 } 1381 1382 // From here on, we can assume that it is safe to call 1383 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 1384 // __kmp_affinity_type = affinity_none. 1385 1386 // Save the affinity mask for the current thread. 1387 kmp_affin_mask_t *oldMask; 1388 KMP_CPU_ALLOC(oldMask); 1389 KMP_ASSERT(oldMask != NULL); 1390 __kmp_get_system_affinity(oldMask, TRUE); 1391 1392 // Run through each of the available contexts, binding the current thread 1393 // to it, and obtaining the pertinent information using the cpuid instr. 1394 // 1395 // The relevant information is: 1396 // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 1397 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 1398 // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value 1399 // of this field determines the width of the core# + thread# fields in the 1400 // Apic Id. It is also an upper bound on the number of threads per 1401 // package, but it has been verified that situations happen were it is not 1402 // exact. In particular, on certain OS/chip combinations where Intel(R) 1403 // Hyper-Threading Technology is supported by the chip but has been 1404 // disabled, the value of this field will be 2 (for a single core chip). 1405 // On other OS/chip combinations supporting Intel(R) Hyper-Threading 1406 // Technology, the value of this field will be 1 when Intel(R) 1407 // Hyper-Threading Technology is disabled and 2 when it is enabled. 1408 // - Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The value 1409 // of this field (+1) determines the width of the core# field in the Apic 1410 // Id. The comments in "cpucount.cpp" say that this value is an upper 1411 // bound, but the IA-32 architecture manual says that it is exactly the 1412 // number of cores per package, and I haven't seen any case where it 1413 // wasn't. 1414 // 1415 // From this information, deduce the package Id, core Id, and thread Id, 1416 // and set the corresponding fields in the apicThreadInfo struct. 1417 unsigned i; 1418 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 1419 __kmp_avail_proc * sizeof(apicThreadInfo)); 1420 unsigned nApics = 0; 1421 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1422 // Skip this proc if it is not included in the machine model. 1423 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1424 continue; 1425 } 1426 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 1427 1428 __kmp_affinity_dispatch->bind_thread(i); 1429 threadInfo[nApics].osId = i; 1430 1431 // The apic id and max threads per pkg come from cpuid(1). 1432 __kmp_x86_cpuid(1, 0, &buf); 1433 if (((buf.edx >> 9) & 1) == 0) { 1434 __kmp_set_system_affinity(oldMask, TRUE); 1435 __kmp_free(threadInfo); 1436 KMP_CPU_FREE(oldMask); 1437 *msg_id = kmp_i18n_str_ApicNotPresent; 1438 return -1; 1439 } 1440 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 1441 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1442 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 1443 threadInfo[nApics].maxThreadsPerPkg = 1; 1444 } 1445 1446 // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded 1447 // value. 1448 // 1449 // First, we need to check if cpuid(4) is supported on this chip. To see if 1450 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n 1451 // or greater. 1452 __kmp_x86_cpuid(0, 0, &buf); 1453 if (buf.eax >= 4) { 1454 __kmp_x86_cpuid(4, 0, &buf); 1455 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1456 } else { 1457 threadInfo[nApics].maxCoresPerPkg = 1; 1458 } 1459 1460 // Infer the pkgId / coreId / threadId using only the info obtained locally. 1461 int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg); 1462 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 1463 1464 int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg); 1465 int widthT = widthCT - widthC; 1466 if (widthT < 0) { 1467 // I've never seen this one happen, but I suppose it could, if the cpuid 1468 // instruction on a chip was really screwed up. Make sure to restore the 1469 // affinity mask before the tail call. 1470 __kmp_set_system_affinity(oldMask, TRUE); 1471 __kmp_free(threadInfo); 1472 KMP_CPU_FREE(oldMask); 1473 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1474 return -1; 1475 } 1476 1477 int maskC = (1 << widthC) - 1; 1478 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC; 1479 1480 int maskT = (1 << widthT) - 1; 1481 threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT; 1482 1483 nApics++; 1484 } 1485 1486 // We've collected all the info we need. 1487 // Restore the old affinity mask for this thread. 1488 __kmp_set_system_affinity(oldMask, TRUE); 1489 1490 // If there's only one thread context to bind to, form an Address object 1491 // with depth 1 and return immediately (or, if affinity is off, set 1492 // address2os to NULL and return). 1493 // 1494 // If it is configured to omit the package level when there is only a single 1495 // package, the logic at the end of this routine won't work if there is only 1496 // a single thread - it would try to form an Address object with depth 0. 1497 KMP_ASSERT(nApics > 0); 1498 if (nApics == 1) { 1499 __kmp_ncores = nPackages = 1; 1500 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1501 if (__kmp_affinity_verbose) { 1502 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1503 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1504 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1505 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1506 __kmp_nThreadsPerCore, __kmp_ncores); 1507 } 1508 1509 if (__kmp_affinity_type == affinity_none) { 1510 __kmp_free(threadInfo); 1511 KMP_CPU_FREE(oldMask); 1512 return 0; 1513 } 1514 1515 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair)); 1516 Address addr(1); 1517 addr.labels[0] = threadInfo[0].pkgId; 1518 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId); 1519 1520 if (__kmp_affinity_gran_levels < 0) { 1521 __kmp_affinity_gran_levels = 0; 1522 } 1523 1524 if (__kmp_affinity_verbose) { 1525 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 1526 } 1527 1528 __kmp_free(threadInfo); 1529 KMP_CPU_FREE(oldMask); 1530 return 1; 1531 } 1532 1533 // Sort the threadInfo table by physical Id. 1534 qsort(threadInfo, nApics, sizeof(*threadInfo), 1535 __kmp_affinity_cmp_apicThreadInfo_phys_id); 1536 1537 // The table is now sorted by pkgId / coreId / threadId, but we really don't 1538 // know the radix of any of the fields. pkgId's may be sparsely assigned among 1539 // the chips on a system. Although coreId's are usually assigned 1540 // [0 .. coresPerPkg-1] and threadId's are usually assigned 1541 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1542 // 1543 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 1544 // total # packages) are at this point - we want to determine that now. We 1545 // only have an upper bound on the first two figures. 1546 // 1547 // We also perform a consistency check at this point: the values returned by 1548 // the cpuid instruction for any thread bound to a given package had better 1549 // return the same info for maxThreadsPerPkg and maxCoresPerPkg. 1550 nPackages = 1; 1551 nCoresPerPkg = 1; 1552 __kmp_nThreadsPerCore = 1; 1553 unsigned nCores = 1; 1554 1555 unsigned pkgCt = 1; // to determine radii 1556 unsigned lastPkgId = threadInfo[0].pkgId; 1557 unsigned coreCt = 1; 1558 unsigned lastCoreId = threadInfo[0].coreId; 1559 unsigned threadCt = 1; 1560 unsigned lastThreadId = threadInfo[0].threadId; 1561 1562 // intra-pkg consist checks 1563 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 1564 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 1565 1566 for (i = 1; i < nApics; i++) { 1567 if (threadInfo[i].pkgId != lastPkgId) { 1568 nCores++; 1569 pkgCt++; 1570 lastPkgId = threadInfo[i].pkgId; 1571 if ((int)coreCt > nCoresPerPkg) 1572 nCoresPerPkg = coreCt; 1573 coreCt = 1; 1574 lastCoreId = threadInfo[i].coreId; 1575 if ((int)threadCt > __kmp_nThreadsPerCore) 1576 __kmp_nThreadsPerCore = threadCt; 1577 threadCt = 1; 1578 lastThreadId = threadInfo[i].threadId; 1579 1580 // This is a different package, so go on to the next iteration without 1581 // doing any consistency checks. Reset the consistency check vars, though. 1582 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 1583 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 1584 continue; 1585 } 1586 1587 if (threadInfo[i].coreId != lastCoreId) { 1588 nCores++; 1589 coreCt++; 1590 lastCoreId = threadInfo[i].coreId; 1591 if ((int)threadCt > __kmp_nThreadsPerCore) 1592 __kmp_nThreadsPerCore = threadCt; 1593 threadCt = 1; 1594 lastThreadId = threadInfo[i].threadId; 1595 } else if (threadInfo[i].threadId != lastThreadId) { 1596 threadCt++; 1597 lastThreadId = threadInfo[i].threadId; 1598 } else { 1599 __kmp_free(threadInfo); 1600 KMP_CPU_FREE(oldMask); 1601 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1602 return -1; 1603 } 1604 1605 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 1606 // fields agree between all the threads bounds to a given package. 1607 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) || 1608 (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 1609 __kmp_free(threadInfo); 1610 KMP_CPU_FREE(oldMask); 1611 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1612 return -1; 1613 } 1614 } 1615 nPackages = pkgCt; 1616 if ((int)coreCt > nCoresPerPkg) 1617 nCoresPerPkg = coreCt; 1618 if ((int)threadCt > __kmp_nThreadsPerCore) 1619 __kmp_nThreadsPerCore = threadCt; 1620 1621 // When affinity is off, this routine will still be called to set 1622 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1623 // Make sure all these vars are set correctly, and return now if affinity is 1624 // not enabled. 1625 __kmp_ncores = nCores; 1626 if (__kmp_affinity_verbose) { 1627 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1628 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1629 if (__kmp_affinity_uniform_topology()) { 1630 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1631 } else { 1632 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1633 } 1634 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1635 __kmp_nThreadsPerCore, __kmp_ncores); 1636 } 1637 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 1638 KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc); 1639 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 1640 for (i = 0; i < nApics; ++i) { 1641 __kmp_pu_os_idx[i] = threadInfo[i].osId; 1642 } 1643 if (__kmp_affinity_type == affinity_none) { 1644 __kmp_free(threadInfo); 1645 KMP_CPU_FREE(oldMask); 1646 return 0; 1647 } 1648 1649 // Now that we've determined the number of packages, the number of cores per 1650 // package, and the number of threads per core, we can construct the data 1651 // structure that is to be returned. 1652 int pkgLevel = 0; 1653 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1; 1654 int threadLevel = 1655 (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 1656 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 1657 1658 KMP_ASSERT(depth > 0); 1659 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1660 1661 for (i = 0; i < nApics; ++i) { 1662 Address addr(depth); 1663 unsigned os = threadInfo[i].osId; 1664 int d = 0; 1665 1666 if (pkgLevel >= 0) { 1667 addr.labels[d++] = threadInfo[i].pkgId; 1668 } 1669 if (coreLevel >= 0) { 1670 addr.labels[d++] = threadInfo[i].coreId; 1671 } 1672 if (threadLevel >= 0) { 1673 addr.labels[d++] = threadInfo[i].threadId; 1674 } 1675 (*address2os)[i] = AddrUnsPair(addr, os); 1676 } 1677 1678 if (__kmp_affinity_gran_levels < 0) { 1679 // Set the granularity level based on what levels are modeled in the machine 1680 // topology map. 1681 __kmp_affinity_gran_levels = 0; 1682 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1683 __kmp_affinity_gran_levels++; 1684 } 1685 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1686 __kmp_affinity_gran_levels++; 1687 } 1688 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) { 1689 __kmp_affinity_gran_levels++; 1690 } 1691 } 1692 1693 if (__kmp_affinity_verbose) { 1694 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel, 1695 coreLevel, threadLevel); 1696 } 1697 1698 __kmp_free(threadInfo); 1699 KMP_CPU_FREE(oldMask); 1700 return depth; 1701 } 1702 1703 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 1704 // architectures support a newer interface for specifying the x2APIC Ids, 1705 // based on CPUID.B or CPUID.1F 1706 static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os, 1707 kmp_i18n_id_t *const msg_id) { 1708 1709 cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST]; 1710 int ratio[KMP_HW_LAST]; 1711 int count[KMP_HW_LAST]; 1712 kmp_hw_t types[INTEL_LEVEL_TYPE_LAST]; 1713 unsigned levels_index; 1714 kmp_cpuid buf; 1715 kmp_uint64 known_levels; 1716 int topology_leaf, highest_leaf, apic_id; 1717 int num_leaves; 1718 static int leaves[] = {0, 0}; 1719 1720 kmp_i18n_id_t leaf_message_id; 1721 1722 KMP_BUILD_ASSERT(sizeof(known_levels) * CHAR_BIT > KMP_HW_LAST); 1723 1724 *msg_id = kmp_i18n_null; 1725 1726 // Figure out the known topology levels 1727 known_levels = 0ull; 1728 for (int i = 0; i < INTEL_LEVEL_TYPE_LAST; ++i) { 1729 if (__kmp_intel_type_2_topology_type(i) != KMP_HW_UNKNOWN) { 1730 known_levels |= (1ull << i); 1731 } 1732 } 1733 1734 // Get the highest cpuid leaf supported 1735 __kmp_x86_cpuid(0, 0, &buf); 1736 highest_leaf = buf.eax; 1737 1738 // If a specific topology method was requested, only allow that specific leaf 1739 // otherwise, try both leaves 31 and 11 in that order 1740 num_leaves = 0; 1741 if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 1742 num_leaves = 1; 1743 leaves[0] = 11; 1744 leaf_message_id = kmp_i18n_str_NoLeaf11Support; 1745 } else if (__kmp_affinity_top_method == affinity_top_method_x2apicid_1f) { 1746 num_leaves = 1; 1747 leaves[0] = 31; 1748 leaf_message_id = kmp_i18n_str_NoLeaf31Support; 1749 } else { 1750 num_leaves = 2; 1751 leaves[0] = 31; 1752 leaves[1] = 11; 1753 leaf_message_id = kmp_i18n_str_NoLeaf11Support; 1754 } 1755 1756 // Check to see if cpuid leaf 31 or 11 is supported. 1757 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1758 topology_leaf = -1; 1759 for (int i = 0; i < num_leaves; ++i) { 1760 int leaf = leaves[i]; 1761 if (highest_leaf < leaf) 1762 continue; 1763 __kmp_x86_cpuid(leaf, 0, &buf); 1764 if (buf.ebx == 0) 1765 continue; 1766 topology_leaf = leaf; 1767 levels_index = __kmp_x2apicid_get_levels(leaf, levels, known_levels); 1768 if (levels_index == 0) 1769 continue; 1770 break; 1771 } 1772 if (topology_leaf == -1 || levels_index == 0) { 1773 *msg_id = leaf_message_id; 1774 return -1; 1775 } 1776 KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST); 1777 1778 // The algorithm used starts by setting the affinity to each available thread 1779 // and retrieving info from the cpuid instruction, so if we are not capable of 1780 // calling __kmp_get_system_affinity() and __kmp_get_system_affinity(), then 1781 // we need to do something else - use the defaults that we calculated from 1782 // issuing cpuid without binding to each proc. 1783 if (!KMP_AFFINITY_CAPABLE()) { 1784 // Hack to try and infer the machine topology using only the data 1785 // available from cpuid on the current thread, and __kmp_xproc. 1786 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1787 1788 for (unsigned i = 0; i < levels_index; ++i) { 1789 if (levels[i].level_type == INTEL_LEVEL_TYPE_SMT) { 1790 __kmp_nThreadsPerCore = levels[i].nitems; 1791 } else if (levels[i].level_type == INTEL_LEVEL_TYPE_CORE) { 1792 nCoresPerPkg = levels[i].nitems; 1793 } else if (levels[i].level_type == INTEL_LEVEL_TYPE_DIE) { 1794 nDiesPerPkg = levels[i].nitems; 1795 } 1796 } 1797 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1798 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1799 if (__kmp_affinity_verbose) { 1800 KMP_INFORM(AffNotCapableUseLocCpuidL, "KMP_AFFINITY", topology_leaf); 1801 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1802 if (__kmp_affinity_uniform_topology()) { 1803 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1804 } else { 1805 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1806 } 1807 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1808 __kmp_nThreadsPerCore, __kmp_ncores); 1809 } 1810 return 0; 1811 } 1812 1813 // From here on, we can assume that it is safe to call 1814 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 1815 // __kmp_affinity_type = affinity_none. 1816 1817 // Save the affinity mask for the current thread. 1818 kmp_affin_mask_t *oldMask; 1819 KMP_CPU_ALLOC(oldMask); 1820 __kmp_get_system_affinity(oldMask, TRUE); 1821 1822 // Allocate the data structure to be returned. 1823 int depth = levels_index; 1824 for (int i = depth - 1, j = 0; i >= 0; --i, ++j) 1825 types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type); 1826 AddrUnsPair *retval = 1827 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 1828 1829 // Run through each of the available contexts, binding the current thread 1830 // to it, and obtaining the pertinent information using the cpuid instr. 1831 unsigned int proc; 1832 int nApics = 0; 1833 KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) { 1834 cpuid_level_info_t my_levels[INTEL_LEVEL_TYPE_LAST]; 1835 unsigned my_levels_index; 1836 1837 // Skip this proc if it is not included in the machine model. 1838 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 1839 continue; 1840 } 1841 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc); 1842 1843 __kmp_affinity_dispatch->bind_thread(proc); 1844 1845 // New algorithm 1846 __kmp_x86_cpuid(topology_leaf, 0, &buf); 1847 apic_id = buf.edx; 1848 Address addr(depth); 1849 my_levels_index = 1850 __kmp_x2apicid_get_levels(topology_leaf, my_levels, known_levels); 1851 if (my_levels_index == 0 || my_levels_index != levels_index) { 1852 KMP_CPU_FREE(oldMask); 1853 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1854 return -1; 1855 } 1856 // Put in topology information 1857 for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) { 1858 addr.labels[idx] = apic_id & my_levels[j].mask; 1859 if (j > 0) 1860 addr.labels[idx] >>= my_levels[j - 1].mask_width; 1861 } 1862 retval[nApics++] = AddrUnsPair(addr, proc); 1863 } 1864 1865 // We've collected all the info we need. 1866 // Restore the old affinity mask for this thread. 1867 __kmp_set_system_affinity(oldMask, TRUE); 1868 1869 // If there's only one thread context to bind to, return now. 1870 KMP_ASSERT(nApics > 0); 1871 if (nApics == 1) { 1872 int pkg_level; 1873 __kmp_ncores = nPackages = 1; 1874 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1875 if (__kmp_affinity_verbose) { 1876 KMP_INFORM(AffUseGlobCpuidL, "KMP_AFFINITY", topology_leaf); 1877 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1878 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1879 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1880 __kmp_nThreadsPerCore, __kmp_ncores); 1881 } 1882 1883 if (__kmp_affinity_type == affinity_none) { 1884 __kmp_free(retval); 1885 KMP_CPU_FREE(oldMask); 1886 return 0; 1887 } 1888 1889 pkg_level = 0; 1890 for (int i = 0; i < depth; ++i) 1891 if (types[i] == KMP_HW_SOCKET) { 1892 pkg_level = i; 1893 break; 1894 } 1895 // Form an Address object which only includes the package level. 1896 Address addr(1); 1897 addr.labels[0] = retval[0].first.labels[pkg_level]; 1898 retval[0].first = addr; 1899 1900 if (__kmp_affinity_gran_levels < 0) { 1901 __kmp_affinity_gran_levels = 0; 1902 } 1903 1904 if (__kmp_affinity_verbose) { 1905 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 1906 } 1907 1908 *address2os = retval; 1909 KMP_CPU_FREE(oldMask); 1910 return 1; 1911 } 1912 1913 // Sort the table by physical Id. 1914 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels); 1915 1916 __kmp_affinity_gather_enumeration_information(retval, nApics, depth, types, 1917 ratio, count); 1918 1919 // When affinity is off, this routine will still be called to set 1920 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1921 // Make sure all these vars are set correctly, and return if affinity is not 1922 // enabled. 1923 int thread_level, core_level, socket_level, die_level; 1924 thread_level = core_level = die_level = socket_level = -1; 1925 for (int level = 0; level < depth; ++level) { 1926 if (types[level] == KMP_HW_THREAD) 1927 thread_level = level; 1928 else if (types[level] == KMP_HW_CORE) 1929 core_level = level; 1930 else if (types[level] == KMP_HW_DIE) 1931 die_level = level; 1932 else if (types[level] == KMP_HW_SOCKET) 1933 socket_level = level; 1934 } 1935 __kmp_nThreadsPerCore = 1936 __kmp_affinity_calculate_ratio(ratio, thread_level, core_level); 1937 if (die_level > 0) { 1938 nDiesPerPkg = 1939 __kmp_affinity_calculate_ratio(ratio, die_level, socket_level); 1940 nCoresPerPkg = __kmp_affinity_calculate_ratio(ratio, core_level, die_level); 1941 } else { 1942 nCoresPerPkg = 1943 __kmp_affinity_calculate_ratio(ratio, core_level, socket_level); 1944 } 1945 if (socket_level >= 0) 1946 nPackages = count[socket_level]; 1947 else 1948 nPackages = 1; 1949 if (core_level >= 0) 1950 __kmp_ncores = count[core_level]; 1951 else 1952 __kmp_ncores = 1; 1953 1954 // Check to see if the machine topology is uniform 1955 unsigned uniform = __kmp_affinity_discover_uniformity(depth, ratio, count); 1956 1957 // Print the machine topology summary. 1958 if (__kmp_affinity_verbose) { 1959 kmp_hw_t numerator_type, denominator_type; 1960 KMP_INFORM(AffUseGlobCpuidL, "KMP_AFFINITY", topology_leaf); 1961 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1962 if (uniform) { 1963 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1964 } else { 1965 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1966 } 1967 1968 kmp_str_buf_t buf; 1969 __kmp_str_buf_init(&buf); 1970 1971 if (core_level < 0) 1972 core_level = depth - 1; 1973 int ncores = count[core_level]; 1974 1975 denominator_type = KMP_HW_UNKNOWN; 1976 for (int level = 0; level < depth; ++level) { 1977 int c; 1978 bool plural; 1979 numerator_type = types[level]; 1980 c = ratio[level]; 1981 plural = (c > 1); 1982 if (level == 0) { 1983 __kmp_str_buf_print( 1984 &buf, "%d %s", c, 1985 __kmp_hw_get_catalog_string(numerator_type, plural)); 1986 } else { 1987 __kmp_str_buf_print(&buf, " x %d %s/%s", c, 1988 __kmp_hw_get_catalog_string(numerator_type, plural), 1989 __kmp_hw_get_catalog_string(denominator_type)); 1990 } 1991 denominator_type = numerator_type; 1992 } 1993 KMP_INFORM(TopologyGeneric, "KMP_AFFINITY", buf.str, ncores); 1994 __kmp_str_buf_free(&buf); 1995 } 1996 1997 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 1998 KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc); 1999 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 2000 for (proc = 0; (int)proc < nApics; ++proc) { 2001 __kmp_pu_os_idx[proc] = retval[proc].second; 2002 } 2003 if (__kmp_affinity_type == affinity_none) { 2004 __kmp_free(retval); 2005 KMP_CPU_FREE(oldMask); 2006 return 0; 2007 } 2008 2009 // Find any levels with radix 1, and remove them from the map 2010 // (except for the package level). 2011 depth = __kmp_affinity_remove_radix_one_levels(retval, nApics, depth, types); 2012 thread_level = core_level = die_level = socket_level = -1; 2013 for (int level = 0; level < depth; ++level) { 2014 if (types[level] == KMP_HW_THREAD) 2015 thread_level = level; 2016 else if (types[level] == KMP_HW_CORE) 2017 core_level = level; 2018 else if (types[level] == KMP_HW_DIE) 2019 die_level = level; 2020 else if (types[level] == KMP_HW_SOCKET) 2021 socket_level = level; 2022 } 2023 2024 if (__kmp_affinity_gran_levels < 0) { 2025 // Set the granularity level based on what levels are modeled 2026 // in the machine topology map. 2027 __kmp_affinity_gran_levels = 0; 2028 if ((thread_level >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 2029 __kmp_affinity_gran_levels++; 2030 } 2031 if ((core_level >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 2032 __kmp_affinity_gran_levels++; 2033 } 2034 if ((die_level >= 0) && (__kmp_affinity_gran > affinity_gran_die)) { 2035 __kmp_affinity_gran_levels++; 2036 } 2037 if (__kmp_affinity_gran > affinity_gran_package) { 2038 __kmp_affinity_gran_levels++; 2039 } 2040 } 2041 2042 if (__kmp_affinity_verbose) { 2043 __kmp_affinity_print_topology(retval, nApics, depth, types); 2044 } 2045 2046 KMP_CPU_FREE(oldMask); 2047 *address2os = retval; 2048 return depth; 2049 } 2050 2051 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 2052 2053 #define osIdIndex 0 2054 #define threadIdIndex 1 2055 #define coreIdIndex 2 2056 #define pkgIdIndex 3 2057 #define nodeIdIndex 4 2058 2059 typedef unsigned *ProcCpuInfo; 2060 static unsigned maxIndex = pkgIdIndex; 2061 2062 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, 2063 const void *b) { 2064 unsigned i; 2065 const unsigned *aa = *(unsigned *const *)a; 2066 const unsigned *bb = *(unsigned *const *)b; 2067 for (i = maxIndex;; i--) { 2068 if (aa[i] < bb[i]) 2069 return -1; 2070 if (aa[i] > bb[i]) 2071 return 1; 2072 if (i == osIdIndex) 2073 break; 2074 } 2075 return 0; 2076 } 2077 2078 #if KMP_USE_HIER_SCHED 2079 // Set the array sizes for the hierarchy layers 2080 static void __kmp_dispatch_set_hierarchy_values() { 2081 // Set the maximum number of L1's to number of cores 2082 // Set the maximum number of L2's to to either number of cores / 2 for 2083 // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing 2084 // Or the number of cores for Intel(R) Xeon(R) processors 2085 // Set the maximum number of NUMA nodes and L3's to number of packages 2086 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] = 2087 nPackages * nCoresPerPkg * __kmp_nThreadsPerCore; 2088 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores; 2089 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \ 2090 KMP_MIC_SUPPORTED 2091 if (__kmp_mic_type >= mic3) 2092 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2; 2093 else 2094 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) 2095 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores; 2096 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages; 2097 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages; 2098 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1; 2099 // Set the number of threads per unit 2100 // Number of hardware threads per L1/L2/L3/NUMA/LOOP 2101 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1; 2102 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] = 2103 __kmp_nThreadsPerCore; 2104 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \ 2105 KMP_MIC_SUPPORTED 2106 if (__kmp_mic_type >= mic3) 2107 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] = 2108 2 * __kmp_nThreadsPerCore; 2109 else 2110 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) 2111 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] = 2112 __kmp_nThreadsPerCore; 2113 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] = 2114 nCoresPerPkg * __kmp_nThreadsPerCore; 2115 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] = 2116 nCoresPerPkg * __kmp_nThreadsPerCore; 2117 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] = 2118 nPackages * nCoresPerPkg * __kmp_nThreadsPerCore; 2119 } 2120 2121 // Return the index into the hierarchy for this tid and layer type (L1, L2, etc) 2122 // i.e., this thread's L1 or this thread's L2, etc. 2123 int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) { 2124 int index = type + 1; 2125 int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1]; 2126 KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST); 2127 if (type == kmp_hier_layer_e::LAYER_THREAD) 2128 return tid; 2129 else if (type == kmp_hier_layer_e::LAYER_LOOP) 2130 return 0; 2131 KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0); 2132 if (tid >= num_hw_threads) 2133 tid = tid % num_hw_threads; 2134 return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index]; 2135 } 2136 2137 // Return the number of t1's per t2 2138 int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) { 2139 int i1 = t1 + 1; 2140 int i2 = t2 + 1; 2141 KMP_DEBUG_ASSERT(i1 <= i2); 2142 KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST); 2143 KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST); 2144 KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0); 2145 // (nthreads/t2) / (nthreads/t1) = t1 / t2 2146 return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1]; 2147 } 2148 #endif // KMP_USE_HIER_SCHED 2149 2150 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 2151 // affinity map. 2152 static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, 2153 int *line, 2154 kmp_i18n_id_t *const msg_id, 2155 FILE *f) { 2156 *address2os = NULL; 2157 *msg_id = kmp_i18n_null; 2158 2159 // Scan of the file, and count the number of "processor" (osId) fields, 2160 // and find the highest value of <n> for a node_<n> field. 2161 char buf[256]; 2162 unsigned num_records = 0; 2163 while (!feof(f)) { 2164 buf[sizeof(buf) - 1] = 1; 2165 if (!fgets(buf, sizeof(buf), f)) { 2166 // Read errors presumably because of EOF 2167 break; 2168 } 2169 2170 char s1[] = "processor"; 2171 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 2172 num_records++; 2173 continue; 2174 } 2175 2176 // FIXME - this will match "node_<n> <garbage>" 2177 unsigned level; 2178 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 2179 // validate the input fisrt: 2180 if (level > (unsigned)__kmp_xproc) { // level is too big 2181 level = __kmp_xproc; 2182 } 2183 if (nodeIdIndex + level >= maxIndex) { 2184 maxIndex = nodeIdIndex + level; 2185 } 2186 continue; 2187 } 2188 } 2189 2190 // Check for empty file / no valid processor records, or too many. The number 2191 // of records can't exceed the number of valid bits in the affinity mask. 2192 if (num_records == 0) { 2193 *line = 0; 2194 *msg_id = kmp_i18n_str_NoProcRecords; 2195 return -1; 2196 } 2197 if (num_records > (unsigned)__kmp_xproc) { 2198 *line = 0; 2199 *msg_id = kmp_i18n_str_TooManyProcRecords; 2200 return -1; 2201 } 2202 2203 // Set the file pointer back to the beginning, so that we can scan the file 2204 // again, this time performing a full parse of the data. Allocate a vector of 2205 // ProcCpuInfo object, where we will place the data. Adding an extra element 2206 // at the end allows us to remove a lot of extra checks for termination 2207 // conditions. 2208 if (fseek(f, 0, SEEK_SET) != 0) { 2209 *line = 0; 2210 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 2211 return -1; 2212 } 2213 2214 // Allocate the array of records to store the proc info in. The dummy 2215 // element at the end makes the logic in filling them out easier to code. 2216 unsigned **threadInfo = 2217 (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *)); 2218 unsigned i; 2219 for (i = 0; i <= num_records; i++) { 2220 threadInfo[i] = 2221 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2222 } 2223 2224 #define CLEANUP_THREAD_INFO \ 2225 for (i = 0; i <= num_records; i++) { \ 2226 __kmp_free(threadInfo[i]); \ 2227 } \ 2228 __kmp_free(threadInfo); 2229 2230 // A value of UINT_MAX means that we didn't find the field 2231 unsigned __index; 2232 2233 #define INIT_PROC_INFO(p) \ 2234 for (__index = 0; __index <= maxIndex; __index++) { \ 2235 (p)[__index] = UINT_MAX; \ 2236 } 2237 2238 for (i = 0; i <= num_records; i++) { 2239 INIT_PROC_INFO(threadInfo[i]); 2240 } 2241 2242 unsigned num_avail = 0; 2243 *line = 0; 2244 while (!feof(f)) { 2245 // Create an inner scoping level, so that all the goto targets at the end of 2246 // the loop appear in an outer scoping level. This avoids warnings about 2247 // jumping past an initialization to a target in the same block. 2248 { 2249 buf[sizeof(buf) - 1] = 1; 2250 bool long_line = false; 2251 if (!fgets(buf, sizeof(buf), f)) { 2252 // Read errors presumably because of EOF 2253 // If there is valid data in threadInfo[num_avail], then fake 2254 // a blank line in ensure that the last address gets parsed. 2255 bool valid = false; 2256 for (i = 0; i <= maxIndex; i++) { 2257 if (threadInfo[num_avail][i] != UINT_MAX) { 2258 valid = true; 2259 } 2260 } 2261 if (!valid) { 2262 break; 2263 } 2264 buf[0] = 0; 2265 } else if (!buf[sizeof(buf) - 1]) { 2266 // The line is longer than the buffer. Set a flag and don't 2267 // emit an error if we were going to ignore the line, anyway. 2268 long_line = true; 2269 2270 #define CHECK_LINE \ 2271 if (long_line) { \ 2272 CLEANUP_THREAD_INFO; \ 2273 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 2274 return -1; \ 2275 } 2276 } 2277 (*line)++; 2278 2279 char s1[] = "processor"; 2280 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 2281 CHECK_LINE; 2282 char *p = strchr(buf + sizeof(s1) - 1, ':'); 2283 unsigned val; 2284 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2285 goto no_val; 2286 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) 2287 #if KMP_ARCH_AARCH64 2288 // Handle the old AArch64 /proc/cpuinfo layout differently, 2289 // it contains all of the 'processor' entries listed in a 2290 // single 'Processor' section, therefore the normal looking 2291 // for duplicates in that section will always fail. 2292 num_avail++; 2293 #else 2294 goto dup_field; 2295 #endif 2296 threadInfo[num_avail][osIdIndex] = val; 2297 #if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64) 2298 char path[256]; 2299 KMP_SNPRINTF( 2300 path, sizeof(path), 2301 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 2302 threadInfo[num_avail][osIdIndex]); 2303 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 2304 2305 KMP_SNPRINTF(path, sizeof(path), 2306 "/sys/devices/system/cpu/cpu%u/topology/core_id", 2307 threadInfo[num_avail][osIdIndex]); 2308 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 2309 continue; 2310 #else 2311 } 2312 char s2[] = "physical id"; 2313 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 2314 CHECK_LINE; 2315 char *p = strchr(buf + sizeof(s2) - 1, ':'); 2316 unsigned val; 2317 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2318 goto no_val; 2319 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) 2320 goto dup_field; 2321 threadInfo[num_avail][pkgIdIndex] = val; 2322 continue; 2323 } 2324 char s3[] = "core id"; 2325 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 2326 CHECK_LINE; 2327 char *p = strchr(buf + sizeof(s3) - 1, ':'); 2328 unsigned val; 2329 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2330 goto no_val; 2331 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) 2332 goto dup_field; 2333 threadInfo[num_avail][coreIdIndex] = val; 2334 continue; 2335 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 2336 } 2337 char s4[] = "thread id"; 2338 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 2339 CHECK_LINE; 2340 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2341 unsigned val; 2342 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2343 goto no_val; 2344 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) 2345 goto dup_field; 2346 threadInfo[num_avail][threadIdIndex] = val; 2347 continue; 2348 } 2349 unsigned level; 2350 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 2351 CHECK_LINE; 2352 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2353 unsigned val; 2354 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2355 goto no_val; 2356 KMP_ASSERT(nodeIdIndex + level <= maxIndex); 2357 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) 2358 goto dup_field; 2359 threadInfo[num_avail][nodeIdIndex + level] = val; 2360 continue; 2361 } 2362 2363 // We didn't recognize the leading token on the line. There are lots of 2364 // leading tokens that we don't recognize - if the line isn't empty, go on 2365 // to the next line. 2366 if ((*buf != 0) && (*buf != '\n')) { 2367 // If the line is longer than the buffer, read characters 2368 // until we find a newline. 2369 if (long_line) { 2370 int ch; 2371 while (((ch = fgetc(f)) != EOF) && (ch != '\n')) 2372 ; 2373 } 2374 continue; 2375 } 2376 2377 // A newline has signalled the end of the processor record. 2378 // Check that there aren't too many procs specified. 2379 if ((int)num_avail == __kmp_xproc) { 2380 CLEANUP_THREAD_INFO; 2381 *msg_id = kmp_i18n_str_TooManyEntries; 2382 return -1; 2383 } 2384 2385 // Check for missing fields. The osId field must be there, and we 2386 // currently require that the physical id field is specified, also. 2387 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 2388 CLEANUP_THREAD_INFO; 2389 *msg_id = kmp_i18n_str_MissingProcField; 2390 return -1; 2391 } 2392 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 2393 CLEANUP_THREAD_INFO; 2394 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 2395 return -1; 2396 } 2397 2398 // Skip this proc if it is not included in the machine model. 2399 if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], 2400 __kmp_affin_fullMask)) { 2401 INIT_PROC_INFO(threadInfo[num_avail]); 2402 continue; 2403 } 2404 2405 // We have a successful parse of this proc's info. 2406 // Increment the counter, and prepare for the next proc. 2407 num_avail++; 2408 KMP_ASSERT(num_avail <= num_records); 2409 INIT_PROC_INFO(threadInfo[num_avail]); 2410 } 2411 continue; 2412 2413 no_val: 2414 CLEANUP_THREAD_INFO; 2415 *msg_id = kmp_i18n_str_MissingValCpuinfo; 2416 return -1; 2417 2418 dup_field: 2419 CLEANUP_THREAD_INFO; 2420 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 2421 return -1; 2422 } 2423 *line = 0; 2424 2425 #if KMP_MIC && REDUCE_TEAM_SIZE 2426 unsigned teamSize = 0; 2427 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2428 2429 // check for num_records == __kmp_xproc ??? 2430 2431 // If there's only one thread context to bind to, form an Address object with 2432 // depth 1 and return immediately (or, if affinity is off, set address2os to 2433 // NULL and return). 2434 // 2435 // If it is configured to omit the package level when there is only a single 2436 // package, the logic at the end of this routine won't work if there is only a 2437 // single thread - it would try to form an Address object with depth 0. 2438 KMP_ASSERT(num_avail > 0); 2439 KMP_ASSERT(num_avail <= num_records); 2440 if (num_avail == 1) { 2441 __kmp_ncores = 1; 2442 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 2443 if (__kmp_affinity_verbose) { 2444 if (!KMP_AFFINITY_CAPABLE()) { 2445 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2446 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2447 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2448 } else { 2449 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2450 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2451 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2452 } 2453 int index; 2454 kmp_str_buf_t buf; 2455 __kmp_str_buf_init(&buf); 2456 __kmp_str_buf_print(&buf, "1"); 2457 for (index = maxIndex - 1; index > pkgIdIndex; index--) { 2458 __kmp_str_buf_print(&buf, " x 1"); 2459 } 2460 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1); 2461 __kmp_str_buf_free(&buf); 2462 } 2463 2464 if (__kmp_affinity_type == affinity_none) { 2465 CLEANUP_THREAD_INFO; 2466 return 0; 2467 } 2468 2469 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair)); 2470 Address addr(1); 2471 addr.labels[0] = threadInfo[0][pkgIdIndex]; 2472 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]); 2473 2474 if (__kmp_affinity_gran_levels < 0) { 2475 __kmp_affinity_gran_levels = 0; 2476 } 2477 2478 if (__kmp_affinity_verbose) { 2479 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 2480 } 2481 2482 CLEANUP_THREAD_INFO; 2483 return 1; 2484 } 2485 2486 // Sort the threadInfo table by physical Id. 2487 qsort(threadInfo, num_avail, sizeof(*threadInfo), 2488 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 2489 2490 // The table is now sorted by pkgId / coreId / threadId, but we really don't 2491 // know the radix of any of the fields. pkgId's may be sparsely assigned among 2492 // the chips on a system. Although coreId's are usually assigned 2493 // [0 .. coresPerPkg-1] and threadId's are usually assigned 2494 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2495 // 2496 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 2497 // total # packages) are at this point - we want to determine that now. We 2498 // only have an upper bound on the first two figures. 2499 unsigned *counts = 2500 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2501 unsigned *maxCt = 2502 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2503 unsigned *totals = 2504 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2505 unsigned *lastId = 2506 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2507 2508 bool assign_thread_ids = false; 2509 unsigned threadIdCt; 2510 unsigned index; 2511 2512 restart_radix_check: 2513 threadIdCt = 0; 2514 2515 // Initialize the counter arrays with data from threadInfo[0]. 2516 if (assign_thread_ids) { 2517 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 2518 threadInfo[0][threadIdIndex] = threadIdCt++; 2519 } else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 2520 threadIdCt = threadInfo[0][threadIdIndex] + 1; 2521 } 2522 } 2523 for (index = 0; index <= maxIndex; index++) { 2524 counts[index] = 1; 2525 maxCt[index] = 1; 2526 totals[index] = 1; 2527 lastId[index] = threadInfo[0][index]; 2528 ; 2529 } 2530 2531 // Run through the rest of the OS procs. 2532 for (i = 1; i < num_avail; i++) { 2533 // Find the most significant index whose id differs from the id for the 2534 // previous OS proc. 2535 for (index = maxIndex; index >= threadIdIndex; index--) { 2536 if (assign_thread_ids && (index == threadIdIndex)) { 2537 // Auto-assign the thread id field if it wasn't specified. 2538 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2539 threadInfo[i][threadIdIndex] = threadIdCt++; 2540 } 2541 // Apparently the thread id field was specified for some entries and not 2542 // others. Start the thread id counter off at the next higher thread id. 2543 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2544 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2545 } 2546 } 2547 if (threadInfo[i][index] != lastId[index]) { 2548 // Run through all indices which are less significant, and reset the 2549 // counts to 1. At all levels up to and including index, we need to 2550 // increment the totals and record the last id. 2551 unsigned index2; 2552 for (index2 = threadIdIndex; index2 < index; index2++) { 2553 totals[index2]++; 2554 if (counts[index2] > maxCt[index2]) { 2555 maxCt[index2] = counts[index2]; 2556 } 2557 counts[index2] = 1; 2558 lastId[index2] = threadInfo[i][index2]; 2559 } 2560 counts[index]++; 2561 totals[index]++; 2562 lastId[index] = threadInfo[i][index]; 2563 2564 if (assign_thread_ids && (index > threadIdIndex)) { 2565 2566 #if KMP_MIC && REDUCE_TEAM_SIZE 2567 // The default team size is the total #threads in the machine 2568 // minus 1 thread for every core that has 3 or more threads. 2569 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2570 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2571 2572 // Restart the thread counter, as we are on a new core. 2573 threadIdCt = 0; 2574 2575 // Auto-assign the thread id field if it wasn't specified. 2576 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2577 threadInfo[i][threadIdIndex] = threadIdCt++; 2578 } 2579 2580 // Apparently the thread id field was specified for some entries and 2581 // not others. Start the thread id counter off at the next higher 2582 // thread id. 2583 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2584 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2585 } 2586 } 2587 break; 2588 } 2589 } 2590 if (index < threadIdIndex) { 2591 // If thread ids were specified, it is an error if they are not unique. 2592 // Also, check that we waven't already restarted the loop (to be safe - 2593 // shouldn't need to). 2594 if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) { 2595 __kmp_free(lastId); 2596 __kmp_free(totals); 2597 __kmp_free(maxCt); 2598 __kmp_free(counts); 2599 CLEANUP_THREAD_INFO; 2600 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2601 return -1; 2602 } 2603 2604 // If the thread ids were not specified and we see entries entries that 2605 // are duplicates, start the loop over and assign the thread ids manually. 2606 assign_thread_ids = true; 2607 goto restart_radix_check; 2608 } 2609 } 2610 2611 #if KMP_MIC && REDUCE_TEAM_SIZE 2612 // The default team size is the total #threads in the machine 2613 // minus 1 thread for every core that has 3 or more threads. 2614 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2615 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2616 2617 for (index = threadIdIndex; index <= maxIndex; index++) { 2618 if (counts[index] > maxCt[index]) { 2619 maxCt[index] = counts[index]; 2620 } 2621 } 2622 2623 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2624 nCoresPerPkg = maxCt[coreIdIndex]; 2625 nPackages = totals[pkgIdIndex]; 2626 2627 // Check to see if the machine topology is uniform 2628 unsigned prod = totals[maxIndex]; 2629 for (index = threadIdIndex; index < maxIndex; index++) { 2630 prod *= maxCt[index]; 2631 } 2632 bool uniform = (prod == totals[threadIdIndex]); 2633 2634 // When affinity is off, this routine will still be called to set 2635 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 2636 // Make sure all these vars are set correctly, and return now if affinity is 2637 // not enabled. 2638 __kmp_ncores = totals[coreIdIndex]; 2639 2640 if (__kmp_affinity_verbose) { 2641 if (!KMP_AFFINITY_CAPABLE()) { 2642 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2643 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2644 if (uniform) { 2645 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2646 } else { 2647 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2648 } 2649 } else { 2650 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2651 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2652 if (uniform) { 2653 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2654 } else { 2655 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2656 } 2657 } 2658 kmp_str_buf_t buf; 2659 __kmp_str_buf_init(&buf); 2660 2661 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]); 2662 for (index = maxIndex - 1; index >= pkgIdIndex; index--) { 2663 __kmp_str_buf_print(&buf, " x %d", maxCt[index]); 2664 } 2665 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex], 2666 maxCt[threadIdIndex], __kmp_ncores); 2667 2668 __kmp_str_buf_free(&buf); 2669 } 2670 2671 #if KMP_MIC && REDUCE_TEAM_SIZE 2672 // Set the default team size. 2673 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2674 __kmp_dflt_team_nth = teamSize; 2675 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting " 2676 "__kmp_dflt_team_nth = %d\n", 2677 __kmp_dflt_team_nth)); 2678 } 2679 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2680 2681 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 2682 KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc); 2683 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 2684 for (i = 0; i < num_avail; ++i) { // fill the os indices 2685 __kmp_pu_os_idx[i] = threadInfo[i][osIdIndex]; 2686 } 2687 2688 if (__kmp_affinity_type == affinity_none) { 2689 __kmp_free(lastId); 2690 __kmp_free(totals); 2691 __kmp_free(maxCt); 2692 __kmp_free(counts); 2693 CLEANUP_THREAD_INFO; 2694 return 0; 2695 } 2696 2697 // Count the number of levels which have more nodes at that level than at the 2698 // parent's level (with there being an implicit root node of the top level). 2699 // This is equivalent to saying that there is at least one node at this level 2700 // which has a sibling. These levels are in the map, and the package level is 2701 // always in the map. 2702 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2703 for (index = threadIdIndex; index < maxIndex; index++) { 2704 KMP_ASSERT(totals[index] >= totals[index + 1]); 2705 inMap[index] = (totals[index] > totals[index + 1]); 2706 } 2707 inMap[maxIndex] = (totals[maxIndex] > 1); 2708 inMap[pkgIdIndex] = true; 2709 2710 int depth = 0; 2711 for (index = threadIdIndex; index <= maxIndex; index++) { 2712 if (inMap[index]) { 2713 depth++; 2714 } 2715 } 2716 KMP_ASSERT(depth > 0); 2717 2718 // Construct the data structure that is to be returned. 2719 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * num_avail); 2720 int pkgLevel = -1; 2721 int coreLevel = -1; 2722 int threadLevel = -1; 2723 2724 for (i = 0; i < num_avail; ++i) { 2725 Address addr(depth); 2726 unsigned os = threadInfo[i][osIdIndex]; 2727 int src_index; 2728 int dst_index = 0; 2729 2730 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2731 if (!inMap[src_index]) { 2732 continue; 2733 } 2734 addr.labels[dst_index] = threadInfo[i][src_index]; 2735 if (src_index == pkgIdIndex) { 2736 pkgLevel = dst_index; 2737 } else if (src_index == coreIdIndex) { 2738 coreLevel = dst_index; 2739 } else if (src_index == threadIdIndex) { 2740 threadLevel = dst_index; 2741 } 2742 dst_index++; 2743 } 2744 (*address2os)[i] = AddrUnsPair(addr, os); 2745 } 2746 2747 if (__kmp_affinity_gran_levels < 0) { 2748 // Set the granularity level based on what levels are modeled 2749 // in the machine topology map. 2750 unsigned src_index; 2751 __kmp_affinity_gran_levels = 0; 2752 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) { 2753 if (!inMap[src_index]) { 2754 continue; 2755 } 2756 switch (src_index) { 2757 case threadIdIndex: 2758 if (__kmp_affinity_gran > affinity_gran_thread) { 2759 __kmp_affinity_gran_levels++; 2760 } 2761 2762 break; 2763 case coreIdIndex: 2764 if (__kmp_affinity_gran > affinity_gran_core) { 2765 __kmp_affinity_gran_levels++; 2766 } 2767 break; 2768 2769 case pkgIdIndex: 2770 if (__kmp_affinity_gran > affinity_gran_package) { 2771 __kmp_affinity_gran_levels++; 2772 } 2773 break; 2774 } 2775 } 2776 } 2777 2778 if (__kmp_affinity_verbose) { 2779 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel, 2780 coreLevel, threadLevel); 2781 } 2782 2783 __kmp_free(inMap); 2784 __kmp_free(lastId); 2785 __kmp_free(totals); 2786 __kmp_free(maxCt); 2787 __kmp_free(counts); 2788 CLEANUP_THREAD_INFO; 2789 return depth; 2790 } 2791 2792 // Create and return a table of affinity masks, indexed by OS thread ID. 2793 // This routine handles OR'ing together all the affinity masks of threads 2794 // that are sufficiently close, if granularity > fine. 2795 static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex, 2796 unsigned *numUnique, 2797 AddrUnsPair *address2os, 2798 unsigned numAddrs) { 2799 // First form a table of affinity masks in order of OS thread id. 2800 unsigned depth; 2801 unsigned maxOsId; 2802 unsigned i; 2803 2804 KMP_ASSERT(numAddrs > 0); 2805 depth = address2os[0].first.depth; 2806 2807 maxOsId = 0; 2808 for (i = numAddrs - 1;; --i) { 2809 unsigned osId = address2os[i].second; 2810 if (osId > maxOsId) { 2811 maxOsId = osId; 2812 } 2813 if (i == 0) 2814 break; 2815 } 2816 kmp_affin_mask_t *osId2Mask; 2817 KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1)); 2818 2819 // Sort the address2os table according to physical order. Doing so will put 2820 // all threads on the same core/package/node in consecutive locations. 2821 qsort(address2os, numAddrs, sizeof(*address2os), 2822 __kmp_affinity_cmp_Address_labels); 2823 2824 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2825 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2826 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2827 } 2828 if (__kmp_affinity_gran_levels >= (int)depth) { 2829 if (__kmp_affinity_verbose || 2830 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 2831 KMP_WARNING(AffThreadsMayMigrate); 2832 } 2833 } 2834 2835 // Run through the table, forming the masks for all threads on each core. 2836 // Threads on the same core will have identical "Address" objects, not 2837 // considering the last level, which must be the thread id. All threads on a 2838 // core will appear consecutively. 2839 unsigned unique = 0; 2840 unsigned j = 0; // index of 1st thread on core 2841 unsigned leader = 0; 2842 Address *leaderAddr = &(address2os[0].first); 2843 kmp_affin_mask_t *sum; 2844 KMP_CPU_ALLOC_ON_STACK(sum); 2845 KMP_CPU_ZERO(sum); 2846 KMP_CPU_SET(address2os[0].second, sum); 2847 for (i = 1; i < numAddrs; i++) { 2848 // If this thread is sufficiently close to the leader (within the 2849 // granularity setting), then set the bit for this os thread in the 2850 // affinity mask for this group, and go on to the next thread. 2851 if (leaderAddr->isClose(address2os[i].first, __kmp_affinity_gran_levels)) { 2852 KMP_CPU_SET(address2os[i].second, sum); 2853 continue; 2854 } 2855 2856 // For every thread in this group, copy the mask to the thread's entry in 2857 // the osId2Mask table. Mark the first address as a leader. 2858 for (; j < i; j++) { 2859 unsigned osId = address2os[j].second; 2860 KMP_DEBUG_ASSERT(osId <= maxOsId); 2861 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2862 KMP_CPU_COPY(mask, sum); 2863 address2os[j].first.leader = (j == leader); 2864 } 2865 unique++; 2866 2867 // Start a new mask. 2868 leader = i; 2869 leaderAddr = &(address2os[i].first); 2870 KMP_CPU_ZERO(sum); 2871 KMP_CPU_SET(address2os[i].second, sum); 2872 } 2873 2874 // For every thread in last group, copy the mask to the thread's 2875 // entry in the osId2Mask table. 2876 for (; j < i; j++) { 2877 unsigned osId = address2os[j].second; 2878 KMP_DEBUG_ASSERT(osId <= maxOsId); 2879 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2880 KMP_CPU_COPY(mask, sum); 2881 address2os[j].first.leader = (j == leader); 2882 } 2883 unique++; 2884 KMP_CPU_FREE_FROM_STACK(sum); 2885 2886 *maxIndex = maxOsId; 2887 *numUnique = unique; 2888 return osId2Mask; 2889 } 2890 2891 // Stuff for the affinity proclist parsers. It's easier to declare these vars 2892 // as file-static than to try and pass them through the calling sequence of 2893 // the recursive-descent OMP_PLACES parser. 2894 static kmp_affin_mask_t *newMasks; 2895 static int numNewMasks; 2896 static int nextNewMask; 2897 2898 #define ADD_MASK(_mask) \ 2899 { \ 2900 if (nextNewMask >= numNewMasks) { \ 2901 int i; \ 2902 numNewMasks *= 2; \ 2903 kmp_affin_mask_t *temp; \ 2904 KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \ 2905 for (i = 0; i < numNewMasks / 2; i++) { \ 2906 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); \ 2907 kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i); \ 2908 KMP_CPU_COPY(dest, src); \ 2909 } \ 2910 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2); \ 2911 newMasks = temp; \ 2912 } \ 2913 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 2914 nextNewMask++; \ 2915 } 2916 2917 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId) \ 2918 { \ 2919 if (((_osId) > _maxOsId) || \ 2920 (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 2921 if (__kmp_affinity_verbose || \ 2922 (__kmp_affinity_warnings && \ 2923 (__kmp_affinity_type != affinity_none))) { \ 2924 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 2925 } \ 2926 } else { \ 2927 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 2928 } \ 2929 } 2930 2931 // Re-parse the proclist (for the explicit affinity type), and form the list 2932 // of affinity newMasks indexed by gtid. 2933 static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 2934 unsigned int *out_numMasks, 2935 const char *proclist, 2936 kmp_affin_mask_t *osId2Mask, 2937 int maxOsId) { 2938 int i; 2939 const char *scan = proclist; 2940 const char *next = proclist; 2941 2942 // We use malloc() for the temporary mask vector, so that we can use 2943 // realloc() to extend it. 2944 numNewMasks = 2; 2945 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 2946 nextNewMask = 0; 2947 kmp_affin_mask_t *sumMask; 2948 KMP_CPU_ALLOC(sumMask); 2949 int setSize = 0; 2950 2951 for (;;) { 2952 int start, end, stride; 2953 2954 SKIP_WS(scan); 2955 next = scan; 2956 if (*next == '\0') { 2957 break; 2958 } 2959 2960 if (*next == '{') { 2961 int num; 2962 setSize = 0; 2963 next++; // skip '{' 2964 SKIP_WS(next); 2965 scan = next; 2966 2967 // Read the first integer in the set. 2968 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist"); 2969 SKIP_DIGITS(next); 2970 num = __kmp_str_to_int(scan, *next); 2971 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2972 2973 // Copy the mask for that osId to the sum (union) mask. 2974 if ((num > maxOsId) || 2975 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2976 if (__kmp_affinity_verbose || 2977 (__kmp_affinity_warnings && 2978 (__kmp_affinity_type != affinity_none))) { 2979 KMP_WARNING(AffIgnoreInvalidProcID, num); 2980 } 2981 KMP_CPU_ZERO(sumMask); 2982 } else { 2983 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2984 setSize = 1; 2985 } 2986 2987 for (;;) { 2988 // Check for end of set. 2989 SKIP_WS(next); 2990 if (*next == '}') { 2991 next++; // skip '}' 2992 break; 2993 } 2994 2995 // Skip optional comma. 2996 if (*next == ',') { 2997 next++; 2998 } 2999 SKIP_WS(next); 3000 3001 // Read the next integer in the set. 3002 scan = next; 3003 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 3004 3005 SKIP_DIGITS(next); 3006 num = __kmp_str_to_int(scan, *next); 3007 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 3008 3009 // Add the mask for that osId to the sum mask. 3010 if ((num > maxOsId) || 3011 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3012 if (__kmp_affinity_verbose || 3013 (__kmp_affinity_warnings && 3014 (__kmp_affinity_type != affinity_none))) { 3015 KMP_WARNING(AffIgnoreInvalidProcID, num); 3016 } 3017 } else { 3018 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 3019 setSize++; 3020 } 3021 } 3022 if (setSize > 0) { 3023 ADD_MASK(sumMask); 3024 } 3025 3026 SKIP_WS(next); 3027 if (*next == ',') { 3028 next++; 3029 } 3030 scan = next; 3031 continue; 3032 } 3033 3034 // Read the first integer. 3035 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 3036 SKIP_DIGITS(next); 3037 start = __kmp_str_to_int(scan, *next); 3038 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 3039 SKIP_WS(next); 3040 3041 // If this isn't a range, then add a mask to the list and go on. 3042 if (*next != '-') { 3043 ADD_MASK_OSID(start, osId2Mask, maxOsId); 3044 3045 // Skip optional comma. 3046 if (*next == ',') { 3047 next++; 3048 } 3049 scan = next; 3050 continue; 3051 } 3052 3053 // This is a range. Skip over the '-' and read in the 2nd int. 3054 next++; // skip '-' 3055 SKIP_WS(next); 3056 scan = next; 3057 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 3058 SKIP_DIGITS(next); 3059 end = __kmp_str_to_int(scan, *next); 3060 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 3061 3062 // Check for a stride parameter 3063 stride = 1; 3064 SKIP_WS(next); 3065 if (*next == ':') { 3066 // A stride is specified. Skip over the ':" and read the 3rd int. 3067 int sign = +1; 3068 next++; // skip ':' 3069 SKIP_WS(next); 3070 scan = next; 3071 if (*next == '-') { 3072 sign = -1; 3073 next++; 3074 SKIP_WS(next); 3075 scan = next; 3076 } 3077 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 3078 SKIP_DIGITS(next); 3079 stride = __kmp_str_to_int(scan, *next); 3080 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 3081 stride *= sign; 3082 } 3083 3084 // Do some range checks. 3085 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 3086 if (stride > 0) { 3087 KMP_ASSERT2(start <= end, "bad explicit proc list"); 3088 } else { 3089 KMP_ASSERT2(start >= end, "bad explicit proc list"); 3090 } 3091 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 3092 3093 // Add the mask for each OS proc # to the list. 3094 if (stride > 0) { 3095 do { 3096 ADD_MASK_OSID(start, osId2Mask, maxOsId); 3097 start += stride; 3098 } while (start <= end); 3099 } else { 3100 do { 3101 ADD_MASK_OSID(start, osId2Mask, maxOsId); 3102 start += stride; 3103 } while (start >= end); 3104 } 3105 3106 // Skip optional comma. 3107 SKIP_WS(next); 3108 if (*next == ',') { 3109 next++; 3110 } 3111 scan = next; 3112 } 3113 3114 *out_numMasks = nextNewMask; 3115 if (nextNewMask == 0) { 3116 *out_masks = NULL; 3117 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3118 return; 3119 } 3120 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3121 for (i = 0; i < nextNewMask; i++) { 3122 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 3123 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 3124 KMP_CPU_COPY(dest, src); 3125 } 3126 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3127 KMP_CPU_FREE(sumMask); 3128 } 3129 3130 /*----------------------------------------------------------------------------- 3131 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 3132 places. Again, Here is the grammar: 3133 3134 place_list := place 3135 place_list := place , place_list 3136 place := num 3137 place := place : num 3138 place := place : num : signed 3139 place := { subplacelist } 3140 place := ! place // (lowest priority) 3141 subplace_list := subplace 3142 subplace_list := subplace , subplace_list 3143 subplace := num 3144 subplace := num : num 3145 subplace := num : num : signed 3146 signed := num 3147 signed := + signed 3148 signed := - signed 3149 -----------------------------------------------------------------------------*/ 3150 static void __kmp_process_subplace_list(const char **scan, 3151 kmp_affin_mask_t *osId2Mask, 3152 int maxOsId, kmp_affin_mask_t *tempMask, 3153 int *setSize) { 3154 const char *next; 3155 3156 for (;;) { 3157 int start, count, stride, i; 3158 3159 // Read in the starting proc id 3160 SKIP_WS(*scan); 3161 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3162 next = *scan; 3163 SKIP_DIGITS(next); 3164 start = __kmp_str_to_int(*scan, *next); 3165 KMP_ASSERT(start >= 0); 3166 *scan = next; 3167 3168 // valid follow sets are ',' ':' and '}' 3169 SKIP_WS(*scan); 3170 if (**scan == '}' || **scan == ',') { 3171 if ((start > maxOsId) || 3172 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3173 if (__kmp_affinity_verbose || 3174 (__kmp_affinity_warnings && 3175 (__kmp_affinity_type != affinity_none))) { 3176 KMP_WARNING(AffIgnoreInvalidProcID, start); 3177 } 3178 } else { 3179 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3180 (*setSize)++; 3181 } 3182 if (**scan == '}') { 3183 break; 3184 } 3185 (*scan)++; // skip ',' 3186 continue; 3187 } 3188 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3189 (*scan)++; // skip ':' 3190 3191 // Read count parameter 3192 SKIP_WS(*scan); 3193 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3194 next = *scan; 3195 SKIP_DIGITS(next); 3196 count = __kmp_str_to_int(*scan, *next); 3197 KMP_ASSERT(count >= 0); 3198 *scan = next; 3199 3200 // valid follow sets are ',' ':' and '}' 3201 SKIP_WS(*scan); 3202 if (**scan == '}' || **scan == ',') { 3203 for (i = 0; i < count; i++) { 3204 if ((start > maxOsId) || 3205 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3206 if (__kmp_affinity_verbose || 3207 (__kmp_affinity_warnings && 3208 (__kmp_affinity_type != affinity_none))) { 3209 KMP_WARNING(AffIgnoreInvalidProcID, start); 3210 } 3211 break; // don't proliferate warnings for large count 3212 } else { 3213 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3214 start++; 3215 (*setSize)++; 3216 } 3217 } 3218 if (**scan == '}') { 3219 break; 3220 } 3221 (*scan)++; // skip ',' 3222 continue; 3223 } 3224 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3225 (*scan)++; // skip ':' 3226 3227 // Read stride parameter 3228 int sign = +1; 3229 for (;;) { 3230 SKIP_WS(*scan); 3231 if (**scan == '+') { 3232 (*scan)++; // skip '+' 3233 continue; 3234 } 3235 if (**scan == '-') { 3236 sign *= -1; 3237 (*scan)++; // skip '-' 3238 continue; 3239 } 3240 break; 3241 } 3242 SKIP_WS(*scan); 3243 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3244 next = *scan; 3245 SKIP_DIGITS(next); 3246 stride = __kmp_str_to_int(*scan, *next); 3247 KMP_ASSERT(stride >= 0); 3248 *scan = next; 3249 stride *= sign; 3250 3251 // valid follow sets are ',' and '}' 3252 SKIP_WS(*scan); 3253 if (**scan == '}' || **scan == ',') { 3254 for (i = 0; i < count; i++) { 3255 if ((start > maxOsId) || 3256 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3257 if (__kmp_affinity_verbose || 3258 (__kmp_affinity_warnings && 3259 (__kmp_affinity_type != affinity_none))) { 3260 KMP_WARNING(AffIgnoreInvalidProcID, start); 3261 } 3262 break; // don't proliferate warnings for large count 3263 } else { 3264 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3265 start += stride; 3266 (*setSize)++; 3267 } 3268 } 3269 if (**scan == '}') { 3270 break; 3271 } 3272 (*scan)++; // skip ',' 3273 continue; 3274 } 3275 3276 KMP_ASSERT2(0, "bad explicit places list"); 3277 } 3278 } 3279 3280 static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 3281 int maxOsId, kmp_affin_mask_t *tempMask, 3282 int *setSize) { 3283 const char *next; 3284 3285 // valid follow sets are '{' '!' and num 3286 SKIP_WS(*scan); 3287 if (**scan == '{') { 3288 (*scan)++; // skip '{' 3289 __kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize); 3290 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 3291 (*scan)++; // skip '}' 3292 } else if (**scan == '!') { 3293 (*scan)++; // skip '!' 3294 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 3295 KMP_CPU_COMPLEMENT(maxOsId, tempMask); 3296 } else if ((**scan >= '0') && (**scan <= '9')) { 3297 next = *scan; 3298 SKIP_DIGITS(next); 3299 int num = __kmp_str_to_int(*scan, *next); 3300 KMP_ASSERT(num >= 0); 3301 if ((num > maxOsId) || 3302 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3303 if (__kmp_affinity_verbose || 3304 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 3305 KMP_WARNING(AffIgnoreInvalidProcID, num); 3306 } 3307 } else { 3308 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 3309 (*setSize)++; 3310 } 3311 *scan = next; // skip num 3312 } else { 3313 KMP_ASSERT2(0, "bad explicit places list"); 3314 } 3315 } 3316 3317 // static void 3318 void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 3319 unsigned int *out_numMasks, 3320 const char *placelist, 3321 kmp_affin_mask_t *osId2Mask, 3322 int maxOsId) { 3323 int i, j, count, stride, sign; 3324 const char *scan = placelist; 3325 const char *next = placelist; 3326 3327 numNewMasks = 2; 3328 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 3329 nextNewMask = 0; 3330 3331 // tempMask is modified based on the previous or initial 3332 // place to form the current place 3333 // previousMask contains the previous place 3334 kmp_affin_mask_t *tempMask; 3335 kmp_affin_mask_t *previousMask; 3336 KMP_CPU_ALLOC(tempMask); 3337 KMP_CPU_ZERO(tempMask); 3338 KMP_CPU_ALLOC(previousMask); 3339 KMP_CPU_ZERO(previousMask); 3340 int setSize = 0; 3341 3342 for (;;) { 3343 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 3344 3345 // valid follow sets are ',' ':' and EOL 3346 SKIP_WS(scan); 3347 if (*scan == '\0' || *scan == ',') { 3348 if (setSize > 0) { 3349 ADD_MASK(tempMask); 3350 } 3351 KMP_CPU_ZERO(tempMask); 3352 setSize = 0; 3353 if (*scan == '\0') { 3354 break; 3355 } 3356 scan++; // skip ',' 3357 continue; 3358 } 3359 3360 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3361 scan++; // skip ':' 3362 3363 // Read count parameter 3364 SKIP_WS(scan); 3365 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 3366 next = scan; 3367 SKIP_DIGITS(next); 3368 count = __kmp_str_to_int(scan, *next); 3369 KMP_ASSERT(count >= 0); 3370 scan = next; 3371 3372 // valid follow sets are ',' ':' and EOL 3373 SKIP_WS(scan); 3374 if (*scan == '\0' || *scan == ',') { 3375 stride = +1; 3376 } else { 3377 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3378 scan++; // skip ':' 3379 3380 // Read stride parameter 3381 sign = +1; 3382 for (;;) { 3383 SKIP_WS(scan); 3384 if (*scan == '+') { 3385 scan++; // skip '+' 3386 continue; 3387 } 3388 if (*scan == '-') { 3389 sign *= -1; 3390 scan++; // skip '-' 3391 continue; 3392 } 3393 break; 3394 } 3395 SKIP_WS(scan); 3396 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 3397 next = scan; 3398 SKIP_DIGITS(next); 3399 stride = __kmp_str_to_int(scan, *next); 3400 KMP_DEBUG_ASSERT(stride >= 0); 3401 scan = next; 3402 stride *= sign; 3403 } 3404 3405 // Add places determined by initial_place : count : stride 3406 for (i = 0; i < count; i++) { 3407 if (setSize == 0) { 3408 break; 3409 } 3410 // Add the current place, then build the next place (tempMask) from that 3411 KMP_CPU_COPY(previousMask, tempMask); 3412 ADD_MASK(previousMask); 3413 KMP_CPU_ZERO(tempMask); 3414 setSize = 0; 3415 KMP_CPU_SET_ITERATE(j, previousMask) { 3416 if (!KMP_CPU_ISSET(j, previousMask)) { 3417 continue; 3418 } 3419 if ((j + stride > maxOsId) || (j + stride < 0) || 3420 (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) || 3421 (!KMP_CPU_ISSET(j + stride, 3422 KMP_CPU_INDEX(osId2Mask, j + stride)))) { 3423 if ((__kmp_affinity_verbose || 3424 (__kmp_affinity_warnings && 3425 (__kmp_affinity_type != affinity_none))) && 3426 i < count - 1) { 3427 KMP_WARNING(AffIgnoreInvalidProcID, j + stride); 3428 } 3429 continue; 3430 } 3431 KMP_CPU_SET(j + stride, tempMask); 3432 setSize++; 3433 } 3434 } 3435 KMP_CPU_ZERO(tempMask); 3436 setSize = 0; 3437 3438 // valid follow sets are ',' and EOL 3439 SKIP_WS(scan); 3440 if (*scan == '\0') { 3441 break; 3442 } 3443 if (*scan == ',') { 3444 scan++; // skip ',' 3445 continue; 3446 } 3447 3448 KMP_ASSERT2(0, "bad explicit places list"); 3449 } 3450 3451 *out_numMasks = nextNewMask; 3452 if (nextNewMask == 0) { 3453 *out_masks = NULL; 3454 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3455 return; 3456 } 3457 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3458 KMP_CPU_FREE(tempMask); 3459 KMP_CPU_FREE(previousMask); 3460 for (i = 0; i < nextNewMask; i++) { 3461 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 3462 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 3463 KMP_CPU_COPY(dest, src); 3464 } 3465 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3466 } 3467 3468 #undef ADD_MASK 3469 #undef ADD_MASK_OSID 3470 3471 #if KMP_USE_HWLOC 3472 static int __kmp_hwloc_skip_PUs_obj(hwloc_topology_t t, hwloc_obj_t o) { 3473 // skip PUs descendants of the object o 3474 int skipped = 0; 3475 hwloc_obj_t hT = NULL; 3476 int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT); 3477 for (int i = 0; i < N; ++i) { 3478 KMP_DEBUG_ASSERT(hT); 3479 unsigned idx = hT->os_index; 3480 if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3481 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3482 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3483 ++skipped; 3484 } 3485 hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT); 3486 } 3487 return skipped; // count number of skipped units 3488 } 3489 3490 static int __kmp_hwloc_obj_has_PUs(hwloc_topology_t t, hwloc_obj_t o) { 3491 // check if obj has PUs present in fullMask 3492 hwloc_obj_t hT = NULL; 3493 int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT); 3494 for (int i = 0; i < N; ++i) { 3495 KMP_DEBUG_ASSERT(hT); 3496 unsigned idx = hT->os_index; 3497 if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) 3498 return 1; // found PU 3499 hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT); 3500 } 3501 return 0; // no PUs found 3502 } 3503 #endif // KMP_USE_HWLOC 3504 3505 static void __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) { 3506 AddrUnsPair *newAddr; 3507 if (__kmp_hws_requested == 0) 3508 goto _exit; // no topology limiting actions requested, exit 3509 #if KMP_USE_HWLOC 3510 if (__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 3511 // Number of subobjects calculated dynamically, this works fine for 3512 // any non-uniform topology. 3513 // L2 cache objects are determined by depth, other objects - by type. 3514 hwloc_topology_t tp = __kmp_hwloc_topology; 3515 int nS = 0, nN = 0, nL = 0, nC = 0, 3516 nT = 0; // logical index including skipped 3517 int nCr = 0, nTr = 0; // number of requested units 3518 int nPkg = 0, nCo = 0, n_new = 0, n_old = 0, nCpP = 0, nTpC = 0; // counters 3519 hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to) 3520 int L2depth, idx; 3521 3522 // check support of extensions ---------------------------------- 3523 int numa_support = 0, tile_support = 0; 3524 if (__kmp_pu_os_idx) 3525 hT = hwloc_get_pu_obj_by_os_index(tp, 3526 __kmp_pu_os_idx[__kmp_avail_proc - 1]); 3527 else 3528 hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, __kmp_avail_proc - 1); 3529 if (hT == NULL) { // something's gone wrong 3530 KMP_WARNING(AffHWSubsetUnsupported); 3531 goto _exit; 3532 } 3533 // check NUMA node 3534 hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT); 3535 hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT); 3536 if (hN != NULL && hN->depth > hS->depth) { 3537 numa_support = 1; // 1 in case socket includes node(s) 3538 } else if (__kmp_hws_node.num > 0) { 3539 // don't support sockets inside NUMA node (no such HW found for testing) 3540 KMP_WARNING(AffHWSubsetUnsupported); 3541 goto _exit; 3542 } 3543 // check L2 cahce, get object by depth because of multiple caches 3544 L2depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED); 3545 hL = hwloc_get_ancestor_obj_by_depth(tp, L2depth, hT); 3546 if (hL != NULL && 3547 __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1) { 3548 tile_support = 1; // no sense to count L2 if it includes single core 3549 } else if (__kmp_hws_tile.num > 0) { 3550 if (__kmp_hws_core.num == 0) { 3551 __kmp_hws_core = __kmp_hws_tile; // replace L2 with core 3552 __kmp_hws_tile.num = 0; 3553 } else { 3554 // L2 and core are both requested, but represent same object 3555 KMP_WARNING(AffHWSubsetInvalid); 3556 goto _exit; 3557 } 3558 } 3559 // end of check of extensions ----------------------------------- 3560 3561 // fill in unset items, validate settings ----------------------- 3562 if (__kmp_hws_socket.num == 0) 3563 __kmp_hws_socket.num = nPackages; // use all available sockets 3564 if (__kmp_hws_socket.offset >= nPackages) { 3565 KMP_WARNING(AffHWSubsetManySockets); 3566 goto _exit; 3567 } 3568 if (numa_support) { 3569 hN = NULL; 3570 int NN = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, 3571 &hN); // num nodes in socket 3572 if (__kmp_hws_node.num == 0) 3573 __kmp_hws_node.num = NN; // use all available nodes 3574 if (__kmp_hws_node.offset >= NN) { 3575 KMP_WARNING(AffHWSubsetManyNodes); 3576 goto _exit; 3577 } 3578 if (tile_support) { 3579 // get num tiles in node 3580 int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL); 3581 if (__kmp_hws_tile.num == 0) { 3582 __kmp_hws_tile.num = NL + 1; 3583 } // use all available tiles, some node may have more tiles, thus +1 3584 if (__kmp_hws_tile.offset >= NL) { 3585 KMP_WARNING(AffHWSubsetManyTiles); 3586 goto _exit; 3587 } 3588 int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, 3589 &hC); // num cores in tile 3590 if (__kmp_hws_core.num == 0) 3591 __kmp_hws_core.num = NC; // use all available cores 3592 if (__kmp_hws_core.offset >= NC) { 3593 KMP_WARNING(AffHWSubsetManyCores); 3594 goto _exit; 3595 } 3596 } else { // tile_support 3597 int NC = __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, 3598 &hC); // num cores in node 3599 if (__kmp_hws_core.num == 0) 3600 __kmp_hws_core.num = NC; // use all available cores 3601 if (__kmp_hws_core.offset >= NC) { 3602 KMP_WARNING(AffHWSubsetManyCores); 3603 goto _exit; 3604 } 3605 } // tile_support 3606 } else { // numa_support 3607 if (tile_support) { 3608 // get num tiles in socket 3609 int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL); 3610 if (__kmp_hws_tile.num == 0) 3611 __kmp_hws_tile.num = NL; // use all available tiles 3612 if (__kmp_hws_tile.offset >= NL) { 3613 KMP_WARNING(AffHWSubsetManyTiles); 3614 goto _exit; 3615 } 3616 int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, 3617 &hC); // num cores in tile 3618 if (__kmp_hws_core.num == 0) 3619 __kmp_hws_core.num = NC; // use all available cores 3620 if (__kmp_hws_core.offset >= NC) { 3621 KMP_WARNING(AffHWSubsetManyCores); 3622 goto _exit; 3623 } 3624 } else { // tile_support 3625 int NC = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, 3626 &hC); // num cores in socket 3627 if (__kmp_hws_core.num == 0) 3628 __kmp_hws_core.num = NC; // use all available cores 3629 if (__kmp_hws_core.offset >= NC) { 3630 KMP_WARNING(AffHWSubsetManyCores); 3631 goto _exit; 3632 } 3633 } // tile_support 3634 } 3635 if (__kmp_hws_proc.num == 0) 3636 __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all available procs 3637 if (__kmp_hws_proc.offset >= __kmp_nThreadsPerCore) { 3638 KMP_WARNING(AffHWSubsetManyProcs); 3639 goto _exit; 3640 } 3641 // end of validation -------------------------------------------- 3642 3643 if (pAddr) // pAddr is NULL in case of affinity_none 3644 newAddr = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * 3645 __kmp_avail_proc); // max size 3646 // main loop to form HW subset ---------------------------------- 3647 hS = NULL; 3648 int NP = hwloc_get_nbobjs_by_type(tp, HWLOC_OBJ_PACKAGE); 3649 for (int s = 0; s < NP; ++s) { 3650 // Check Socket ----------------------------------------------- 3651 hS = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hS); 3652 if (!__kmp_hwloc_obj_has_PUs(tp, hS)) 3653 continue; // skip socket if all PUs are out of fullMask 3654 ++nS; // only count objects those have PUs in affinity mask 3655 if (nS <= __kmp_hws_socket.offset || 3656 nS > __kmp_hws_socket.num + __kmp_hws_socket.offset) { 3657 n_old += __kmp_hwloc_skip_PUs_obj(tp, hS); // skip socket 3658 continue; // move to next socket 3659 } 3660 nCr = 0; // count number of cores per socket 3661 // socket requested, go down the topology tree 3662 // check 4 cases: (+NUMA+Tile), (+NUMA-Tile), (-NUMA+Tile), (-NUMA-Tile) 3663 if (numa_support) { 3664 nN = 0; 3665 hN = NULL; 3666 // num nodes in current socket 3667 int NN = 3668 __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, &hN); 3669 for (int n = 0; n < NN; ++n) { 3670 // Check NUMA Node ---------------------------------------- 3671 if (!__kmp_hwloc_obj_has_PUs(tp, hN)) { 3672 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3673 continue; // skip node if all PUs are out of fullMask 3674 } 3675 ++nN; 3676 if (nN <= __kmp_hws_node.offset || 3677 nN > __kmp_hws_node.num + __kmp_hws_node.offset) { 3678 // skip node as not requested 3679 n_old += __kmp_hwloc_skip_PUs_obj(tp, hN); // skip node 3680 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3681 continue; // move to next node 3682 } 3683 // node requested, go down the topology tree 3684 if (tile_support) { 3685 nL = 0; 3686 hL = NULL; 3687 int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL); 3688 for (int l = 0; l < NL; ++l) { 3689 // Check L2 (tile) ------------------------------------ 3690 if (!__kmp_hwloc_obj_has_PUs(tp, hL)) { 3691 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3692 continue; // skip tile if all PUs are out of fullMask 3693 } 3694 ++nL; 3695 if (nL <= __kmp_hws_tile.offset || 3696 nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) { 3697 // skip tile as not requested 3698 n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile 3699 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3700 continue; // move to next tile 3701 } 3702 // tile requested, go down the topology tree 3703 nC = 0; 3704 hC = NULL; 3705 // num cores in current tile 3706 int NC = __kmp_hwloc_count_children_by_type(tp, hL, 3707 HWLOC_OBJ_CORE, &hC); 3708 for (int c = 0; c < NC; ++c) { 3709 // Check Core --------------------------------------- 3710 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3711 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3712 continue; // skip core if all PUs are out of fullMask 3713 } 3714 ++nC; 3715 if (nC <= __kmp_hws_core.offset || 3716 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3717 // skip node as not requested 3718 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3719 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3720 continue; // move to next node 3721 } 3722 // core requested, go down to PUs 3723 nT = 0; 3724 nTr = 0; 3725 hT = NULL; 3726 // num procs in current core 3727 int NT = __kmp_hwloc_count_children_by_type(tp, hC, 3728 HWLOC_OBJ_PU, &hT); 3729 for (int t = 0; t < NT; ++t) { 3730 // Check PU --------------------------------------- 3731 idx = hT->os_index; 3732 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3733 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3734 continue; // skip PU if not in fullMask 3735 } 3736 ++nT; 3737 if (nT <= __kmp_hws_proc.offset || 3738 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3739 // skip PU 3740 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3741 ++n_old; 3742 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3743 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3744 continue; // move to next node 3745 } 3746 ++nTr; 3747 if (pAddr) // collect requested thread's data 3748 newAddr[n_new] = (*pAddr)[n_old]; 3749 ++n_new; 3750 ++n_old; 3751 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3752 } // threads loop 3753 if (nTr > 0) { 3754 ++nCr; // num cores per socket 3755 ++nCo; // total num cores 3756 if (nTr > nTpC) 3757 nTpC = nTr; // calc max threads per core 3758 } 3759 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3760 } // cores loop 3761 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3762 } // tiles loop 3763 } else { // tile_support 3764 // no tiles, check cores 3765 nC = 0; 3766 hC = NULL; 3767 // num cores in current node 3768 int NC = 3769 __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, &hC); 3770 for (int c = 0; c < NC; ++c) { 3771 // Check Core --------------------------------------- 3772 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3773 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3774 continue; // skip core if all PUs are out of fullMask 3775 } 3776 ++nC; 3777 if (nC <= __kmp_hws_core.offset || 3778 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3779 // skip node as not requested 3780 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3781 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3782 continue; // move to next node 3783 } 3784 // core requested, go down to PUs 3785 nT = 0; 3786 nTr = 0; 3787 hT = NULL; 3788 int NT = 3789 __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT); 3790 for (int t = 0; t < NT; ++t) { 3791 // Check PU --------------------------------------- 3792 idx = hT->os_index; 3793 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3794 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3795 continue; // skip PU if not in fullMask 3796 } 3797 ++nT; 3798 if (nT <= __kmp_hws_proc.offset || 3799 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3800 // skip PU 3801 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3802 ++n_old; 3803 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3804 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3805 continue; // move to next node 3806 } 3807 ++nTr; 3808 if (pAddr) // collect requested thread's data 3809 newAddr[n_new] = (*pAddr)[n_old]; 3810 ++n_new; 3811 ++n_old; 3812 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3813 } // threads loop 3814 if (nTr > 0) { 3815 ++nCr; // num cores per socket 3816 ++nCo; // total num cores 3817 if (nTr > nTpC) 3818 nTpC = nTr; // calc max threads per core 3819 } 3820 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3821 } // cores loop 3822 } // tiles support 3823 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3824 } // nodes loop 3825 } else { // numa_support 3826 // no NUMA support 3827 if (tile_support) { 3828 nL = 0; 3829 hL = NULL; 3830 // num tiles in current socket 3831 int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL); 3832 for (int l = 0; l < NL; ++l) { 3833 // Check L2 (tile) ------------------------------------ 3834 if (!__kmp_hwloc_obj_has_PUs(tp, hL)) { 3835 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3836 continue; // skip tile if all PUs are out of fullMask 3837 } 3838 ++nL; 3839 if (nL <= __kmp_hws_tile.offset || 3840 nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) { 3841 // skip tile as not requested 3842 n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile 3843 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3844 continue; // move to next tile 3845 } 3846 // tile requested, go down the topology tree 3847 nC = 0; 3848 hC = NULL; 3849 // num cores per tile 3850 int NC = 3851 __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC); 3852 for (int c = 0; c < NC; ++c) { 3853 // Check Core --------------------------------------- 3854 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3855 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3856 continue; // skip core if all PUs are out of fullMask 3857 } 3858 ++nC; 3859 if (nC <= __kmp_hws_core.offset || 3860 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3861 // skip node as not requested 3862 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3863 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3864 continue; // move to next node 3865 } 3866 // core requested, go down to PUs 3867 nT = 0; 3868 nTr = 0; 3869 hT = NULL; 3870 // num procs per core 3871 int NT = 3872 __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT); 3873 for (int t = 0; t < NT; ++t) { 3874 // Check PU --------------------------------------- 3875 idx = hT->os_index; 3876 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3877 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3878 continue; // skip PU if not in fullMask 3879 } 3880 ++nT; 3881 if (nT <= __kmp_hws_proc.offset || 3882 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3883 // skip PU 3884 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3885 ++n_old; 3886 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3887 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3888 continue; // move to next node 3889 } 3890 ++nTr; 3891 if (pAddr) // collect requested thread's data 3892 newAddr[n_new] = (*pAddr)[n_old]; 3893 ++n_new; 3894 ++n_old; 3895 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3896 } // threads loop 3897 if (nTr > 0) { 3898 ++nCr; // num cores per socket 3899 ++nCo; // total num cores 3900 if (nTr > nTpC) 3901 nTpC = nTr; // calc max threads per core 3902 } 3903 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3904 } // cores loop 3905 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3906 } // tiles loop 3907 } else { // tile_support 3908 // no tiles, check cores 3909 nC = 0; 3910 hC = NULL; 3911 // num cores in socket 3912 int NC = 3913 __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, &hC); 3914 for (int c = 0; c < NC; ++c) { 3915 // Check Core ------------------------------------------- 3916 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3917 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3918 continue; // skip core if all PUs are out of fullMask 3919 } 3920 ++nC; 3921 if (nC <= __kmp_hws_core.offset || 3922 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3923 // skip node as not requested 3924 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3925 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3926 continue; // move to next node 3927 } 3928 // core requested, go down to PUs 3929 nT = 0; 3930 nTr = 0; 3931 hT = NULL; 3932 // num procs per core 3933 int NT = 3934 __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT); 3935 for (int t = 0; t < NT; ++t) { 3936 // Check PU --------------------------------------- 3937 idx = hT->os_index; 3938 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3939 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3940 continue; // skip PU if not in fullMask 3941 } 3942 ++nT; 3943 if (nT <= __kmp_hws_proc.offset || 3944 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3945 // skip PU 3946 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3947 ++n_old; 3948 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3949 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3950 continue; // move to next node 3951 } 3952 ++nTr; 3953 if (pAddr) // collect requested thread's data 3954 newAddr[n_new] = (*pAddr)[n_old]; 3955 ++n_new; 3956 ++n_old; 3957 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3958 } // threads loop 3959 if (nTr > 0) { 3960 ++nCr; // num cores per socket 3961 ++nCo; // total num cores 3962 if (nTr > nTpC) 3963 nTpC = nTr; // calc max threads per core 3964 } 3965 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3966 } // cores loop 3967 } // tiles support 3968 } // numa_support 3969 if (nCr > 0) { // found cores? 3970 ++nPkg; // num sockets 3971 if (nCr > nCpP) 3972 nCpP = nCr; // calc max cores per socket 3973 } 3974 } // sockets loop 3975 3976 // check the subset is valid 3977 KMP_DEBUG_ASSERT(n_old == __kmp_avail_proc); 3978 KMP_DEBUG_ASSERT(nPkg > 0); 3979 KMP_DEBUG_ASSERT(nCpP > 0); 3980 KMP_DEBUG_ASSERT(nTpC > 0); 3981 KMP_DEBUG_ASSERT(nCo > 0); 3982 KMP_DEBUG_ASSERT(nPkg <= nPackages); 3983 KMP_DEBUG_ASSERT(nCpP <= nCoresPerPkg); 3984 KMP_DEBUG_ASSERT(nTpC <= __kmp_nThreadsPerCore); 3985 KMP_DEBUG_ASSERT(nCo <= __kmp_ncores); 3986 3987 nPackages = nPkg; // correct num sockets 3988 nCoresPerPkg = nCpP; // correct num cores per socket 3989 __kmp_nThreadsPerCore = nTpC; // correct num threads per core 3990 __kmp_avail_proc = n_new; // correct num procs 3991 __kmp_ncores = nCo; // correct num cores 3992 // hwloc topology method end 3993 } else 3994 #endif // KMP_USE_HWLOC 3995 { 3996 int n_old = 0, n_new = 0, proc_num = 0; 3997 if (__kmp_hws_node.num > 0 || __kmp_hws_tile.num > 0) { 3998 KMP_WARNING(AffHWSubsetNoHWLOC); 3999 goto _exit; 4000 } 4001 if (__kmp_hws_socket.num == 0) 4002 __kmp_hws_socket.num = nPackages; // use all available sockets 4003 if (__kmp_hws_die.num == 0) 4004 __kmp_hws_die.num = nDiesPerPkg; // use all available dies 4005 if (__kmp_hws_core.num == 0) 4006 __kmp_hws_core.num = nCoresPerPkg; // use all available cores 4007 if (__kmp_hws_proc.num == 0 || __kmp_hws_proc.num > __kmp_nThreadsPerCore) 4008 __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all HW contexts 4009 if (!__kmp_affinity_uniform_topology()) { 4010 KMP_WARNING(AffHWSubsetNonUniform); 4011 goto _exit; // don't support non-uniform topology 4012 } 4013 if (depth > 4) { 4014 KMP_WARNING(AffHWSubsetNonThreeLevel); 4015 goto _exit; // don't support not-3-level topology 4016 } 4017 if (__kmp_hws_socket.offset + __kmp_hws_socket.num > nPackages) { 4018 KMP_WARNING(AffHWSubsetManySockets); 4019 goto _exit; 4020 } 4021 if (depth == 4 && __kmp_hws_die.offset + __kmp_hws_die.num > nDiesPerPkg) { 4022 KMP_WARNING(AffHWSubsetManyDies); 4023 goto _exit; 4024 } 4025 if (__kmp_hws_core.offset + __kmp_hws_core.num > nCoresPerPkg) { 4026 KMP_WARNING(AffHWSubsetManyCores); 4027 goto _exit; 4028 } 4029 // Form the requested subset 4030 if (pAddr) // pAddr is NULL in case of affinity_none 4031 newAddr = (AddrUnsPair *)__kmp_allocate( 4032 sizeof(AddrUnsPair) * __kmp_hws_socket.num * __kmp_hws_die.num * 4033 __kmp_hws_core.num * __kmp_hws_proc.num); 4034 for (int i = 0; i < nPackages; ++i) { 4035 if (i < __kmp_hws_socket.offset || 4036 i >= __kmp_hws_socket.offset + __kmp_hws_socket.num) { 4037 // skip not-requested socket 4038 n_old += nDiesPerPkg * nCoresPerPkg * __kmp_nThreadsPerCore; 4039 if (__kmp_pu_os_idx != NULL) { 4040 // walk through skipped socket 4041 for (int l = 0; l < nDiesPerPkg; ++l) { 4042 for (int j = 0; j < nCoresPerPkg; ++j) { 4043 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 4044 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 4045 ++proc_num; 4046 } 4047 } 4048 } 4049 } 4050 } else { 4051 // walk through requested socket 4052 for (int l = 0; l < nDiesPerPkg; ++l) { 4053 // skip unwanted die 4054 if (l < __kmp_hws_die.offset || 4055 l >= __kmp_hws_die.offset + __kmp_hws_die.num) { 4056 n_old += nCoresPerPkg; 4057 if (__kmp_pu_os_idx != NULL) { 4058 for (int k = 0; k < nCoresPerPkg; ++k) { 4059 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 4060 ++proc_num; 4061 } 4062 } 4063 } else { 4064 for (int j = 0; j < nCoresPerPkg; ++j) { 4065 if (j < __kmp_hws_core.offset || 4066 j >= __kmp_hws_core.offset + 4067 __kmp_hws_core.num) { // skip not-requested core 4068 n_old += __kmp_nThreadsPerCore; 4069 if (__kmp_pu_os_idx != NULL) { 4070 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 4071 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], 4072 __kmp_affin_fullMask); 4073 ++proc_num; 4074 } 4075 } 4076 } else { 4077 // walk through requested core 4078 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 4079 if (k < __kmp_hws_proc.num) { 4080 if (pAddr) // collect requested thread's data 4081 newAddr[n_new] = (*pAddr)[n_old]; 4082 n_new++; 4083 } else { 4084 if (__kmp_pu_os_idx != NULL) 4085 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], 4086 __kmp_affin_fullMask); 4087 } 4088 n_old++; 4089 ++proc_num; 4090 } 4091 } 4092 } 4093 } 4094 } 4095 } 4096 } 4097 KMP_DEBUG_ASSERT(n_old == nPackages * nDiesPerPkg * nCoresPerPkg * 4098 __kmp_nThreadsPerCore); 4099 KMP_DEBUG_ASSERT(n_new == __kmp_hws_socket.num * __kmp_hws_die.num * 4100 __kmp_hws_core.num * __kmp_hws_proc.num); 4101 nPackages = __kmp_hws_socket.num; // correct nPackages 4102 nCoresPerPkg = __kmp_hws_core.num; // correct nCoresPerPkg 4103 nDiesPerPkg = __kmp_hws_die.num; // correct nDiesPerPkg 4104 __kmp_nThreadsPerCore = __kmp_hws_proc.num; // correct __kmp_nThreadsPerCore 4105 __kmp_avail_proc = n_new; // correct avail_proc 4106 __kmp_ncores = 4107 nPackages * nDiesPerPkg * __kmp_hws_core.num; // correct ncores 4108 } // non-hwloc topology method 4109 if (pAddr) { 4110 __kmp_free(*pAddr); 4111 *pAddr = newAddr; // replace old topology with new one 4112 } 4113 if (__kmp_affinity_verbose) { 4114 KMP_INFORM(AvailableOSProc, "KMP_HW_SUBSET", __kmp_avail_proc); 4115 kmp_str_buf_t buf; 4116 __kmp_str_buf_init(&buf); 4117 __kmp_str_buf_print(&buf, "%d", nPackages); 4118 KMP_INFORM(TopologyExtra, "KMP_HW_SUBSET", buf.str, nCoresPerPkg, 4119 __kmp_nThreadsPerCore, __kmp_ncores); 4120 __kmp_str_buf_free(&buf); 4121 } 4122 _exit: 4123 if (__kmp_pu_os_idx != NULL) { 4124 __kmp_free(__kmp_pu_os_idx); 4125 __kmp_pu_os_idx = NULL; 4126 } 4127 } 4128 4129 // This function figures out the deepest level at which there is at least one 4130 // cluster/core with more than one processing unit bound to it. 4131 static int __kmp_affinity_find_core_level(const AddrUnsPair *address2os, 4132 int nprocs, int bottom_level) { 4133 int core_level = 0; 4134 4135 for (int i = 0; i < nprocs; i++) { 4136 for (int j = bottom_level; j > 0; j--) { 4137 if (address2os[i].first.labels[j] > 0) { 4138 if (core_level < (j - 1)) { 4139 core_level = j - 1; 4140 } 4141 } 4142 } 4143 } 4144 return core_level; 4145 } 4146 4147 // This function counts number of clusters/cores at given level. 4148 static int __kmp_affinity_compute_ncores(const AddrUnsPair *address2os, 4149 int nprocs, int bottom_level, 4150 int core_level) { 4151 int ncores = 0; 4152 int i, j; 4153 4154 j = bottom_level; 4155 for (i = 0; i < nprocs; i++) { 4156 for (j = bottom_level; j > core_level; j--) { 4157 if ((i + 1) < nprocs) { 4158 if (address2os[i + 1].first.labels[j] > 0) { 4159 break; 4160 } 4161 } 4162 } 4163 if (j == core_level) { 4164 ncores++; 4165 } 4166 } 4167 if (j > core_level) { 4168 // In case of ( nprocs < __kmp_avail_proc ) we may end too deep and miss one 4169 // core. May occur when called from __kmp_affinity_find_core(). 4170 ncores++; 4171 } 4172 return ncores; 4173 } 4174 4175 // This function finds to which cluster/core given processing unit is bound. 4176 static int __kmp_affinity_find_core(const AddrUnsPair *address2os, int proc, 4177 int bottom_level, int core_level) { 4178 return __kmp_affinity_compute_ncores(address2os, proc + 1, bottom_level, 4179 core_level) - 4180 1; 4181 } 4182 4183 // This function finds maximal number of processing units bound to a 4184 // cluster/core at given level. 4185 static int __kmp_affinity_max_proc_per_core(const AddrUnsPair *address2os, 4186 int nprocs, int bottom_level, 4187 int core_level) { 4188 int maxprocpercore = 0; 4189 4190 if (core_level < bottom_level) { 4191 for (int i = 0; i < nprocs; i++) { 4192 int percore = address2os[i].first.labels[core_level + 1] + 1; 4193 4194 if (percore > maxprocpercore) { 4195 maxprocpercore = percore; 4196 } 4197 } 4198 } else { 4199 maxprocpercore = 1; 4200 } 4201 return maxprocpercore; 4202 } 4203 4204 static AddrUnsPair *address2os = NULL; 4205 static int *procarr = NULL; 4206 static int __kmp_aff_depth = 0; 4207 4208 #if KMP_USE_HIER_SCHED 4209 #define KMP_EXIT_AFF_NONE \ 4210 KMP_ASSERT(__kmp_affinity_type == affinity_none); \ 4211 KMP_ASSERT(address2os == NULL); \ 4212 __kmp_apply_thread_places(NULL, 0); \ 4213 __kmp_create_affinity_none_places(); \ 4214 __kmp_dispatch_set_hierarchy_values(); \ 4215 return; 4216 #else 4217 #define KMP_EXIT_AFF_NONE \ 4218 KMP_ASSERT(__kmp_affinity_type == affinity_none); \ 4219 KMP_ASSERT(address2os == NULL); \ 4220 __kmp_apply_thread_places(NULL, 0); \ 4221 __kmp_create_affinity_none_places(); \ 4222 return; 4223 #endif 4224 4225 // Create a one element mask array (set of places) which only contains the 4226 // initial process's affinity mask 4227 static void __kmp_create_affinity_none_places() { 4228 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4229 KMP_ASSERT(__kmp_affinity_type == affinity_none); 4230 __kmp_affinity_num_masks = 1; 4231 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4232 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, 0); 4233 KMP_CPU_COPY(dest, __kmp_affin_fullMask); 4234 } 4235 4236 static int __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) { 4237 const Address *aa = &(((const AddrUnsPair *)a)->first); 4238 const Address *bb = &(((const AddrUnsPair *)b)->first); 4239 unsigned depth = aa->depth; 4240 unsigned i; 4241 KMP_DEBUG_ASSERT(depth == bb->depth); 4242 KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth); 4243 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0); 4244 for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) { 4245 int j = depth - i - 1; 4246 if (aa->childNums[j] < bb->childNums[j]) 4247 return -1; 4248 if (aa->childNums[j] > bb->childNums[j]) 4249 return 1; 4250 } 4251 for (; i < depth; i++) { 4252 int j = i - __kmp_affinity_compact; 4253 if (aa->childNums[j] < bb->childNums[j]) 4254 return -1; 4255 if (aa->childNums[j] > bb->childNums[j]) 4256 return 1; 4257 } 4258 return 0; 4259 } 4260 4261 static void __kmp_aux_affinity_initialize(void) { 4262 if (__kmp_affinity_masks != NULL) { 4263 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4264 return; 4265 } 4266 4267 // Create the "full" mask - this defines all of the processors that we 4268 // consider to be in the machine model. If respect is set, then it is the 4269 // initialization thread's affinity mask. Otherwise, it is all processors that 4270 // we know about on the machine. 4271 if (__kmp_affin_fullMask == NULL) { 4272 KMP_CPU_ALLOC(__kmp_affin_fullMask); 4273 } 4274 if (KMP_AFFINITY_CAPABLE()) { 4275 __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE); 4276 if (__kmp_affinity_respect_mask) { 4277 // Count the number of available processors. 4278 unsigned i; 4279 __kmp_avail_proc = 0; 4280 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 4281 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 4282 continue; 4283 } 4284 __kmp_avail_proc++; 4285 } 4286 if (__kmp_avail_proc > __kmp_xproc) { 4287 if (__kmp_affinity_verbose || 4288 (__kmp_affinity_warnings && 4289 (__kmp_affinity_type != affinity_none))) { 4290 KMP_WARNING(ErrorInitializeAffinity); 4291 } 4292 __kmp_affinity_type = affinity_none; 4293 KMP_AFFINITY_DISABLE(); 4294 return; 4295 } 4296 4297 if (__kmp_affinity_verbose) { 4298 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4299 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4300 __kmp_affin_fullMask); 4301 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 4302 } 4303 } else { 4304 if (__kmp_affinity_verbose) { 4305 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4306 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4307 __kmp_affin_fullMask); 4308 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 4309 } 4310 __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask); 4311 __kmp_avail_proc = __kmp_xproc; 4312 #if KMP_OS_WINDOWS 4313 // Set the process affinity mask since threads' affinity 4314 // masks must be subset of process mask in Windows* OS 4315 __kmp_affin_fullMask->set_process_affinity(true); 4316 #endif 4317 } 4318 } 4319 4320 if (__kmp_affinity_gran == affinity_gran_tile && 4321 // check if user's request is valid 4322 __kmp_affinity_dispatch->get_api_type() == KMPAffinity::NATIVE_OS) { 4323 KMP_WARNING(AffTilesNoHWLOC, "KMP_AFFINITY"); 4324 __kmp_affinity_gran = affinity_gran_package; 4325 } 4326 4327 int depth = -1; 4328 kmp_i18n_id_t msg_id = kmp_i18n_null; 4329 4330 // For backward compatibility, setting KMP_CPUINFO_FILE => 4331 // KMP_TOPOLOGY_METHOD=cpuinfo 4332 if ((__kmp_cpuinfo_file != NULL) && 4333 (__kmp_affinity_top_method == affinity_top_method_all)) { 4334 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 4335 } 4336 4337 if (__kmp_affinity_top_method == affinity_top_method_all) { 4338 // In the default code path, errors are not fatal - we just try using 4339 // another method. We only emit a warning message if affinity is on, or the 4340 // verbose flag is set, and the nowarnings flag was not set. 4341 const char *file_name = NULL; 4342 int line = 0; 4343 #if KMP_USE_HWLOC 4344 if (depth < 0 && 4345 __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 4346 if (__kmp_affinity_verbose) { 4347 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 4348 } 4349 if (!__kmp_hwloc_error) { 4350 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); 4351 if (depth == 0) { 4352 KMP_EXIT_AFF_NONE; 4353 } 4354 } else if (__kmp_affinity_verbose) { 4355 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 4356 } 4357 } 4358 #endif 4359 4360 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4361 4362 if (depth < 0) { 4363 if (__kmp_affinity_verbose) { 4364 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 4365 } 4366 4367 file_name = NULL; 4368 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 4369 if (depth == 0) { 4370 KMP_EXIT_AFF_NONE; 4371 } 4372 4373 if (depth < 0) { 4374 if (__kmp_affinity_verbose) { 4375 if (msg_id != kmp_i18n_null) { 4376 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", 4377 __kmp_i18n_catgets(msg_id), 4378 KMP_I18N_STR(DecodingLegacyAPIC)); 4379 } else { 4380 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 4381 KMP_I18N_STR(DecodingLegacyAPIC)); 4382 } 4383 } 4384 4385 file_name = NULL; 4386 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 4387 if (depth == 0) { 4388 KMP_EXIT_AFF_NONE; 4389 } 4390 } 4391 } 4392 4393 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4394 4395 #if KMP_OS_LINUX 4396 4397 if (depth < 0) { 4398 if (__kmp_affinity_verbose) { 4399 if (msg_id != kmp_i18n_null) { 4400 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", 4401 __kmp_i18n_catgets(msg_id), "/proc/cpuinfo"); 4402 } else { 4403 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo"); 4404 } 4405 } 4406 4407 kmp_safe_raii_file_t f("/proc/cpuinfo", "r"); 4408 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 4409 if (depth == 0) { 4410 KMP_EXIT_AFF_NONE; 4411 } 4412 } 4413 4414 #endif /* KMP_OS_LINUX */ 4415 4416 #if KMP_GROUP_AFFINITY 4417 4418 if ((depth < 0) && (__kmp_num_proc_groups > 1)) { 4419 if (__kmp_affinity_verbose) { 4420 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 4421 } 4422 4423 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 4424 KMP_ASSERT(depth != 0); 4425 } 4426 4427 #endif /* KMP_GROUP_AFFINITY */ 4428 4429 if (depth < 0) { 4430 if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) { 4431 if (file_name == NULL) { 4432 KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id)); 4433 } else if (line == 0) { 4434 KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id)); 4435 } else { 4436 KMP_INFORM(UsingFlatOSFileLine, file_name, line, 4437 __kmp_i18n_catgets(msg_id)); 4438 } 4439 } 4440 // FIXME - print msg if msg_id = kmp_i18n_null ??? 4441 4442 file_name = ""; 4443 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 4444 if (depth == 0) { 4445 KMP_EXIT_AFF_NONE; 4446 } 4447 KMP_ASSERT(depth > 0); 4448 KMP_ASSERT(address2os != NULL); 4449 } 4450 } 4451 4452 #if KMP_USE_HWLOC 4453 else if (__kmp_affinity_top_method == affinity_top_method_hwloc) { 4454 KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC); 4455 if (__kmp_affinity_verbose) { 4456 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 4457 } 4458 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); 4459 if (depth == 0) { 4460 KMP_EXIT_AFF_NONE; 4461 } 4462 } 4463 #endif // KMP_USE_HWLOC 4464 4465 // If the user has specified that a particular topology discovery method is to 4466 // be used, then we abort if that method fails. The exception is group 4467 // affinity, which might have been implicitly set. 4468 4469 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4470 4471 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid || 4472 __kmp_affinity_top_method == affinity_top_method_x2apicid_1f) { 4473 if (__kmp_affinity_verbose) { 4474 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 4475 } 4476 4477 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 4478 if (depth == 0) { 4479 KMP_EXIT_AFF_NONE; 4480 } 4481 if (depth < 0) { 4482 KMP_ASSERT(msg_id != kmp_i18n_null); 4483 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4484 } 4485 } else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 4486 if (__kmp_affinity_verbose) { 4487 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 4488 } 4489 4490 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 4491 if (depth == 0) { 4492 KMP_EXIT_AFF_NONE; 4493 } 4494 if (depth < 0) { 4495 KMP_ASSERT(msg_id != kmp_i18n_null); 4496 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4497 } 4498 } 4499 4500 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4501 4502 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 4503 const char *filename; 4504 const char *env_var = nullptr; 4505 if (__kmp_cpuinfo_file != NULL) { 4506 filename = __kmp_cpuinfo_file; 4507 env_var = "KMP_CPUINFO_FILE"; 4508 } else { 4509 filename = "/proc/cpuinfo"; 4510 } 4511 4512 if (__kmp_affinity_verbose) { 4513 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 4514 } 4515 4516 kmp_safe_raii_file_t f(filename, "r", env_var); 4517 int line = 0; 4518 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 4519 if (depth < 0) { 4520 KMP_ASSERT(msg_id != kmp_i18n_null); 4521 if (line > 0) { 4522 KMP_FATAL(FileLineMsgExiting, filename, line, 4523 __kmp_i18n_catgets(msg_id)); 4524 } else { 4525 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 4526 } 4527 } 4528 if (__kmp_affinity_type == affinity_none) { 4529 KMP_ASSERT(depth == 0); 4530 KMP_EXIT_AFF_NONE; 4531 } 4532 } 4533 4534 #if KMP_GROUP_AFFINITY 4535 4536 else if (__kmp_affinity_top_method == affinity_top_method_group) { 4537 if (__kmp_affinity_verbose) { 4538 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 4539 } 4540 4541 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 4542 KMP_ASSERT(depth != 0); 4543 if (depth < 0) { 4544 KMP_ASSERT(msg_id != kmp_i18n_null); 4545 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4546 } 4547 } 4548 4549 #endif /* KMP_GROUP_AFFINITY */ 4550 4551 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 4552 if (__kmp_affinity_verbose) { 4553 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY"); 4554 } 4555 4556 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 4557 if (depth == 0) { 4558 KMP_EXIT_AFF_NONE; 4559 } 4560 // should not fail 4561 KMP_ASSERT(depth > 0); 4562 KMP_ASSERT(address2os != NULL); 4563 } 4564 4565 #if KMP_USE_HIER_SCHED 4566 __kmp_dispatch_set_hierarchy_values(); 4567 #endif 4568 4569 if (address2os == NULL) { 4570 if (KMP_AFFINITY_CAPABLE() && 4571 (__kmp_affinity_verbose || 4572 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) { 4573 KMP_WARNING(ErrorInitializeAffinity); 4574 } 4575 __kmp_affinity_type = affinity_none; 4576 __kmp_create_affinity_none_places(); 4577 KMP_AFFINITY_DISABLE(); 4578 return; 4579 } 4580 4581 if (__kmp_affinity_gran == affinity_gran_tile 4582 #if KMP_USE_HWLOC 4583 && __kmp_tile_depth == 0 4584 #endif 4585 ) { 4586 // tiles requested but not detected, warn user on this 4587 KMP_WARNING(AffTilesNoTiles, "KMP_AFFINITY"); 4588 } 4589 4590 __kmp_apply_thread_places(&address2os, depth); 4591 4592 // Create the table of masks, indexed by thread Id. 4593 unsigned maxIndex; 4594 unsigned numUnique; 4595 kmp_affin_mask_t *osId2Mask = 4596 __kmp_create_masks(&maxIndex, &numUnique, address2os, __kmp_avail_proc); 4597 if (__kmp_affinity_gran_levels == 0) { 4598 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 4599 } 4600 4601 // Set the childNums vector in all Address objects. This must be done before 4602 // we can sort using __kmp_affinity_cmp_Address_child_num(), which takes into 4603 // account the setting of __kmp_affinity_compact. 4604 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc); 4605 4606 switch (__kmp_affinity_type) { 4607 4608 case affinity_explicit: 4609 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 4610 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) { 4611 __kmp_affinity_process_proclist( 4612 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 4613 __kmp_affinity_proclist, osId2Mask, maxIndex); 4614 } else { 4615 __kmp_affinity_process_placelist( 4616 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 4617 __kmp_affinity_proclist, osId2Mask, maxIndex); 4618 } 4619 if (__kmp_affinity_num_masks == 0) { 4620 if (__kmp_affinity_verbose || 4621 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 4622 KMP_WARNING(AffNoValidProcID); 4623 } 4624 __kmp_affinity_type = affinity_none; 4625 __kmp_create_affinity_none_places(); 4626 return; 4627 } 4628 break; 4629 4630 // The other affinity types rely on sorting the Addresses according to some 4631 // permutation of the machine topology tree. Set __kmp_affinity_compact and 4632 // __kmp_affinity_offset appropriately, then jump to a common code fragment 4633 // to do the sort and create the array of affinity masks. 4634 4635 case affinity_logical: 4636 __kmp_affinity_compact = 0; 4637 if (__kmp_affinity_offset) { 4638 __kmp_affinity_offset = 4639 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 4640 } 4641 goto sortAddresses; 4642 4643 case affinity_physical: 4644 if (__kmp_nThreadsPerCore > 1) { 4645 __kmp_affinity_compact = 1; 4646 if (__kmp_affinity_compact >= depth) { 4647 __kmp_affinity_compact = 0; 4648 } 4649 } else { 4650 __kmp_affinity_compact = 0; 4651 } 4652 if (__kmp_affinity_offset) { 4653 __kmp_affinity_offset = 4654 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 4655 } 4656 goto sortAddresses; 4657 4658 case affinity_scatter: 4659 if (__kmp_affinity_compact >= depth) { 4660 __kmp_affinity_compact = 0; 4661 } else { 4662 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 4663 } 4664 goto sortAddresses; 4665 4666 case affinity_compact: 4667 if (__kmp_affinity_compact >= depth) { 4668 __kmp_affinity_compact = depth - 1; 4669 } 4670 goto sortAddresses; 4671 4672 case affinity_balanced: 4673 if (depth <= 1) { 4674 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 4675 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 4676 } 4677 __kmp_affinity_type = affinity_none; 4678 __kmp_create_affinity_none_places(); 4679 return; 4680 } else if (!__kmp_affinity_uniform_topology()) { 4681 // Save the depth for further usage 4682 __kmp_aff_depth = depth; 4683 4684 int core_level = __kmp_affinity_find_core_level( 4685 address2os, __kmp_avail_proc, depth - 1); 4686 int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, 4687 depth - 1, core_level); 4688 int maxprocpercore = __kmp_affinity_max_proc_per_core( 4689 address2os, __kmp_avail_proc, depth - 1, core_level); 4690 4691 int nproc = ncores * maxprocpercore; 4692 if ((nproc < 2) || (nproc < __kmp_avail_proc)) { 4693 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 4694 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 4695 } 4696 __kmp_affinity_type = affinity_none; 4697 return; 4698 } 4699 4700 procarr = (int *)__kmp_allocate(sizeof(int) * nproc); 4701 for (int i = 0; i < nproc; i++) { 4702 procarr[i] = -1; 4703 } 4704 4705 int lastcore = -1; 4706 int inlastcore = 0; 4707 for (int i = 0; i < __kmp_avail_proc; i++) { 4708 int proc = address2os[i].second; 4709 int core = 4710 __kmp_affinity_find_core(address2os, i, depth - 1, core_level); 4711 4712 if (core == lastcore) { 4713 inlastcore++; 4714 } else { 4715 inlastcore = 0; 4716 } 4717 lastcore = core; 4718 4719 procarr[core * maxprocpercore + inlastcore] = proc; 4720 } 4721 } 4722 if (__kmp_affinity_compact >= depth) { 4723 __kmp_affinity_compact = depth - 1; 4724 } 4725 4726 sortAddresses: 4727 // Allocate the gtid->affinity mask table. 4728 if (__kmp_affinity_dups) { 4729 __kmp_affinity_num_masks = __kmp_avail_proc; 4730 } else { 4731 __kmp_affinity_num_masks = numUnique; 4732 } 4733 4734 if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) && 4735 (__kmp_affinity_num_places > 0) && 4736 ((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) { 4737 __kmp_affinity_num_masks = __kmp_affinity_num_places; 4738 } 4739 4740 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4741 4742 // Sort the address2os table according to the current setting of 4743 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 4744 qsort(address2os, __kmp_avail_proc, sizeof(*address2os), 4745 __kmp_affinity_cmp_Address_child_num); 4746 { 4747 int i; 4748 unsigned j; 4749 for (i = 0, j = 0; i < __kmp_avail_proc; i++) { 4750 if ((!__kmp_affinity_dups) && (!address2os[i].first.leader)) { 4751 continue; 4752 } 4753 unsigned osId = address2os[i].second; 4754 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 4755 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j); 4756 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 4757 KMP_CPU_COPY(dest, src); 4758 if (++j >= __kmp_affinity_num_masks) { 4759 break; 4760 } 4761 } 4762 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 4763 } 4764 break; 4765 4766 default: 4767 KMP_ASSERT2(0, "Unexpected affinity setting"); 4768 } 4769 4770 KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1); 4771 machine_hierarchy.init(address2os, __kmp_avail_proc); 4772 } 4773 #undef KMP_EXIT_AFF_NONE 4774 4775 void __kmp_affinity_initialize(void) { 4776 // Much of the code above was written assuming that if a machine was not 4777 // affinity capable, then __kmp_affinity_type == affinity_none. We now 4778 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 4779 // There are too many checks for __kmp_affinity_type == affinity_none 4780 // in this code. Instead of trying to change them all, check if 4781 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 4782 // affinity_none, call the real initialization routine, then restore 4783 // __kmp_affinity_type to affinity_disabled. 4784 int disabled = (__kmp_affinity_type == affinity_disabled); 4785 if (!KMP_AFFINITY_CAPABLE()) { 4786 KMP_ASSERT(disabled); 4787 } 4788 if (disabled) { 4789 __kmp_affinity_type = affinity_none; 4790 } 4791 __kmp_aux_affinity_initialize(); 4792 if (disabled) { 4793 __kmp_affinity_type = affinity_disabled; 4794 } 4795 } 4796 4797 void __kmp_affinity_uninitialize(void) { 4798 if (__kmp_affinity_masks != NULL) { 4799 KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4800 __kmp_affinity_masks = NULL; 4801 } 4802 if (__kmp_affin_fullMask != NULL) { 4803 KMP_CPU_FREE(__kmp_affin_fullMask); 4804 __kmp_affin_fullMask = NULL; 4805 } 4806 __kmp_affinity_num_masks = 0; 4807 __kmp_affinity_type = affinity_default; 4808 __kmp_affinity_num_places = 0; 4809 if (__kmp_affinity_proclist != NULL) { 4810 __kmp_free(__kmp_affinity_proclist); 4811 __kmp_affinity_proclist = NULL; 4812 } 4813 if (address2os != NULL) { 4814 __kmp_free(address2os); 4815 address2os = NULL; 4816 } 4817 if (procarr != NULL) { 4818 __kmp_free(procarr); 4819 procarr = NULL; 4820 } 4821 #if KMP_USE_HWLOC 4822 if (__kmp_hwloc_topology != NULL) { 4823 hwloc_topology_destroy(__kmp_hwloc_topology); 4824 __kmp_hwloc_topology = NULL; 4825 } 4826 #endif 4827 KMPAffinity::destroy_api(); 4828 } 4829 4830 void __kmp_affinity_set_init_mask(int gtid, int isa_root) { 4831 if (!KMP_AFFINITY_CAPABLE()) { 4832 return; 4833 } 4834 4835 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4836 if (th->th.th_affin_mask == NULL) { 4837 KMP_CPU_ALLOC(th->th.th_affin_mask); 4838 } else { 4839 KMP_CPU_ZERO(th->th.th_affin_mask); 4840 } 4841 4842 // Copy the thread mask to the kmp_info_t structure. If 4843 // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that 4844 // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set, 4845 // then the full mask is the same as the mask of the initialization thread. 4846 kmp_affin_mask_t *mask; 4847 int i; 4848 4849 if (KMP_AFFINITY_NON_PROC_BIND) { 4850 if ((__kmp_affinity_type == affinity_none) || 4851 (__kmp_affinity_type == affinity_balanced)) { 4852 #if KMP_GROUP_AFFINITY 4853 if (__kmp_num_proc_groups > 1) { 4854 return; 4855 } 4856 #endif 4857 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4858 i = 0; 4859 mask = __kmp_affin_fullMask; 4860 } else { 4861 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 4862 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4863 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4864 } 4865 } else { 4866 if ((!isa_root) || 4867 (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 4868 #if KMP_GROUP_AFFINITY 4869 if (__kmp_num_proc_groups > 1) { 4870 return; 4871 } 4872 #endif 4873 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4874 i = KMP_PLACE_ALL; 4875 mask = __kmp_affin_fullMask; 4876 } else { 4877 // int i = some hash function or just a counter that doesn't 4878 // always start at 0. Use gtid for now. 4879 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 4880 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4881 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4882 } 4883 } 4884 4885 th->th.th_current_place = i; 4886 if (isa_root) { 4887 th->th.th_new_place = i; 4888 th->th.th_first_place = 0; 4889 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4890 } else if (KMP_AFFINITY_NON_PROC_BIND) { 4891 // When using a Non-OMP_PROC_BIND affinity method, 4892 // set all threads' place-partition-var to the entire place list 4893 th->th.th_first_place = 0; 4894 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4895 } 4896 4897 if (i == KMP_PLACE_ALL) { 4898 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 4899 gtid)); 4900 } else { 4901 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 4902 gtid, i)); 4903 } 4904 4905 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4906 4907 if (__kmp_affinity_verbose 4908 /* to avoid duplicate printing (will be correctly printed on barrier) */ 4909 && (__kmp_affinity_type == affinity_none || 4910 (i != KMP_PLACE_ALL && __kmp_affinity_type != affinity_balanced))) { 4911 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4912 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4913 th->th.th_affin_mask); 4914 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4915 __kmp_gettid(), gtid, buf); 4916 } 4917 4918 #if KMP_OS_WINDOWS 4919 // On Windows* OS, the process affinity mask might have changed. If the user 4920 // didn't request affinity and this call fails, just continue silently. 4921 // See CQ171393. 4922 if (__kmp_affinity_type == affinity_none) { 4923 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 4924 } else 4925 #endif 4926 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4927 } 4928 4929 void __kmp_affinity_set_place(int gtid) { 4930 if (!KMP_AFFINITY_CAPABLE()) { 4931 return; 4932 } 4933 4934 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4935 4936 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current " 4937 "place = %d)\n", 4938 gtid, th->th.th_new_place, th->th.th_current_place)); 4939 4940 // Check that the new place is within this thread's partition. 4941 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4942 KMP_ASSERT(th->th.th_new_place >= 0); 4943 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 4944 if (th->th.th_first_place <= th->th.th_last_place) { 4945 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) && 4946 (th->th.th_new_place <= th->th.th_last_place)); 4947 } else { 4948 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) || 4949 (th->th.th_new_place >= th->th.th_last_place)); 4950 } 4951 4952 // Copy the thread mask to the kmp_info_t structure, 4953 // and set this thread's affinity. 4954 kmp_affin_mask_t *mask = 4955 KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place); 4956 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4957 th->th.th_current_place = th->th.th_new_place; 4958 4959 if (__kmp_affinity_verbose) { 4960 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4961 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4962 th->th.th_affin_mask); 4963 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), 4964 __kmp_gettid(), gtid, buf); 4965 } 4966 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4967 } 4968 4969 int __kmp_aux_set_affinity(void **mask) { 4970 int gtid; 4971 kmp_info_t *th; 4972 int retval; 4973 4974 if (!KMP_AFFINITY_CAPABLE()) { 4975 return -1; 4976 } 4977 4978 gtid = __kmp_entry_gtid(); 4979 KA_TRACE( 4980 1000, (""); { 4981 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4982 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4983 (kmp_affin_mask_t *)(*mask)); 4984 __kmp_debug_printf( 4985 "kmp_set_affinity: setting affinity mask for thread %d = %s\n", 4986 gtid, buf); 4987 }); 4988 4989 if (__kmp_env_consistency_check) { 4990 if ((mask == NULL) || (*mask == NULL)) { 4991 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4992 } else { 4993 unsigned proc; 4994 int num_procs = 0; 4995 4996 KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) { 4997 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4998 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4999 } 5000 if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 5001 continue; 5002 } 5003 num_procs++; 5004 } 5005 if (num_procs == 0) { 5006 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 5007 } 5008 5009 #if KMP_GROUP_AFFINITY 5010 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 5011 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 5012 } 5013 #endif /* KMP_GROUP_AFFINITY */ 5014 } 5015 } 5016 5017 th = __kmp_threads[gtid]; 5018 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 5019 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 5020 if (retval == 0) { 5021 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 5022 } 5023 5024 th->th.th_current_place = KMP_PLACE_UNDEFINED; 5025 th->th.th_new_place = KMP_PLACE_UNDEFINED; 5026 th->th.th_first_place = 0; 5027 th->th.th_last_place = __kmp_affinity_num_masks - 1; 5028 5029 // Turn off 4.0 affinity for the current tread at this parallel level. 5030 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 5031 5032 return retval; 5033 } 5034 5035 int __kmp_aux_get_affinity(void **mask) { 5036 int gtid; 5037 int retval; 5038 kmp_info_t *th; 5039 5040 if (!KMP_AFFINITY_CAPABLE()) { 5041 return -1; 5042 } 5043 5044 gtid = __kmp_entry_gtid(); 5045 th = __kmp_threads[gtid]; 5046 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 5047 5048 KA_TRACE( 5049 1000, (""); { 5050 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5051 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 5052 th->th.th_affin_mask); 5053 __kmp_printf( 5054 "kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, 5055 buf); 5056 }); 5057 5058 if (__kmp_env_consistency_check) { 5059 if ((mask == NULL) || (*mask == NULL)) { 5060 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 5061 } 5062 } 5063 5064 #if !KMP_OS_WINDOWS 5065 5066 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 5067 KA_TRACE( 5068 1000, (""); { 5069 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5070 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 5071 (kmp_affin_mask_t *)(*mask)); 5072 __kmp_printf( 5073 "kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, 5074 buf); 5075 }); 5076 return retval; 5077 5078 #else 5079 (void)retval; 5080 5081 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 5082 return 0; 5083 5084 #endif /* KMP_OS_WINDOWS */ 5085 } 5086 5087 int __kmp_aux_get_affinity_max_proc() { 5088 if (!KMP_AFFINITY_CAPABLE()) { 5089 return 0; 5090 } 5091 #if KMP_GROUP_AFFINITY 5092 if (__kmp_num_proc_groups > 1) { 5093 return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT); 5094 } 5095 #endif 5096 return __kmp_xproc; 5097 } 5098 5099 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) { 5100 if (!KMP_AFFINITY_CAPABLE()) { 5101 return -1; 5102 } 5103 5104 KA_TRACE( 5105 1000, (""); { 5106 int gtid = __kmp_entry_gtid(); 5107 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5108 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 5109 (kmp_affin_mask_t *)(*mask)); 5110 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in " 5111 "affinity mask for thread %d = %s\n", 5112 proc, gtid, buf); 5113 }); 5114 5115 if (__kmp_env_consistency_check) { 5116 if ((mask == NULL) || (*mask == NULL)) { 5117 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 5118 } 5119 } 5120 5121 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 5122 return -1; 5123 } 5124 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 5125 return -2; 5126 } 5127 5128 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 5129 return 0; 5130 } 5131 5132 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) { 5133 if (!KMP_AFFINITY_CAPABLE()) { 5134 return -1; 5135 } 5136 5137 KA_TRACE( 5138 1000, (""); { 5139 int gtid = __kmp_entry_gtid(); 5140 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5141 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 5142 (kmp_affin_mask_t *)(*mask)); 5143 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in " 5144 "affinity mask for thread %d = %s\n", 5145 proc, gtid, buf); 5146 }); 5147 5148 if (__kmp_env_consistency_check) { 5149 if ((mask == NULL) || (*mask == NULL)) { 5150 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 5151 } 5152 } 5153 5154 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 5155 return -1; 5156 } 5157 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 5158 return -2; 5159 } 5160 5161 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 5162 return 0; 5163 } 5164 5165 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) { 5166 if (!KMP_AFFINITY_CAPABLE()) { 5167 return -1; 5168 } 5169 5170 KA_TRACE( 5171 1000, (""); { 5172 int gtid = __kmp_entry_gtid(); 5173 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5174 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 5175 (kmp_affin_mask_t *)(*mask)); 5176 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in " 5177 "affinity mask for thread %d = %s\n", 5178 proc, gtid, buf); 5179 }); 5180 5181 if (__kmp_env_consistency_check) { 5182 if ((mask == NULL) || (*mask == NULL)) { 5183 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc"); 5184 } 5185 } 5186 5187 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 5188 return -1; 5189 } 5190 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 5191 return 0; 5192 } 5193 5194 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 5195 } 5196 5197 // Dynamic affinity settings - Affinity balanced 5198 void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) { 5199 KMP_DEBUG_ASSERT(th); 5200 bool fine_gran = true; 5201 int tid = th->th.th_info.ds.ds_tid; 5202 5203 switch (__kmp_affinity_gran) { 5204 case affinity_gran_fine: 5205 case affinity_gran_thread: 5206 break; 5207 case affinity_gran_core: 5208 if (__kmp_nThreadsPerCore > 1) { 5209 fine_gran = false; 5210 } 5211 break; 5212 case affinity_gran_package: 5213 if (nCoresPerPkg > 1) { 5214 fine_gran = false; 5215 } 5216 break; 5217 default: 5218 fine_gran = false; 5219 } 5220 5221 if (__kmp_affinity_uniform_topology()) { 5222 int coreID; 5223 int threadID; 5224 // Number of hyper threads per core in HT machine 5225 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 5226 // Number of cores 5227 int ncores = __kmp_ncores; 5228 if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) { 5229 __kmp_nth_per_core = __kmp_avail_proc / nPackages; 5230 ncores = nPackages; 5231 } 5232 // How many threads will be bound to each core 5233 int chunk = nthreads / ncores; 5234 // How many cores will have an additional thread bound to it - "big cores" 5235 int big_cores = nthreads % ncores; 5236 // Number of threads on the big cores 5237 int big_nth = (chunk + 1) * big_cores; 5238 if (tid < big_nth) { 5239 coreID = tid / (chunk + 1); 5240 threadID = (tid % (chunk + 1)) % __kmp_nth_per_core; 5241 } else { // tid >= big_nth 5242 coreID = (tid - big_cores) / chunk; 5243 threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core; 5244 } 5245 5246 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 5247 "Illegal set affinity operation when not capable"); 5248 5249 kmp_affin_mask_t *mask = th->th.th_affin_mask; 5250 KMP_CPU_ZERO(mask); 5251 5252 if (fine_gran) { 5253 int osID = address2os[coreID * __kmp_nth_per_core + threadID].second; 5254 KMP_CPU_SET(osID, mask); 5255 } else { 5256 for (int i = 0; i < __kmp_nth_per_core; i++) { 5257 int osID; 5258 osID = address2os[coreID * __kmp_nth_per_core + i].second; 5259 KMP_CPU_SET(osID, mask); 5260 } 5261 } 5262 if (__kmp_affinity_verbose) { 5263 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5264 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 5265 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 5266 __kmp_gettid(), tid, buf); 5267 } 5268 __kmp_set_system_affinity(mask, TRUE); 5269 } else { // Non-uniform topology 5270 5271 kmp_affin_mask_t *mask = th->th.th_affin_mask; 5272 KMP_CPU_ZERO(mask); 5273 5274 int core_level = __kmp_affinity_find_core_level( 5275 address2os, __kmp_avail_proc, __kmp_aff_depth - 1); 5276 int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, 5277 __kmp_aff_depth - 1, core_level); 5278 int nth_per_core = __kmp_affinity_max_proc_per_core( 5279 address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level); 5280 5281 // For performance gain consider the special case nthreads == 5282 // __kmp_avail_proc 5283 if (nthreads == __kmp_avail_proc) { 5284 if (fine_gran) { 5285 int osID = address2os[tid].second; 5286 KMP_CPU_SET(osID, mask); 5287 } else { 5288 int core = __kmp_affinity_find_core(address2os, tid, 5289 __kmp_aff_depth - 1, core_level); 5290 for (int i = 0; i < __kmp_avail_proc; i++) { 5291 int osID = address2os[i].second; 5292 if (__kmp_affinity_find_core(address2os, i, __kmp_aff_depth - 1, 5293 core_level) == core) { 5294 KMP_CPU_SET(osID, mask); 5295 } 5296 } 5297 } 5298 } else if (nthreads <= ncores) { 5299 5300 int core = 0; 5301 for (int i = 0; i < ncores; i++) { 5302 // Check if this core from procarr[] is in the mask 5303 int in_mask = 0; 5304 for (int j = 0; j < nth_per_core; j++) { 5305 if (procarr[i * nth_per_core + j] != -1) { 5306 in_mask = 1; 5307 break; 5308 } 5309 } 5310 if (in_mask) { 5311 if (tid == core) { 5312 for (int j = 0; j < nth_per_core; j++) { 5313 int osID = procarr[i * nth_per_core + j]; 5314 if (osID != -1) { 5315 KMP_CPU_SET(osID, mask); 5316 // For fine granularity it is enough to set the first available 5317 // osID for this core 5318 if (fine_gran) { 5319 break; 5320 } 5321 } 5322 } 5323 break; 5324 } else { 5325 core++; 5326 } 5327 } 5328 } 5329 } else { // nthreads > ncores 5330 // Array to save the number of processors at each core 5331 int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores); 5332 // Array to save the number of cores with "x" available processors; 5333 int *ncores_with_x_procs = 5334 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 5335 // Array to save the number of cores with # procs from x to nth_per_core 5336 int *ncores_with_x_to_max_procs = 5337 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 5338 5339 for (int i = 0; i <= nth_per_core; i++) { 5340 ncores_with_x_procs[i] = 0; 5341 ncores_with_x_to_max_procs[i] = 0; 5342 } 5343 5344 for (int i = 0; i < ncores; i++) { 5345 int cnt = 0; 5346 for (int j = 0; j < nth_per_core; j++) { 5347 if (procarr[i * nth_per_core + j] != -1) { 5348 cnt++; 5349 } 5350 } 5351 nproc_at_core[i] = cnt; 5352 ncores_with_x_procs[cnt]++; 5353 } 5354 5355 for (int i = 0; i <= nth_per_core; i++) { 5356 for (int j = i; j <= nth_per_core; j++) { 5357 ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j]; 5358 } 5359 } 5360 5361 // Max number of processors 5362 int nproc = nth_per_core * ncores; 5363 // An array to keep number of threads per each context 5364 int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc); 5365 for (int i = 0; i < nproc; i++) { 5366 newarr[i] = 0; 5367 } 5368 5369 int nth = nthreads; 5370 int flag = 0; 5371 while (nth > 0) { 5372 for (int j = 1; j <= nth_per_core; j++) { 5373 int cnt = ncores_with_x_to_max_procs[j]; 5374 for (int i = 0; i < ncores; i++) { 5375 // Skip the core with 0 processors 5376 if (nproc_at_core[i] == 0) { 5377 continue; 5378 } 5379 for (int k = 0; k < nth_per_core; k++) { 5380 if (procarr[i * nth_per_core + k] != -1) { 5381 if (newarr[i * nth_per_core + k] == 0) { 5382 newarr[i * nth_per_core + k] = 1; 5383 cnt--; 5384 nth--; 5385 break; 5386 } else { 5387 if (flag != 0) { 5388 newarr[i * nth_per_core + k]++; 5389 cnt--; 5390 nth--; 5391 break; 5392 } 5393 } 5394 } 5395 } 5396 if (cnt == 0 || nth == 0) { 5397 break; 5398 } 5399 } 5400 if (nth == 0) { 5401 break; 5402 } 5403 } 5404 flag = 1; 5405 } 5406 int sum = 0; 5407 for (int i = 0; i < nproc; i++) { 5408 sum += newarr[i]; 5409 if (sum > tid) { 5410 if (fine_gran) { 5411 int osID = procarr[i]; 5412 KMP_CPU_SET(osID, mask); 5413 } else { 5414 int coreID = i / nth_per_core; 5415 for (int ii = 0; ii < nth_per_core; ii++) { 5416 int osID = procarr[coreID * nth_per_core + ii]; 5417 if (osID != -1) { 5418 KMP_CPU_SET(osID, mask); 5419 } 5420 } 5421 } 5422 break; 5423 } 5424 } 5425 __kmp_free(newarr); 5426 } 5427 5428 if (__kmp_affinity_verbose) { 5429 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5430 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 5431 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 5432 __kmp_gettid(), tid, buf); 5433 } 5434 __kmp_set_system_affinity(mask, TRUE); 5435 } 5436 } 5437 5438 #if KMP_OS_LINUX || KMP_OS_FREEBSD 5439 // We don't need this entry for Windows because 5440 // there is GetProcessAffinityMask() api 5441 // 5442 // The intended usage is indicated by these steps: 5443 // 1) The user gets the current affinity mask 5444 // 2) Then sets the affinity by calling this function 5445 // 3) Error check the return value 5446 // 4) Use non-OpenMP parallelization 5447 // 5) Reset the affinity to what was stored in step 1) 5448 #ifdef __cplusplus 5449 extern "C" 5450 #endif 5451 int 5452 kmp_set_thread_affinity_mask_initial() 5453 // the function returns 0 on success, 5454 // -1 if we cannot bind thread 5455 // >0 (errno) if an error happened during binding 5456 { 5457 int gtid = __kmp_get_gtid(); 5458 if (gtid < 0) { 5459 // Do not touch non-omp threads 5460 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5461 "non-omp thread, returning\n")); 5462 return -1; 5463 } 5464 if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) { 5465 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5466 "affinity not initialized, returning\n")); 5467 return -1; 5468 } 5469 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5470 "set full mask for thread %d\n", 5471 gtid)); 5472 KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL); 5473 return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE); 5474 } 5475 #endif 5476 5477 #endif // KMP_AFFINITY_SUPPORTED 5478