1 /* 2 * kmp_affinity.cpp -- affinity management 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_i18n.h" 16 #include "kmp_io.h" 17 #include "kmp_str.h" 18 #include "kmp_wrapper_getpid.h" 19 #if KMP_USE_HIER_SCHED 20 #include "kmp_dispatch_hier.h" 21 #endif 22 23 // Store the real or imagined machine hierarchy here 24 static hierarchy_info machine_hierarchy; 25 26 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); } 27 28 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 29 kmp_uint32 depth; 30 // The test below is true if affinity is available, but set to "none". Need to 31 // init on first use of hierarchical barrier. 32 if (TCR_1(machine_hierarchy.uninitialized)) 33 machine_hierarchy.init(NULL, nproc); 34 35 // Adjust the hierarchy in case num threads exceeds original 36 if (nproc > machine_hierarchy.base_num_threads) 37 machine_hierarchy.resize(nproc); 38 39 depth = machine_hierarchy.depth; 40 KMP_DEBUG_ASSERT(depth > 0); 41 42 thr_bar->depth = depth; 43 thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0] - 1; 44 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 45 } 46 47 #if KMP_AFFINITY_SUPPORTED 48 49 bool KMPAffinity::picked_api = false; 50 51 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); } 52 void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); } 53 void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); } 54 void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); } 55 void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); } 56 void KMPAffinity::operator delete(void *p) { __kmp_free(p); } 57 58 void KMPAffinity::pick_api() { 59 KMPAffinity *affinity_dispatch; 60 if (picked_api) 61 return; 62 #if KMP_USE_HWLOC 63 // Only use Hwloc if affinity isn't explicitly disabled and 64 // user requests Hwloc topology method 65 if (__kmp_affinity_top_method == affinity_top_method_hwloc && 66 __kmp_affinity_type != affinity_disabled) { 67 affinity_dispatch = new KMPHwlocAffinity(); 68 } else 69 #endif 70 { 71 affinity_dispatch = new KMPNativeAffinity(); 72 } 73 __kmp_affinity_dispatch = affinity_dispatch; 74 picked_api = true; 75 } 76 77 void KMPAffinity::destroy_api() { 78 if (__kmp_affinity_dispatch != NULL) { 79 delete __kmp_affinity_dispatch; 80 __kmp_affinity_dispatch = NULL; 81 picked_api = false; 82 } 83 } 84 85 #define KMP_ADVANCE_SCAN(scan) \ 86 while (*scan != '\0') { \ 87 scan++; \ 88 } 89 90 // Print the affinity mask to the character array in a pretty format. 91 // The format is a comma separated list of non-negative integers or integer 92 // ranges: e.g., 1,2,3-5,7,9-15 93 // The format can also be the string "{<empty>}" if no bits are set in mask 94 char *__kmp_affinity_print_mask(char *buf, int buf_len, 95 kmp_affin_mask_t *mask) { 96 int start = 0, finish = 0, previous = 0; 97 bool first_range; 98 KMP_ASSERT(buf); 99 KMP_ASSERT(buf_len >= 40); 100 KMP_ASSERT(mask); 101 char *scan = buf; 102 char *end = buf + buf_len - 1; 103 104 // Check for empty set. 105 if (mask->begin() == mask->end()) { 106 KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}"); 107 KMP_ADVANCE_SCAN(scan); 108 KMP_ASSERT(scan <= end); 109 return buf; 110 } 111 112 first_range = true; 113 start = mask->begin(); 114 while (1) { 115 // Find next range 116 // [start, previous] is inclusive range of contiguous bits in mask 117 for (finish = mask->next(start), previous = start; 118 finish == previous + 1 && finish != mask->end(); 119 finish = mask->next(finish)) { 120 previous = finish; 121 } 122 123 // The first range does not need a comma printed before it, but the rest 124 // of the ranges do need a comma beforehand 125 if (!first_range) { 126 KMP_SNPRINTF(scan, end - scan + 1, "%s", ","); 127 KMP_ADVANCE_SCAN(scan); 128 } else { 129 first_range = false; 130 } 131 // Range with three or more contiguous bits in the affinity mask 132 if (previous - start > 1) { 133 KMP_SNPRINTF(scan, end - scan + 1, "%d-%d", static_cast<int>(start), 134 static_cast<int>(previous)); 135 } else { 136 // Range with one or two contiguous bits in the affinity mask 137 KMP_SNPRINTF(scan, end - scan + 1, "%d", static_cast<int>(start)); 138 KMP_ADVANCE_SCAN(scan); 139 if (previous - start > 0) { 140 KMP_SNPRINTF(scan, end - scan + 1, ",%d", static_cast<int>(previous)); 141 } 142 } 143 KMP_ADVANCE_SCAN(scan); 144 // Start over with new start point 145 start = finish; 146 if (start == mask->end()) 147 break; 148 // Check for overflow 149 if (end - scan < 2) 150 break; 151 } 152 153 // Check for overflow 154 KMP_ASSERT(scan <= end); 155 return buf; 156 } 157 #undef KMP_ADVANCE_SCAN 158 159 // Print the affinity mask to the string buffer object in a pretty format 160 // The format is a comma separated list of non-negative integers or integer 161 // ranges: e.g., 1,2,3-5,7,9-15 162 // The format can also be the string "{<empty>}" if no bits are set in mask 163 kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf, 164 kmp_affin_mask_t *mask) { 165 int start = 0, finish = 0, previous = 0; 166 bool first_range; 167 KMP_ASSERT(buf); 168 KMP_ASSERT(mask); 169 170 __kmp_str_buf_clear(buf); 171 172 // Check for empty set. 173 if (mask->begin() == mask->end()) { 174 __kmp_str_buf_print(buf, "%s", "{<empty>}"); 175 return buf; 176 } 177 178 first_range = true; 179 start = mask->begin(); 180 while (1) { 181 // Find next range 182 // [start, previous] is inclusive range of contiguous bits in mask 183 for (finish = mask->next(start), previous = start; 184 finish == previous + 1 && finish != mask->end(); 185 finish = mask->next(finish)) { 186 previous = finish; 187 } 188 189 // The first range does not need a comma printed before it, but the rest 190 // of the ranges do need a comma beforehand 191 if (!first_range) { 192 __kmp_str_buf_print(buf, "%s", ","); 193 } else { 194 first_range = false; 195 } 196 // Range with three or more contiguous bits in the affinity mask 197 if (previous - start > 1) { 198 __kmp_str_buf_print(buf, "%d-%d", static_cast<int>(start), 199 static_cast<int>(previous)); 200 } else { 201 // Range with one or two contiguous bits in the affinity mask 202 __kmp_str_buf_print(buf, "%d", static_cast<int>(start)); 203 if (previous - start > 0) { 204 __kmp_str_buf_print(buf, ",%d", static_cast<int>(previous)); 205 } 206 } 207 // Start over with new start point 208 start = finish; 209 if (start == mask->end()) 210 break; 211 } 212 return buf; 213 } 214 215 void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) { 216 KMP_CPU_ZERO(mask); 217 218 #if KMP_GROUP_AFFINITY 219 220 if (__kmp_num_proc_groups > 1) { 221 int group; 222 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 223 for (group = 0; group < __kmp_num_proc_groups; group++) { 224 int i; 225 int num = __kmp_GetActiveProcessorCount(group); 226 for (i = 0; i < num; i++) { 227 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 228 } 229 } 230 } else 231 232 #endif /* KMP_GROUP_AFFINITY */ 233 234 { 235 int proc; 236 for (proc = 0; proc < __kmp_xproc; proc++) { 237 KMP_CPU_SET(proc, mask); 238 } 239 } 240 } 241 242 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be 243 // called to renumber the labels from [0..n] and place them into the child_num 244 // vector of the address object. This is done in case the labels used for 245 // the children at one node of the hierarchy differ from those used for 246 // another node at the same level. Example: suppose the machine has 2 nodes 247 // with 2 packages each. The first node contains packages 601 and 602, and 248 // second node contains packages 603 and 604. If we try to sort the table 249 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604 250 // because we are paying attention to the labels themselves, not the ordinal 251 // child numbers. By using the child numbers in the sort, the result is 252 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604. 253 static void __kmp_affinity_assign_child_nums(AddrUnsPair *address2os, 254 int numAddrs) { 255 KMP_DEBUG_ASSERT(numAddrs > 0); 256 int depth = address2os->first.depth; 257 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 258 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 259 int labCt; 260 for (labCt = 0; labCt < depth; labCt++) { 261 address2os[0].first.childNums[labCt] = counts[labCt] = 0; 262 lastLabel[labCt] = address2os[0].first.labels[labCt]; 263 } 264 int i; 265 for (i = 1; i < numAddrs; i++) { 266 for (labCt = 0; labCt < depth; labCt++) { 267 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) { 268 int labCt2; 269 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) { 270 counts[labCt2] = 0; 271 lastLabel[labCt2] = address2os[i].first.labels[labCt2]; 272 } 273 counts[labCt]++; 274 lastLabel[labCt] = address2os[i].first.labels[labCt]; 275 break; 276 } 277 } 278 for (labCt = 0; labCt < depth; labCt++) { 279 address2os[i].first.childNums[labCt] = counts[labCt]; 280 } 281 for (; labCt < (int)Address::maxDepth; labCt++) { 282 address2os[i].first.childNums[labCt] = 0; 283 } 284 } 285 __kmp_free(lastLabel); 286 __kmp_free(counts); 287 } 288 289 // All of the __kmp_affinity_create_*_map() routines should set 290 // __kmp_affinity_masks to a vector of affinity mask objects of length 291 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and return 292 // the number of levels in the machine topology tree (zero if 293 // __kmp_affinity_type == affinity_none). 294 // 295 // All of the __kmp_affinity_create_*_map() routines should set 296 // *__kmp_affin_fullMask to the affinity mask for the initialization thread. 297 // They need to save and restore the mask, and it could be needed later, so 298 // saving it is just an optimization to avoid calling kmp_get_system_affinity() 299 // again. 300 kmp_affin_mask_t *__kmp_affin_fullMask = NULL; 301 302 static int nCoresPerPkg, nPackages; 303 static int __kmp_nThreadsPerCore; 304 #ifndef KMP_DFLT_NTH_CORES 305 static int __kmp_ncores; 306 #endif 307 static int *__kmp_pu_os_idx = NULL; 308 309 // __kmp_affinity_uniform_topology() doesn't work when called from 310 // places which support arbitrarily many levels in the machine topology 311 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map() 312 // __kmp_affinity_create_x2apicid_map(). 313 inline static bool __kmp_affinity_uniform_topology() { 314 return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages); 315 } 316 317 // Print out the detailed machine topology map, i.e. the physical locations 318 // of each OS proc. 319 static void __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, 320 int depth, int pkgLevel, 321 int coreLevel, int threadLevel) { 322 int proc; 323 324 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 325 for (proc = 0; proc < len; proc++) { 326 int level; 327 kmp_str_buf_t buf; 328 __kmp_str_buf_init(&buf); 329 for (level = 0; level < depth; level++) { 330 if (level == threadLevel) { 331 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread)); 332 } else if (level == coreLevel) { 333 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core)); 334 } else if (level == pkgLevel) { 335 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package)); 336 } else if (level > pkgLevel) { 337 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node), 338 level - pkgLevel - 1); 339 } else { 340 __kmp_str_buf_print(&buf, "L%d ", level); 341 } 342 __kmp_str_buf_print(&buf, "%d ", address2os[proc].first.labels[level]); 343 } 344 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second, 345 buf.str); 346 __kmp_str_buf_free(&buf); 347 } 348 } 349 350 #if KMP_USE_HWLOC 351 352 static void __kmp_affinity_print_hwloc_tp(AddrUnsPair *addrP, int len, 353 int depth, int *levels) { 354 int proc; 355 kmp_str_buf_t buf; 356 __kmp_str_buf_init(&buf); 357 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 358 for (proc = 0; proc < len; proc++) { 359 __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Package), 360 addrP[proc].first.labels[0]); 361 if (depth > 1) { 362 int level = 1; // iterate over levels 363 int label = 1; // iterate over labels 364 if (__kmp_numa_detected) 365 // node level follows package 366 if (levels[level++] > 0) 367 __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Node), 368 addrP[proc].first.labels[label++]); 369 if (__kmp_tile_depth > 0) 370 // tile level follows node if any, or package 371 if (levels[level++] > 0) 372 __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Tile), 373 addrP[proc].first.labels[label++]); 374 if (levels[level++] > 0) 375 // core level follows 376 __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Core), 377 addrP[proc].first.labels[label++]); 378 if (levels[level++] > 0) 379 // thread level is the latest 380 __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Thread), 381 addrP[proc].first.labels[label++]); 382 KMP_DEBUG_ASSERT(label == depth); 383 } 384 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", addrP[proc].second, buf.str); 385 __kmp_str_buf_clear(&buf); 386 } 387 __kmp_str_buf_free(&buf); 388 } 389 390 static int nNodePerPkg, nTilePerPkg, nTilePerNode, nCorePerNode, nCorePerTile; 391 392 // This function removes the topology levels that are radix 1 and don't offer 393 // further information about the topology. The most common example is when you 394 // have one thread context per core, we don't want the extra thread context 395 // level if it offers no unique labels. So they are removed. 396 // return value: the new depth of address2os 397 static int __kmp_affinity_remove_radix_one_levels(AddrUnsPair *addrP, int nTh, 398 int depth, int *levels) { 399 int level; 400 int i; 401 int radix1_detected; 402 int new_depth = depth; 403 for (level = depth - 1; level > 0; --level) { 404 // Detect if this level is radix 1 405 radix1_detected = 1; 406 for (i = 1; i < nTh; ++i) { 407 if (addrP[0].first.labels[level] != addrP[i].first.labels[level]) { 408 // There are differing label values for this level so it stays 409 radix1_detected = 0; 410 break; 411 } 412 } 413 if (!radix1_detected) 414 continue; 415 // Radix 1 was detected 416 --new_depth; 417 levels[level] = -1; // mark level as not present in address2os array 418 if (level == new_depth) { 419 // "turn off" deepest level, just decrement the depth that removes 420 // the level from address2os array 421 for (i = 0; i < nTh; ++i) { 422 addrP[i].first.depth--; 423 } 424 } else { 425 // For other levels, we move labels over and also reduce the depth 426 int j; 427 for (j = level; j < new_depth; ++j) { 428 for (i = 0; i < nTh; ++i) { 429 addrP[i].first.labels[j] = addrP[i].first.labels[j + 1]; 430 addrP[i].first.depth--; 431 } 432 levels[j + 1] -= 1; 433 } 434 } 435 } 436 return new_depth; 437 } 438 439 // Returns the number of objects of type 'type' below 'obj' within the topology 440 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is 441 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET 442 // object. 443 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj, 444 hwloc_obj_type_t type) { 445 int retval = 0; 446 hwloc_obj_t first; 447 for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type, 448 obj->logical_index, type, 0); 449 first != NULL && 450 hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, obj->type, first) == 451 obj; 452 first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type, 453 first)) { 454 ++retval; 455 } 456 return retval; 457 } 458 459 static int __kmp_hwloc_count_children_by_depth(hwloc_topology_t t, 460 hwloc_obj_t o, 461 kmp_hwloc_depth_t depth, 462 hwloc_obj_t *f) { 463 if (o->depth == depth) { 464 if (*f == NULL) 465 *f = o; // output first descendant found 466 return 1; 467 } 468 int sum = 0; 469 for (unsigned i = 0; i < o->arity; i++) 470 sum += __kmp_hwloc_count_children_by_depth(t, o->children[i], depth, f); 471 return sum; // will be 0 if no one found (as PU arity is 0) 472 } 473 474 static int __kmp_hwloc_count_children_by_type(hwloc_topology_t t, hwloc_obj_t o, 475 hwloc_obj_type_t type, 476 hwloc_obj_t *f) { 477 if (!hwloc_compare_types(o->type, type)) { 478 if (*f == NULL) 479 *f = o; // output first descendant found 480 return 1; 481 } 482 int sum = 0; 483 for (unsigned i = 0; i < o->arity; i++) 484 sum += __kmp_hwloc_count_children_by_type(t, o->children[i], type, f); 485 return sum; // will be 0 if no one found (as PU arity is 0) 486 } 487 488 static int __kmp_hwloc_process_obj_core_pu(AddrUnsPair *addrPair, 489 int &nActiveThreads, 490 int &num_active_cores, 491 hwloc_obj_t obj, int depth, 492 int *labels) { 493 hwloc_obj_t core = NULL; 494 hwloc_topology_t &tp = __kmp_hwloc_topology; 495 int NC = __kmp_hwloc_count_children_by_type(tp, obj, HWLOC_OBJ_CORE, &core); 496 for (int core_id = 0; core_id < NC; ++core_id, core = core->next_cousin) { 497 hwloc_obj_t pu = NULL; 498 KMP_DEBUG_ASSERT(core != NULL); 499 int num_active_threads = 0; 500 int NT = __kmp_hwloc_count_children_by_type(tp, core, HWLOC_OBJ_PU, &pu); 501 // int NT = core->arity; pu = core->first_child; // faster? 502 for (int pu_id = 0; pu_id < NT; ++pu_id, pu = pu->next_cousin) { 503 KMP_DEBUG_ASSERT(pu != NULL); 504 if (!KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask)) 505 continue; // skip inactive (inaccessible) unit 506 Address addr(depth + 2); 507 KA_TRACE(20, ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n", 508 obj->os_index, obj->logical_index, core->os_index, 509 core->logical_index, pu->os_index, pu->logical_index)); 510 for (int i = 0; i < depth; ++i) 511 addr.labels[i] = labels[i]; // package, etc. 512 addr.labels[depth] = core_id; // core 513 addr.labels[depth + 1] = pu_id; // pu 514 addrPair[nActiveThreads] = AddrUnsPair(addr, pu->os_index); 515 __kmp_pu_os_idx[nActiveThreads] = pu->os_index; 516 nActiveThreads++; 517 ++num_active_threads; // count active threads per core 518 } 519 if (num_active_threads) { // were there any active threads on the core? 520 ++__kmp_ncores; // count total active cores 521 ++num_active_cores; // count active cores per socket 522 if (num_active_threads > __kmp_nThreadsPerCore) 523 __kmp_nThreadsPerCore = num_active_threads; // calc maximum 524 } 525 } 526 return 0; 527 } 528 529 // Check if NUMA node detected below the package, 530 // and if tile object is detected and return its depth 531 static int __kmp_hwloc_check_numa() { 532 hwloc_topology_t &tp = __kmp_hwloc_topology; 533 hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to) 534 int depth, l2cache_depth, package_depth; 535 536 // Get some PU 537 hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, 0); 538 if (hT == NULL) // something has gone wrong 539 return 1; 540 541 // check NUMA node below PACKAGE 542 hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT); 543 hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT); 544 KMP_DEBUG_ASSERT(hS != NULL); 545 if (hN != NULL && hN->depth > hS->depth) { 546 __kmp_numa_detected = TRUE; // socket includes node(s) 547 if (__kmp_affinity_gran == affinity_gran_node) { 548 __kmp_affinity_gran = affinity_gran_numa; 549 } 550 } 551 552 package_depth = hwloc_get_type_depth(tp, HWLOC_OBJ_PACKAGE); 553 l2cache_depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED); 554 // check tile, get object by depth because of multiple caches possible 555 depth = (l2cache_depth < package_depth) ? package_depth : l2cache_depth; 556 hL = hwloc_get_ancestor_obj_by_depth(tp, depth, hT); 557 hC = NULL; // not used, but reset it here just in case 558 if (hL != NULL && 559 __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1) 560 __kmp_tile_depth = depth; // tile consists of multiple cores 561 return 0; 562 } 563 564 static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os, 565 kmp_i18n_id_t *const msg_id) { 566 hwloc_topology_t &tp = __kmp_hwloc_topology; // shortcut of a long name 567 *address2os = NULL; 568 *msg_id = kmp_i18n_null; 569 570 // Save the affinity mask for the current thread. 571 kmp_affin_mask_t *oldMask; 572 KMP_CPU_ALLOC(oldMask); 573 __kmp_get_system_affinity(oldMask, TRUE); 574 __kmp_hwloc_check_numa(); 575 576 if (!KMP_AFFINITY_CAPABLE()) { 577 // Hack to try and infer the machine topology using only the data 578 // available from cpuid on the current thread, and __kmp_xproc. 579 KMP_ASSERT(__kmp_affinity_type == affinity_none); 580 // hwloc only guarantees existance of PU object, so check PACKAGE and CORE 581 hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0); 582 if (o != NULL) 583 nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_CORE); 584 else 585 nCoresPerPkg = 1; // no PACKAGE found 586 o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0); 587 if (o != NULL) 588 __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_PU); 589 else 590 __kmp_nThreadsPerCore = 1; // no CORE found 591 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 592 if (nCoresPerPkg == 0) 593 nCoresPerPkg = 1; // to prevent possible division by 0 594 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 595 if (__kmp_affinity_verbose) { 596 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 597 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 598 if (__kmp_affinity_uniform_topology()) { 599 KMP_INFORM(Uniform, "KMP_AFFINITY"); 600 } else { 601 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 602 } 603 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 604 __kmp_nThreadsPerCore, __kmp_ncores); 605 } 606 KMP_CPU_FREE(oldMask); 607 return 0; 608 } 609 610 int depth = 3; 611 int levels[5] = {0, 1, 2, 3, 4}; // package, [node,] [tile,] core, thread 612 int labels[3] = {0}; // package [,node] [,tile] - head of labels array 613 if (__kmp_numa_detected) 614 ++depth; 615 if (__kmp_tile_depth) 616 ++depth; 617 618 // Allocate the data structure to be returned. 619 AddrUnsPair *retval = 620 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 621 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 622 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 623 624 // When affinity is off, this routine will still be called to set 625 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 626 // nCoresPerPkg, & nPackages. Make sure all these vars are set 627 // correctly, and return if affinity is not enabled. 628 629 hwloc_obj_t socket, node, tile; 630 int nActiveThreads = 0; 631 int socket_id = 0; 632 // re-calculate globals to count only accessible resources 633 __kmp_ncores = nPackages = nCoresPerPkg = __kmp_nThreadsPerCore = 0; 634 nNodePerPkg = nTilePerPkg = nTilePerNode = nCorePerNode = nCorePerTile = 0; 635 for (socket = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0); socket != NULL; 636 socket = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, socket), 637 socket_id++) { 638 labels[0] = socket_id; 639 if (__kmp_numa_detected) { 640 int NN; 641 int n_active_nodes = 0; 642 node = NULL; 643 NN = __kmp_hwloc_count_children_by_type(tp, socket, HWLOC_OBJ_NUMANODE, 644 &node); 645 for (int node_id = 0; node_id < NN; ++node_id, node = node->next_cousin) { 646 labels[1] = node_id; 647 if (__kmp_tile_depth) { 648 // NUMA + tiles 649 int NT; 650 int n_active_tiles = 0; 651 tile = NULL; 652 NT = __kmp_hwloc_count_children_by_depth(tp, node, __kmp_tile_depth, 653 &tile); 654 for (int tl_id = 0; tl_id < NT; ++tl_id, tile = tile->next_cousin) { 655 labels[2] = tl_id; 656 int n_active_cores = 0; 657 __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads, 658 n_active_cores, tile, 3, labels); 659 if (n_active_cores) { // were there any active cores on the socket? 660 ++n_active_tiles; // count active tiles per node 661 if (n_active_cores > nCorePerTile) 662 nCorePerTile = n_active_cores; // calc maximum 663 } 664 } 665 if (n_active_tiles) { // were there any active tiles on the socket? 666 ++n_active_nodes; // count active nodes per package 667 if (n_active_tiles > nTilePerNode) 668 nTilePerNode = n_active_tiles; // calc maximum 669 } 670 } else { 671 // NUMA, no tiles 672 int n_active_cores = 0; 673 __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads, 674 n_active_cores, node, 2, labels); 675 if (n_active_cores) { // were there any active cores on the socket? 676 ++n_active_nodes; // count active nodes per package 677 if (n_active_cores > nCorePerNode) 678 nCorePerNode = n_active_cores; // calc maximum 679 } 680 } 681 } 682 if (n_active_nodes) { // were there any active nodes on the socket? 683 ++nPackages; // count total active packages 684 if (n_active_nodes > nNodePerPkg) 685 nNodePerPkg = n_active_nodes; // calc maximum 686 } 687 } else { 688 if (__kmp_tile_depth) { 689 // no NUMA, tiles 690 int NT; 691 int n_active_tiles = 0; 692 tile = NULL; 693 NT = __kmp_hwloc_count_children_by_depth(tp, socket, __kmp_tile_depth, 694 &tile); 695 for (int tl_id = 0; tl_id < NT; ++tl_id, tile = tile->next_cousin) { 696 labels[1] = tl_id; 697 int n_active_cores = 0; 698 __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads, 699 n_active_cores, tile, 2, labels); 700 if (n_active_cores) { // were there any active cores on the socket? 701 ++n_active_tiles; // count active tiles per package 702 if (n_active_cores > nCorePerTile) 703 nCorePerTile = n_active_cores; // calc maximum 704 } 705 } 706 if (n_active_tiles) { // were there any active tiles on the socket? 707 ++nPackages; // count total active packages 708 if (n_active_tiles > nTilePerPkg) 709 nTilePerPkg = n_active_tiles; // calc maximum 710 } 711 } else { 712 // no NUMA, no tiles 713 int n_active_cores = 0; 714 __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads, n_active_cores, 715 socket, 1, labels); 716 if (n_active_cores) { // were there any active cores on the socket? 717 ++nPackages; // count total active packages 718 if (n_active_cores > nCoresPerPkg) 719 nCoresPerPkg = n_active_cores; // calc maximum 720 } 721 } 722 } 723 } 724 725 // If there's only one thread context to bind to, return now. 726 KMP_DEBUG_ASSERT(nActiveThreads == __kmp_avail_proc); 727 KMP_ASSERT(nActiveThreads > 0); 728 if (nActiveThreads == 1) { 729 __kmp_ncores = nPackages = 1; 730 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 731 if (__kmp_affinity_verbose) { 732 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 733 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 734 KMP_INFORM(Uniform, "KMP_AFFINITY"); 735 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 736 __kmp_nThreadsPerCore, __kmp_ncores); 737 } 738 739 if (__kmp_affinity_type == affinity_none) { 740 __kmp_free(retval); 741 KMP_CPU_FREE(oldMask); 742 return 0; 743 } 744 745 // Form an Address object which only includes the package level. 746 Address addr(1); 747 addr.labels[0] = retval[0].first.labels[0]; 748 retval[0].first = addr; 749 750 if (__kmp_affinity_gran_levels < 0) { 751 __kmp_affinity_gran_levels = 0; 752 } 753 754 if (__kmp_affinity_verbose) { 755 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 756 } 757 758 *address2os = retval; 759 KMP_CPU_FREE(oldMask); 760 return 1; 761 } 762 763 // Sort the table by physical Id. 764 qsort(retval, nActiveThreads, sizeof(*retval), 765 __kmp_affinity_cmp_Address_labels); 766 767 // Check to see if the machine topology is uniform 768 int nPUs = nPackages * __kmp_nThreadsPerCore; 769 if (__kmp_numa_detected) { 770 if (__kmp_tile_depth) { // NUMA + tiles 771 nPUs *= (nNodePerPkg * nTilePerNode * nCorePerTile); 772 } else { // NUMA, no tiles 773 nPUs *= (nNodePerPkg * nCorePerNode); 774 } 775 } else { 776 if (__kmp_tile_depth) { // no NUMA, tiles 777 nPUs *= (nTilePerPkg * nCorePerTile); 778 } else { // no NUMA, no tiles 779 nPUs *= nCoresPerPkg; 780 } 781 } 782 unsigned uniform = (nPUs == nActiveThreads); 783 784 // Print the machine topology summary. 785 if (__kmp_affinity_verbose) { 786 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 787 if (uniform) { 788 KMP_INFORM(Uniform, "KMP_AFFINITY"); 789 } else { 790 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 791 } 792 if (__kmp_numa_detected) { 793 if (__kmp_tile_depth) { // NUMA + tiles 794 KMP_INFORM(TopologyExtraNoTi, "KMP_AFFINITY", nPackages, nNodePerPkg, 795 nTilePerNode, nCorePerTile, __kmp_nThreadsPerCore, 796 __kmp_ncores); 797 } else { // NUMA, no tiles 798 KMP_INFORM(TopologyExtraNode, "KMP_AFFINITY", nPackages, nNodePerPkg, 799 nCorePerNode, __kmp_nThreadsPerCore, __kmp_ncores); 800 nPUs *= (nNodePerPkg * nCorePerNode); 801 } 802 } else { 803 if (__kmp_tile_depth) { // no NUMA, tiles 804 KMP_INFORM(TopologyExtraTile, "KMP_AFFINITY", nPackages, nTilePerPkg, 805 nCorePerTile, __kmp_nThreadsPerCore, __kmp_ncores); 806 } else { // no NUMA, no tiles 807 kmp_str_buf_t buf; 808 __kmp_str_buf_init(&buf); 809 __kmp_str_buf_print(&buf, "%d", nPackages); 810 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 811 __kmp_nThreadsPerCore, __kmp_ncores); 812 __kmp_str_buf_free(&buf); 813 } 814 } 815 } 816 817 if (__kmp_affinity_type == affinity_none) { 818 __kmp_free(retval); 819 KMP_CPU_FREE(oldMask); 820 return 0; 821 } 822 823 int depth_full = depth; // number of levels before compressing 824 // Find any levels with radix 1, and remove them from the map 825 // (except for the package level). 826 depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth, 827 levels); 828 KMP_DEBUG_ASSERT(__kmp_affinity_gran != affinity_gran_default); 829 if (__kmp_affinity_gran_levels < 0) { 830 // Set the granularity level based on what levels are modeled 831 // in the machine topology map. 832 __kmp_affinity_gran_levels = 0; // lowest level (e.g. fine) 833 if (__kmp_affinity_gran > affinity_gran_thread) { 834 for (int i = 1; i <= depth_full; ++i) { 835 if (__kmp_affinity_gran <= i) // only count deeper levels 836 break; 837 if (levels[depth_full - i] > 0) 838 __kmp_affinity_gran_levels++; 839 } 840 } 841 if (__kmp_affinity_gran > affinity_gran_package) 842 __kmp_affinity_gran_levels++; // e.g. granularity = group 843 } 844 845 if (__kmp_affinity_verbose) 846 __kmp_affinity_print_hwloc_tp(retval, nActiveThreads, depth, levels); 847 848 KMP_CPU_FREE(oldMask); 849 *address2os = retval; 850 return depth; 851 } 852 #endif // KMP_USE_HWLOC 853 854 // If we don't know how to retrieve the machine's processor topology, or 855 // encounter an error in doing so, this routine is called to form a "flat" 856 // mapping of os thread id's <-> processor id's. 857 static int __kmp_affinity_create_flat_map(AddrUnsPair **address2os, 858 kmp_i18n_id_t *const msg_id) { 859 *address2os = NULL; 860 *msg_id = kmp_i18n_null; 861 862 // Even if __kmp_affinity_type == affinity_none, this routine might still 863 // called to set __kmp_ncores, as well as 864 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 865 if (!KMP_AFFINITY_CAPABLE()) { 866 KMP_ASSERT(__kmp_affinity_type == affinity_none); 867 __kmp_ncores = nPackages = __kmp_xproc; 868 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 869 if (__kmp_affinity_verbose) { 870 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY"); 871 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 872 KMP_INFORM(Uniform, "KMP_AFFINITY"); 873 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 874 __kmp_nThreadsPerCore, __kmp_ncores); 875 } 876 return 0; 877 } 878 879 // When affinity is off, this routine will still be called to set 880 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 881 // Make sure all these vars are set correctly, and return now if affinity is 882 // not enabled. 883 __kmp_ncores = nPackages = __kmp_avail_proc; 884 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 885 if (__kmp_affinity_verbose) { 886 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY"); 887 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 888 KMP_INFORM(Uniform, "KMP_AFFINITY"); 889 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 890 __kmp_nThreadsPerCore, __kmp_ncores); 891 } 892 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 893 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 894 if (__kmp_affinity_type == affinity_none) { 895 int avail_ct = 0; 896 int i; 897 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 898 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) 899 continue; 900 __kmp_pu_os_idx[avail_ct++] = i; // suppose indices are flat 901 } 902 return 0; 903 } 904 905 // Construct the data structure to be returned. 906 *address2os = 907 (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 908 int avail_ct = 0; 909 int i; 910 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 911 // Skip this proc if it is not included in the machine model. 912 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 913 continue; 914 } 915 __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat 916 Address addr(1); 917 addr.labels[0] = i; 918 (*address2os)[avail_ct++] = AddrUnsPair(addr, i); 919 } 920 if (__kmp_affinity_verbose) { 921 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 922 } 923 924 if (__kmp_affinity_gran_levels < 0) { 925 // Only the package level is modeled in the machine topology map, 926 // so the #levels of granularity is either 0 or 1. 927 if (__kmp_affinity_gran > affinity_gran_package) { 928 __kmp_affinity_gran_levels = 1; 929 } else { 930 __kmp_affinity_gran_levels = 0; 931 } 932 } 933 return 1; 934 } 935 936 #if KMP_GROUP_AFFINITY 937 938 // If multiple Windows* OS processor groups exist, we can create a 2-level 939 // topology map with the groups at level 0 and the individual procs at level 1. 940 // This facilitates letting the threads float among all procs in a group, 941 // if granularity=group (the default when there are multiple groups). 942 static int __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os, 943 kmp_i18n_id_t *const msg_id) { 944 *address2os = NULL; 945 *msg_id = kmp_i18n_null; 946 947 // If we aren't affinity capable, then return now. 948 // The flat mapping will be used. 949 if (!KMP_AFFINITY_CAPABLE()) { 950 // FIXME set *msg_id 951 return -1; 952 } 953 954 // Construct the data structure to be returned. 955 *address2os = 956 (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 957 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 958 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 959 int avail_ct = 0; 960 int i; 961 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 962 // Skip this proc if it is not included in the machine model. 963 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 964 continue; 965 } 966 __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat 967 Address addr(2); 968 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR)); 969 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR)); 970 (*address2os)[avail_ct++] = AddrUnsPair(addr, i); 971 972 if (__kmp_affinity_verbose) { 973 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0], 974 addr.labels[1]); 975 } 976 } 977 978 if (__kmp_affinity_gran_levels < 0) { 979 if (__kmp_affinity_gran == affinity_gran_group) { 980 __kmp_affinity_gran_levels = 1; 981 } else if ((__kmp_affinity_gran == affinity_gran_fine) || 982 (__kmp_affinity_gran == affinity_gran_thread)) { 983 __kmp_affinity_gran_levels = 0; 984 } else { 985 const char *gran_str = NULL; 986 if (__kmp_affinity_gran == affinity_gran_core) { 987 gran_str = "core"; 988 } else if (__kmp_affinity_gran == affinity_gran_package) { 989 gran_str = "package"; 990 } else if (__kmp_affinity_gran == affinity_gran_node) { 991 gran_str = "node"; 992 } else { 993 KMP_ASSERT(0); 994 } 995 996 // Warning: can't use affinity granularity \"gran\" with group topology 997 // method, using "thread" 998 __kmp_affinity_gran_levels = 0; 999 } 1000 } 1001 return 2; 1002 } 1003 1004 #endif /* KMP_GROUP_AFFINITY */ 1005 1006 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1007 1008 static int __kmp_cpuid_mask_width(int count) { 1009 int r = 0; 1010 1011 while ((1 << r) < count) 1012 ++r; 1013 return r; 1014 } 1015 1016 class apicThreadInfo { 1017 public: 1018 unsigned osId; // param to __kmp_affinity_bind_thread 1019 unsigned apicId; // from cpuid after binding 1020 unsigned maxCoresPerPkg; // "" 1021 unsigned maxThreadsPerPkg; // "" 1022 unsigned pkgId; // inferred from above values 1023 unsigned coreId; // "" 1024 unsigned threadId; // "" 1025 }; 1026 1027 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, 1028 const void *b) { 1029 const apicThreadInfo *aa = (const apicThreadInfo *)a; 1030 const apicThreadInfo *bb = (const apicThreadInfo *)b; 1031 if (aa->pkgId < bb->pkgId) 1032 return -1; 1033 if (aa->pkgId > bb->pkgId) 1034 return 1; 1035 if (aa->coreId < bb->coreId) 1036 return -1; 1037 if (aa->coreId > bb->coreId) 1038 return 1; 1039 if (aa->threadId < bb->threadId) 1040 return -1; 1041 if (aa->threadId > bb->threadId) 1042 return 1; 1043 return 0; 1044 } 1045 1046 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 1047 // an algorithm which cycles through the available os threads, setting 1048 // the current thread's affinity mask to that thread, and then retrieves 1049 // the Apic Id for each thread context using the cpuid instruction. 1050 static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os, 1051 kmp_i18n_id_t *const msg_id) { 1052 kmp_cpuid buf; 1053 *address2os = NULL; 1054 *msg_id = kmp_i18n_null; 1055 1056 // Check if cpuid leaf 4 is supported. 1057 __kmp_x86_cpuid(0, 0, &buf); 1058 if (buf.eax < 4) { 1059 *msg_id = kmp_i18n_str_NoLeaf4Support; 1060 return -1; 1061 } 1062 1063 // The algorithm used starts by setting the affinity to each available thread 1064 // and retrieving info from the cpuid instruction, so if we are not capable of 1065 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we 1066 // need to do something else - use the defaults that we calculated from 1067 // issuing cpuid without binding to each proc. 1068 if (!KMP_AFFINITY_CAPABLE()) { 1069 // Hack to try and infer the machine topology using only the data 1070 // available from cpuid on the current thread, and __kmp_xproc. 1071 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1072 1073 // Get an upper bound on the number of threads per package using cpuid(1). 1074 // On some OS/chps combinations where HT is supported by the chip but is 1075 // disabled, this value will be 2 on a single core chip. Usually, it will be 1076 // 2 if HT is enabled and 1 if HT is disabled. 1077 __kmp_x86_cpuid(1, 0, &buf); 1078 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1079 if (maxThreadsPerPkg == 0) { 1080 maxThreadsPerPkg = 1; 1081 } 1082 1083 // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded 1084 // value. 1085 // 1086 // The author of cpu_count.cpp treated this only an upper bound on the 1087 // number of cores, but I haven't seen any cases where it was greater than 1088 // the actual number of cores, so we will treat it as exact in this block of 1089 // code. 1090 // 1091 // First, we need to check if cpuid(4) is supported on this chip. To see if 1092 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or 1093 // greater. 1094 __kmp_x86_cpuid(0, 0, &buf); 1095 if (buf.eax >= 4) { 1096 __kmp_x86_cpuid(4, 0, &buf); 1097 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1098 } else { 1099 nCoresPerPkg = 1; 1100 } 1101 1102 // There is no way to reliably tell if HT is enabled without issuing the 1103 // cpuid instruction from every thread, can correlating the cpuid info, so 1104 // if the machine is not affinity capable, we assume that HT is off. We have 1105 // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine 1106 // does not support HT. 1107 // 1108 // - Older OSes are usually found on machines with older chips, which do not 1109 // support HT. 1110 // - The performance penalty for mistakenly identifying a machine as HT when 1111 // it isn't (which results in blocktime being incorrectly set to 0) is 1112 // greater than the penalty when for mistakenly identifying a machine as 1113 // being 1 thread/core when it is really HT enabled (which results in 1114 // blocktime being incorrectly set to a positive value). 1115 __kmp_ncores = __kmp_xproc; 1116 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1117 __kmp_nThreadsPerCore = 1; 1118 if (__kmp_affinity_verbose) { 1119 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY"); 1120 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1121 if (__kmp_affinity_uniform_topology()) { 1122 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1123 } else { 1124 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1125 } 1126 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1127 __kmp_nThreadsPerCore, __kmp_ncores); 1128 } 1129 return 0; 1130 } 1131 1132 // From here on, we can assume that it is safe to call 1133 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 1134 // __kmp_affinity_type = affinity_none. 1135 1136 // Save the affinity mask for the current thread. 1137 kmp_affin_mask_t *oldMask; 1138 KMP_CPU_ALLOC(oldMask); 1139 KMP_ASSERT(oldMask != NULL); 1140 __kmp_get_system_affinity(oldMask, TRUE); 1141 1142 // Run through each of the available contexts, binding the current thread 1143 // to it, and obtaining the pertinent information using the cpuid instr. 1144 // 1145 // The relevant information is: 1146 // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 1147 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 1148 // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value 1149 // of this field determines the width of the core# + thread# fields in the 1150 // Apic Id. It is also an upper bound on the number of threads per 1151 // package, but it has been verified that situations happen were it is not 1152 // exact. In particular, on certain OS/chip combinations where Intel(R) 1153 // Hyper-Threading Technology is supported by the chip but has been 1154 // disabled, the value of this field will be 2 (for a single core chip). 1155 // On other OS/chip combinations supporting Intel(R) Hyper-Threading 1156 // Technology, the value of this field will be 1 when Intel(R) 1157 // Hyper-Threading Technology is disabled and 2 when it is enabled. 1158 // - Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The value 1159 // of this field (+1) determines the width of the core# field in the Apic 1160 // Id. The comments in "cpucount.cpp" say that this value is an upper 1161 // bound, but the IA-32 architecture manual says that it is exactly the 1162 // number of cores per package, and I haven't seen any case where it 1163 // wasn't. 1164 // 1165 // From this information, deduce the package Id, core Id, and thread Id, 1166 // and set the corresponding fields in the apicThreadInfo struct. 1167 unsigned i; 1168 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 1169 __kmp_avail_proc * sizeof(apicThreadInfo)); 1170 unsigned nApics = 0; 1171 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1172 // Skip this proc if it is not included in the machine model. 1173 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1174 continue; 1175 } 1176 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 1177 1178 __kmp_affinity_dispatch->bind_thread(i); 1179 threadInfo[nApics].osId = i; 1180 1181 // The apic id and max threads per pkg come from cpuid(1). 1182 __kmp_x86_cpuid(1, 0, &buf); 1183 if (((buf.edx >> 9) & 1) == 0) { 1184 __kmp_set_system_affinity(oldMask, TRUE); 1185 __kmp_free(threadInfo); 1186 KMP_CPU_FREE(oldMask); 1187 *msg_id = kmp_i18n_str_ApicNotPresent; 1188 return -1; 1189 } 1190 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 1191 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1192 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 1193 threadInfo[nApics].maxThreadsPerPkg = 1; 1194 } 1195 1196 // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded 1197 // value. 1198 // 1199 // First, we need to check if cpuid(4) is supported on this chip. To see if 1200 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n 1201 // or greater. 1202 __kmp_x86_cpuid(0, 0, &buf); 1203 if (buf.eax >= 4) { 1204 __kmp_x86_cpuid(4, 0, &buf); 1205 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1206 } else { 1207 threadInfo[nApics].maxCoresPerPkg = 1; 1208 } 1209 1210 // Infer the pkgId / coreId / threadId using only the info obtained locally. 1211 int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg); 1212 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 1213 1214 int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg); 1215 int widthT = widthCT - widthC; 1216 if (widthT < 0) { 1217 // I've never seen this one happen, but I suppose it could, if the cpuid 1218 // instruction on a chip was really screwed up. Make sure to restore the 1219 // affinity mask before the tail call. 1220 __kmp_set_system_affinity(oldMask, TRUE); 1221 __kmp_free(threadInfo); 1222 KMP_CPU_FREE(oldMask); 1223 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1224 return -1; 1225 } 1226 1227 int maskC = (1 << widthC) - 1; 1228 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC; 1229 1230 int maskT = (1 << widthT) - 1; 1231 threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT; 1232 1233 nApics++; 1234 } 1235 1236 // We've collected all the info we need. 1237 // Restore the old affinity mask for this thread. 1238 __kmp_set_system_affinity(oldMask, TRUE); 1239 1240 // If there's only one thread context to bind to, form an Address object 1241 // with depth 1 and return immediately (or, if affinity is off, set 1242 // address2os to NULL and return). 1243 // 1244 // If it is configured to omit the package level when there is only a single 1245 // package, the logic at the end of this routine won't work if there is only 1246 // a single thread - it would try to form an Address object with depth 0. 1247 KMP_ASSERT(nApics > 0); 1248 if (nApics == 1) { 1249 __kmp_ncores = nPackages = 1; 1250 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1251 if (__kmp_affinity_verbose) { 1252 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1253 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1254 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1255 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1256 __kmp_nThreadsPerCore, __kmp_ncores); 1257 } 1258 1259 if (__kmp_affinity_type == affinity_none) { 1260 __kmp_free(threadInfo); 1261 KMP_CPU_FREE(oldMask); 1262 return 0; 1263 } 1264 1265 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair)); 1266 Address addr(1); 1267 addr.labels[0] = threadInfo[0].pkgId; 1268 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId); 1269 1270 if (__kmp_affinity_gran_levels < 0) { 1271 __kmp_affinity_gran_levels = 0; 1272 } 1273 1274 if (__kmp_affinity_verbose) { 1275 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 1276 } 1277 1278 __kmp_free(threadInfo); 1279 KMP_CPU_FREE(oldMask); 1280 return 1; 1281 } 1282 1283 // Sort the threadInfo table by physical Id. 1284 qsort(threadInfo, nApics, sizeof(*threadInfo), 1285 __kmp_affinity_cmp_apicThreadInfo_phys_id); 1286 1287 // The table is now sorted by pkgId / coreId / threadId, but we really don't 1288 // know the radix of any of the fields. pkgId's may be sparsely assigned among 1289 // the chips on a system. Although coreId's are usually assigned 1290 // [0 .. coresPerPkg-1] and threadId's are usually assigned 1291 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1292 // 1293 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 1294 // total # packages) are at this point - we want to determine that now. We 1295 // only have an upper bound on the first two figures. 1296 // 1297 // We also perform a consistency check at this point: the values returned by 1298 // the cpuid instruction for any thread bound to a given package had better 1299 // return the same info for maxThreadsPerPkg and maxCoresPerPkg. 1300 nPackages = 1; 1301 nCoresPerPkg = 1; 1302 __kmp_nThreadsPerCore = 1; 1303 unsigned nCores = 1; 1304 1305 unsigned pkgCt = 1; // to determine radii 1306 unsigned lastPkgId = threadInfo[0].pkgId; 1307 unsigned coreCt = 1; 1308 unsigned lastCoreId = threadInfo[0].coreId; 1309 unsigned threadCt = 1; 1310 unsigned lastThreadId = threadInfo[0].threadId; 1311 1312 // intra-pkg consist checks 1313 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 1314 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 1315 1316 for (i = 1; i < nApics; i++) { 1317 if (threadInfo[i].pkgId != lastPkgId) { 1318 nCores++; 1319 pkgCt++; 1320 lastPkgId = threadInfo[i].pkgId; 1321 if ((int)coreCt > nCoresPerPkg) 1322 nCoresPerPkg = coreCt; 1323 coreCt = 1; 1324 lastCoreId = threadInfo[i].coreId; 1325 if ((int)threadCt > __kmp_nThreadsPerCore) 1326 __kmp_nThreadsPerCore = threadCt; 1327 threadCt = 1; 1328 lastThreadId = threadInfo[i].threadId; 1329 1330 // This is a different package, so go on to the next iteration without 1331 // doing any consistency checks. Reset the consistency check vars, though. 1332 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 1333 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 1334 continue; 1335 } 1336 1337 if (threadInfo[i].coreId != lastCoreId) { 1338 nCores++; 1339 coreCt++; 1340 lastCoreId = threadInfo[i].coreId; 1341 if ((int)threadCt > __kmp_nThreadsPerCore) 1342 __kmp_nThreadsPerCore = threadCt; 1343 threadCt = 1; 1344 lastThreadId = threadInfo[i].threadId; 1345 } else if (threadInfo[i].threadId != lastThreadId) { 1346 threadCt++; 1347 lastThreadId = threadInfo[i].threadId; 1348 } else { 1349 __kmp_free(threadInfo); 1350 KMP_CPU_FREE(oldMask); 1351 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1352 return -1; 1353 } 1354 1355 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 1356 // fields agree between all the threads bounds to a given package. 1357 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) || 1358 (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 1359 __kmp_free(threadInfo); 1360 KMP_CPU_FREE(oldMask); 1361 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1362 return -1; 1363 } 1364 } 1365 nPackages = pkgCt; 1366 if ((int)coreCt > nCoresPerPkg) 1367 nCoresPerPkg = coreCt; 1368 if ((int)threadCt > __kmp_nThreadsPerCore) 1369 __kmp_nThreadsPerCore = threadCt; 1370 1371 // When affinity is off, this routine will still be called to set 1372 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1373 // Make sure all these vars are set correctly, and return now if affinity is 1374 // not enabled. 1375 __kmp_ncores = nCores; 1376 if (__kmp_affinity_verbose) { 1377 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1378 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1379 if (__kmp_affinity_uniform_topology()) { 1380 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1381 } else { 1382 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1383 } 1384 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1385 __kmp_nThreadsPerCore, __kmp_ncores); 1386 } 1387 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 1388 KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc); 1389 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 1390 for (i = 0; i < nApics; ++i) { 1391 __kmp_pu_os_idx[i] = threadInfo[i].osId; 1392 } 1393 if (__kmp_affinity_type == affinity_none) { 1394 __kmp_free(threadInfo); 1395 KMP_CPU_FREE(oldMask); 1396 return 0; 1397 } 1398 1399 // Now that we've determined the number of packages, the number of cores per 1400 // package, and the number of threads per core, we can construct the data 1401 // structure that is to be returned. 1402 int pkgLevel = 0; 1403 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1; 1404 int threadLevel = 1405 (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 1406 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 1407 1408 KMP_ASSERT(depth > 0); 1409 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1410 1411 for (i = 0; i < nApics; ++i) { 1412 Address addr(depth); 1413 unsigned os = threadInfo[i].osId; 1414 int d = 0; 1415 1416 if (pkgLevel >= 0) { 1417 addr.labels[d++] = threadInfo[i].pkgId; 1418 } 1419 if (coreLevel >= 0) { 1420 addr.labels[d++] = threadInfo[i].coreId; 1421 } 1422 if (threadLevel >= 0) { 1423 addr.labels[d++] = threadInfo[i].threadId; 1424 } 1425 (*address2os)[i] = AddrUnsPair(addr, os); 1426 } 1427 1428 if (__kmp_affinity_gran_levels < 0) { 1429 // Set the granularity level based on what levels are modeled in the machine 1430 // topology map. 1431 __kmp_affinity_gran_levels = 0; 1432 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1433 __kmp_affinity_gran_levels++; 1434 } 1435 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1436 __kmp_affinity_gran_levels++; 1437 } 1438 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) { 1439 __kmp_affinity_gran_levels++; 1440 } 1441 } 1442 1443 if (__kmp_affinity_verbose) { 1444 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel, 1445 coreLevel, threadLevel); 1446 } 1447 1448 __kmp_free(threadInfo); 1449 KMP_CPU_FREE(oldMask); 1450 return depth; 1451 } 1452 1453 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 1454 // architectures support a newer interface for specifying the x2APIC Ids, 1455 // based on cpuid leaf 11. 1456 static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os, 1457 kmp_i18n_id_t *const msg_id) { 1458 kmp_cpuid buf; 1459 *address2os = NULL; 1460 *msg_id = kmp_i18n_null; 1461 1462 // Check to see if cpuid leaf 11 is supported. 1463 __kmp_x86_cpuid(0, 0, &buf); 1464 if (buf.eax < 11) { 1465 *msg_id = kmp_i18n_str_NoLeaf11Support; 1466 return -1; 1467 } 1468 __kmp_x86_cpuid(11, 0, &buf); 1469 if (buf.ebx == 0) { 1470 *msg_id = kmp_i18n_str_NoLeaf11Support; 1471 return -1; 1472 } 1473 1474 // Find the number of levels in the machine topology. While we're at it, get 1475 // the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will try to 1476 // get more accurate values later by explicitly counting them, but get 1477 // reasonable defaults now, in case we return early. 1478 int level; 1479 int threadLevel = -1; 1480 int coreLevel = -1; 1481 int pkgLevel = -1; 1482 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1483 1484 for (level = 0;; level++) { 1485 if (level > 31) { 1486 // FIXME: Hack for DPD200163180 1487 // 1488 // If level is big then something went wrong -> exiting 1489 // 1490 // There could actually be 32 valid levels in the machine topology, but so 1491 // far, the only machine we have seen which does not exit this loop before 1492 // iteration 32 has fubar x2APIC settings. 1493 // 1494 // For now, just reject this case based upon loop trip count. 1495 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1496 return -1; 1497 } 1498 __kmp_x86_cpuid(11, level, &buf); 1499 if (buf.ebx == 0) { 1500 if (pkgLevel < 0) { 1501 // Will infer nPackages from __kmp_xproc 1502 pkgLevel = level; 1503 level++; 1504 } 1505 break; 1506 } 1507 int kind = (buf.ecx >> 8) & 0xff; 1508 if (kind == 1) { 1509 // SMT level 1510 threadLevel = level; 1511 coreLevel = -1; 1512 pkgLevel = -1; 1513 __kmp_nThreadsPerCore = buf.ebx & 0xffff; 1514 if (__kmp_nThreadsPerCore == 0) { 1515 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1516 return -1; 1517 } 1518 } else if (kind == 2) { 1519 // core level 1520 coreLevel = level; 1521 pkgLevel = -1; 1522 nCoresPerPkg = buf.ebx & 0xffff; 1523 if (nCoresPerPkg == 0) { 1524 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1525 return -1; 1526 } 1527 } else { 1528 if (level <= 0) { 1529 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1530 return -1; 1531 } 1532 if (pkgLevel >= 0) { 1533 continue; 1534 } 1535 pkgLevel = level; 1536 nPackages = buf.ebx & 0xffff; 1537 if (nPackages == 0) { 1538 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1539 return -1; 1540 } 1541 } 1542 } 1543 int depth = level; 1544 1545 // In the above loop, "level" was counted from the finest level (usually 1546 // thread) to the coarsest. The caller expects that we will place the labels 1547 // in (*address2os)[].first.labels[] in the inverse order, so we need to 1548 // invert the vars saying which level means what. 1549 if (threadLevel >= 0) { 1550 threadLevel = depth - threadLevel - 1; 1551 } 1552 if (coreLevel >= 0) { 1553 coreLevel = depth - coreLevel - 1; 1554 } 1555 KMP_DEBUG_ASSERT(pkgLevel >= 0); 1556 pkgLevel = depth - pkgLevel - 1; 1557 1558 // The algorithm used starts by setting the affinity to each available thread 1559 // and retrieving info from the cpuid instruction, so if we are not capable of 1560 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we 1561 // need to do something else - use the defaults that we calculated from 1562 // issuing cpuid without binding to each proc. 1563 if (!KMP_AFFINITY_CAPABLE()) { 1564 // Hack to try and infer the machine topology using only the data 1565 // available from cpuid on the current thread, and __kmp_xproc. 1566 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1567 1568 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1569 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1570 if (__kmp_affinity_verbose) { 1571 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 1572 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1573 if (__kmp_affinity_uniform_topology()) { 1574 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1575 } else { 1576 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1577 } 1578 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1579 __kmp_nThreadsPerCore, __kmp_ncores); 1580 } 1581 return 0; 1582 } 1583 1584 // From here on, we can assume that it is safe to call 1585 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 1586 // __kmp_affinity_type = affinity_none. 1587 1588 // Save the affinity mask for the current thread. 1589 kmp_affin_mask_t *oldMask; 1590 KMP_CPU_ALLOC(oldMask); 1591 __kmp_get_system_affinity(oldMask, TRUE); 1592 1593 // Allocate the data structure to be returned. 1594 AddrUnsPair *retval = 1595 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 1596 1597 // Run through each of the available contexts, binding the current thread 1598 // to it, and obtaining the pertinent information using the cpuid instr. 1599 unsigned int proc; 1600 int nApics = 0; 1601 KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) { 1602 // Skip this proc if it is not included in the machine model. 1603 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 1604 continue; 1605 } 1606 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc); 1607 1608 __kmp_affinity_dispatch->bind_thread(proc); 1609 1610 // Extract labels for each level in the machine topology map from Apic ID. 1611 Address addr(depth); 1612 int prev_shift = 0; 1613 1614 for (level = 0; level < depth; level++) { 1615 __kmp_x86_cpuid(11, level, &buf); 1616 unsigned apicId = buf.edx; 1617 if (buf.ebx == 0) { 1618 if (level != depth - 1) { 1619 KMP_CPU_FREE(oldMask); 1620 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1621 return -1; 1622 } 1623 addr.labels[depth - level - 1] = apicId >> prev_shift; 1624 level++; 1625 break; 1626 } 1627 int shift = buf.eax & 0x1f; 1628 int mask = (1 << shift) - 1; 1629 addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift; 1630 prev_shift = shift; 1631 } 1632 if (level != depth) { 1633 KMP_CPU_FREE(oldMask); 1634 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1635 return -1; 1636 } 1637 1638 retval[nApics] = AddrUnsPair(addr, proc); 1639 nApics++; 1640 } 1641 1642 // We've collected all the info we need. 1643 // Restore the old affinity mask for this thread. 1644 __kmp_set_system_affinity(oldMask, TRUE); 1645 1646 // If there's only one thread context to bind to, return now. 1647 KMP_ASSERT(nApics > 0); 1648 if (nApics == 1) { 1649 __kmp_ncores = nPackages = 1; 1650 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1651 if (__kmp_affinity_verbose) { 1652 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1653 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1654 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1655 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1656 __kmp_nThreadsPerCore, __kmp_ncores); 1657 } 1658 1659 if (__kmp_affinity_type == affinity_none) { 1660 __kmp_free(retval); 1661 KMP_CPU_FREE(oldMask); 1662 return 0; 1663 } 1664 1665 // Form an Address object which only includes the package level. 1666 Address addr(1); 1667 addr.labels[0] = retval[0].first.labels[pkgLevel]; 1668 retval[0].first = addr; 1669 1670 if (__kmp_affinity_gran_levels < 0) { 1671 __kmp_affinity_gran_levels = 0; 1672 } 1673 1674 if (__kmp_affinity_verbose) { 1675 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 1676 } 1677 1678 *address2os = retval; 1679 KMP_CPU_FREE(oldMask); 1680 return 1; 1681 } 1682 1683 // Sort the table by physical Id. 1684 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels); 1685 1686 // Find the radix at each of the levels. 1687 unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1688 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1689 unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1690 unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1691 for (level = 0; level < depth; level++) { 1692 totals[level] = 1; 1693 maxCt[level] = 1; 1694 counts[level] = 1; 1695 last[level] = retval[0].first.labels[level]; 1696 } 1697 1698 // From here on, the iteration variable "level" runs from the finest level to 1699 // the coarsest, i.e. we iterate forward through 1700 // (*address2os)[].first.labels[] - in the previous loops, we iterated 1701 // backwards. 1702 for (proc = 1; (int)proc < nApics; proc++) { 1703 int level; 1704 for (level = 0; level < depth; level++) { 1705 if (retval[proc].first.labels[level] != last[level]) { 1706 int j; 1707 for (j = level + 1; j < depth; j++) { 1708 totals[j]++; 1709 counts[j] = 1; 1710 // The line below causes printing incorrect topology information in 1711 // case the max value for some level (maxCt[level]) is encountered 1712 // earlier than some less value while going through the array. For 1713 // example, let pkg0 has 4 cores and pkg1 has 2 cores. Then 1714 // maxCt[1] == 2 1715 // whereas it must be 4. 1716 // TODO!!! Check if it can be commented safely 1717 // maxCt[j] = 1; 1718 last[j] = retval[proc].first.labels[j]; 1719 } 1720 totals[level]++; 1721 counts[level]++; 1722 if (counts[level] > maxCt[level]) { 1723 maxCt[level] = counts[level]; 1724 } 1725 last[level] = retval[proc].first.labels[level]; 1726 break; 1727 } else if (level == depth - 1) { 1728 __kmp_free(last); 1729 __kmp_free(maxCt); 1730 __kmp_free(counts); 1731 __kmp_free(totals); 1732 __kmp_free(retval); 1733 KMP_CPU_FREE(oldMask); 1734 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 1735 return -1; 1736 } 1737 } 1738 } 1739 1740 // When affinity is off, this routine will still be called to set 1741 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1742 // Make sure all these vars are set correctly, and return if affinity is not 1743 // enabled. 1744 if (threadLevel >= 0) { 1745 __kmp_nThreadsPerCore = maxCt[threadLevel]; 1746 } else { 1747 __kmp_nThreadsPerCore = 1; 1748 } 1749 nPackages = totals[pkgLevel]; 1750 1751 if (coreLevel >= 0) { 1752 __kmp_ncores = totals[coreLevel]; 1753 nCoresPerPkg = maxCt[coreLevel]; 1754 } else { 1755 __kmp_ncores = nPackages; 1756 nCoresPerPkg = 1; 1757 } 1758 1759 // Check to see if the machine topology is uniform 1760 unsigned prod = maxCt[0]; 1761 for (level = 1; level < depth; level++) { 1762 prod *= maxCt[level]; 1763 } 1764 bool uniform = (prod == totals[level - 1]); 1765 1766 // Print the machine topology summary. 1767 if (__kmp_affinity_verbose) { 1768 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1769 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1770 if (uniform) { 1771 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1772 } else { 1773 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1774 } 1775 1776 kmp_str_buf_t buf; 1777 __kmp_str_buf_init(&buf); 1778 1779 __kmp_str_buf_print(&buf, "%d", totals[0]); 1780 for (level = 1; level <= pkgLevel; level++) { 1781 __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 1782 } 1783 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 1784 __kmp_nThreadsPerCore, __kmp_ncores); 1785 1786 __kmp_str_buf_free(&buf); 1787 } 1788 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 1789 KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc); 1790 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 1791 for (proc = 0; (int)proc < nApics; ++proc) { 1792 __kmp_pu_os_idx[proc] = retval[proc].second; 1793 } 1794 if (__kmp_affinity_type == affinity_none) { 1795 __kmp_free(last); 1796 __kmp_free(maxCt); 1797 __kmp_free(counts); 1798 __kmp_free(totals); 1799 __kmp_free(retval); 1800 KMP_CPU_FREE(oldMask); 1801 return 0; 1802 } 1803 1804 // Find any levels with radix 1, and remove them from the map 1805 // (except for the package level). 1806 int new_depth = 0; 1807 for (level = 0; level < depth; level++) { 1808 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1809 continue; 1810 } 1811 new_depth++; 1812 } 1813 1814 // If we are removing any levels, allocate a new vector to return, 1815 // and copy the relevant information to it. 1816 if (new_depth != depth) { 1817 AddrUnsPair *new_retval = 1818 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1819 for (proc = 0; (int)proc < nApics; proc++) { 1820 Address addr(new_depth); 1821 new_retval[proc] = AddrUnsPair(addr, retval[proc].second); 1822 } 1823 int new_level = 0; 1824 int newPkgLevel = -1; 1825 int newCoreLevel = -1; 1826 int newThreadLevel = -1; 1827 for (level = 0; level < depth; level++) { 1828 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1829 // Remove this level. Never remove the package level 1830 continue; 1831 } 1832 if (level == pkgLevel) { 1833 newPkgLevel = new_level; 1834 } 1835 if (level == coreLevel) { 1836 newCoreLevel = new_level; 1837 } 1838 if (level == threadLevel) { 1839 newThreadLevel = new_level; 1840 } 1841 for (proc = 0; (int)proc < nApics; proc++) { 1842 new_retval[proc].first.labels[new_level] = 1843 retval[proc].first.labels[level]; 1844 } 1845 new_level++; 1846 } 1847 1848 __kmp_free(retval); 1849 retval = new_retval; 1850 depth = new_depth; 1851 pkgLevel = newPkgLevel; 1852 coreLevel = newCoreLevel; 1853 threadLevel = newThreadLevel; 1854 } 1855 1856 if (__kmp_affinity_gran_levels < 0) { 1857 // Set the granularity level based on what levels are modeled 1858 // in the machine topology map. 1859 __kmp_affinity_gran_levels = 0; 1860 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1861 __kmp_affinity_gran_levels++; 1862 } 1863 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1864 __kmp_affinity_gran_levels++; 1865 } 1866 if (__kmp_affinity_gran > affinity_gran_package) { 1867 __kmp_affinity_gran_levels++; 1868 } 1869 } 1870 1871 if (__kmp_affinity_verbose) { 1872 __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, coreLevel, 1873 threadLevel); 1874 } 1875 1876 __kmp_free(last); 1877 __kmp_free(maxCt); 1878 __kmp_free(counts); 1879 __kmp_free(totals); 1880 KMP_CPU_FREE(oldMask); 1881 *address2os = retval; 1882 return depth; 1883 } 1884 1885 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1886 1887 #define osIdIndex 0 1888 #define threadIdIndex 1 1889 #define coreIdIndex 2 1890 #define pkgIdIndex 3 1891 #define nodeIdIndex 4 1892 1893 typedef unsigned *ProcCpuInfo; 1894 static unsigned maxIndex = pkgIdIndex; 1895 1896 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, 1897 const void *b) { 1898 unsigned i; 1899 const unsigned *aa = *(unsigned *const *)a; 1900 const unsigned *bb = *(unsigned *const *)b; 1901 for (i = maxIndex;; i--) { 1902 if (aa[i] < bb[i]) 1903 return -1; 1904 if (aa[i] > bb[i]) 1905 return 1; 1906 if (i == osIdIndex) 1907 break; 1908 } 1909 return 0; 1910 } 1911 1912 #if KMP_USE_HIER_SCHED 1913 // Set the array sizes for the hierarchy layers 1914 static void __kmp_dispatch_set_hierarchy_values() { 1915 // Set the maximum number of L1's to number of cores 1916 // Set the maximum number of L2's to to either number of cores / 2 for 1917 // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing 1918 // Or the number of cores for Intel(R) Xeon(R) processors 1919 // Set the maximum number of NUMA nodes and L3's to number of packages 1920 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] = 1921 nPackages * nCoresPerPkg * __kmp_nThreadsPerCore; 1922 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores; 1923 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \ 1924 KMP_MIC_SUPPORTED 1925 if (__kmp_mic_type >= mic3) 1926 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2; 1927 else 1928 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) 1929 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores; 1930 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages; 1931 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages; 1932 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1; 1933 // Set the number of threads per unit 1934 // Number of hardware threads per L1/L2/L3/NUMA/LOOP 1935 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1; 1936 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] = 1937 __kmp_nThreadsPerCore; 1938 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \ 1939 KMP_MIC_SUPPORTED 1940 if (__kmp_mic_type >= mic3) 1941 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] = 1942 2 * __kmp_nThreadsPerCore; 1943 else 1944 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) 1945 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] = 1946 __kmp_nThreadsPerCore; 1947 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] = 1948 nCoresPerPkg * __kmp_nThreadsPerCore; 1949 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] = 1950 nCoresPerPkg * __kmp_nThreadsPerCore; 1951 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] = 1952 nPackages * nCoresPerPkg * __kmp_nThreadsPerCore; 1953 } 1954 1955 // Return the index into the hierarchy for this tid and layer type (L1, L2, etc) 1956 // i.e., this thread's L1 or this thread's L2, etc. 1957 int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) { 1958 int index = type + 1; 1959 int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1]; 1960 KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST); 1961 if (type == kmp_hier_layer_e::LAYER_THREAD) 1962 return tid; 1963 else if (type == kmp_hier_layer_e::LAYER_LOOP) 1964 return 0; 1965 KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0); 1966 if (tid >= num_hw_threads) 1967 tid = tid % num_hw_threads; 1968 return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index]; 1969 } 1970 1971 // Return the number of t1's per t2 1972 int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) { 1973 int i1 = t1 + 1; 1974 int i2 = t2 + 1; 1975 KMP_DEBUG_ASSERT(i1 <= i2); 1976 KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST); 1977 KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST); 1978 KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0); 1979 // (nthreads/t2) / (nthreads/t1) = t1 / t2 1980 return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1]; 1981 } 1982 #endif // KMP_USE_HIER_SCHED 1983 1984 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 1985 // affinity map. 1986 static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, 1987 int *line, 1988 kmp_i18n_id_t *const msg_id, 1989 FILE *f) { 1990 *address2os = NULL; 1991 *msg_id = kmp_i18n_null; 1992 1993 // Scan of the file, and count the number of "processor" (osId) fields, 1994 // and find the highest value of <n> for a node_<n> field. 1995 char buf[256]; 1996 unsigned num_records = 0; 1997 while (!feof(f)) { 1998 buf[sizeof(buf) - 1] = 1; 1999 if (!fgets(buf, sizeof(buf), f)) { 2000 // Read errors presumably because of EOF 2001 break; 2002 } 2003 2004 char s1[] = "processor"; 2005 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 2006 num_records++; 2007 continue; 2008 } 2009 2010 // FIXME - this will match "node_<n> <garbage>" 2011 unsigned level; 2012 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 2013 if (nodeIdIndex + level >= maxIndex) { 2014 maxIndex = nodeIdIndex + level; 2015 } 2016 continue; 2017 } 2018 } 2019 2020 // Check for empty file / no valid processor records, or too many. The number 2021 // of records can't exceed the number of valid bits in the affinity mask. 2022 if (num_records == 0) { 2023 *line = 0; 2024 *msg_id = kmp_i18n_str_NoProcRecords; 2025 return -1; 2026 } 2027 if (num_records > (unsigned)__kmp_xproc) { 2028 *line = 0; 2029 *msg_id = kmp_i18n_str_TooManyProcRecords; 2030 return -1; 2031 } 2032 2033 // Set the file pointer back to the beginning, so that we can scan the file 2034 // again, this time performing a full parse of the data. Allocate a vector of 2035 // ProcCpuInfo object, where we will place the data. Adding an extra element 2036 // at the end allows us to remove a lot of extra checks for termination 2037 // conditions. 2038 if (fseek(f, 0, SEEK_SET) != 0) { 2039 *line = 0; 2040 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 2041 return -1; 2042 } 2043 2044 // Allocate the array of records to store the proc info in. The dummy 2045 // element at the end makes the logic in filling them out easier to code. 2046 unsigned **threadInfo = 2047 (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *)); 2048 unsigned i; 2049 for (i = 0; i <= num_records; i++) { 2050 threadInfo[i] = 2051 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2052 } 2053 2054 #define CLEANUP_THREAD_INFO \ 2055 for (i = 0; i <= num_records; i++) { \ 2056 __kmp_free(threadInfo[i]); \ 2057 } \ 2058 __kmp_free(threadInfo); 2059 2060 // A value of UINT_MAX means that we didn't find the field 2061 unsigned __index; 2062 2063 #define INIT_PROC_INFO(p) \ 2064 for (__index = 0; __index <= maxIndex; __index++) { \ 2065 (p)[__index] = UINT_MAX; \ 2066 } 2067 2068 for (i = 0; i <= num_records; i++) { 2069 INIT_PROC_INFO(threadInfo[i]); 2070 } 2071 2072 unsigned num_avail = 0; 2073 *line = 0; 2074 while (!feof(f)) { 2075 // Create an inner scoping level, so that all the goto targets at the end of 2076 // the loop appear in an outer scoping level. This avoids warnings about 2077 // jumping past an initialization to a target in the same block. 2078 { 2079 buf[sizeof(buf) - 1] = 1; 2080 bool long_line = false; 2081 if (!fgets(buf, sizeof(buf), f)) { 2082 // Read errors presumably because of EOF 2083 // If there is valid data in threadInfo[num_avail], then fake 2084 // a blank line in ensure that the last address gets parsed. 2085 bool valid = false; 2086 for (i = 0; i <= maxIndex; i++) { 2087 if (threadInfo[num_avail][i] != UINT_MAX) { 2088 valid = true; 2089 } 2090 } 2091 if (!valid) { 2092 break; 2093 } 2094 buf[0] = 0; 2095 } else if (!buf[sizeof(buf) - 1]) { 2096 // The line is longer than the buffer. Set a flag and don't 2097 // emit an error if we were going to ignore the line, anyway. 2098 long_line = true; 2099 2100 #define CHECK_LINE \ 2101 if (long_line) { \ 2102 CLEANUP_THREAD_INFO; \ 2103 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 2104 return -1; \ 2105 } 2106 } 2107 (*line)++; 2108 2109 char s1[] = "processor"; 2110 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 2111 CHECK_LINE; 2112 char *p = strchr(buf + sizeof(s1) - 1, ':'); 2113 unsigned val; 2114 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2115 goto no_val; 2116 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) 2117 #if KMP_ARCH_AARCH64 2118 // Handle the old AArch64 /proc/cpuinfo layout differently, 2119 // it contains all of the 'processor' entries listed in a 2120 // single 'Processor' section, therefore the normal looking 2121 // for duplicates in that section will always fail. 2122 num_avail++; 2123 #else 2124 goto dup_field; 2125 #endif 2126 threadInfo[num_avail][osIdIndex] = val; 2127 #if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64) 2128 char path[256]; 2129 KMP_SNPRINTF( 2130 path, sizeof(path), 2131 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 2132 threadInfo[num_avail][osIdIndex]); 2133 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 2134 2135 KMP_SNPRINTF(path, sizeof(path), 2136 "/sys/devices/system/cpu/cpu%u/topology/core_id", 2137 threadInfo[num_avail][osIdIndex]); 2138 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 2139 continue; 2140 #else 2141 } 2142 char s2[] = "physical id"; 2143 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 2144 CHECK_LINE; 2145 char *p = strchr(buf + sizeof(s2) - 1, ':'); 2146 unsigned val; 2147 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2148 goto no_val; 2149 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) 2150 goto dup_field; 2151 threadInfo[num_avail][pkgIdIndex] = val; 2152 continue; 2153 } 2154 char s3[] = "core id"; 2155 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 2156 CHECK_LINE; 2157 char *p = strchr(buf + sizeof(s3) - 1, ':'); 2158 unsigned val; 2159 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2160 goto no_val; 2161 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) 2162 goto dup_field; 2163 threadInfo[num_avail][coreIdIndex] = val; 2164 continue; 2165 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 2166 } 2167 char s4[] = "thread id"; 2168 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 2169 CHECK_LINE; 2170 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2171 unsigned val; 2172 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2173 goto no_val; 2174 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) 2175 goto dup_field; 2176 threadInfo[num_avail][threadIdIndex] = val; 2177 continue; 2178 } 2179 unsigned level; 2180 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 2181 CHECK_LINE; 2182 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2183 unsigned val; 2184 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2185 goto no_val; 2186 KMP_ASSERT(nodeIdIndex + level <= maxIndex); 2187 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) 2188 goto dup_field; 2189 threadInfo[num_avail][nodeIdIndex + level] = val; 2190 continue; 2191 } 2192 2193 // We didn't recognize the leading token on the line. There are lots of 2194 // leading tokens that we don't recognize - if the line isn't empty, go on 2195 // to the next line. 2196 if ((*buf != 0) && (*buf != '\n')) { 2197 // If the line is longer than the buffer, read characters 2198 // until we find a newline. 2199 if (long_line) { 2200 int ch; 2201 while (((ch = fgetc(f)) != EOF) && (ch != '\n')) 2202 ; 2203 } 2204 continue; 2205 } 2206 2207 // A newline has signalled the end of the processor record. 2208 // Check that there aren't too many procs specified. 2209 if ((int)num_avail == __kmp_xproc) { 2210 CLEANUP_THREAD_INFO; 2211 *msg_id = kmp_i18n_str_TooManyEntries; 2212 return -1; 2213 } 2214 2215 // Check for missing fields. The osId field must be there, and we 2216 // currently require that the physical id field is specified, also. 2217 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 2218 CLEANUP_THREAD_INFO; 2219 *msg_id = kmp_i18n_str_MissingProcField; 2220 return -1; 2221 } 2222 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 2223 CLEANUP_THREAD_INFO; 2224 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 2225 return -1; 2226 } 2227 2228 // Skip this proc if it is not included in the machine model. 2229 if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], 2230 __kmp_affin_fullMask)) { 2231 INIT_PROC_INFO(threadInfo[num_avail]); 2232 continue; 2233 } 2234 2235 // We have a successful parse of this proc's info. 2236 // Increment the counter, and prepare for the next proc. 2237 num_avail++; 2238 KMP_ASSERT(num_avail <= num_records); 2239 INIT_PROC_INFO(threadInfo[num_avail]); 2240 } 2241 continue; 2242 2243 no_val: 2244 CLEANUP_THREAD_INFO; 2245 *msg_id = kmp_i18n_str_MissingValCpuinfo; 2246 return -1; 2247 2248 dup_field: 2249 CLEANUP_THREAD_INFO; 2250 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 2251 return -1; 2252 } 2253 *line = 0; 2254 2255 #if KMP_MIC && REDUCE_TEAM_SIZE 2256 unsigned teamSize = 0; 2257 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2258 2259 // check for num_records == __kmp_xproc ??? 2260 2261 // If there's only one thread context to bind to, form an Address object with 2262 // depth 1 and return immediately (or, if affinity is off, set address2os to 2263 // NULL and return). 2264 // 2265 // If it is configured to omit the package level when there is only a single 2266 // package, the logic at the end of this routine won't work if there is only a 2267 // single thread - it would try to form an Address object with depth 0. 2268 KMP_ASSERT(num_avail > 0); 2269 KMP_ASSERT(num_avail <= num_records); 2270 if (num_avail == 1) { 2271 __kmp_ncores = 1; 2272 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 2273 if (__kmp_affinity_verbose) { 2274 if (!KMP_AFFINITY_CAPABLE()) { 2275 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2276 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2277 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2278 } else { 2279 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2280 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2281 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2282 } 2283 int index; 2284 kmp_str_buf_t buf; 2285 __kmp_str_buf_init(&buf); 2286 __kmp_str_buf_print(&buf, "1"); 2287 for (index = maxIndex - 1; index > pkgIdIndex; index--) { 2288 __kmp_str_buf_print(&buf, " x 1"); 2289 } 2290 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1); 2291 __kmp_str_buf_free(&buf); 2292 } 2293 2294 if (__kmp_affinity_type == affinity_none) { 2295 CLEANUP_THREAD_INFO; 2296 return 0; 2297 } 2298 2299 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair)); 2300 Address addr(1); 2301 addr.labels[0] = threadInfo[0][pkgIdIndex]; 2302 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]); 2303 2304 if (__kmp_affinity_gran_levels < 0) { 2305 __kmp_affinity_gran_levels = 0; 2306 } 2307 2308 if (__kmp_affinity_verbose) { 2309 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 2310 } 2311 2312 CLEANUP_THREAD_INFO; 2313 return 1; 2314 } 2315 2316 // Sort the threadInfo table by physical Id. 2317 qsort(threadInfo, num_avail, sizeof(*threadInfo), 2318 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 2319 2320 // The table is now sorted by pkgId / coreId / threadId, but we really don't 2321 // know the radix of any of the fields. pkgId's may be sparsely assigned among 2322 // the chips on a system. Although coreId's are usually assigned 2323 // [0 .. coresPerPkg-1] and threadId's are usually assigned 2324 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2325 // 2326 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 2327 // total # packages) are at this point - we want to determine that now. We 2328 // only have an upper bound on the first two figures. 2329 unsigned *counts = 2330 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2331 unsigned *maxCt = 2332 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2333 unsigned *totals = 2334 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2335 unsigned *lastId = 2336 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2337 2338 bool assign_thread_ids = false; 2339 unsigned threadIdCt; 2340 unsigned index; 2341 2342 restart_radix_check: 2343 threadIdCt = 0; 2344 2345 // Initialize the counter arrays with data from threadInfo[0]. 2346 if (assign_thread_ids) { 2347 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 2348 threadInfo[0][threadIdIndex] = threadIdCt++; 2349 } else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 2350 threadIdCt = threadInfo[0][threadIdIndex] + 1; 2351 } 2352 } 2353 for (index = 0; index <= maxIndex; index++) { 2354 counts[index] = 1; 2355 maxCt[index] = 1; 2356 totals[index] = 1; 2357 lastId[index] = threadInfo[0][index]; 2358 ; 2359 } 2360 2361 // Run through the rest of the OS procs. 2362 for (i = 1; i < num_avail; i++) { 2363 // Find the most significant index whose id differs from the id for the 2364 // previous OS proc. 2365 for (index = maxIndex; index >= threadIdIndex; index--) { 2366 if (assign_thread_ids && (index == threadIdIndex)) { 2367 // Auto-assign the thread id field if it wasn't specified. 2368 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2369 threadInfo[i][threadIdIndex] = threadIdCt++; 2370 } 2371 // Apparently the thread id field was specified for some entries and not 2372 // others. Start the thread id counter off at the next higher thread id. 2373 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2374 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2375 } 2376 } 2377 if (threadInfo[i][index] != lastId[index]) { 2378 // Run through all indices which are less significant, and reset the 2379 // counts to 1. At all levels up to and including index, we need to 2380 // increment the totals and record the last id. 2381 unsigned index2; 2382 for (index2 = threadIdIndex; index2 < index; index2++) { 2383 totals[index2]++; 2384 if (counts[index2] > maxCt[index2]) { 2385 maxCt[index2] = counts[index2]; 2386 } 2387 counts[index2] = 1; 2388 lastId[index2] = threadInfo[i][index2]; 2389 } 2390 counts[index]++; 2391 totals[index]++; 2392 lastId[index] = threadInfo[i][index]; 2393 2394 if (assign_thread_ids && (index > threadIdIndex)) { 2395 2396 #if KMP_MIC && REDUCE_TEAM_SIZE 2397 // The default team size is the total #threads in the machine 2398 // minus 1 thread for every core that has 3 or more threads. 2399 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2400 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2401 2402 // Restart the thread counter, as we are on a new core. 2403 threadIdCt = 0; 2404 2405 // Auto-assign the thread id field if it wasn't specified. 2406 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2407 threadInfo[i][threadIdIndex] = threadIdCt++; 2408 } 2409 2410 // Apparently the thread id field was specified for some entries and 2411 // not others. Start the thread id counter off at the next higher 2412 // thread id. 2413 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2414 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2415 } 2416 } 2417 break; 2418 } 2419 } 2420 if (index < threadIdIndex) { 2421 // If thread ids were specified, it is an error if they are not unique. 2422 // Also, check that we waven't already restarted the loop (to be safe - 2423 // shouldn't need to). 2424 if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) { 2425 __kmp_free(lastId); 2426 __kmp_free(totals); 2427 __kmp_free(maxCt); 2428 __kmp_free(counts); 2429 CLEANUP_THREAD_INFO; 2430 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2431 return -1; 2432 } 2433 2434 // If the thread ids were not specified and we see entries entries that 2435 // are duplicates, start the loop over and assign the thread ids manually. 2436 assign_thread_ids = true; 2437 goto restart_radix_check; 2438 } 2439 } 2440 2441 #if KMP_MIC && REDUCE_TEAM_SIZE 2442 // The default team size is the total #threads in the machine 2443 // minus 1 thread for every core that has 3 or more threads. 2444 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2445 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2446 2447 for (index = threadIdIndex; index <= maxIndex; index++) { 2448 if (counts[index] > maxCt[index]) { 2449 maxCt[index] = counts[index]; 2450 } 2451 } 2452 2453 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2454 nCoresPerPkg = maxCt[coreIdIndex]; 2455 nPackages = totals[pkgIdIndex]; 2456 2457 // Check to see if the machine topology is uniform 2458 unsigned prod = totals[maxIndex]; 2459 for (index = threadIdIndex; index < maxIndex; index++) { 2460 prod *= maxCt[index]; 2461 } 2462 bool uniform = (prod == totals[threadIdIndex]); 2463 2464 // When affinity is off, this routine will still be called to set 2465 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 2466 // Make sure all these vars are set correctly, and return now if affinity is 2467 // not enabled. 2468 __kmp_ncores = totals[coreIdIndex]; 2469 2470 if (__kmp_affinity_verbose) { 2471 if (!KMP_AFFINITY_CAPABLE()) { 2472 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2473 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2474 if (uniform) { 2475 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2476 } else { 2477 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2478 } 2479 } else { 2480 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2481 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2482 if (uniform) { 2483 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2484 } else { 2485 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2486 } 2487 } 2488 kmp_str_buf_t buf; 2489 __kmp_str_buf_init(&buf); 2490 2491 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]); 2492 for (index = maxIndex - 1; index >= pkgIdIndex; index--) { 2493 __kmp_str_buf_print(&buf, " x %d", maxCt[index]); 2494 } 2495 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex], 2496 maxCt[threadIdIndex], __kmp_ncores); 2497 2498 __kmp_str_buf_free(&buf); 2499 } 2500 2501 #if KMP_MIC && REDUCE_TEAM_SIZE 2502 // Set the default team size. 2503 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2504 __kmp_dflt_team_nth = teamSize; 2505 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting " 2506 "__kmp_dflt_team_nth = %d\n", 2507 __kmp_dflt_team_nth)); 2508 } 2509 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2510 2511 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 2512 KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc); 2513 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 2514 for (i = 0; i < num_avail; ++i) { // fill the os indices 2515 __kmp_pu_os_idx[i] = threadInfo[i][osIdIndex]; 2516 } 2517 2518 if (__kmp_affinity_type == affinity_none) { 2519 __kmp_free(lastId); 2520 __kmp_free(totals); 2521 __kmp_free(maxCt); 2522 __kmp_free(counts); 2523 CLEANUP_THREAD_INFO; 2524 return 0; 2525 } 2526 2527 // Count the number of levels which have more nodes at that level than at the 2528 // parent's level (with there being an implicit root node of the top level). 2529 // This is equivalent to saying that there is at least one node at this level 2530 // which has a sibling. These levels are in the map, and the package level is 2531 // always in the map. 2532 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2533 for (index = threadIdIndex; index < maxIndex; index++) { 2534 KMP_ASSERT(totals[index] >= totals[index + 1]); 2535 inMap[index] = (totals[index] > totals[index + 1]); 2536 } 2537 inMap[maxIndex] = (totals[maxIndex] > 1); 2538 inMap[pkgIdIndex] = true; 2539 2540 int depth = 0; 2541 for (index = threadIdIndex; index <= maxIndex; index++) { 2542 if (inMap[index]) { 2543 depth++; 2544 } 2545 } 2546 KMP_ASSERT(depth > 0); 2547 2548 // Construct the data structure that is to be returned. 2549 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * num_avail); 2550 int pkgLevel = -1; 2551 int coreLevel = -1; 2552 int threadLevel = -1; 2553 2554 for (i = 0; i < num_avail; ++i) { 2555 Address addr(depth); 2556 unsigned os = threadInfo[i][osIdIndex]; 2557 int src_index; 2558 int dst_index = 0; 2559 2560 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2561 if (!inMap[src_index]) { 2562 continue; 2563 } 2564 addr.labels[dst_index] = threadInfo[i][src_index]; 2565 if (src_index == pkgIdIndex) { 2566 pkgLevel = dst_index; 2567 } else if (src_index == coreIdIndex) { 2568 coreLevel = dst_index; 2569 } else if (src_index == threadIdIndex) { 2570 threadLevel = dst_index; 2571 } 2572 dst_index++; 2573 } 2574 (*address2os)[i] = AddrUnsPair(addr, os); 2575 } 2576 2577 if (__kmp_affinity_gran_levels < 0) { 2578 // Set the granularity level based on what levels are modeled 2579 // in the machine topology map. 2580 unsigned src_index; 2581 __kmp_affinity_gran_levels = 0; 2582 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) { 2583 if (!inMap[src_index]) { 2584 continue; 2585 } 2586 switch (src_index) { 2587 case threadIdIndex: 2588 if (__kmp_affinity_gran > affinity_gran_thread) { 2589 __kmp_affinity_gran_levels++; 2590 } 2591 2592 break; 2593 case coreIdIndex: 2594 if (__kmp_affinity_gran > affinity_gran_core) { 2595 __kmp_affinity_gran_levels++; 2596 } 2597 break; 2598 2599 case pkgIdIndex: 2600 if (__kmp_affinity_gran > affinity_gran_package) { 2601 __kmp_affinity_gran_levels++; 2602 } 2603 break; 2604 } 2605 } 2606 } 2607 2608 if (__kmp_affinity_verbose) { 2609 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel, 2610 coreLevel, threadLevel); 2611 } 2612 2613 __kmp_free(inMap); 2614 __kmp_free(lastId); 2615 __kmp_free(totals); 2616 __kmp_free(maxCt); 2617 __kmp_free(counts); 2618 CLEANUP_THREAD_INFO; 2619 return depth; 2620 } 2621 2622 // Create and return a table of affinity masks, indexed by OS thread ID. 2623 // This routine handles OR'ing together all the affinity masks of threads 2624 // that are sufficiently close, if granularity > fine. 2625 static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex, 2626 unsigned *numUnique, 2627 AddrUnsPair *address2os, 2628 unsigned numAddrs) { 2629 // First form a table of affinity masks in order of OS thread id. 2630 unsigned depth; 2631 unsigned maxOsId; 2632 unsigned i; 2633 2634 KMP_ASSERT(numAddrs > 0); 2635 depth = address2os[0].first.depth; 2636 2637 maxOsId = 0; 2638 for (i = numAddrs - 1;; --i) { 2639 unsigned osId = address2os[i].second; 2640 if (osId > maxOsId) { 2641 maxOsId = osId; 2642 } 2643 if (i == 0) 2644 break; 2645 } 2646 kmp_affin_mask_t *osId2Mask; 2647 KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1)); 2648 2649 // Sort the address2os table according to physical order. Doing so will put 2650 // all threads on the same core/package/node in consecutive locations. 2651 qsort(address2os, numAddrs, sizeof(*address2os), 2652 __kmp_affinity_cmp_Address_labels); 2653 2654 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2655 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2656 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2657 } 2658 if (__kmp_affinity_gran_levels >= (int)depth) { 2659 if (__kmp_affinity_verbose || 2660 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 2661 KMP_WARNING(AffThreadsMayMigrate); 2662 } 2663 } 2664 2665 // Run through the table, forming the masks for all threads on each core. 2666 // Threads on the same core will have identical "Address" objects, not 2667 // considering the last level, which must be the thread id. All threads on a 2668 // core will appear consecutively. 2669 unsigned unique = 0; 2670 unsigned j = 0; // index of 1st thread on core 2671 unsigned leader = 0; 2672 Address *leaderAddr = &(address2os[0].first); 2673 kmp_affin_mask_t *sum; 2674 KMP_CPU_ALLOC_ON_STACK(sum); 2675 KMP_CPU_ZERO(sum); 2676 KMP_CPU_SET(address2os[0].second, sum); 2677 for (i = 1; i < numAddrs; i++) { 2678 // If this thread is sufficiently close to the leader (within the 2679 // granularity setting), then set the bit for this os thread in the 2680 // affinity mask for this group, and go on to the next thread. 2681 if (leaderAddr->isClose(address2os[i].first, __kmp_affinity_gran_levels)) { 2682 KMP_CPU_SET(address2os[i].second, sum); 2683 continue; 2684 } 2685 2686 // For every thread in this group, copy the mask to the thread's entry in 2687 // the osId2Mask table. Mark the first address as a leader. 2688 for (; j < i; j++) { 2689 unsigned osId = address2os[j].second; 2690 KMP_DEBUG_ASSERT(osId <= maxOsId); 2691 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2692 KMP_CPU_COPY(mask, sum); 2693 address2os[j].first.leader = (j == leader); 2694 } 2695 unique++; 2696 2697 // Start a new mask. 2698 leader = i; 2699 leaderAddr = &(address2os[i].first); 2700 KMP_CPU_ZERO(sum); 2701 KMP_CPU_SET(address2os[i].second, sum); 2702 } 2703 2704 // For every thread in last group, copy the mask to the thread's 2705 // entry in the osId2Mask table. 2706 for (; j < i; j++) { 2707 unsigned osId = address2os[j].second; 2708 KMP_DEBUG_ASSERT(osId <= maxOsId); 2709 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2710 KMP_CPU_COPY(mask, sum); 2711 address2os[j].first.leader = (j == leader); 2712 } 2713 unique++; 2714 KMP_CPU_FREE_FROM_STACK(sum); 2715 2716 *maxIndex = maxOsId; 2717 *numUnique = unique; 2718 return osId2Mask; 2719 } 2720 2721 // Stuff for the affinity proclist parsers. It's easier to declare these vars 2722 // as file-static than to try and pass them through the calling sequence of 2723 // the recursive-descent OMP_PLACES parser. 2724 static kmp_affin_mask_t *newMasks; 2725 static int numNewMasks; 2726 static int nextNewMask; 2727 2728 #define ADD_MASK(_mask) \ 2729 { \ 2730 if (nextNewMask >= numNewMasks) { \ 2731 int i; \ 2732 numNewMasks *= 2; \ 2733 kmp_affin_mask_t *temp; \ 2734 KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \ 2735 for (i = 0; i < numNewMasks / 2; i++) { \ 2736 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); \ 2737 kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i); \ 2738 KMP_CPU_COPY(dest, src); \ 2739 } \ 2740 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2); \ 2741 newMasks = temp; \ 2742 } \ 2743 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 2744 nextNewMask++; \ 2745 } 2746 2747 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId) \ 2748 { \ 2749 if (((_osId) > _maxOsId) || \ 2750 (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 2751 if (__kmp_affinity_verbose || \ 2752 (__kmp_affinity_warnings && \ 2753 (__kmp_affinity_type != affinity_none))) { \ 2754 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 2755 } \ 2756 } else { \ 2757 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 2758 } \ 2759 } 2760 2761 // Re-parse the proclist (for the explicit affinity type), and form the list 2762 // of affinity newMasks indexed by gtid. 2763 static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 2764 unsigned int *out_numMasks, 2765 const char *proclist, 2766 kmp_affin_mask_t *osId2Mask, 2767 int maxOsId) { 2768 int i; 2769 const char *scan = proclist; 2770 const char *next = proclist; 2771 2772 // We use malloc() for the temporary mask vector, so that we can use 2773 // realloc() to extend it. 2774 numNewMasks = 2; 2775 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 2776 nextNewMask = 0; 2777 kmp_affin_mask_t *sumMask; 2778 KMP_CPU_ALLOC(sumMask); 2779 int setSize = 0; 2780 2781 for (;;) { 2782 int start, end, stride; 2783 2784 SKIP_WS(scan); 2785 next = scan; 2786 if (*next == '\0') { 2787 break; 2788 } 2789 2790 if (*next == '{') { 2791 int num; 2792 setSize = 0; 2793 next++; // skip '{' 2794 SKIP_WS(next); 2795 scan = next; 2796 2797 // Read the first integer in the set. 2798 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist"); 2799 SKIP_DIGITS(next); 2800 num = __kmp_str_to_int(scan, *next); 2801 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2802 2803 // Copy the mask for that osId to the sum (union) mask. 2804 if ((num > maxOsId) || 2805 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2806 if (__kmp_affinity_verbose || 2807 (__kmp_affinity_warnings && 2808 (__kmp_affinity_type != affinity_none))) { 2809 KMP_WARNING(AffIgnoreInvalidProcID, num); 2810 } 2811 KMP_CPU_ZERO(sumMask); 2812 } else { 2813 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2814 setSize = 1; 2815 } 2816 2817 for (;;) { 2818 // Check for end of set. 2819 SKIP_WS(next); 2820 if (*next == '}') { 2821 next++; // skip '}' 2822 break; 2823 } 2824 2825 // Skip optional comma. 2826 if (*next == ',') { 2827 next++; 2828 } 2829 SKIP_WS(next); 2830 2831 // Read the next integer in the set. 2832 scan = next; 2833 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2834 2835 SKIP_DIGITS(next); 2836 num = __kmp_str_to_int(scan, *next); 2837 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2838 2839 // Add the mask for that osId to the sum mask. 2840 if ((num > maxOsId) || 2841 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2842 if (__kmp_affinity_verbose || 2843 (__kmp_affinity_warnings && 2844 (__kmp_affinity_type != affinity_none))) { 2845 KMP_WARNING(AffIgnoreInvalidProcID, num); 2846 } 2847 } else { 2848 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2849 setSize++; 2850 } 2851 } 2852 if (setSize > 0) { 2853 ADD_MASK(sumMask); 2854 } 2855 2856 SKIP_WS(next); 2857 if (*next == ',') { 2858 next++; 2859 } 2860 scan = next; 2861 continue; 2862 } 2863 2864 // Read the first integer. 2865 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2866 SKIP_DIGITS(next); 2867 start = __kmp_str_to_int(scan, *next); 2868 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 2869 SKIP_WS(next); 2870 2871 // If this isn't a range, then add a mask to the list and go on. 2872 if (*next != '-') { 2873 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2874 2875 // Skip optional comma. 2876 if (*next == ',') { 2877 next++; 2878 } 2879 scan = next; 2880 continue; 2881 } 2882 2883 // This is a range. Skip over the '-' and read in the 2nd int. 2884 next++; // skip '-' 2885 SKIP_WS(next); 2886 scan = next; 2887 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2888 SKIP_DIGITS(next); 2889 end = __kmp_str_to_int(scan, *next); 2890 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 2891 2892 // Check for a stride parameter 2893 stride = 1; 2894 SKIP_WS(next); 2895 if (*next == ':') { 2896 // A stride is specified. Skip over the ':" and read the 3rd int. 2897 int sign = +1; 2898 next++; // skip ':' 2899 SKIP_WS(next); 2900 scan = next; 2901 if (*next == '-') { 2902 sign = -1; 2903 next++; 2904 SKIP_WS(next); 2905 scan = next; 2906 } 2907 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2908 SKIP_DIGITS(next); 2909 stride = __kmp_str_to_int(scan, *next); 2910 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 2911 stride *= sign; 2912 } 2913 2914 // Do some range checks. 2915 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 2916 if (stride > 0) { 2917 KMP_ASSERT2(start <= end, "bad explicit proc list"); 2918 } else { 2919 KMP_ASSERT2(start >= end, "bad explicit proc list"); 2920 } 2921 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 2922 2923 // Add the mask for each OS proc # to the list. 2924 if (stride > 0) { 2925 do { 2926 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2927 start += stride; 2928 } while (start <= end); 2929 } else { 2930 do { 2931 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2932 start += stride; 2933 } while (start >= end); 2934 } 2935 2936 // Skip optional comma. 2937 SKIP_WS(next); 2938 if (*next == ',') { 2939 next++; 2940 } 2941 scan = next; 2942 } 2943 2944 *out_numMasks = nextNewMask; 2945 if (nextNewMask == 0) { 2946 *out_masks = NULL; 2947 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 2948 return; 2949 } 2950 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 2951 for (i = 0; i < nextNewMask; i++) { 2952 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 2953 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 2954 KMP_CPU_COPY(dest, src); 2955 } 2956 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 2957 KMP_CPU_FREE(sumMask); 2958 } 2959 2960 /*----------------------------------------------------------------------------- 2961 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 2962 places. Again, Here is the grammar: 2963 2964 place_list := place 2965 place_list := place , place_list 2966 place := num 2967 place := place : num 2968 place := place : num : signed 2969 place := { subplacelist } 2970 place := ! place // (lowest priority) 2971 subplace_list := subplace 2972 subplace_list := subplace , subplace_list 2973 subplace := num 2974 subplace := num : num 2975 subplace := num : num : signed 2976 signed := num 2977 signed := + signed 2978 signed := - signed 2979 -----------------------------------------------------------------------------*/ 2980 static void __kmp_process_subplace_list(const char **scan, 2981 kmp_affin_mask_t *osId2Mask, 2982 int maxOsId, kmp_affin_mask_t *tempMask, 2983 int *setSize) { 2984 const char *next; 2985 2986 for (;;) { 2987 int start, count, stride, i; 2988 2989 // Read in the starting proc id 2990 SKIP_WS(*scan); 2991 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 2992 next = *scan; 2993 SKIP_DIGITS(next); 2994 start = __kmp_str_to_int(*scan, *next); 2995 KMP_ASSERT(start >= 0); 2996 *scan = next; 2997 2998 // valid follow sets are ',' ':' and '}' 2999 SKIP_WS(*scan); 3000 if (**scan == '}' || **scan == ',') { 3001 if ((start > maxOsId) || 3002 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3003 if (__kmp_affinity_verbose || 3004 (__kmp_affinity_warnings && 3005 (__kmp_affinity_type != affinity_none))) { 3006 KMP_WARNING(AffIgnoreInvalidProcID, start); 3007 } 3008 } else { 3009 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3010 (*setSize)++; 3011 } 3012 if (**scan == '}') { 3013 break; 3014 } 3015 (*scan)++; // skip ',' 3016 continue; 3017 } 3018 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3019 (*scan)++; // skip ':' 3020 3021 // Read count parameter 3022 SKIP_WS(*scan); 3023 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3024 next = *scan; 3025 SKIP_DIGITS(next); 3026 count = __kmp_str_to_int(*scan, *next); 3027 KMP_ASSERT(count >= 0); 3028 *scan = next; 3029 3030 // valid follow sets are ',' ':' and '}' 3031 SKIP_WS(*scan); 3032 if (**scan == '}' || **scan == ',') { 3033 for (i = 0; i < count; i++) { 3034 if ((start > maxOsId) || 3035 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3036 if (__kmp_affinity_verbose || 3037 (__kmp_affinity_warnings && 3038 (__kmp_affinity_type != affinity_none))) { 3039 KMP_WARNING(AffIgnoreInvalidProcID, start); 3040 } 3041 break; // don't proliferate warnings for large count 3042 } else { 3043 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3044 start++; 3045 (*setSize)++; 3046 } 3047 } 3048 if (**scan == '}') { 3049 break; 3050 } 3051 (*scan)++; // skip ',' 3052 continue; 3053 } 3054 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3055 (*scan)++; // skip ':' 3056 3057 // Read stride parameter 3058 int sign = +1; 3059 for (;;) { 3060 SKIP_WS(*scan); 3061 if (**scan == '+') { 3062 (*scan)++; // skip '+' 3063 continue; 3064 } 3065 if (**scan == '-') { 3066 sign *= -1; 3067 (*scan)++; // skip '-' 3068 continue; 3069 } 3070 break; 3071 } 3072 SKIP_WS(*scan); 3073 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3074 next = *scan; 3075 SKIP_DIGITS(next); 3076 stride = __kmp_str_to_int(*scan, *next); 3077 KMP_ASSERT(stride >= 0); 3078 *scan = next; 3079 stride *= sign; 3080 3081 // valid follow sets are ',' and '}' 3082 SKIP_WS(*scan); 3083 if (**scan == '}' || **scan == ',') { 3084 for (i = 0; i < count; i++) { 3085 if ((start > maxOsId) || 3086 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3087 if (__kmp_affinity_verbose || 3088 (__kmp_affinity_warnings && 3089 (__kmp_affinity_type != affinity_none))) { 3090 KMP_WARNING(AffIgnoreInvalidProcID, start); 3091 } 3092 break; // don't proliferate warnings for large count 3093 } else { 3094 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3095 start += stride; 3096 (*setSize)++; 3097 } 3098 } 3099 if (**scan == '}') { 3100 break; 3101 } 3102 (*scan)++; // skip ',' 3103 continue; 3104 } 3105 3106 KMP_ASSERT2(0, "bad explicit places list"); 3107 } 3108 } 3109 3110 static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 3111 int maxOsId, kmp_affin_mask_t *tempMask, 3112 int *setSize) { 3113 const char *next; 3114 3115 // valid follow sets are '{' '!' and num 3116 SKIP_WS(*scan); 3117 if (**scan == '{') { 3118 (*scan)++; // skip '{' 3119 __kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize); 3120 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 3121 (*scan)++; // skip '}' 3122 } else if (**scan == '!') { 3123 (*scan)++; // skip '!' 3124 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 3125 KMP_CPU_COMPLEMENT(maxOsId, tempMask); 3126 } else if ((**scan >= '0') && (**scan <= '9')) { 3127 next = *scan; 3128 SKIP_DIGITS(next); 3129 int num = __kmp_str_to_int(*scan, *next); 3130 KMP_ASSERT(num >= 0); 3131 if ((num > maxOsId) || 3132 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3133 if (__kmp_affinity_verbose || 3134 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 3135 KMP_WARNING(AffIgnoreInvalidProcID, num); 3136 } 3137 } else { 3138 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 3139 (*setSize)++; 3140 } 3141 *scan = next; // skip num 3142 } else { 3143 KMP_ASSERT2(0, "bad explicit places list"); 3144 } 3145 } 3146 3147 // static void 3148 void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 3149 unsigned int *out_numMasks, 3150 const char *placelist, 3151 kmp_affin_mask_t *osId2Mask, 3152 int maxOsId) { 3153 int i, j, count, stride, sign; 3154 const char *scan = placelist; 3155 const char *next = placelist; 3156 3157 numNewMasks = 2; 3158 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 3159 nextNewMask = 0; 3160 3161 // tempMask is modified based on the previous or initial 3162 // place to form the current place 3163 // previousMask contains the previous place 3164 kmp_affin_mask_t *tempMask; 3165 kmp_affin_mask_t *previousMask; 3166 KMP_CPU_ALLOC(tempMask); 3167 KMP_CPU_ZERO(tempMask); 3168 KMP_CPU_ALLOC(previousMask); 3169 KMP_CPU_ZERO(previousMask); 3170 int setSize = 0; 3171 3172 for (;;) { 3173 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 3174 3175 // valid follow sets are ',' ':' and EOL 3176 SKIP_WS(scan); 3177 if (*scan == '\0' || *scan == ',') { 3178 if (setSize > 0) { 3179 ADD_MASK(tempMask); 3180 } 3181 KMP_CPU_ZERO(tempMask); 3182 setSize = 0; 3183 if (*scan == '\0') { 3184 break; 3185 } 3186 scan++; // skip ',' 3187 continue; 3188 } 3189 3190 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3191 scan++; // skip ':' 3192 3193 // Read count parameter 3194 SKIP_WS(scan); 3195 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 3196 next = scan; 3197 SKIP_DIGITS(next); 3198 count = __kmp_str_to_int(scan, *next); 3199 KMP_ASSERT(count >= 0); 3200 scan = next; 3201 3202 // valid follow sets are ',' ':' and EOL 3203 SKIP_WS(scan); 3204 if (*scan == '\0' || *scan == ',') { 3205 stride = +1; 3206 } else { 3207 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3208 scan++; // skip ':' 3209 3210 // Read stride parameter 3211 sign = +1; 3212 for (;;) { 3213 SKIP_WS(scan); 3214 if (*scan == '+') { 3215 scan++; // skip '+' 3216 continue; 3217 } 3218 if (*scan == '-') { 3219 sign *= -1; 3220 scan++; // skip '-' 3221 continue; 3222 } 3223 break; 3224 } 3225 SKIP_WS(scan); 3226 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 3227 next = scan; 3228 SKIP_DIGITS(next); 3229 stride = __kmp_str_to_int(scan, *next); 3230 KMP_DEBUG_ASSERT(stride >= 0); 3231 scan = next; 3232 stride *= sign; 3233 } 3234 3235 // Add places determined by initial_place : count : stride 3236 for (i = 0; i < count; i++) { 3237 if (setSize == 0) { 3238 break; 3239 } 3240 // Add the current place, then build the next place (tempMask) from that 3241 KMP_CPU_COPY(previousMask, tempMask); 3242 ADD_MASK(previousMask); 3243 KMP_CPU_ZERO(tempMask); 3244 setSize = 0; 3245 KMP_CPU_SET_ITERATE(j, previousMask) { 3246 if (!KMP_CPU_ISSET(j, previousMask)) { 3247 continue; 3248 } 3249 if ((j + stride > maxOsId) || (j + stride < 0) || 3250 (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) || 3251 (!KMP_CPU_ISSET(j + stride, 3252 KMP_CPU_INDEX(osId2Mask, j + stride)))) { 3253 if ((__kmp_affinity_verbose || 3254 (__kmp_affinity_warnings && 3255 (__kmp_affinity_type != affinity_none))) && 3256 i < count - 1) { 3257 KMP_WARNING(AffIgnoreInvalidProcID, j + stride); 3258 } 3259 continue; 3260 } 3261 KMP_CPU_SET(j + stride, tempMask); 3262 setSize++; 3263 } 3264 } 3265 KMP_CPU_ZERO(tempMask); 3266 setSize = 0; 3267 3268 // valid follow sets are ',' and EOL 3269 SKIP_WS(scan); 3270 if (*scan == '\0') { 3271 break; 3272 } 3273 if (*scan == ',') { 3274 scan++; // skip ',' 3275 continue; 3276 } 3277 3278 KMP_ASSERT2(0, "bad explicit places list"); 3279 } 3280 3281 *out_numMasks = nextNewMask; 3282 if (nextNewMask == 0) { 3283 *out_masks = NULL; 3284 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3285 return; 3286 } 3287 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3288 KMP_CPU_FREE(tempMask); 3289 KMP_CPU_FREE(previousMask); 3290 for (i = 0; i < nextNewMask; i++) { 3291 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 3292 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 3293 KMP_CPU_COPY(dest, src); 3294 } 3295 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3296 } 3297 3298 #undef ADD_MASK 3299 #undef ADD_MASK_OSID 3300 3301 #if KMP_USE_HWLOC 3302 static int __kmp_hwloc_skip_PUs_obj(hwloc_topology_t t, hwloc_obj_t o) { 3303 // skip PUs descendants of the object o 3304 int skipped = 0; 3305 hwloc_obj_t hT = NULL; 3306 int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT); 3307 for (int i = 0; i < N; ++i) { 3308 KMP_DEBUG_ASSERT(hT); 3309 unsigned idx = hT->os_index; 3310 if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3311 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3312 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3313 ++skipped; 3314 } 3315 hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT); 3316 } 3317 return skipped; // count number of skipped units 3318 } 3319 3320 static int __kmp_hwloc_obj_has_PUs(hwloc_topology_t t, hwloc_obj_t o) { 3321 // check if obj has PUs present in fullMask 3322 hwloc_obj_t hT = NULL; 3323 int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT); 3324 for (int i = 0; i < N; ++i) { 3325 KMP_DEBUG_ASSERT(hT); 3326 unsigned idx = hT->os_index; 3327 if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) 3328 return 1; // found PU 3329 hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT); 3330 } 3331 return 0; // no PUs found 3332 } 3333 #endif // KMP_USE_HWLOC 3334 3335 static void __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) { 3336 AddrUnsPair *newAddr; 3337 if (__kmp_hws_requested == 0) 3338 goto _exit; // no topology limiting actions requested, exit 3339 #if KMP_USE_HWLOC 3340 if (__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 3341 // Number of subobjects calculated dynamically, this works fine for 3342 // any non-uniform topology. 3343 // L2 cache objects are determined by depth, other objects - by type. 3344 hwloc_topology_t tp = __kmp_hwloc_topology; 3345 int nS = 0, nN = 0, nL = 0, nC = 0, 3346 nT = 0; // logical index including skipped 3347 int nCr = 0, nTr = 0; // number of requested units 3348 int nPkg = 0, nCo = 0, n_new = 0, n_old = 0, nCpP = 0, nTpC = 0; // counters 3349 hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to) 3350 int L2depth, idx; 3351 3352 // check support of extensions ---------------------------------- 3353 int numa_support = 0, tile_support = 0; 3354 if (__kmp_pu_os_idx) 3355 hT = hwloc_get_pu_obj_by_os_index(tp, 3356 __kmp_pu_os_idx[__kmp_avail_proc - 1]); 3357 else 3358 hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, __kmp_avail_proc - 1); 3359 if (hT == NULL) { // something's gone wrong 3360 KMP_WARNING(AffHWSubsetUnsupported); 3361 goto _exit; 3362 } 3363 // check NUMA node 3364 hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT); 3365 hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT); 3366 if (hN != NULL && hN->depth > hS->depth) { 3367 numa_support = 1; // 1 in case socket includes node(s) 3368 } else if (__kmp_hws_node.num > 0) { 3369 // don't support sockets inside NUMA node (no such HW found for testing) 3370 KMP_WARNING(AffHWSubsetUnsupported); 3371 goto _exit; 3372 } 3373 // check L2 cahce, get object by depth because of multiple caches 3374 L2depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED); 3375 hL = hwloc_get_ancestor_obj_by_depth(tp, L2depth, hT); 3376 if (hL != NULL && 3377 __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1) { 3378 tile_support = 1; // no sense to count L2 if it includes single core 3379 } else if (__kmp_hws_tile.num > 0) { 3380 if (__kmp_hws_core.num == 0) { 3381 __kmp_hws_core = __kmp_hws_tile; // replace L2 with core 3382 __kmp_hws_tile.num = 0; 3383 } else { 3384 // L2 and core are both requested, but represent same object 3385 KMP_WARNING(AffHWSubsetInvalid); 3386 goto _exit; 3387 } 3388 } 3389 // end of check of extensions ----------------------------------- 3390 3391 // fill in unset items, validate settings ----------------------- 3392 if (__kmp_hws_socket.num == 0) 3393 __kmp_hws_socket.num = nPackages; // use all available sockets 3394 if (__kmp_hws_socket.offset >= nPackages) { 3395 KMP_WARNING(AffHWSubsetManySockets); 3396 goto _exit; 3397 } 3398 if (numa_support) { 3399 hN = NULL; 3400 int NN = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, 3401 &hN); // num nodes in socket 3402 if (__kmp_hws_node.num == 0) 3403 __kmp_hws_node.num = NN; // use all available nodes 3404 if (__kmp_hws_node.offset >= NN) { 3405 KMP_WARNING(AffHWSubsetManyNodes); 3406 goto _exit; 3407 } 3408 if (tile_support) { 3409 // get num tiles in node 3410 int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL); 3411 if (__kmp_hws_tile.num == 0) { 3412 __kmp_hws_tile.num = NL + 1; 3413 } // use all available tiles, some node may have more tiles, thus +1 3414 if (__kmp_hws_tile.offset >= NL) { 3415 KMP_WARNING(AffHWSubsetManyTiles); 3416 goto _exit; 3417 } 3418 int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, 3419 &hC); // num cores in tile 3420 if (__kmp_hws_core.num == 0) 3421 __kmp_hws_core.num = NC; // use all available cores 3422 if (__kmp_hws_core.offset >= NC) { 3423 KMP_WARNING(AffHWSubsetManyCores); 3424 goto _exit; 3425 } 3426 } else { // tile_support 3427 int NC = __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, 3428 &hC); // num cores in node 3429 if (__kmp_hws_core.num == 0) 3430 __kmp_hws_core.num = NC; // use all available cores 3431 if (__kmp_hws_core.offset >= NC) { 3432 KMP_WARNING(AffHWSubsetManyCores); 3433 goto _exit; 3434 } 3435 } // tile_support 3436 } else { // numa_support 3437 if (tile_support) { 3438 // get num tiles in socket 3439 int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL); 3440 if (__kmp_hws_tile.num == 0) 3441 __kmp_hws_tile.num = NL; // use all available tiles 3442 if (__kmp_hws_tile.offset >= NL) { 3443 KMP_WARNING(AffHWSubsetManyTiles); 3444 goto _exit; 3445 } 3446 int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, 3447 &hC); // num cores in tile 3448 if (__kmp_hws_core.num == 0) 3449 __kmp_hws_core.num = NC; // use all available cores 3450 if (__kmp_hws_core.offset >= NC) { 3451 KMP_WARNING(AffHWSubsetManyCores); 3452 goto _exit; 3453 } 3454 } else { // tile_support 3455 int NC = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, 3456 &hC); // num cores in socket 3457 if (__kmp_hws_core.num == 0) 3458 __kmp_hws_core.num = NC; // use all available cores 3459 if (__kmp_hws_core.offset >= NC) { 3460 KMP_WARNING(AffHWSubsetManyCores); 3461 goto _exit; 3462 } 3463 } // tile_support 3464 } 3465 if (__kmp_hws_proc.num == 0) 3466 __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all available procs 3467 if (__kmp_hws_proc.offset >= __kmp_nThreadsPerCore) { 3468 KMP_WARNING(AffHWSubsetManyProcs); 3469 goto _exit; 3470 } 3471 // end of validation -------------------------------------------- 3472 3473 if (pAddr) // pAddr is NULL in case of affinity_none 3474 newAddr = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * 3475 __kmp_avail_proc); // max size 3476 // main loop to form HW subset ---------------------------------- 3477 hS = NULL; 3478 int NP = hwloc_get_nbobjs_by_type(tp, HWLOC_OBJ_PACKAGE); 3479 for (int s = 0; s < NP; ++s) { 3480 // Check Socket ----------------------------------------------- 3481 hS = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hS); 3482 if (!__kmp_hwloc_obj_has_PUs(tp, hS)) 3483 continue; // skip socket if all PUs are out of fullMask 3484 ++nS; // only count objects those have PUs in affinity mask 3485 if (nS <= __kmp_hws_socket.offset || 3486 nS > __kmp_hws_socket.num + __kmp_hws_socket.offset) { 3487 n_old += __kmp_hwloc_skip_PUs_obj(tp, hS); // skip socket 3488 continue; // move to next socket 3489 } 3490 nCr = 0; // count number of cores per socket 3491 // socket requested, go down the topology tree 3492 // check 4 cases: (+NUMA+Tile), (+NUMA-Tile), (-NUMA+Tile), (-NUMA-Tile) 3493 if (numa_support) { 3494 nN = 0; 3495 hN = NULL; 3496 // num nodes in current socket 3497 int NN = 3498 __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, &hN); 3499 for (int n = 0; n < NN; ++n) { 3500 // Check NUMA Node ---------------------------------------- 3501 if (!__kmp_hwloc_obj_has_PUs(tp, hN)) { 3502 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3503 continue; // skip node if all PUs are out of fullMask 3504 } 3505 ++nN; 3506 if (nN <= __kmp_hws_node.offset || 3507 nN > __kmp_hws_node.num + __kmp_hws_node.offset) { 3508 // skip node as not requested 3509 n_old += __kmp_hwloc_skip_PUs_obj(tp, hN); // skip node 3510 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3511 continue; // move to next node 3512 } 3513 // node requested, go down the topology tree 3514 if (tile_support) { 3515 nL = 0; 3516 hL = NULL; 3517 int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL); 3518 for (int l = 0; l < NL; ++l) { 3519 // Check L2 (tile) ------------------------------------ 3520 if (!__kmp_hwloc_obj_has_PUs(tp, hL)) { 3521 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3522 continue; // skip tile if all PUs are out of fullMask 3523 } 3524 ++nL; 3525 if (nL <= __kmp_hws_tile.offset || 3526 nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) { 3527 // skip tile as not requested 3528 n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile 3529 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3530 continue; // move to next tile 3531 } 3532 // tile requested, go down the topology tree 3533 nC = 0; 3534 hC = NULL; 3535 // num cores in current tile 3536 int NC = __kmp_hwloc_count_children_by_type(tp, hL, 3537 HWLOC_OBJ_CORE, &hC); 3538 for (int c = 0; c < NC; ++c) { 3539 // Check Core --------------------------------------- 3540 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3541 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3542 continue; // skip core if all PUs are out of fullMask 3543 } 3544 ++nC; 3545 if (nC <= __kmp_hws_core.offset || 3546 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3547 // skip node as not requested 3548 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3549 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3550 continue; // move to next node 3551 } 3552 // core requested, go down to PUs 3553 nT = 0; 3554 nTr = 0; 3555 hT = NULL; 3556 // num procs in current core 3557 int NT = __kmp_hwloc_count_children_by_type(tp, hC, 3558 HWLOC_OBJ_PU, &hT); 3559 for (int t = 0; t < NT; ++t) { 3560 // Check PU --------------------------------------- 3561 idx = hT->os_index; 3562 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3563 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3564 continue; // skip PU if not in fullMask 3565 } 3566 ++nT; 3567 if (nT <= __kmp_hws_proc.offset || 3568 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3569 // skip PU 3570 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3571 ++n_old; 3572 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3573 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3574 continue; // move to next node 3575 } 3576 ++nTr; 3577 if (pAddr) // collect requested thread's data 3578 newAddr[n_new] = (*pAddr)[n_old]; 3579 ++n_new; 3580 ++n_old; 3581 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3582 } // threads loop 3583 if (nTr > 0) { 3584 ++nCr; // num cores per socket 3585 ++nCo; // total num cores 3586 if (nTr > nTpC) 3587 nTpC = nTr; // calc max threads per core 3588 } 3589 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3590 } // cores loop 3591 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3592 } // tiles loop 3593 } else { // tile_support 3594 // no tiles, check cores 3595 nC = 0; 3596 hC = NULL; 3597 // num cores in current node 3598 int NC = 3599 __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, &hC); 3600 for (int c = 0; c < NC; ++c) { 3601 // Check Core --------------------------------------- 3602 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3603 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3604 continue; // skip core if all PUs are out of fullMask 3605 } 3606 ++nC; 3607 if (nC <= __kmp_hws_core.offset || 3608 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3609 // skip node as not requested 3610 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3611 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3612 continue; // move to next node 3613 } 3614 // core requested, go down to PUs 3615 nT = 0; 3616 nTr = 0; 3617 hT = NULL; 3618 int NT = 3619 __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT); 3620 for (int t = 0; t < NT; ++t) { 3621 // Check PU --------------------------------------- 3622 idx = hT->os_index; 3623 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3624 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3625 continue; // skip PU if not in fullMask 3626 } 3627 ++nT; 3628 if (nT <= __kmp_hws_proc.offset || 3629 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3630 // skip PU 3631 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3632 ++n_old; 3633 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3634 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3635 continue; // move to next node 3636 } 3637 ++nTr; 3638 if (pAddr) // collect requested thread's data 3639 newAddr[n_new] = (*pAddr)[n_old]; 3640 ++n_new; 3641 ++n_old; 3642 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3643 } // threads loop 3644 if (nTr > 0) { 3645 ++nCr; // num cores per socket 3646 ++nCo; // total num cores 3647 if (nTr > nTpC) 3648 nTpC = nTr; // calc max threads per core 3649 } 3650 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3651 } // cores loop 3652 } // tiles support 3653 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3654 } // nodes loop 3655 } else { // numa_support 3656 // no NUMA support 3657 if (tile_support) { 3658 nL = 0; 3659 hL = NULL; 3660 // num tiles in current socket 3661 int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL); 3662 for (int l = 0; l < NL; ++l) { 3663 // Check L2 (tile) ------------------------------------ 3664 if (!__kmp_hwloc_obj_has_PUs(tp, hL)) { 3665 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3666 continue; // skip tile if all PUs are out of fullMask 3667 } 3668 ++nL; 3669 if (nL <= __kmp_hws_tile.offset || 3670 nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) { 3671 // skip tile as not requested 3672 n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile 3673 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3674 continue; // move to next tile 3675 } 3676 // tile requested, go down the topology tree 3677 nC = 0; 3678 hC = NULL; 3679 // num cores per tile 3680 int NC = 3681 __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC); 3682 for (int c = 0; c < NC; ++c) { 3683 // Check Core --------------------------------------- 3684 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3685 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3686 continue; // skip core if all PUs are out of fullMask 3687 } 3688 ++nC; 3689 if (nC <= __kmp_hws_core.offset || 3690 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3691 // skip node as not requested 3692 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3693 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3694 continue; // move to next node 3695 } 3696 // core requested, go down to PUs 3697 nT = 0; 3698 nTr = 0; 3699 hT = NULL; 3700 // num procs per core 3701 int NT = 3702 __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT); 3703 for (int t = 0; t < NT; ++t) { 3704 // Check PU --------------------------------------- 3705 idx = hT->os_index; 3706 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3707 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3708 continue; // skip PU if not in fullMask 3709 } 3710 ++nT; 3711 if (nT <= __kmp_hws_proc.offset || 3712 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3713 // skip PU 3714 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3715 ++n_old; 3716 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3717 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3718 continue; // move to next node 3719 } 3720 ++nTr; 3721 if (pAddr) // collect requested thread's data 3722 newAddr[n_new] = (*pAddr)[n_old]; 3723 ++n_new; 3724 ++n_old; 3725 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3726 } // threads loop 3727 if (nTr > 0) { 3728 ++nCr; // num cores per socket 3729 ++nCo; // total num cores 3730 if (nTr > nTpC) 3731 nTpC = nTr; // calc max threads per core 3732 } 3733 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3734 } // cores loop 3735 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3736 } // tiles loop 3737 } else { // tile_support 3738 // no tiles, check cores 3739 nC = 0; 3740 hC = NULL; 3741 // num cores in socket 3742 int NC = 3743 __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, &hC); 3744 for (int c = 0; c < NC; ++c) { 3745 // Check Core ------------------------------------------- 3746 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3747 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3748 continue; // skip core if all PUs are out of fullMask 3749 } 3750 ++nC; 3751 if (nC <= __kmp_hws_core.offset || 3752 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3753 // skip node as not requested 3754 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3755 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3756 continue; // move to next node 3757 } 3758 // core requested, go down to PUs 3759 nT = 0; 3760 nTr = 0; 3761 hT = NULL; 3762 // num procs per core 3763 int NT = 3764 __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT); 3765 for (int t = 0; t < NT; ++t) { 3766 // Check PU --------------------------------------- 3767 idx = hT->os_index; 3768 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3769 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3770 continue; // skip PU if not in fullMask 3771 } 3772 ++nT; 3773 if (nT <= __kmp_hws_proc.offset || 3774 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3775 // skip PU 3776 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3777 ++n_old; 3778 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3779 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3780 continue; // move to next node 3781 } 3782 ++nTr; 3783 if (pAddr) // collect requested thread's data 3784 newAddr[n_new] = (*pAddr)[n_old]; 3785 ++n_new; 3786 ++n_old; 3787 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3788 } // threads loop 3789 if (nTr > 0) { 3790 ++nCr; // num cores per socket 3791 ++nCo; // total num cores 3792 if (nTr > nTpC) 3793 nTpC = nTr; // calc max threads per core 3794 } 3795 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3796 } // cores loop 3797 } // tiles support 3798 } // numa_support 3799 if (nCr > 0) { // found cores? 3800 ++nPkg; // num sockets 3801 if (nCr > nCpP) 3802 nCpP = nCr; // calc max cores per socket 3803 } 3804 } // sockets loop 3805 3806 // check the subset is valid 3807 KMP_DEBUG_ASSERT(n_old == __kmp_avail_proc); 3808 KMP_DEBUG_ASSERT(nPkg > 0); 3809 KMP_DEBUG_ASSERT(nCpP > 0); 3810 KMP_DEBUG_ASSERT(nTpC > 0); 3811 KMP_DEBUG_ASSERT(nCo > 0); 3812 KMP_DEBUG_ASSERT(nPkg <= nPackages); 3813 KMP_DEBUG_ASSERT(nCpP <= nCoresPerPkg); 3814 KMP_DEBUG_ASSERT(nTpC <= __kmp_nThreadsPerCore); 3815 KMP_DEBUG_ASSERT(nCo <= __kmp_ncores); 3816 3817 nPackages = nPkg; // correct num sockets 3818 nCoresPerPkg = nCpP; // correct num cores per socket 3819 __kmp_nThreadsPerCore = nTpC; // correct num threads per core 3820 __kmp_avail_proc = n_new; // correct num procs 3821 __kmp_ncores = nCo; // correct num cores 3822 // hwloc topology method end 3823 } else 3824 #endif // KMP_USE_HWLOC 3825 { 3826 int n_old = 0, n_new = 0, proc_num = 0; 3827 if (__kmp_hws_node.num > 0 || __kmp_hws_tile.num > 0) { 3828 KMP_WARNING(AffHWSubsetNoHWLOC); 3829 goto _exit; 3830 } 3831 if (__kmp_hws_socket.num == 0) 3832 __kmp_hws_socket.num = nPackages; // use all available sockets 3833 if (__kmp_hws_core.num == 0) 3834 __kmp_hws_core.num = nCoresPerPkg; // use all available cores 3835 if (__kmp_hws_proc.num == 0 || __kmp_hws_proc.num > __kmp_nThreadsPerCore) 3836 __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all HW contexts 3837 if (!__kmp_affinity_uniform_topology()) { 3838 KMP_WARNING(AffHWSubsetNonUniform); 3839 goto _exit; // don't support non-uniform topology 3840 } 3841 if (depth > 3) { 3842 KMP_WARNING(AffHWSubsetNonThreeLevel); 3843 goto _exit; // don't support not-3-level topology 3844 } 3845 if (__kmp_hws_socket.offset + __kmp_hws_socket.num > nPackages) { 3846 KMP_WARNING(AffHWSubsetManySockets); 3847 goto _exit; 3848 } 3849 if (__kmp_hws_core.offset + __kmp_hws_core.num > nCoresPerPkg) { 3850 KMP_WARNING(AffHWSubsetManyCores); 3851 goto _exit; 3852 } 3853 // Form the requested subset 3854 if (pAddr) // pAddr is NULL in case of affinity_none 3855 newAddr = (AddrUnsPair *)__kmp_allocate( 3856 sizeof(AddrUnsPair) * __kmp_hws_socket.num * __kmp_hws_core.num * 3857 __kmp_hws_proc.num); 3858 for (int i = 0; i < nPackages; ++i) { 3859 if (i < __kmp_hws_socket.offset || 3860 i >= __kmp_hws_socket.offset + __kmp_hws_socket.num) { 3861 // skip not-requested socket 3862 n_old += nCoresPerPkg * __kmp_nThreadsPerCore; 3863 if (__kmp_pu_os_idx != NULL) { 3864 // walk through skipped socket 3865 for (int j = 0; j < nCoresPerPkg; ++j) { 3866 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 3867 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3868 ++proc_num; 3869 } 3870 } 3871 } 3872 } else { 3873 // walk through requested socket 3874 for (int j = 0; j < nCoresPerPkg; ++j) { 3875 if (j < __kmp_hws_core.offset || 3876 j >= __kmp_hws_core.offset + 3877 __kmp_hws_core.num) { // skip not-requested core 3878 n_old += __kmp_nThreadsPerCore; 3879 if (__kmp_pu_os_idx != NULL) { 3880 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 3881 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3882 ++proc_num; 3883 } 3884 } 3885 } else { 3886 // walk through requested core 3887 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 3888 if (k < __kmp_hws_proc.num) { 3889 if (pAddr) // collect requested thread's data 3890 newAddr[n_new] = (*pAddr)[n_old]; 3891 n_new++; 3892 } else { 3893 if (__kmp_pu_os_idx != NULL) 3894 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3895 } 3896 n_old++; 3897 ++proc_num; 3898 } 3899 } 3900 } 3901 } 3902 } 3903 KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore); 3904 KMP_DEBUG_ASSERT(n_new == 3905 __kmp_hws_socket.num * __kmp_hws_core.num * 3906 __kmp_hws_proc.num); 3907 nPackages = __kmp_hws_socket.num; // correct nPackages 3908 nCoresPerPkg = __kmp_hws_core.num; // correct nCoresPerPkg 3909 __kmp_nThreadsPerCore = __kmp_hws_proc.num; // correct __kmp_nThreadsPerCore 3910 __kmp_avail_proc = n_new; // correct avail_proc 3911 __kmp_ncores = nPackages * __kmp_hws_core.num; // correct ncores 3912 } // non-hwloc topology method 3913 if (pAddr) { 3914 __kmp_free(*pAddr); 3915 *pAddr = newAddr; // replace old topology with new one 3916 } 3917 if (__kmp_affinity_verbose) { 3918 KMP_INFORM(AvailableOSProc, "KMP_HW_SUBSET", __kmp_avail_proc); 3919 kmp_str_buf_t buf; 3920 __kmp_str_buf_init(&buf); 3921 __kmp_str_buf_print(&buf, "%d", nPackages); 3922 KMP_INFORM(TopologyExtra, "KMP_HW_SUBSET", buf.str, nCoresPerPkg, 3923 __kmp_nThreadsPerCore, __kmp_ncores); 3924 __kmp_str_buf_free(&buf); 3925 } 3926 _exit: 3927 if (__kmp_pu_os_idx != NULL) { 3928 __kmp_free(__kmp_pu_os_idx); 3929 __kmp_pu_os_idx = NULL; 3930 } 3931 } 3932 3933 // This function figures out the deepest level at which there is at least one 3934 // cluster/core with more than one processing unit bound to it. 3935 static int __kmp_affinity_find_core_level(const AddrUnsPair *address2os, 3936 int nprocs, int bottom_level) { 3937 int core_level = 0; 3938 3939 for (int i = 0; i < nprocs; i++) { 3940 for (int j = bottom_level; j > 0; j--) { 3941 if (address2os[i].first.labels[j] > 0) { 3942 if (core_level < (j - 1)) { 3943 core_level = j - 1; 3944 } 3945 } 3946 } 3947 } 3948 return core_level; 3949 } 3950 3951 // This function counts number of clusters/cores at given level. 3952 static int __kmp_affinity_compute_ncores(const AddrUnsPair *address2os, 3953 int nprocs, int bottom_level, 3954 int core_level) { 3955 int ncores = 0; 3956 int i, j; 3957 3958 j = bottom_level; 3959 for (i = 0; i < nprocs; i++) { 3960 for (j = bottom_level; j > core_level; j--) { 3961 if ((i + 1) < nprocs) { 3962 if (address2os[i + 1].first.labels[j] > 0) { 3963 break; 3964 } 3965 } 3966 } 3967 if (j == core_level) { 3968 ncores++; 3969 } 3970 } 3971 if (j > core_level) { 3972 // In case of ( nprocs < __kmp_avail_proc ) we may end too deep and miss one 3973 // core. May occur when called from __kmp_affinity_find_core(). 3974 ncores++; 3975 } 3976 return ncores; 3977 } 3978 3979 // This function finds to which cluster/core given processing unit is bound. 3980 static int __kmp_affinity_find_core(const AddrUnsPair *address2os, int proc, 3981 int bottom_level, int core_level) { 3982 return __kmp_affinity_compute_ncores(address2os, proc + 1, bottom_level, 3983 core_level) - 3984 1; 3985 } 3986 3987 // This function finds maximal number of processing units bound to a 3988 // cluster/core at given level. 3989 static int __kmp_affinity_max_proc_per_core(const AddrUnsPair *address2os, 3990 int nprocs, int bottom_level, 3991 int core_level) { 3992 int maxprocpercore = 0; 3993 3994 if (core_level < bottom_level) { 3995 for (int i = 0; i < nprocs; i++) { 3996 int percore = address2os[i].first.labels[core_level + 1] + 1; 3997 3998 if (percore > maxprocpercore) { 3999 maxprocpercore = percore; 4000 } 4001 } 4002 } else { 4003 maxprocpercore = 1; 4004 } 4005 return maxprocpercore; 4006 } 4007 4008 static AddrUnsPair *address2os = NULL; 4009 static int *procarr = NULL; 4010 static int __kmp_aff_depth = 0; 4011 4012 #if KMP_USE_HIER_SCHED 4013 #define KMP_EXIT_AFF_NONE \ 4014 KMP_ASSERT(__kmp_affinity_type == affinity_none); \ 4015 KMP_ASSERT(address2os == NULL); \ 4016 __kmp_apply_thread_places(NULL, 0); \ 4017 __kmp_create_affinity_none_places(); \ 4018 __kmp_dispatch_set_hierarchy_values(); \ 4019 return; 4020 #else 4021 #define KMP_EXIT_AFF_NONE \ 4022 KMP_ASSERT(__kmp_affinity_type == affinity_none); \ 4023 KMP_ASSERT(address2os == NULL); \ 4024 __kmp_apply_thread_places(NULL, 0); \ 4025 __kmp_create_affinity_none_places(); \ 4026 return; 4027 #endif 4028 4029 // Create a one element mask array (set of places) which only contains the 4030 // initial process's affinity mask 4031 static void __kmp_create_affinity_none_places() { 4032 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4033 KMP_ASSERT(__kmp_affinity_type == affinity_none); 4034 __kmp_affinity_num_masks = 1; 4035 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4036 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, 0); 4037 KMP_CPU_COPY(dest, __kmp_affin_fullMask); 4038 } 4039 4040 static int __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) { 4041 const Address *aa = &(((const AddrUnsPair *)a)->first); 4042 const Address *bb = &(((const AddrUnsPair *)b)->first); 4043 unsigned depth = aa->depth; 4044 unsigned i; 4045 KMP_DEBUG_ASSERT(depth == bb->depth); 4046 KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth); 4047 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0); 4048 for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) { 4049 int j = depth - i - 1; 4050 if (aa->childNums[j] < bb->childNums[j]) 4051 return -1; 4052 if (aa->childNums[j] > bb->childNums[j]) 4053 return 1; 4054 } 4055 for (; i < depth; i++) { 4056 int j = i - __kmp_affinity_compact; 4057 if (aa->childNums[j] < bb->childNums[j]) 4058 return -1; 4059 if (aa->childNums[j] > bb->childNums[j]) 4060 return 1; 4061 } 4062 return 0; 4063 } 4064 4065 static void __kmp_aux_affinity_initialize(void) { 4066 if (__kmp_affinity_masks != NULL) { 4067 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4068 return; 4069 } 4070 4071 // Create the "full" mask - this defines all of the processors that we 4072 // consider to be in the machine model. If respect is set, then it is the 4073 // initialization thread's affinity mask. Otherwise, it is all processors that 4074 // we know about on the machine. 4075 if (__kmp_affin_fullMask == NULL) { 4076 KMP_CPU_ALLOC(__kmp_affin_fullMask); 4077 } 4078 if (KMP_AFFINITY_CAPABLE()) { 4079 __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE); 4080 if (__kmp_affinity_respect_mask) { 4081 // Count the number of available processors. 4082 unsigned i; 4083 __kmp_avail_proc = 0; 4084 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 4085 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 4086 continue; 4087 } 4088 __kmp_avail_proc++; 4089 } 4090 if (__kmp_avail_proc > __kmp_xproc) { 4091 if (__kmp_affinity_verbose || 4092 (__kmp_affinity_warnings && 4093 (__kmp_affinity_type != affinity_none))) { 4094 KMP_WARNING(ErrorInitializeAffinity); 4095 } 4096 __kmp_affinity_type = affinity_none; 4097 KMP_AFFINITY_DISABLE(); 4098 return; 4099 } 4100 4101 if (__kmp_affinity_verbose) { 4102 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4103 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4104 __kmp_affin_fullMask); 4105 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 4106 } 4107 } else { 4108 if (__kmp_affinity_verbose) { 4109 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4110 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4111 __kmp_affin_fullMask); 4112 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 4113 } 4114 __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask); 4115 __kmp_avail_proc = __kmp_xproc; 4116 #if KMP_OS_WINDOWS 4117 // Set the process affinity mask since threads' affinity 4118 // masks must be subset of process mask in Windows* OS 4119 __kmp_affin_fullMask->set_process_affinity(true); 4120 #endif 4121 } 4122 } 4123 4124 if (__kmp_affinity_gran == affinity_gran_tile && 4125 // check if user's request is valid 4126 __kmp_affinity_dispatch->get_api_type() == KMPAffinity::NATIVE_OS) { 4127 KMP_WARNING(AffTilesNoHWLOC, "KMP_AFFINITY"); 4128 __kmp_affinity_gran = affinity_gran_package; 4129 } 4130 4131 int depth = -1; 4132 kmp_i18n_id_t msg_id = kmp_i18n_null; 4133 4134 // For backward compatibility, setting KMP_CPUINFO_FILE => 4135 // KMP_TOPOLOGY_METHOD=cpuinfo 4136 if ((__kmp_cpuinfo_file != NULL) && 4137 (__kmp_affinity_top_method == affinity_top_method_all)) { 4138 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 4139 } 4140 4141 if (__kmp_affinity_top_method == affinity_top_method_all) { 4142 // In the default code path, errors are not fatal - we just try using 4143 // another method. We only emit a warning message if affinity is on, or the 4144 // verbose flag is set, and the nowarnings flag was not set. 4145 const char *file_name = NULL; 4146 int line = 0; 4147 #if KMP_USE_HWLOC 4148 if (depth < 0 && 4149 __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 4150 if (__kmp_affinity_verbose) { 4151 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 4152 } 4153 if (!__kmp_hwloc_error) { 4154 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); 4155 if (depth == 0) { 4156 KMP_EXIT_AFF_NONE; 4157 } else if (depth < 0 && __kmp_affinity_verbose) { 4158 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 4159 } 4160 } else if (__kmp_affinity_verbose) { 4161 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 4162 } 4163 } 4164 #endif 4165 4166 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4167 4168 if (depth < 0) { 4169 if (__kmp_affinity_verbose) { 4170 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 4171 } 4172 4173 file_name = NULL; 4174 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 4175 if (depth == 0) { 4176 KMP_EXIT_AFF_NONE; 4177 } 4178 4179 if (depth < 0) { 4180 if (__kmp_affinity_verbose) { 4181 if (msg_id != kmp_i18n_null) { 4182 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", 4183 __kmp_i18n_catgets(msg_id), 4184 KMP_I18N_STR(DecodingLegacyAPIC)); 4185 } else { 4186 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 4187 KMP_I18N_STR(DecodingLegacyAPIC)); 4188 } 4189 } 4190 4191 file_name = NULL; 4192 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 4193 if (depth == 0) { 4194 KMP_EXIT_AFF_NONE; 4195 } 4196 } 4197 } 4198 4199 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4200 4201 #if KMP_OS_LINUX 4202 4203 if (depth < 0) { 4204 if (__kmp_affinity_verbose) { 4205 if (msg_id != kmp_i18n_null) { 4206 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", 4207 __kmp_i18n_catgets(msg_id), "/proc/cpuinfo"); 4208 } else { 4209 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo"); 4210 } 4211 } 4212 4213 kmp_safe_raii_file_t f("/proc/cpuinfo", "r"); 4214 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 4215 if (depth == 0) { 4216 KMP_EXIT_AFF_NONE; 4217 } 4218 } 4219 4220 #endif /* KMP_OS_LINUX */ 4221 4222 #if KMP_GROUP_AFFINITY 4223 4224 if ((depth < 0) && (__kmp_num_proc_groups > 1)) { 4225 if (__kmp_affinity_verbose) { 4226 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 4227 } 4228 4229 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 4230 KMP_ASSERT(depth != 0); 4231 } 4232 4233 #endif /* KMP_GROUP_AFFINITY */ 4234 4235 if (depth < 0) { 4236 if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) { 4237 if (file_name == NULL) { 4238 KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id)); 4239 } else if (line == 0) { 4240 KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id)); 4241 } else { 4242 KMP_INFORM(UsingFlatOSFileLine, file_name, line, 4243 __kmp_i18n_catgets(msg_id)); 4244 } 4245 } 4246 // FIXME - print msg if msg_id = kmp_i18n_null ??? 4247 4248 file_name = ""; 4249 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 4250 if (depth == 0) { 4251 KMP_EXIT_AFF_NONE; 4252 } 4253 KMP_ASSERT(depth > 0); 4254 KMP_ASSERT(address2os != NULL); 4255 } 4256 } 4257 4258 #if KMP_USE_HWLOC 4259 else if (__kmp_affinity_top_method == affinity_top_method_hwloc) { 4260 KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC); 4261 if (__kmp_affinity_verbose) { 4262 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 4263 } 4264 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); 4265 if (depth == 0) { 4266 KMP_EXIT_AFF_NONE; 4267 } 4268 } 4269 #endif // KMP_USE_HWLOC 4270 4271 // If the user has specified that a particular topology discovery method is to be 4272 // used, then we abort if that method fails. The exception is group affinity, 4273 // which might have been implicitly set. 4274 4275 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4276 4277 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 4278 if (__kmp_affinity_verbose) { 4279 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 4280 } 4281 4282 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 4283 if (depth == 0) { 4284 KMP_EXIT_AFF_NONE; 4285 } 4286 if (depth < 0) { 4287 KMP_ASSERT(msg_id != kmp_i18n_null); 4288 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4289 } 4290 } else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 4291 if (__kmp_affinity_verbose) { 4292 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 4293 } 4294 4295 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 4296 if (depth == 0) { 4297 KMP_EXIT_AFF_NONE; 4298 } 4299 if (depth < 0) { 4300 KMP_ASSERT(msg_id != kmp_i18n_null); 4301 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4302 } 4303 } 4304 4305 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4306 4307 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 4308 const char *filename; 4309 const char *env_var = nullptr; 4310 if (__kmp_cpuinfo_file != NULL) { 4311 filename = __kmp_cpuinfo_file; 4312 env_var = "KMP_CPUINFO_FILE"; 4313 } else { 4314 filename = "/proc/cpuinfo"; 4315 } 4316 4317 if (__kmp_affinity_verbose) { 4318 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 4319 } 4320 4321 kmp_safe_raii_file_t f(filename, "r", env_var); 4322 int line = 0; 4323 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 4324 if (depth < 0) { 4325 KMP_ASSERT(msg_id != kmp_i18n_null); 4326 if (line > 0) { 4327 KMP_FATAL(FileLineMsgExiting, filename, line, 4328 __kmp_i18n_catgets(msg_id)); 4329 } else { 4330 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 4331 } 4332 } 4333 if (__kmp_affinity_type == affinity_none) { 4334 KMP_ASSERT(depth == 0); 4335 KMP_EXIT_AFF_NONE; 4336 } 4337 } 4338 4339 #if KMP_GROUP_AFFINITY 4340 4341 else if (__kmp_affinity_top_method == affinity_top_method_group) { 4342 if (__kmp_affinity_verbose) { 4343 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 4344 } 4345 4346 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 4347 KMP_ASSERT(depth != 0); 4348 if (depth < 0) { 4349 KMP_ASSERT(msg_id != kmp_i18n_null); 4350 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4351 } 4352 } 4353 4354 #endif /* KMP_GROUP_AFFINITY */ 4355 4356 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 4357 if (__kmp_affinity_verbose) { 4358 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY"); 4359 } 4360 4361 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 4362 if (depth == 0) { 4363 KMP_EXIT_AFF_NONE; 4364 } 4365 // should not fail 4366 KMP_ASSERT(depth > 0); 4367 KMP_ASSERT(address2os != NULL); 4368 } 4369 4370 #if KMP_USE_HIER_SCHED 4371 __kmp_dispatch_set_hierarchy_values(); 4372 #endif 4373 4374 if (address2os == NULL) { 4375 if (KMP_AFFINITY_CAPABLE() && 4376 (__kmp_affinity_verbose || 4377 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) { 4378 KMP_WARNING(ErrorInitializeAffinity); 4379 } 4380 __kmp_affinity_type = affinity_none; 4381 __kmp_create_affinity_none_places(); 4382 KMP_AFFINITY_DISABLE(); 4383 return; 4384 } 4385 4386 if (__kmp_affinity_gran == affinity_gran_tile 4387 #if KMP_USE_HWLOC 4388 && __kmp_tile_depth == 0 4389 #endif 4390 ) { 4391 // tiles requested but not detected, warn user on this 4392 KMP_WARNING(AffTilesNoTiles, "KMP_AFFINITY"); 4393 } 4394 4395 __kmp_apply_thread_places(&address2os, depth); 4396 4397 // Create the table of masks, indexed by thread Id. 4398 unsigned maxIndex; 4399 unsigned numUnique; 4400 kmp_affin_mask_t *osId2Mask = 4401 __kmp_create_masks(&maxIndex, &numUnique, address2os, __kmp_avail_proc); 4402 if (__kmp_affinity_gran_levels == 0) { 4403 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 4404 } 4405 4406 // Set the childNums vector in all Address objects. This must be done before 4407 // we can sort using __kmp_affinity_cmp_Address_child_num(), which takes into 4408 // account the setting of __kmp_affinity_compact. 4409 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc); 4410 4411 switch (__kmp_affinity_type) { 4412 4413 case affinity_explicit: 4414 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 4415 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) { 4416 __kmp_affinity_process_proclist( 4417 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 4418 __kmp_affinity_proclist, osId2Mask, maxIndex); 4419 } else { 4420 __kmp_affinity_process_placelist( 4421 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 4422 __kmp_affinity_proclist, osId2Mask, maxIndex); 4423 } 4424 if (__kmp_affinity_num_masks == 0) { 4425 if (__kmp_affinity_verbose || 4426 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 4427 KMP_WARNING(AffNoValidProcID); 4428 } 4429 __kmp_affinity_type = affinity_none; 4430 __kmp_create_affinity_none_places(); 4431 return; 4432 } 4433 break; 4434 4435 // The other affinity types rely on sorting the Addresses according to some 4436 // permutation of the machine topology tree. Set __kmp_affinity_compact and 4437 // __kmp_affinity_offset appropriately, then jump to a common code fragment 4438 // to do the sort and create the array of affinity masks. 4439 4440 case affinity_logical: 4441 __kmp_affinity_compact = 0; 4442 if (__kmp_affinity_offset) { 4443 __kmp_affinity_offset = 4444 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 4445 } 4446 goto sortAddresses; 4447 4448 case affinity_physical: 4449 if (__kmp_nThreadsPerCore > 1) { 4450 __kmp_affinity_compact = 1; 4451 if (__kmp_affinity_compact >= depth) { 4452 __kmp_affinity_compact = 0; 4453 } 4454 } else { 4455 __kmp_affinity_compact = 0; 4456 } 4457 if (__kmp_affinity_offset) { 4458 __kmp_affinity_offset = 4459 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 4460 } 4461 goto sortAddresses; 4462 4463 case affinity_scatter: 4464 if (__kmp_affinity_compact >= depth) { 4465 __kmp_affinity_compact = 0; 4466 } else { 4467 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 4468 } 4469 goto sortAddresses; 4470 4471 case affinity_compact: 4472 if (__kmp_affinity_compact >= depth) { 4473 __kmp_affinity_compact = depth - 1; 4474 } 4475 goto sortAddresses; 4476 4477 case affinity_balanced: 4478 if (depth <= 1) { 4479 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 4480 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 4481 } 4482 __kmp_affinity_type = affinity_none; 4483 __kmp_create_affinity_none_places(); 4484 return; 4485 } else if (!__kmp_affinity_uniform_topology()) { 4486 // Save the depth for further usage 4487 __kmp_aff_depth = depth; 4488 4489 int core_level = __kmp_affinity_find_core_level( 4490 address2os, __kmp_avail_proc, depth - 1); 4491 int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, 4492 depth - 1, core_level); 4493 int maxprocpercore = __kmp_affinity_max_proc_per_core( 4494 address2os, __kmp_avail_proc, depth - 1, core_level); 4495 4496 int nproc = ncores * maxprocpercore; 4497 if ((nproc < 2) || (nproc < __kmp_avail_proc)) { 4498 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 4499 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 4500 } 4501 __kmp_affinity_type = affinity_none; 4502 return; 4503 } 4504 4505 procarr = (int *)__kmp_allocate(sizeof(int) * nproc); 4506 for (int i = 0; i < nproc; i++) { 4507 procarr[i] = -1; 4508 } 4509 4510 int lastcore = -1; 4511 int inlastcore = 0; 4512 for (int i = 0; i < __kmp_avail_proc; i++) { 4513 int proc = address2os[i].second; 4514 int core = 4515 __kmp_affinity_find_core(address2os, i, depth - 1, core_level); 4516 4517 if (core == lastcore) { 4518 inlastcore++; 4519 } else { 4520 inlastcore = 0; 4521 } 4522 lastcore = core; 4523 4524 procarr[core * maxprocpercore + inlastcore] = proc; 4525 } 4526 } 4527 if (__kmp_affinity_compact >= depth) { 4528 __kmp_affinity_compact = depth - 1; 4529 } 4530 4531 sortAddresses: 4532 // Allocate the gtid->affinity mask table. 4533 if (__kmp_affinity_dups) { 4534 __kmp_affinity_num_masks = __kmp_avail_proc; 4535 } else { 4536 __kmp_affinity_num_masks = numUnique; 4537 } 4538 4539 if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) && 4540 (__kmp_affinity_num_places > 0) && 4541 ((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) { 4542 __kmp_affinity_num_masks = __kmp_affinity_num_places; 4543 } 4544 4545 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4546 4547 // Sort the address2os table according to the current setting of 4548 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 4549 qsort(address2os, __kmp_avail_proc, sizeof(*address2os), 4550 __kmp_affinity_cmp_Address_child_num); 4551 { 4552 int i; 4553 unsigned j; 4554 for (i = 0, j = 0; i < __kmp_avail_proc; i++) { 4555 if ((!__kmp_affinity_dups) && (!address2os[i].first.leader)) { 4556 continue; 4557 } 4558 unsigned osId = address2os[i].second; 4559 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 4560 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j); 4561 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 4562 KMP_CPU_COPY(dest, src); 4563 if (++j >= __kmp_affinity_num_masks) { 4564 break; 4565 } 4566 } 4567 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 4568 } 4569 break; 4570 4571 default: 4572 KMP_ASSERT2(0, "Unexpected affinity setting"); 4573 } 4574 4575 KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1); 4576 machine_hierarchy.init(address2os, __kmp_avail_proc); 4577 } 4578 #undef KMP_EXIT_AFF_NONE 4579 4580 void __kmp_affinity_initialize(void) { 4581 // Much of the code above was written assuming that if a machine was not 4582 // affinity capable, then __kmp_affinity_type == affinity_none. We now 4583 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 4584 // There are too many checks for __kmp_affinity_type == affinity_none 4585 // in this code. Instead of trying to change them all, check if 4586 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 4587 // affinity_none, call the real initialization routine, then restore 4588 // __kmp_affinity_type to affinity_disabled. 4589 int disabled = (__kmp_affinity_type == affinity_disabled); 4590 if (!KMP_AFFINITY_CAPABLE()) { 4591 KMP_ASSERT(disabled); 4592 } 4593 if (disabled) { 4594 __kmp_affinity_type = affinity_none; 4595 } 4596 __kmp_aux_affinity_initialize(); 4597 if (disabled) { 4598 __kmp_affinity_type = affinity_disabled; 4599 } 4600 } 4601 4602 void __kmp_affinity_uninitialize(void) { 4603 if (__kmp_affinity_masks != NULL) { 4604 KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4605 __kmp_affinity_masks = NULL; 4606 } 4607 if (__kmp_affin_fullMask != NULL) { 4608 KMP_CPU_FREE(__kmp_affin_fullMask); 4609 __kmp_affin_fullMask = NULL; 4610 } 4611 __kmp_affinity_num_masks = 0; 4612 __kmp_affinity_type = affinity_default; 4613 __kmp_affinity_num_places = 0; 4614 if (__kmp_affinity_proclist != NULL) { 4615 __kmp_free(__kmp_affinity_proclist); 4616 __kmp_affinity_proclist = NULL; 4617 } 4618 if (address2os != NULL) { 4619 __kmp_free(address2os); 4620 address2os = NULL; 4621 } 4622 if (procarr != NULL) { 4623 __kmp_free(procarr); 4624 procarr = NULL; 4625 } 4626 #if KMP_USE_HWLOC 4627 if (__kmp_hwloc_topology != NULL) { 4628 hwloc_topology_destroy(__kmp_hwloc_topology); 4629 __kmp_hwloc_topology = NULL; 4630 } 4631 #endif 4632 KMPAffinity::destroy_api(); 4633 } 4634 4635 void __kmp_affinity_set_init_mask(int gtid, int isa_root) { 4636 if (!KMP_AFFINITY_CAPABLE()) { 4637 return; 4638 } 4639 4640 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4641 if (th->th.th_affin_mask == NULL) { 4642 KMP_CPU_ALLOC(th->th.th_affin_mask); 4643 } else { 4644 KMP_CPU_ZERO(th->th.th_affin_mask); 4645 } 4646 4647 // Copy the thread mask to the kmp_info_t structure. If 4648 // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that 4649 // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set, 4650 // then the full mask is the same as the mask of the initialization thread. 4651 kmp_affin_mask_t *mask; 4652 int i; 4653 4654 if (KMP_AFFINITY_NON_PROC_BIND) { 4655 if ((__kmp_affinity_type == affinity_none) || 4656 (__kmp_affinity_type == affinity_balanced)) { 4657 #if KMP_GROUP_AFFINITY 4658 if (__kmp_num_proc_groups > 1) { 4659 return; 4660 } 4661 #endif 4662 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4663 i = 0; 4664 mask = __kmp_affin_fullMask; 4665 } else { 4666 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 4667 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4668 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4669 } 4670 } else { 4671 if ((!isa_root) || 4672 (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 4673 #if KMP_GROUP_AFFINITY 4674 if (__kmp_num_proc_groups > 1) { 4675 return; 4676 } 4677 #endif 4678 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4679 i = KMP_PLACE_ALL; 4680 mask = __kmp_affin_fullMask; 4681 } else { 4682 // int i = some hash function or just a counter that doesn't 4683 // always start at 0. Use gtid for now. 4684 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 4685 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4686 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4687 } 4688 } 4689 4690 th->th.th_current_place = i; 4691 if (isa_root) { 4692 th->th.th_new_place = i; 4693 th->th.th_first_place = 0; 4694 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4695 } else if (KMP_AFFINITY_NON_PROC_BIND) { 4696 // When using a Non-OMP_PROC_BIND affinity method, 4697 // set all threads' place-partition-var to the entire place list 4698 th->th.th_first_place = 0; 4699 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4700 } 4701 4702 if (i == KMP_PLACE_ALL) { 4703 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 4704 gtid)); 4705 } else { 4706 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 4707 gtid, i)); 4708 } 4709 4710 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4711 4712 if (__kmp_affinity_verbose 4713 /* to avoid duplicate printing (will be correctly printed on barrier) */ 4714 && (__kmp_affinity_type == affinity_none || 4715 (i != KMP_PLACE_ALL && __kmp_affinity_type != affinity_balanced))) { 4716 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4717 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4718 th->th.th_affin_mask); 4719 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4720 __kmp_gettid(), gtid, buf); 4721 } 4722 4723 #if KMP_OS_WINDOWS 4724 // On Windows* OS, the process affinity mask might have changed. If the user 4725 // didn't request affinity and this call fails, just continue silently. 4726 // See CQ171393. 4727 if (__kmp_affinity_type == affinity_none) { 4728 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 4729 } else 4730 #endif 4731 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4732 } 4733 4734 void __kmp_affinity_set_place(int gtid) { 4735 if (!KMP_AFFINITY_CAPABLE()) { 4736 return; 4737 } 4738 4739 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4740 4741 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current " 4742 "place = %d)\n", 4743 gtid, th->th.th_new_place, th->th.th_current_place)); 4744 4745 // Check that the new place is within this thread's partition. 4746 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4747 KMP_ASSERT(th->th.th_new_place >= 0); 4748 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 4749 if (th->th.th_first_place <= th->th.th_last_place) { 4750 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) && 4751 (th->th.th_new_place <= th->th.th_last_place)); 4752 } else { 4753 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) || 4754 (th->th.th_new_place >= th->th.th_last_place)); 4755 } 4756 4757 // Copy the thread mask to the kmp_info_t structure, 4758 // and set this thread's affinity. 4759 kmp_affin_mask_t *mask = 4760 KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place); 4761 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4762 th->th.th_current_place = th->th.th_new_place; 4763 4764 if (__kmp_affinity_verbose) { 4765 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4766 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4767 th->th.th_affin_mask); 4768 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), 4769 __kmp_gettid(), gtid, buf); 4770 } 4771 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4772 } 4773 4774 int __kmp_aux_set_affinity(void **mask) { 4775 int gtid; 4776 kmp_info_t *th; 4777 int retval; 4778 4779 if (!KMP_AFFINITY_CAPABLE()) { 4780 return -1; 4781 } 4782 4783 gtid = __kmp_entry_gtid(); 4784 KA_TRACE(1000, (""); { 4785 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4786 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4787 (kmp_affin_mask_t *)(*mask)); 4788 __kmp_debug_printf( 4789 "kmp_set_affinity: setting affinity mask for thread %d = %s\n", gtid, 4790 buf); 4791 }); 4792 4793 if (__kmp_env_consistency_check) { 4794 if ((mask == NULL) || (*mask == NULL)) { 4795 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4796 } else { 4797 unsigned proc; 4798 int num_procs = 0; 4799 4800 KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) { 4801 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4802 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4803 } 4804 if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 4805 continue; 4806 } 4807 num_procs++; 4808 } 4809 if (num_procs == 0) { 4810 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4811 } 4812 4813 #if KMP_GROUP_AFFINITY 4814 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 4815 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4816 } 4817 #endif /* KMP_GROUP_AFFINITY */ 4818 } 4819 } 4820 4821 th = __kmp_threads[gtid]; 4822 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4823 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4824 if (retval == 0) { 4825 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 4826 } 4827 4828 th->th.th_current_place = KMP_PLACE_UNDEFINED; 4829 th->th.th_new_place = KMP_PLACE_UNDEFINED; 4830 th->th.th_first_place = 0; 4831 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4832 4833 // Turn off 4.0 affinity for the current tread at this parallel level. 4834 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 4835 4836 return retval; 4837 } 4838 4839 int __kmp_aux_get_affinity(void **mask) { 4840 int gtid; 4841 int retval; 4842 kmp_info_t *th; 4843 4844 if (!KMP_AFFINITY_CAPABLE()) { 4845 return -1; 4846 } 4847 4848 gtid = __kmp_entry_gtid(); 4849 th = __kmp_threads[gtid]; 4850 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4851 4852 KA_TRACE(1000, (""); { 4853 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4854 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4855 th->th.th_affin_mask); 4856 __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", 4857 gtid, buf); 4858 }); 4859 4860 if (__kmp_env_consistency_check) { 4861 if ((mask == NULL) || (*mask == NULL)) { 4862 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 4863 } 4864 } 4865 4866 #if !KMP_OS_WINDOWS 4867 4868 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4869 KA_TRACE(1000, (""); { 4870 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4871 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4872 (kmp_affin_mask_t *)(*mask)); 4873 __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", 4874 gtid, buf); 4875 }); 4876 return retval; 4877 4878 #else 4879 4880 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 4881 return 0; 4882 4883 #endif /* KMP_OS_WINDOWS */ 4884 } 4885 4886 int __kmp_aux_get_affinity_max_proc() { 4887 if (!KMP_AFFINITY_CAPABLE()) { 4888 return 0; 4889 } 4890 #if KMP_GROUP_AFFINITY 4891 if (__kmp_num_proc_groups > 1) { 4892 return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT); 4893 } 4894 #endif 4895 return __kmp_xproc; 4896 } 4897 4898 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) { 4899 if (!KMP_AFFINITY_CAPABLE()) { 4900 return -1; 4901 } 4902 4903 KA_TRACE(1000, (""); { 4904 int gtid = __kmp_entry_gtid(); 4905 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4906 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4907 (kmp_affin_mask_t *)(*mask)); 4908 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in " 4909 "affinity mask for thread %d = %s\n", 4910 proc, gtid, buf); 4911 }); 4912 4913 if (__kmp_env_consistency_check) { 4914 if ((mask == NULL) || (*mask == NULL)) { 4915 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4916 } 4917 } 4918 4919 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4920 return -1; 4921 } 4922 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4923 return -2; 4924 } 4925 4926 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 4927 return 0; 4928 } 4929 4930 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) { 4931 if (!KMP_AFFINITY_CAPABLE()) { 4932 return -1; 4933 } 4934 4935 KA_TRACE(1000, (""); { 4936 int gtid = __kmp_entry_gtid(); 4937 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4938 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4939 (kmp_affin_mask_t *)(*mask)); 4940 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in " 4941 "affinity mask for thread %d = %s\n", 4942 proc, gtid, buf); 4943 }); 4944 4945 if (__kmp_env_consistency_check) { 4946 if ((mask == NULL) || (*mask == NULL)) { 4947 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 4948 } 4949 } 4950 4951 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4952 return -1; 4953 } 4954 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4955 return -2; 4956 } 4957 4958 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 4959 return 0; 4960 } 4961 4962 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) { 4963 if (!KMP_AFFINITY_CAPABLE()) { 4964 return -1; 4965 } 4966 4967 KA_TRACE(1000, (""); { 4968 int gtid = __kmp_entry_gtid(); 4969 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4970 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4971 (kmp_affin_mask_t *)(*mask)); 4972 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in " 4973 "affinity mask for thread %d = %s\n", 4974 proc, gtid, buf); 4975 }); 4976 4977 if (__kmp_env_consistency_check) { 4978 if ((mask == NULL) || (*mask == NULL)) { 4979 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc"); 4980 } 4981 } 4982 4983 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4984 return -1; 4985 } 4986 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4987 return 0; 4988 } 4989 4990 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 4991 } 4992 4993 // Dynamic affinity settings - Affinity balanced 4994 void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) { 4995 KMP_DEBUG_ASSERT(th); 4996 bool fine_gran = true; 4997 int tid = th->th.th_info.ds.ds_tid; 4998 4999 switch (__kmp_affinity_gran) { 5000 case affinity_gran_fine: 5001 case affinity_gran_thread: 5002 break; 5003 case affinity_gran_core: 5004 if (__kmp_nThreadsPerCore > 1) { 5005 fine_gran = false; 5006 } 5007 break; 5008 case affinity_gran_package: 5009 if (nCoresPerPkg > 1) { 5010 fine_gran = false; 5011 } 5012 break; 5013 default: 5014 fine_gran = false; 5015 } 5016 5017 if (__kmp_affinity_uniform_topology()) { 5018 int coreID; 5019 int threadID; 5020 // Number of hyper threads per core in HT machine 5021 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 5022 // Number of cores 5023 int ncores = __kmp_ncores; 5024 if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) { 5025 __kmp_nth_per_core = __kmp_avail_proc / nPackages; 5026 ncores = nPackages; 5027 } 5028 // How many threads will be bound to each core 5029 int chunk = nthreads / ncores; 5030 // How many cores will have an additional thread bound to it - "big cores" 5031 int big_cores = nthreads % ncores; 5032 // Number of threads on the big cores 5033 int big_nth = (chunk + 1) * big_cores; 5034 if (tid < big_nth) { 5035 coreID = tid / (chunk + 1); 5036 threadID = (tid % (chunk + 1)) % __kmp_nth_per_core; 5037 } else { // tid >= big_nth 5038 coreID = (tid - big_cores) / chunk; 5039 threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core; 5040 } 5041 5042 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 5043 "Illegal set affinity operation when not capable"); 5044 5045 kmp_affin_mask_t *mask = th->th.th_affin_mask; 5046 KMP_CPU_ZERO(mask); 5047 5048 if (fine_gran) { 5049 int osID = address2os[coreID * __kmp_nth_per_core + threadID].second; 5050 KMP_CPU_SET(osID, mask); 5051 } else { 5052 for (int i = 0; i < __kmp_nth_per_core; i++) { 5053 int osID; 5054 osID = address2os[coreID * __kmp_nth_per_core + i].second; 5055 KMP_CPU_SET(osID, mask); 5056 } 5057 } 5058 if (__kmp_affinity_verbose) { 5059 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5060 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 5061 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 5062 __kmp_gettid(), tid, buf); 5063 } 5064 __kmp_set_system_affinity(mask, TRUE); 5065 } else { // Non-uniform topology 5066 5067 kmp_affin_mask_t *mask = th->th.th_affin_mask; 5068 KMP_CPU_ZERO(mask); 5069 5070 int core_level = __kmp_affinity_find_core_level( 5071 address2os, __kmp_avail_proc, __kmp_aff_depth - 1); 5072 int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, 5073 __kmp_aff_depth - 1, core_level); 5074 int nth_per_core = __kmp_affinity_max_proc_per_core( 5075 address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level); 5076 5077 // For performance gain consider the special case nthreads == 5078 // __kmp_avail_proc 5079 if (nthreads == __kmp_avail_proc) { 5080 if (fine_gran) { 5081 int osID = address2os[tid].second; 5082 KMP_CPU_SET(osID, mask); 5083 } else { 5084 int core = __kmp_affinity_find_core(address2os, tid, 5085 __kmp_aff_depth - 1, core_level); 5086 for (int i = 0; i < __kmp_avail_proc; i++) { 5087 int osID = address2os[i].second; 5088 if (__kmp_affinity_find_core(address2os, i, __kmp_aff_depth - 1, 5089 core_level) == core) { 5090 KMP_CPU_SET(osID, mask); 5091 } 5092 } 5093 } 5094 } else if (nthreads <= ncores) { 5095 5096 int core = 0; 5097 for (int i = 0; i < ncores; i++) { 5098 // Check if this core from procarr[] is in the mask 5099 int in_mask = 0; 5100 for (int j = 0; j < nth_per_core; j++) { 5101 if (procarr[i * nth_per_core + j] != -1) { 5102 in_mask = 1; 5103 break; 5104 } 5105 } 5106 if (in_mask) { 5107 if (tid == core) { 5108 for (int j = 0; j < nth_per_core; j++) { 5109 int osID = procarr[i * nth_per_core + j]; 5110 if (osID != -1) { 5111 KMP_CPU_SET(osID, mask); 5112 // For fine granularity it is enough to set the first available 5113 // osID for this core 5114 if (fine_gran) { 5115 break; 5116 } 5117 } 5118 } 5119 break; 5120 } else { 5121 core++; 5122 } 5123 } 5124 } 5125 } else { // nthreads > ncores 5126 // Array to save the number of processors at each core 5127 int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores); 5128 // Array to save the number of cores with "x" available processors; 5129 int *ncores_with_x_procs = 5130 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 5131 // Array to save the number of cores with # procs from x to nth_per_core 5132 int *ncores_with_x_to_max_procs = 5133 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 5134 5135 for (int i = 0; i <= nth_per_core; i++) { 5136 ncores_with_x_procs[i] = 0; 5137 ncores_with_x_to_max_procs[i] = 0; 5138 } 5139 5140 for (int i = 0; i < ncores; i++) { 5141 int cnt = 0; 5142 for (int j = 0; j < nth_per_core; j++) { 5143 if (procarr[i * nth_per_core + j] != -1) { 5144 cnt++; 5145 } 5146 } 5147 nproc_at_core[i] = cnt; 5148 ncores_with_x_procs[cnt]++; 5149 } 5150 5151 for (int i = 0; i <= nth_per_core; i++) { 5152 for (int j = i; j <= nth_per_core; j++) { 5153 ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j]; 5154 } 5155 } 5156 5157 // Max number of processors 5158 int nproc = nth_per_core * ncores; 5159 // An array to keep number of threads per each context 5160 int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc); 5161 for (int i = 0; i < nproc; i++) { 5162 newarr[i] = 0; 5163 } 5164 5165 int nth = nthreads; 5166 int flag = 0; 5167 while (nth > 0) { 5168 for (int j = 1; j <= nth_per_core; j++) { 5169 int cnt = ncores_with_x_to_max_procs[j]; 5170 for (int i = 0; i < ncores; i++) { 5171 // Skip the core with 0 processors 5172 if (nproc_at_core[i] == 0) { 5173 continue; 5174 } 5175 for (int k = 0; k < nth_per_core; k++) { 5176 if (procarr[i * nth_per_core + k] != -1) { 5177 if (newarr[i * nth_per_core + k] == 0) { 5178 newarr[i * nth_per_core + k] = 1; 5179 cnt--; 5180 nth--; 5181 break; 5182 } else { 5183 if (flag != 0) { 5184 newarr[i * nth_per_core + k]++; 5185 cnt--; 5186 nth--; 5187 break; 5188 } 5189 } 5190 } 5191 } 5192 if (cnt == 0 || nth == 0) { 5193 break; 5194 } 5195 } 5196 if (nth == 0) { 5197 break; 5198 } 5199 } 5200 flag = 1; 5201 } 5202 int sum = 0; 5203 for (int i = 0; i < nproc; i++) { 5204 sum += newarr[i]; 5205 if (sum > tid) { 5206 if (fine_gran) { 5207 int osID = procarr[i]; 5208 KMP_CPU_SET(osID, mask); 5209 } else { 5210 int coreID = i / nth_per_core; 5211 for (int ii = 0; ii < nth_per_core; ii++) { 5212 int osID = procarr[coreID * nth_per_core + ii]; 5213 if (osID != -1) { 5214 KMP_CPU_SET(osID, mask); 5215 } 5216 } 5217 } 5218 break; 5219 } 5220 } 5221 __kmp_free(newarr); 5222 } 5223 5224 if (__kmp_affinity_verbose) { 5225 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5226 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 5227 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 5228 __kmp_gettid(), tid, buf); 5229 } 5230 __kmp_set_system_affinity(mask, TRUE); 5231 } 5232 } 5233 5234 #if KMP_OS_LINUX || KMP_OS_FREEBSD 5235 // We don't need this entry for Windows because 5236 // there is GetProcessAffinityMask() api 5237 // 5238 // The intended usage is indicated by these steps: 5239 // 1) The user gets the current affinity mask 5240 // 2) Then sets the affinity by calling this function 5241 // 3) Error check the return value 5242 // 4) Use non-OpenMP parallelization 5243 // 5) Reset the affinity to what was stored in step 1) 5244 #ifdef __cplusplus 5245 extern "C" 5246 #endif 5247 int 5248 kmp_set_thread_affinity_mask_initial() 5249 // the function returns 0 on success, 5250 // -1 if we cannot bind thread 5251 // >0 (errno) if an error happened during binding 5252 { 5253 int gtid = __kmp_get_gtid(); 5254 if (gtid < 0) { 5255 // Do not touch non-omp threads 5256 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5257 "non-omp thread, returning\n")); 5258 return -1; 5259 } 5260 if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) { 5261 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5262 "affinity not initialized, returning\n")); 5263 return -1; 5264 } 5265 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5266 "set full mask for thread %d\n", 5267 gtid)); 5268 KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL); 5269 return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE); 5270 } 5271 #endif 5272 5273 #endif // KMP_AFFINITY_SUPPORTED 5274