1 /* 2 * kmp_affinity.cpp -- affinity management 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // The LLVM Compiler Infrastructure 8 // 9 // This file is dual licensed under the MIT and the University of Illinois Open 10 // Source Licenses. See LICENSE.txt for details. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "kmp.h" 15 #include "kmp_affinity.h" 16 #include "kmp_i18n.h" 17 #include "kmp_io.h" 18 #include "kmp_str.h" 19 #include "kmp_wrapper_getpid.h" 20 #if KMP_USE_HIER_SCHED 21 #include "kmp_dispatch_hier.h" 22 #endif 23 24 // Store the real or imagined machine hierarchy here 25 static hierarchy_info machine_hierarchy; 26 27 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); } 28 29 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 30 kmp_uint32 depth; 31 // The test below is true if affinity is available, but set to "none". Need to 32 // init on first use of hierarchical barrier. 33 if (TCR_1(machine_hierarchy.uninitialized)) 34 machine_hierarchy.init(NULL, nproc); 35 36 // Adjust the hierarchy in case num threads exceeds original 37 if (nproc > machine_hierarchy.base_num_threads) 38 machine_hierarchy.resize(nproc); 39 40 depth = machine_hierarchy.depth; 41 KMP_DEBUG_ASSERT(depth > 0); 42 43 thr_bar->depth = depth; 44 thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0] - 1; 45 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 46 } 47 48 #if KMP_AFFINITY_SUPPORTED 49 50 bool KMPAffinity::picked_api = false; 51 52 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); } 53 void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); } 54 void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); } 55 void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); } 56 void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); } 57 void KMPAffinity::operator delete(void *p) { __kmp_free(p); } 58 59 void KMPAffinity::pick_api() { 60 KMPAffinity *affinity_dispatch; 61 if (picked_api) 62 return; 63 #if KMP_USE_HWLOC 64 // Only use Hwloc if affinity isn't explicitly disabled and 65 // user requests Hwloc topology method 66 if (__kmp_affinity_top_method == affinity_top_method_hwloc && 67 __kmp_affinity_type != affinity_disabled) { 68 affinity_dispatch = new KMPHwlocAffinity(); 69 } else 70 #endif 71 { 72 affinity_dispatch = new KMPNativeAffinity(); 73 } 74 __kmp_affinity_dispatch = affinity_dispatch; 75 picked_api = true; 76 } 77 78 void KMPAffinity::destroy_api() { 79 if (__kmp_affinity_dispatch != NULL) { 80 delete __kmp_affinity_dispatch; 81 __kmp_affinity_dispatch = NULL; 82 picked_api = false; 83 } 84 } 85 86 // Print the affinity mask to the character array in a pretty format. 87 char *__kmp_affinity_print_mask(char *buf, int buf_len, 88 kmp_affin_mask_t *mask) { 89 KMP_ASSERT(buf_len >= 40); 90 char *scan = buf; 91 char *end = buf + buf_len - 1; 92 93 // Find first element / check for empty set. 94 int i; 95 i = mask->begin(); 96 if (i == mask->end()) { 97 KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}"); 98 while (*scan != '\0') 99 scan++; 100 KMP_ASSERT(scan <= end); 101 return buf; 102 } 103 104 KMP_SNPRINTF(scan, end - scan + 1, "{%d", i); 105 while (*scan != '\0') 106 scan++; 107 i++; 108 for (; i != mask->end(); i = mask->next(i)) { 109 if (!KMP_CPU_ISSET(i, mask)) { 110 continue; 111 } 112 113 // Check for buffer overflow. A string of the form ",<n>" will have at most 114 // 10 characters, plus we want to leave room to print ",...}" if the set is 115 // too large to print for a total of 15 characters. We already left room for 116 // '\0' in setting end. 117 if (end - scan < 15) { 118 break; 119 } 120 KMP_SNPRINTF(scan, end - scan + 1, ",%-d", i); 121 while (*scan != '\0') 122 scan++; 123 } 124 if (i != mask->end()) { 125 KMP_SNPRINTF(scan, end - scan + 1, ",..."); 126 while (*scan != '\0') 127 scan++; 128 } 129 KMP_SNPRINTF(scan, end - scan + 1, "}"); 130 while (*scan != '\0') 131 scan++; 132 KMP_ASSERT(scan <= end); 133 return buf; 134 } 135 136 void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) { 137 KMP_CPU_ZERO(mask); 138 139 #if KMP_GROUP_AFFINITY 140 141 if (__kmp_num_proc_groups > 1) { 142 int group; 143 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 144 for (group = 0; group < __kmp_num_proc_groups; group++) { 145 int i; 146 int num = __kmp_GetActiveProcessorCount(group); 147 for (i = 0; i < num; i++) { 148 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 149 } 150 } 151 } else 152 153 #endif /* KMP_GROUP_AFFINITY */ 154 155 { 156 int proc; 157 for (proc = 0; proc < __kmp_xproc; proc++) { 158 KMP_CPU_SET(proc, mask); 159 } 160 } 161 } 162 163 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be 164 // called to renumber the labels from [0..n] and place them into the child_num 165 // vector of the address object. This is done in case the labels used for 166 // the children at one node of the hierarchy differ from those used for 167 // another node at the same level. Example: suppose the machine has 2 nodes 168 // with 2 packages each. The first node contains packages 601 and 602, and 169 // second node contains packages 603 and 604. If we try to sort the table 170 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604 171 // because we are paying attention to the labels themselves, not the ordinal 172 // child numbers. By using the child numbers in the sort, the result is 173 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604. 174 static void __kmp_affinity_assign_child_nums(AddrUnsPair *address2os, 175 int numAddrs) { 176 KMP_DEBUG_ASSERT(numAddrs > 0); 177 int depth = address2os->first.depth; 178 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 179 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 180 int labCt; 181 for (labCt = 0; labCt < depth; labCt++) { 182 address2os[0].first.childNums[labCt] = counts[labCt] = 0; 183 lastLabel[labCt] = address2os[0].first.labels[labCt]; 184 } 185 int i; 186 for (i = 1; i < numAddrs; i++) { 187 for (labCt = 0; labCt < depth; labCt++) { 188 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) { 189 int labCt2; 190 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) { 191 counts[labCt2] = 0; 192 lastLabel[labCt2] = address2os[i].first.labels[labCt2]; 193 } 194 counts[labCt]++; 195 lastLabel[labCt] = address2os[i].first.labels[labCt]; 196 break; 197 } 198 } 199 for (labCt = 0; labCt < depth; labCt++) { 200 address2os[i].first.childNums[labCt] = counts[labCt]; 201 } 202 for (; labCt < (int)Address::maxDepth; labCt++) { 203 address2os[i].first.childNums[labCt] = 0; 204 } 205 } 206 __kmp_free(lastLabel); 207 __kmp_free(counts); 208 } 209 210 // All of the __kmp_affinity_create_*_map() routines should set 211 // __kmp_affinity_masks to a vector of affinity mask objects of length 212 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and return 213 // the number of levels in the machine topology tree (zero if 214 // __kmp_affinity_type == affinity_none). 215 // 216 // All of the __kmp_affinity_create_*_map() routines should set 217 // *__kmp_affin_fullMask to the affinity mask for the initialization thread. 218 // They need to save and restore the mask, and it could be needed later, so 219 // saving it is just an optimization to avoid calling kmp_get_system_affinity() 220 // again. 221 kmp_affin_mask_t *__kmp_affin_fullMask = NULL; 222 223 static int nCoresPerPkg, nPackages; 224 static int __kmp_nThreadsPerCore; 225 #ifndef KMP_DFLT_NTH_CORES 226 static int __kmp_ncores; 227 #endif 228 static int *__kmp_pu_os_idx = NULL; 229 230 // __kmp_affinity_uniform_topology() doesn't work when called from 231 // places which support arbitrarily many levels in the machine topology 232 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map() 233 // __kmp_affinity_create_x2apicid_map(). 234 inline static bool __kmp_affinity_uniform_topology() { 235 return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages); 236 } 237 238 // Print out the detailed machine topology map, i.e. the physical locations 239 // of each OS proc. 240 static void __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, 241 int depth, int pkgLevel, 242 int coreLevel, int threadLevel) { 243 int proc; 244 245 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 246 for (proc = 0; proc < len; proc++) { 247 int level; 248 kmp_str_buf_t buf; 249 __kmp_str_buf_init(&buf); 250 for (level = 0; level < depth; level++) { 251 if (level == threadLevel) { 252 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread)); 253 } else if (level == coreLevel) { 254 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core)); 255 } else if (level == pkgLevel) { 256 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package)); 257 } else if (level > pkgLevel) { 258 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node), 259 level - pkgLevel - 1); 260 } else { 261 __kmp_str_buf_print(&buf, "L%d ", level); 262 } 263 __kmp_str_buf_print(&buf, "%d ", address2os[proc].first.labels[level]); 264 } 265 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second, 266 buf.str); 267 __kmp_str_buf_free(&buf); 268 } 269 } 270 271 #if KMP_USE_HWLOC 272 273 static void __kmp_affinity_print_hwloc_tp(AddrUnsPair *addrP, int len, 274 int depth, int *levels) { 275 int proc; 276 kmp_str_buf_t buf; 277 __kmp_str_buf_init(&buf); 278 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 279 for (proc = 0; proc < len; proc++) { 280 __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Package), 281 addrP[proc].first.labels[0]); 282 if (depth > 1) { 283 int level = 1; // iterate over levels 284 int label = 1; // iterate over labels 285 if (__kmp_numa_detected) 286 // node level follows package 287 if (levels[level++] > 0) 288 __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Node), 289 addrP[proc].first.labels[label++]); 290 if (__kmp_tile_depth > 0) 291 // tile level follows node if any, or package 292 if (levels[level++] > 0) 293 __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Tile), 294 addrP[proc].first.labels[label++]); 295 if (levels[level++] > 0) 296 // core level follows 297 __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Core), 298 addrP[proc].first.labels[label++]); 299 if (levels[level++] > 0) 300 // thread level is the latest 301 __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Thread), 302 addrP[proc].first.labels[label++]); 303 KMP_DEBUG_ASSERT(label == depth); 304 } 305 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", addrP[proc].second, buf.str); 306 __kmp_str_buf_clear(&buf); 307 } 308 __kmp_str_buf_free(&buf); 309 } 310 311 static int nNodePerPkg, nTilePerPkg, nTilePerNode, nCorePerNode, nCorePerTile; 312 313 // This function removes the topology levels that are radix 1 and don't offer 314 // further information about the topology. The most common example is when you 315 // have one thread context per core, we don't want the extra thread context 316 // level if it offers no unique labels. So they are removed. 317 // return value: the new depth of address2os 318 static int __kmp_affinity_remove_radix_one_levels(AddrUnsPair *addrP, int nTh, 319 int depth, int *levels) { 320 int level; 321 int i; 322 int radix1_detected; 323 int new_depth = depth; 324 for (level = depth - 1; level > 0; --level) { 325 // Detect if this level is radix 1 326 radix1_detected = 1; 327 for (i = 1; i < nTh; ++i) { 328 if (addrP[0].first.labels[level] != addrP[i].first.labels[level]) { 329 // There are differing label values for this level so it stays 330 radix1_detected = 0; 331 break; 332 } 333 } 334 if (!radix1_detected) 335 continue; 336 // Radix 1 was detected 337 --new_depth; 338 levels[level] = -1; // mark level as not present in address2os array 339 if (level == new_depth) { 340 // "turn off" deepest level, just decrement the depth that removes 341 // the level from address2os array 342 for (i = 0; i < nTh; ++i) { 343 addrP[i].first.depth--; 344 } 345 } else { 346 // For other levels, we move labels over and also reduce the depth 347 int j; 348 for (j = level; j < new_depth; ++j) { 349 for (i = 0; i < nTh; ++i) { 350 addrP[i].first.labels[j] = addrP[i].first.labels[j + 1]; 351 addrP[i].first.depth--; 352 } 353 levels[j + 1] -= 1; 354 } 355 } 356 } 357 return new_depth; 358 } 359 360 // Returns the number of objects of type 'type' below 'obj' within the topology 361 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is 362 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET 363 // object. 364 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj, 365 hwloc_obj_type_t type) { 366 int retval = 0; 367 hwloc_obj_t first; 368 for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type, 369 obj->logical_index, type, 0); 370 first != NULL && 371 hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, obj->type, first) == 372 obj; 373 first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type, 374 first)) { 375 ++retval; 376 } 377 return retval; 378 } 379 380 static int __kmp_hwloc_count_children_by_depth(hwloc_topology_t t, 381 hwloc_obj_t o, unsigned depth, 382 hwloc_obj_t *f) { 383 if (o->depth == depth) { 384 if (*f == NULL) 385 *f = o; // output first descendant found 386 return 1; 387 } 388 int sum = 0; 389 for (unsigned i = 0; i < o->arity; i++) 390 sum += __kmp_hwloc_count_children_by_depth(t, o->children[i], depth, f); 391 return sum; // will be 0 if no one found (as PU arity is 0) 392 } 393 394 static int __kmp_hwloc_count_children_by_type(hwloc_topology_t t, hwloc_obj_t o, 395 hwloc_obj_type_t type, 396 hwloc_obj_t *f) { 397 if (!hwloc_compare_types(o->type, type)) { 398 if (*f == NULL) 399 *f = o; // output first descendant found 400 return 1; 401 } 402 int sum = 0; 403 for (unsigned i = 0; i < o->arity; i++) 404 sum += __kmp_hwloc_count_children_by_type(t, o->children[i], type, f); 405 return sum; // will be 0 if no one found (as PU arity is 0) 406 } 407 408 static int __kmp_hwloc_process_obj_core_pu(AddrUnsPair *addrPair, 409 int &nActiveThreads, 410 int &num_active_cores, 411 hwloc_obj_t obj, int depth, 412 int *labels) { 413 hwloc_obj_t core = NULL; 414 hwloc_topology_t &tp = __kmp_hwloc_topology; 415 int NC = __kmp_hwloc_count_children_by_type(tp, obj, HWLOC_OBJ_CORE, &core); 416 for (int core_id = 0; core_id < NC; ++core_id, core = core->next_cousin) { 417 hwloc_obj_t pu = NULL; 418 KMP_DEBUG_ASSERT(core != NULL); 419 int num_active_threads = 0; 420 int NT = __kmp_hwloc_count_children_by_type(tp, core, HWLOC_OBJ_PU, &pu); 421 // int NT = core->arity; pu = core->first_child; // faster? 422 for (int pu_id = 0; pu_id < NT; ++pu_id, pu = pu->next_cousin) { 423 KMP_DEBUG_ASSERT(pu != NULL); 424 if (!KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask)) 425 continue; // skip inactive (inaccessible) unit 426 Address addr(depth + 2); 427 KA_TRACE(20, ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n", 428 obj->os_index, obj->logical_index, core->os_index, 429 core->logical_index, pu->os_index, pu->logical_index)); 430 for (int i = 0; i < depth; ++i) 431 addr.labels[i] = labels[i]; // package, etc. 432 addr.labels[depth] = core_id; // core 433 addr.labels[depth + 1] = pu_id; // pu 434 addrPair[nActiveThreads] = AddrUnsPair(addr, pu->os_index); 435 __kmp_pu_os_idx[nActiveThreads] = pu->os_index; 436 nActiveThreads++; 437 ++num_active_threads; // count active threads per core 438 } 439 if (num_active_threads) { // were there any active threads on the core? 440 ++__kmp_ncores; // count total active cores 441 ++num_active_cores; // count active cores per socket 442 if (num_active_threads > __kmp_nThreadsPerCore) 443 __kmp_nThreadsPerCore = num_active_threads; // calc maximum 444 } 445 } 446 return 0; 447 } 448 449 // Check if NUMA node detected below the package, 450 // and if tile object is detected and return its depth 451 static int __kmp_hwloc_check_numa() { 452 hwloc_topology_t &tp = __kmp_hwloc_topology; 453 hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to) 454 int depth; 455 456 // Get some PU 457 hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, 0); 458 if (hT == NULL) // something has gone wrong 459 return 1; 460 461 // check NUMA node below PACKAGE 462 hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT); 463 hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT); 464 KMP_DEBUG_ASSERT(hS != NULL); 465 if (hN != NULL && hN->depth > hS->depth) { 466 __kmp_numa_detected = TRUE; // socket includes node(s) 467 if (__kmp_affinity_gran == affinity_gran_node) { 468 __kmp_affinity_gran == affinity_gran_numa; 469 } 470 } 471 472 // check tile, get object by depth because of multiple caches possible 473 depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED); 474 hL = hwloc_get_ancestor_obj_by_depth(tp, depth, hT); 475 hC = NULL; // not used, but reset it here just in case 476 if (hL != NULL && 477 __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1) 478 __kmp_tile_depth = depth; // tile consists of multiple cores 479 return 0; 480 } 481 482 static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os, 483 kmp_i18n_id_t *const msg_id) { 484 hwloc_topology_t &tp = __kmp_hwloc_topology; // shortcut of a long name 485 *address2os = NULL; 486 *msg_id = kmp_i18n_null; 487 488 // Save the affinity mask for the current thread. 489 kmp_affin_mask_t *oldMask; 490 KMP_CPU_ALLOC(oldMask); 491 __kmp_get_system_affinity(oldMask, TRUE); 492 __kmp_hwloc_check_numa(); 493 494 if (!KMP_AFFINITY_CAPABLE()) { 495 // Hack to try and infer the machine topology using only the data 496 // available from cpuid on the current thread, and __kmp_xproc. 497 KMP_ASSERT(__kmp_affinity_type == affinity_none); 498 499 nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj( 500 hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0), HWLOC_OBJ_CORE); 501 __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj( 502 hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0), HWLOC_OBJ_PU); 503 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 504 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 505 if (__kmp_affinity_verbose) { 506 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 507 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 508 if (__kmp_affinity_uniform_topology()) { 509 KMP_INFORM(Uniform, "KMP_AFFINITY"); 510 } else { 511 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 512 } 513 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 514 __kmp_nThreadsPerCore, __kmp_ncores); 515 } 516 KMP_CPU_FREE(oldMask); 517 return 0; 518 } 519 520 int depth = 3; 521 int levels[5] = {0, 1, 2, 3, 4}; // package, [node,] [tile,] core, thread 522 int labels[3] = {0}; // package [,node] [,tile] - head of lables array 523 if (__kmp_numa_detected) 524 ++depth; 525 if (__kmp_tile_depth) 526 ++depth; 527 528 // Allocate the data structure to be returned. 529 AddrUnsPair *retval = 530 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 531 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 532 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 533 534 // When affinity is off, this routine will still be called to set 535 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 536 // nCoresPerPkg, & nPackages. Make sure all these vars are set 537 // correctly, and return if affinity is not enabled. 538 539 hwloc_obj_t socket, node, tile; 540 int nActiveThreads = 0; 541 int socket_id = 0; 542 // re-calculate globals to count only accessible resources 543 __kmp_ncores = nPackages = nCoresPerPkg = __kmp_nThreadsPerCore = 0; 544 nNodePerPkg = nTilePerPkg = nTilePerNode = nCorePerNode = nCorePerTile = 0; 545 for (socket = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0); socket != NULL; 546 socket = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, socket), 547 socket_id++) { 548 labels[0] = socket_id; 549 if (__kmp_numa_detected) { 550 int NN; 551 int n_active_nodes = 0; 552 node = NULL; 553 NN = __kmp_hwloc_count_children_by_type(tp, socket, HWLOC_OBJ_NUMANODE, 554 &node); 555 for (int node_id = 0; node_id < NN; ++node_id, node = node->next_cousin) { 556 labels[1] = node_id; 557 if (__kmp_tile_depth) { 558 // NUMA + tiles 559 int NT; 560 int n_active_tiles = 0; 561 tile = NULL; 562 NT = __kmp_hwloc_count_children_by_depth(tp, node, __kmp_tile_depth, 563 &tile); 564 for (int tl_id = 0; tl_id < NT; ++tl_id, tile = tile->next_cousin) { 565 labels[2] = tl_id; 566 int n_active_cores = 0; 567 __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads, 568 n_active_cores, tile, 3, labels); 569 if (n_active_cores) { // were there any active cores on the socket? 570 ++n_active_tiles; // count active tiles per node 571 if (n_active_cores > nCorePerTile) 572 nCorePerTile = n_active_cores; // calc maximum 573 } 574 } 575 if (n_active_tiles) { // were there any active tiles on the socket? 576 ++n_active_nodes; // count active nodes per package 577 if (n_active_tiles > nTilePerNode) 578 nTilePerNode = n_active_tiles; // calc maximum 579 } 580 } else { 581 // NUMA, no tiles 582 int n_active_cores = 0; 583 __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads, 584 n_active_cores, node, 2, labels); 585 if (n_active_cores) { // were there any active cores on the socket? 586 ++n_active_nodes; // count active nodes per package 587 if (n_active_cores > nCorePerNode) 588 nCorePerNode = n_active_cores; // calc maximum 589 } 590 } 591 } 592 if (n_active_nodes) { // were there any active nodes on the socket? 593 ++nPackages; // count total active packages 594 if (n_active_nodes > nNodePerPkg) 595 nNodePerPkg = n_active_nodes; // calc maximum 596 } 597 } else { 598 if (__kmp_tile_depth) { 599 // no NUMA, tiles 600 int NT; 601 int n_active_tiles = 0; 602 tile = NULL; 603 NT = __kmp_hwloc_count_children_by_depth(tp, socket, __kmp_tile_depth, 604 &tile); 605 for (int tl_id = 0; tl_id < NT; ++tl_id, tile = tile->next_cousin) { 606 labels[1] = tl_id; 607 int n_active_cores = 0; 608 __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads, 609 n_active_cores, tile, 2, labels); 610 if (n_active_cores) { // were there any active cores on the socket? 611 ++n_active_tiles; // count active tiles per package 612 if (n_active_cores > nCorePerTile) 613 nCorePerTile = n_active_cores; // calc maximum 614 } 615 } 616 if (n_active_tiles) { // were there any active tiles on the socket? 617 ++nPackages; // count total active packages 618 if (n_active_tiles > nTilePerPkg) 619 nTilePerPkg = n_active_tiles; // calc maximum 620 } 621 } else { 622 // no NUMA, no tiles 623 int n_active_cores = 0; 624 __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads, n_active_cores, 625 socket, 1, labels); 626 if (n_active_cores) { // were there any active cores on the socket? 627 ++nPackages; // count total active packages 628 if (n_active_cores > nCoresPerPkg) 629 nCoresPerPkg = n_active_cores; // calc maximum 630 } 631 } 632 } 633 } 634 635 // If there's only one thread context to bind to, return now. 636 KMP_DEBUG_ASSERT(nActiveThreads == __kmp_avail_proc); 637 KMP_ASSERT(nActiveThreads > 0); 638 if (nActiveThreads == 1) { 639 __kmp_ncores = nPackages = 1; 640 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 641 if (__kmp_affinity_verbose) { 642 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 643 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 644 645 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 646 if (__kmp_affinity_respect_mask) { 647 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 648 } else { 649 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 650 } 651 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 652 KMP_INFORM(Uniform, "KMP_AFFINITY"); 653 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 654 __kmp_nThreadsPerCore, __kmp_ncores); 655 } 656 657 if (__kmp_affinity_type == affinity_none) { 658 __kmp_free(retval); 659 KMP_CPU_FREE(oldMask); 660 return 0; 661 } 662 663 // Form an Address object which only includes the package level. 664 Address addr(1); 665 addr.labels[0] = retval[0].first.labels[0]; 666 retval[0].first = addr; 667 668 if (__kmp_affinity_gran_levels < 0) { 669 __kmp_affinity_gran_levels = 0; 670 } 671 672 if (__kmp_affinity_verbose) { 673 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 674 } 675 676 *address2os = retval; 677 KMP_CPU_FREE(oldMask); 678 return 1; 679 } 680 681 // Sort the table by physical Id. 682 qsort(retval, nActiveThreads, sizeof(*retval), 683 __kmp_affinity_cmp_Address_labels); 684 685 // Check to see if the machine topology is uniform 686 int nPUs = nPackages * __kmp_nThreadsPerCore; 687 if (__kmp_numa_detected) { 688 if (__kmp_tile_depth) { // NUMA + tiles 689 nPUs *= (nNodePerPkg * nTilePerNode * nCorePerTile); 690 } else { // NUMA, no tiles 691 nPUs *= (nNodePerPkg * nCorePerNode); 692 } 693 } else { 694 if (__kmp_tile_depth) { // no NUMA, tiles 695 nPUs *= (nTilePerPkg * nCorePerTile); 696 } else { // no NUMA, no tiles 697 nPUs *= nCoresPerPkg; 698 } 699 } 700 unsigned uniform = (nPUs == nActiveThreads); 701 702 // Print the machine topology summary. 703 if (__kmp_affinity_verbose) { 704 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 705 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 706 if (__kmp_affinity_respect_mask) { 707 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 708 } else { 709 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 710 } 711 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 712 if (uniform) { 713 KMP_INFORM(Uniform, "KMP_AFFINITY"); 714 } else { 715 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 716 } 717 if (__kmp_numa_detected) { 718 if (__kmp_tile_depth) { // NUMA + tiles 719 KMP_INFORM(TopologyExtraNoTi, "KMP_AFFINITY", nPackages, nNodePerPkg, 720 nTilePerNode, nCorePerTile, __kmp_nThreadsPerCore, 721 __kmp_ncores); 722 } else { // NUMA, no tiles 723 KMP_INFORM(TopologyExtraNode, "KMP_AFFINITY", nPackages, nNodePerPkg, 724 nCorePerNode, __kmp_nThreadsPerCore, __kmp_ncores); 725 nPUs *= (nNodePerPkg * nCorePerNode); 726 } 727 } else { 728 if (__kmp_tile_depth) { // no NUMA, tiles 729 KMP_INFORM(TopologyExtraTile, "KMP_AFFINITY", nPackages, nTilePerPkg, 730 nCorePerTile, __kmp_nThreadsPerCore, __kmp_ncores); 731 } else { // no NUMA, no tiles 732 kmp_str_buf_t buf; 733 __kmp_str_buf_init(&buf); 734 __kmp_str_buf_print(&buf, "%d", nPackages); 735 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 736 __kmp_nThreadsPerCore, __kmp_ncores); 737 __kmp_str_buf_free(&buf); 738 } 739 } 740 } 741 742 if (__kmp_affinity_type == affinity_none) { 743 __kmp_free(retval); 744 KMP_CPU_FREE(oldMask); 745 return 0; 746 } 747 748 int depth_full = depth; // number of levels before compressing 749 // Find any levels with radiix 1, and remove them from the map 750 // (except for the package level). 751 depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth, 752 levels); 753 KMP_DEBUG_ASSERT(__kmp_affinity_gran != affinity_gran_default); 754 if (__kmp_affinity_gran_levels < 0) { 755 // Set the granularity level based on what levels are modeled 756 // in the machine topology map. 757 __kmp_affinity_gran_levels = 0; // lowest level (e.g. fine) 758 if (__kmp_affinity_gran > affinity_gran_thread) { 759 for (int i = 1; i <= depth_full; ++i) { 760 if (__kmp_affinity_gran <= i) // only count deeper levels 761 break; 762 if (levels[depth_full - i] > 0) 763 __kmp_affinity_gran_levels++; 764 } 765 } 766 if (__kmp_affinity_gran > affinity_gran_package) 767 __kmp_affinity_gran_levels++; // e.g. granularity = group 768 } 769 770 if (__kmp_affinity_verbose) 771 __kmp_affinity_print_hwloc_tp(retval, nActiveThreads, depth, levels); 772 773 KMP_CPU_FREE(oldMask); 774 *address2os = retval; 775 return depth; 776 } 777 #endif // KMP_USE_HWLOC 778 779 // If we don't know how to retrieve the machine's processor topology, or 780 // encounter an error in doing so, this routine is called to form a "flat" 781 // mapping of os thread id's <-> processor id's. 782 static int __kmp_affinity_create_flat_map(AddrUnsPair **address2os, 783 kmp_i18n_id_t *const msg_id) { 784 *address2os = NULL; 785 *msg_id = kmp_i18n_null; 786 787 // Even if __kmp_affinity_type == affinity_none, this routine might still 788 // called to set __kmp_ncores, as well as 789 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 790 if (!KMP_AFFINITY_CAPABLE()) { 791 KMP_ASSERT(__kmp_affinity_type == affinity_none); 792 __kmp_ncores = nPackages = __kmp_xproc; 793 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 794 if (__kmp_affinity_verbose) { 795 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY"); 796 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 797 KMP_INFORM(Uniform, "KMP_AFFINITY"); 798 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 799 __kmp_nThreadsPerCore, __kmp_ncores); 800 } 801 return 0; 802 } 803 804 // When affinity is off, this routine will still be called to set 805 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 806 // Make sure all these vars are set correctly, and return now if affinity is 807 // not enabled. 808 __kmp_ncores = nPackages = __kmp_avail_proc; 809 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 810 if (__kmp_affinity_verbose) { 811 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 812 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 813 __kmp_affin_fullMask); 814 815 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY"); 816 if (__kmp_affinity_respect_mask) { 817 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 818 } else { 819 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 820 } 821 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 822 KMP_INFORM(Uniform, "KMP_AFFINITY"); 823 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 824 __kmp_nThreadsPerCore, __kmp_ncores); 825 } 826 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 827 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 828 if (__kmp_affinity_type == affinity_none) { 829 int avail_ct = 0; 830 int i; 831 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 832 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) 833 continue; 834 __kmp_pu_os_idx[avail_ct++] = i; // suppose indices are flat 835 } 836 return 0; 837 } 838 839 // Contruct the data structure to be returned. 840 *address2os = 841 (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 842 int avail_ct = 0; 843 int i; 844 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 845 // Skip this proc if it is not included in the machine model. 846 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 847 continue; 848 } 849 __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat 850 Address addr(1); 851 addr.labels[0] = i; 852 (*address2os)[avail_ct++] = AddrUnsPair(addr, i); 853 } 854 if (__kmp_affinity_verbose) { 855 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 856 } 857 858 if (__kmp_affinity_gran_levels < 0) { 859 // Only the package level is modeled in the machine topology map, 860 // so the #levels of granularity is either 0 or 1. 861 if (__kmp_affinity_gran > affinity_gran_package) { 862 __kmp_affinity_gran_levels = 1; 863 } else { 864 __kmp_affinity_gran_levels = 0; 865 } 866 } 867 return 1; 868 } 869 870 #if KMP_GROUP_AFFINITY 871 872 // If multiple Windows* OS processor groups exist, we can create a 2-level 873 // topology map with the groups at level 0 and the individual procs at level 1. 874 // This facilitates letting the threads float among all procs in a group, 875 // if granularity=group (the default when there are multiple groups). 876 static int __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os, 877 kmp_i18n_id_t *const msg_id) { 878 *address2os = NULL; 879 *msg_id = kmp_i18n_null; 880 881 // If we aren't affinity capable, then return now. 882 // The flat mapping will be used. 883 if (!KMP_AFFINITY_CAPABLE()) { 884 // FIXME set *msg_id 885 return -1; 886 } 887 888 // Contruct the data structure to be returned. 889 *address2os = 890 (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 891 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 892 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 893 int avail_ct = 0; 894 int i; 895 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 896 // Skip this proc if it is not included in the machine model. 897 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 898 continue; 899 } 900 __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat 901 Address addr(2); 902 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR)); 903 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR)); 904 (*address2os)[avail_ct++] = AddrUnsPair(addr, i); 905 906 if (__kmp_affinity_verbose) { 907 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0], 908 addr.labels[1]); 909 } 910 } 911 912 if (__kmp_affinity_gran_levels < 0) { 913 if (__kmp_affinity_gran == affinity_gran_group) { 914 __kmp_affinity_gran_levels = 1; 915 } else if ((__kmp_affinity_gran == affinity_gran_fine) || 916 (__kmp_affinity_gran == affinity_gran_thread)) { 917 __kmp_affinity_gran_levels = 0; 918 } else { 919 const char *gran_str = NULL; 920 if (__kmp_affinity_gran == affinity_gran_core) { 921 gran_str = "core"; 922 } else if (__kmp_affinity_gran == affinity_gran_package) { 923 gran_str = "package"; 924 } else if (__kmp_affinity_gran == affinity_gran_node) { 925 gran_str = "node"; 926 } else { 927 KMP_ASSERT(0); 928 } 929 930 // Warning: can't use affinity granularity \"gran\" with group topology 931 // method, using "thread" 932 __kmp_affinity_gran_levels = 0; 933 } 934 } 935 return 2; 936 } 937 938 #endif /* KMP_GROUP_AFFINITY */ 939 940 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 941 942 static int __kmp_cpuid_mask_width(int count) { 943 int r = 0; 944 945 while ((1 << r) < count) 946 ++r; 947 return r; 948 } 949 950 class apicThreadInfo { 951 public: 952 unsigned osId; // param to __kmp_affinity_bind_thread 953 unsigned apicId; // from cpuid after binding 954 unsigned maxCoresPerPkg; // "" 955 unsigned maxThreadsPerPkg; // "" 956 unsigned pkgId; // inferred from above values 957 unsigned coreId; // "" 958 unsigned threadId; // "" 959 }; 960 961 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, 962 const void *b) { 963 const apicThreadInfo *aa = (const apicThreadInfo *)a; 964 const apicThreadInfo *bb = (const apicThreadInfo *)b; 965 if (aa->pkgId < bb->pkgId) 966 return -1; 967 if (aa->pkgId > bb->pkgId) 968 return 1; 969 if (aa->coreId < bb->coreId) 970 return -1; 971 if (aa->coreId > bb->coreId) 972 return 1; 973 if (aa->threadId < bb->threadId) 974 return -1; 975 if (aa->threadId > bb->threadId) 976 return 1; 977 return 0; 978 } 979 980 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 981 // an algorithm which cycles through the available os threads, setting 982 // the current thread's affinity mask to that thread, and then retrieves 983 // the Apic Id for each thread context using the cpuid instruction. 984 static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os, 985 kmp_i18n_id_t *const msg_id) { 986 kmp_cpuid buf; 987 *address2os = NULL; 988 *msg_id = kmp_i18n_null; 989 990 // Check if cpuid leaf 4 is supported. 991 __kmp_x86_cpuid(0, 0, &buf); 992 if (buf.eax < 4) { 993 *msg_id = kmp_i18n_str_NoLeaf4Support; 994 return -1; 995 } 996 997 // The algorithm used starts by setting the affinity to each available thread 998 // and retrieving info from the cpuid instruction, so if we are not capable of 999 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we 1000 // need to do something else - use the defaults that we calculated from 1001 // issuing cpuid without binding to each proc. 1002 if (!KMP_AFFINITY_CAPABLE()) { 1003 // Hack to try and infer the machine topology using only the data 1004 // available from cpuid on the current thread, and __kmp_xproc. 1005 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1006 1007 // Get an upper bound on the number of threads per package using cpuid(1). 1008 // On some OS/chps combinations where HT is supported by the chip but is 1009 // disabled, this value will be 2 on a single core chip. Usually, it will be 1010 // 2 if HT is enabled and 1 if HT is disabled. 1011 __kmp_x86_cpuid(1, 0, &buf); 1012 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1013 if (maxThreadsPerPkg == 0) { 1014 maxThreadsPerPkg = 1; 1015 } 1016 1017 // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded 1018 // value. 1019 // 1020 // The author of cpu_count.cpp treated this only an upper bound on the 1021 // number of cores, but I haven't seen any cases where it was greater than 1022 // the actual number of cores, so we will treat it as exact in this block of 1023 // code. 1024 // 1025 // First, we need to check if cpuid(4) is supported on this chip. To see if 1026 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or 1027 // greater. 1028 __kmp_x86_cpuid(0, 0, &buf); 1029 if (buf.eax >= 4) { 1030 __kmp_x86_cpuid(4, 0, &buf); 1031 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1032 } else { 1033 nCoresPerPkg = 1; 1034 } 1035 1036 // There is no way to reliably tell if HT is enabled without issuing the 1037 // cpuid instruction from every thread, can correlating the cpuid info, so 1038 // if the machine is not affinity capable, we assume that HT is off. We have 1039 // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine 1040 // does not support HT. 1041 // 1042 // - Older OSes are usually found on machines with older chips, which do not 1043 // support HT. 1044 // - The performance penalty for mistakenly identifying a machine as HT when 1045 // it isn't (which results in blocktime being incorrecly set to 0) is 1046 // greater than the penalty when for mistakenly identifying a machine as 1047 // being 1 thread/core when it is really HT enabled (which results in 1048 // blocktime being incorrectly set to a positive value). 1049 __kmp_ncores = __kmp_xproc; 1050 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1051 __kmp_nThreadsPerCore = 1; 1052 if (__kmp_affinity_verbose) { 1053 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY"); 1054 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1055 if (__kmp_affinity_uniform_topology()) { 1056 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1057 } else { 1058 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1059 } 1060 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1061 __kmp_nThreadsPerCore, __kmp_ncores); 1062 } 1063 return 0; 1064 } 1065 1066 // From here on, we can assume that it is safe to call 1067 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 1068 // __kmp_affinity_type = affinity_none. 1069 1070 // Save the affinity mask for the current thread. 1071 kmp_affin_mask_t *oldMask; 1072 KMP_CPU_ALLOC(oldMask); 1073 KMP_ASSERT(oldMask != NULL); 1074 __kmp_get_system_affinity(oldMask, TRUE); 1075 1076 // Run through each of the available contexts, binding the current thread 1077 // to it, and obtaining the pertinent information using the cpuid instr. 1078 // 1079 // The relevant information is: 1080 // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 1081 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 1082 // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value 1083 // of this field determines the width of the core# + thread# fields in the 1084 // Apic Id. It is also an upper bound on the number of threads per 1085 // package, but it has been verified that situations happen were it is not 1086 // exact. In particular, on certain OS/chip combinations where Intel(R) 1087 // Hyper-Threading Technology is supported by the chip but has been 1088 // disabled, the value of this field will be 2 (for a single core chip). 1089 // On other OS/chip combinations supporting Intel(R) Hyper-Threading 1090 // Technology, the value of this field will be 1 when Intel(R) 1091 // Hyper-Threading Technology is disabled and 2 when it is enabled. 1092 // - Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The value 1093 // of this field (+1) determines the width of the core# field in the Apic 1094 // Id. The comments in "cpucount.cpp" say that this value is an upper 1095 // bound, but the IA-32 architecture manual says that it is exactly the 1096 // number of cores per package, and I haven't seen any case where it 1097 // wasn't. 1098 // 1099 // From this information, deduce the package Id, core Id, and thread Id, 1100 // and set the corresponding fields in the apicThreadInfo struct. 1101 unsigned i; 1102 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 1103 __kmp_avail_proc * sizeof(apicThreadInfo)); 1104 unsigned nApics = 0; 1105 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1106 // Skip this proc if it is not included in the machine model. 1107 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1108 continue; 1109 } 1110 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 1111 1112 __kmp_affinity_dispatch->bind_thread(i); 1113 threadInfo[nApics].osId = i; 1114 1115 // The apic id and max threads per pkg come from cpuid(1). 1116 __kmp_x86_cpuid(1, 0, &buf); 1117 if (((buf.edx >> 9) & 1) == 0) { 1118 __kmp_set_system_affinity(oldMask, TRUE); 1119 __kmp_free(threadInfo); 1120 KMP_CPU_FREE(oldMask); 1121 *msg_id = kmp_i18n_str_ApicNotPresent; 1122 return -1; 1123 } 1124 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 1125 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1126 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 1127 threadInfo[nApics].maxThreadsPerPkg = 1; 1128 } 1129 1130 // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded 1131 // value. 1132 // 1133 // First, we need to check if cpuid(4) is supported on this chip. To see if 1134 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n 1135 // or greater. 1136 __kmp_x86_cpuid(0, 0, &buf); 1137 if (buf.eax >= 4) { 1138 __kmp_x86_cpuid(4, 0, &buf); 1139 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1140 } else { 1141 threadInfo[nApics].maxCoresPerPkg = 1; 1142 } 1143 1144 // Infer the pkgId / coreId / threadId using only the info obtained locally. 1145 int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg); 1146 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 1147 1148 int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg); 1149 int widthT = widthCT - widthC; 1150 if (widthT < 0) { 1151 // I've never seen this one happen, but I suppose it could, if the cpuid 1152 // instruction on a chip was really screwed up. Make sure to restore the 1153 // affinity mask before the tail call. 1154 __kmp_set_system_affinity(oldMask, TRUE); 1155 __kmp_free(threadInfo); 1156 KMP_CPU_FREE(oldMask); 1157 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1158 return -1; 1159 } 1160 1161 int maskC = (1 << widthC) - 1; 1162 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC; 1163 1164 int maskT = (1 << widthT) - 1; 1165 threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT; 1166 1167 nApics++; 1168 } 1169 1170 // We've collected all the info we need. 1171 // Restore the old affinity mask for this thread. 1172 __kmp_set_system_affinity(oldMask, TRUE); 1173 1174 // If there's only one thread context to bind to, form an Address object 1175 // with depth 1 and return immediately (or, if affinity is off, set 1176 // address2os to NULL and return). 1177 // 1178 // If it is configured to omit the package level when there is only a single 1179 // package, the logic at the end of this routine won't work if there is only 1180 // a single thread - it would try to form an Address object with depth 0. 1181 KMP_ASSERT(nApics > 0); 1182 if (nApics == 1) { 1183 __kmp_ncores = nPackages = 1; 1184 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1185 if (__kmp_affinity_verbose) { 1186 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1187 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1188 1189 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1190 if (__kmp_affinity_respect_mask) { 1191 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1192 } else { 1193 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1194 } 1195 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1196 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1197 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1198 __kmp_nThreadsPerCore, __kmp_ncores); 1199 } 1200 1201 if (__kmp_affinity_type == affinity_none) { 1202 __kmp_free(threadInfo); 1203 KMP_CPU_FREE(oldMask); 1204 return 0; 1205 } 1206 1207 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair)); 1208 Address addr(1); 1209 addr.labels[0] = threadInfo[0].pkgId; 1210 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId); 1211 1212 if (__kmp_affinity_gran_levels < 0) { 1213 __kmp_affinity_gran_levels = 0; 1214 } 1215 1216 if (__kmp_affinity_verbose) { 1217 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 1218 } 1219 1220 __kmp_free(threadInfo); 1221 KMP_CPU_FREE(oldMask); 1222 return 1; 1223 } 1224 1225 // Sort the threadInfo table by physical Id. 1226 qsort(threadInfo, nApics, sizeof(*threadInfo), 1227 __kmp_affinity_cmp_apicThreadInfo_phys_id); 1228 1229 // The table is now sorted by pkgId / coreId / threadId, but we really don't 1230 // know the radix of any of the fields. pkgId's may be sparsely assigned among 1231 // the chips on a system. Although coreId's are usually assigned 1232 // [0 .. coresPerPkg-1] and threadId's are usually assigned 1233 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1234 // 1235 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 1236 // total # packages) are at this point - we want to determine that now. We 1237 // only have an upper bound on the first two figures. 1238 // 1239 // We also perform a consistency check at this point: the values returned by 1240 // the cpuid instruction for any thread bound to a given package had better 1241 // return the same info for maxThreadsPerPkg and maxCoresPerPkg. 1242 nPackages = 1; 1243 nCoresPerPkg = 1; 1244 __kmp_nThreadsPerCore = 1; 1245 unsigned nCores = 1; 1246 1247 unsigned pkgCt = 1; // to determine radii 1248 unsigned lastPkgId = threadInfo[0].pkgId; 1249 unsigned coreCt = 1; 1250 unsigned lastCoreId = threadInfo[0].coreId; 1251 unsigned threadCt = 1; 1252 unsigned lastThreadId = threadInfo[0].threadId; 1253 1254 // intra-pkg consist checks 1255 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 1256 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 1257 1258 for (i = 1; i < nApics; i++) { 1259 if (threadInfo[i].pkgId != lastPkgId) { 1260 nCores++; 1261 pkgCt++; 1262 lastPkgId = threadInfo[i].pkgId; 1263 if ((int)coreCt > nCoresPerPkg) 1264 nCoresPerPkg = coreCt; 1265 coreCt = 1; 1266 lastCoreId = threadInfo[i].coreId; 1267 if ((int)threadCt > __kmp_nThreadsPerCore) 1268 __kmp_nThreadsPerCore = threadCt; 1269 threadCt = 1; 1270 lastThreadId = threadInfo[i].threadId; 1271 1272 // This is a different package, so go on to the next iteration without 1273 // doing any consistency checks. Reset the consistency check vars, though. 1274 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 1275 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 1276 continue; 1277 } 1278 1279 if (threadInfo[i].coreId != lastCoreId) { 1280 nCores++; 1281 coreCt++; 1282 lastCoreId = threadInfo[i].coreId; 1283 if ((int)threadCt > __kmp_nThreadsPerCore) 1284 __kmp_nThreadsPerCore = threadCt; 1285 threadCt = 1; 1286 lastThreadId = threadInfo[i].threadId; 1287 } else if (threadInfo[i].threadId != lastThreadId) { 1288 threadCt++; 1289 lastThreadId = threadInfo[i].threadId; 1290 } else { 1291 __kmp_free(threadInfo); 1292 KMP_CPU_FREE(oldMask); 1293 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1294 return -1; 1295 } 1296 1297 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 1298 // fields agree between all the threads bounds to a given package. 1299 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) || 1300 (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 1301 __kmp_free(threadInfo); 1302 KMP_CPU_FREE(oldMask); 1303 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1304 return -1; 1305 } 1306 } 1307 nPackages = pkgCt; 1308 if ((int)coreCt > nCoresPerPkg) 1309 nCoresPerPkg = coreCt; 1310 if ((int)threadCt > __kmp_nThreadsPerCore) 1311 __kmp_nThreadsPerCore = threadCt; 1312 1313 // When affinity is off, this routine will still be called to set 1314 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1315 // Make sure all these vars are set correctly, and return now if affinity is 1316 // not enabled. 1317 __kmp_ncores = nCores; 1318 if (__kmp_affinity_verbose) { 1319 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1320 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1321 1322 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1323 if (__kmp_affinity_respect_mask) { 1324 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1325 } else { 1326 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1327 } 1328 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1329 if (__kmp_affinity_uniform_topology()) { 1330 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1331 } else { 1332 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1333 } 1334 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1335 __kmp_nThreadsPerCore, __kmp_ncores); 1336 } 1337 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 1338 KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc); 1339 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 1340 for (i = 0; i < nApics; ++i) { 1341 __kmp_pu_os_idx[i] = threadInfo[i].osId; 1342 } 1343 if (__kmp_affinity_type == affinity_none) { 1344 __kmp_free(threadInfo); 1345 KMP_CPU_FREE(oldMask); 1346 return 0; 1347 } 1348 1349 // Now that we've determined the number of packages, the number of cores per 1350 // package, and the number of threads per core, we can construct the data 1351 // structure that is to be returned. 1352 int pkgLevel = 0; 1353 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1; 1354 int threadLevel = 1355 (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 1356 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 1357 1358 KMP_ASSERT(depth > 0); 1359 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1360 1361 for (i = 0; i < nApics; ++i) { 1362 Address addr(depth); 1363 unsigned os = threadInfo[i].osId; 1364 int d = 0; 1365 1366 if (pkgLevel >= 0) { 1367 addr.labels[d++] = threadInfo[i].pkgId; 1368 } 1369 if (coreLevel >= 0) { 1370 addr.labels[d++] = threadInfo[i].coreId; 1371 } 1372 if (threadLevel >= 0) { 1373 addr.labels[d++] = threadInfo[i].threadId; 1374 } 1375 (*address2os)[i] = AddrUnsPair(addr, os); 1376 } 1377 1378 if (__kmp_affinity_gran_levels < 0) { 1379 // Set the granularity level based on what levels are modeled in the machine 1380 // topology map. 1381 __kmp_affinity_gran_levels = 0; 1382 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1383 __kmp_affinity_gran_levels++; 1384 } 1385 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1386 __kmp_affinity_gran_levels++; 1387 } 1388 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) { 1389 __kmp_affinity_gran_levels++; 1390 } 1391 } 1392 1393 if (__kmp_affinity_verbose) { 1394 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel, 1395 coreLevel, threadLevel); 1396 } 1397 1398 __kmp_free(threadInfo); 1399 KMP_CPU_FREE(oldMask); 1400 return depth; 1401 } 1402 1403 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 1404 // architectures support a newer interface for specifying the x2APIC Ids, 1405 // based on cpuid leaf 11. 1406 static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os, 1407 kmp_i18n_id_t *const msg_id) { 1408 kmp_cpuid buf; 1409 *address2os = NULL; 1410 *msg_id = kmp_i18n_null; 1411 1412 // Check to see if cpuid leaf 11 is supported. 1413 __kmp_x86_cpuid(0, 0, &buf); 1414 if (buf.eax < 11) { 1415 *msg_id = kmp_i18n_str_NoLeaf11Support; 1416 return -1; 1417 } 1418 __kmp_x86_cpuid(11, 0, &buf); 1419 if (buf.ebx == 0) { 1420 *msg_id = kmp_i18n_str_NoLeaf11Support; 1421 return -1; 1422 } 1423 1424 // Find the number of levels in the machine topology. While we're at it, get 1425 // the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will try to 1426 // get more accurate values later by explicitly counting them, but get 1427 // reasonable defaults now, in case we return early. 1428 int level; 1429 int threadLevel = -1; 1430 int coreLevel = -1; 1431 int pkgLevel = -1; 1432 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1433 1434 for (level = 0;; level++) { 1435 if (level > 31) { 1436 // FIXME: Hack for DPD200163180 1437 // 1438 // If level is big then something went wrong -> exiting 1439 // 1440 // There could actually be 32 valid levels in the machine topology, but so 1441 // far, the only machine we have seen which does not exit this loop before 1442 // iteration 32 has fubar x2APIC settings. 1443 // 1444 // For now, just reject this case based upon loop trip count. 1445 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1446 return -1; 1447 } 1448 __kmp_x86_cpuid(11, level, &buf); 1449 if (buf.ebx == 0) { 1450 if (pkgLevel < 0) { 1451 // Will infer nPackages from __kmp_xproc 1452 pkgLevel = level; 1453 level++; 1454 } 1455 break; 1456 } 1457 int kind = (buf.ecx >> 8) & 0xff; 1458 if (kind == 1) { 1459 // SMT level 1460 threadLevel = level; 1461 coreLevel = -1; 1462 pkgLevel = -1; 1463 __kmp_nThreadsPerCore = buf.ebx & 0xffff; 1464 if (__kmp_nThreadsPerCore == 0) { 1465 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1466 return -1; 1467 } 1468 } else if (kind == 2) { 1469 // core level 1470 coreLevel = level; 1471 pkgLevel = -1; 1472 nCoresPerPkg = buf.ebx & 0xffff; 1473 if (nCoresPerPkg == 0) { 1474 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1475 return -1; 1476 } 1477 } else { 1478 if (level <= 0) { 1479 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1480 return -1; 1481 } 1482 if (pkgLevel >= 0) { 1483 continue; 1484 } 1485 pkgLevel = level; 1486 nPackages = buf.ebx & 0xffff; 1487 if (nPackages == 0) { 1488 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1489 return -1; 1490 } 1491 } 1492 } 1493 int depth = level; 1494 1495 // In the above loop, "level" was counted from the finest level (usually 1496 // thread) to the coarsest. The caller expects that we will place the labels 1497 // in (*address2os)[].first.labels[] in the inverse order, so we need to 1498 // invert the vars saying which level means what. 1499 if (threadLevel >= 0) { 1500 threadLevel = depth - threadLevel - 1; 1501 } 1502 if (coreLevel >= 0) { 1503 coreLevel = depth - coreLevel - 1; 1504 } 1505 KMP_DEBUG_ASSERT(pkgLevel >= 0); 1506 pkgLevel = depth - pkgLevel - 1; 1507 1508 // The algorithm used starts by setting the affinity to each available thread 1509 // and retrieving info from the cpuid instruction, so if we are not capable of 1510 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we 1511 // need to do something else - use the defaults that we calculated from 1512 // issuing cpuid without binding to each proc. 1513 if (!KMP_AFFINITY_CAPABLE()) { 1514 // Hack to try and infer the machine topology using only the data 1515 // available from cpuid on the current thread, and __kmp_xproc. 1516 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1517 1518 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1519 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1520 if (__kmp_affinity_verbose) { 1521 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 1522 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1523 if (__kmp_affinity_uniform_topology()) { 1524 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1525 } else { 1526 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1527 } 1528 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1529 __kmp_nThreadsPerCore, __kmp_ncores); 1530 } 1531 return 0; 1532 } 1533 1534 // From here on, we can assume that it is safe to call 1535 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 1536 // __kmp_affinity_type = affinity_none. 1537 1538 // Save the affinity mask for the current thread. 1539 kmp_affin_mask_t *oldMask; 1540 KMP_CPU_ALLOC(oldMask); 1541 __kmp_get_system_affinity(oldMask, TRUE); 1542 1543 // Allocate the data structure to be returned. 1544 AddrUnsPair *retval = 1545 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 1546 1547 // Run through each of the available contexts, binding the current thread 1548 // to it, and obtaining the pertinent information using the cpuid instr. 1549 unsigned int proc; 1550 int nApics = 0; 1551 KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) { 1552 // Skip this proc if it is not included in the machine model. 1553 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 1554 continue; 1555 } 1556 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc); 1557 1558 __kmp_affinity_dispatch->bind_thread(proc); 1559 1560 // Extract labels for each level in the machine topology map from Apic ID. 1561 Address addr(depth); 1562 int prev_shift = 0; 1563 1564 for (level = 0; level < depth; level++) { 1565 __kmp_x86_cpuid(11, level, &buf); 1566 unsigned apicId = buf.edx; 1567 if (buf.ebx == 0) { 1568 if (level != depth - 1) { 1569 KMP_CPU_FREE(oldMask); 1570 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1571 return -1; 1572 } 1573 addr.labels[depth - level - 1] = apicId >> prev_shift; 1574 level++; 1575 break; 1576 } 1577 int shift = buf.eax & 0x1f; 1578 int mask = (1 << shift) - 1; 1579 addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift; 1580 prev_shift = shift; 1581 } 1582 if (level != depth) { 1583 KMP_CPU_FREE(oldMask); 1584 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1585 return -1; 1586 } 1587 1588 retval[nApics] = AddrUnsPair(addr, proc); 1589 nApics++; 1590 } 1591 1592 // We've collected all the info we need. 1593 // Restore the old affinity mask for this thread. 1594 __kmp_set_system_affinity(oldMask, TRUE); 1595 1596 // If there's only one thread context to bind to, return now. 1597 KMP_ASSERT(nApics > 0); 1598 if (nApics == 1) { 1599 __kmp_ncores = nPackages = 1; 1600 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1601 if (__kmp_affinity_verbose) { 1602 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1603 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1604 1605 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1606 if (__kmp_affinity_respect_mask) { 1607 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1608 } else { 1609 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1610 } 1611 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1612 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1613 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1614 __kmp_nThreadsPerCore, __kmp_ncores); 1615 } 1616 1617 if (__kmp_affinity_type == affinity_none) { 1618 __kmp_free(retval); 1619 KMP_CPU_FREE(oldMask); 1620 return 0; 1621 } 1622 1623 // Form an Address object which only includes the package level. 1624 Address addr(1); 1625 addr.labels[0] = retval[0].first.labels[pkgLevel]; 1626 retval[0].first = addr; 1627 1628 if (__kmp_affinity_gran_levels < 0) { 1629 __kmp_affinity_gran_levels = 0; 1630 } 1631 1632 if (__kmp_affinity_verbose) { 1633 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 1634 } 1635 1636 *address2os = retval; 1637 KMP_CPU_FREE(oldMask); 1638 return 1; 1639 } 1640 1641 // Sort the table by physical Id. 1642 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels); 1643 1644 // Find the radix at each of the levels. 1645 unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1646 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1647 unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1648 unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1649 for (level = 0; level < depth; level++) { 1650 totals[level] = 1; 1651 maxCt[level] = 1; 1652 counts[level] = 1; 1653 last[level] = retval[0].first.labels[level]; 1654 } 1655 1656 // From here on, the iteration variable "level" runs from the finest level to 1657 // the coarsest, i.e. we iterate forward through 1658 // (*address2os)[].first.labels[] - in the previous loops, we iterated 1659 // backwards. 1660 for (proc = 1; (int)proc < nApics; proc++) { 1661 int level; 1662 for (level = 0; level < depth; level++) { 1663 if (retval[proc].first.labels[level] != last[level]) { 1664 int j; 1665 for (j = level + 1; j < depth; j++) { 1666 totals[j]++; 1667 counts[j] = 1; 1668 // The line below causes printing incorrect topology information in 1669 // case the max value for some level (maxCt[level]) is encountered 1670 // earlier than some less value while going through the array. For 1671 // example, let pkg0 has 4 cores and pkg1 has 2 cores. Then 1672 // maxCt[1] == 2 1673 // whereas it must be 4. 1674 // TODO!!! Check if it can be commented safely 1675 // maxCt[j] = 1; 1676 last[j] = retval[proc].first.labels[j]; 1677 } 1678 totals[level]++; 1679 counts[level]++; 1680 if (counts[level] > maxCt[level]) { 1681 maxCt[level] = counts[level]; 1682 } 1683 last[level] = retval[proc].first.labels[level]; 1684 break; 1685 } else if (level == depth - 1) { 1686 __kmp_free(last); 1687 __kmp_free(maxCt); 1688 __kmp_free(counts); 1689 __kmp_free(totals); 1690 __kmp_free(retval); 1691 KMP_CPU_FREE(oldMask); 1692 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 1693 return -1; 1694 } 1695 } 1696 } 1697 1698 // When affinity is off, this routine will still be called to set 1699 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1700 // Make sure all these vars are set correctly, and return if affinity is not 1701 // enabled. 1702 if (threadLevel >= 0) { 1703 __kmp_nThreadsPerCore = maxCt[threadLevel]; 1704 } else { 1705 __kmp_nThreadsPerCore = 1; 1706 } 1707 nPackages = totals[pkgLevel]; 1708 1709 if (coreLevel >= 0) { 1710 __kmp_ncores = totals[coreLevel]; 1711 nCoresPerPkg = maxCt[coreLevel]; 1712 } else { 1713 __kmp_ncores = nPackages; 1714 nCoresPerPkg = 1; 1715 } 1716 1717 // Check to see if the machine topology is uniform 1718 unsigned prod = maxCt[0]; 1719 for (level = 1; level < depth; level++) { 1720 prod *= maxCt[level]; 1721 } 1722 bool uniform = (prod == totals[level - 1]); 1723 1724 // Print the machine topology summary. 1725 if (__kmp_affinity_verbose) { 1726 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 1727 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1728 1729 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1730 if (__kmp_affinity_respect_mask) { 1731 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 1732 } else { 1733 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 1734 } 1735 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1736 if (uniform) { 1737 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1738 } else { 1739 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1740 } 1741 1742 kmp_str_buf_t buf; 1743 __kmp_str_buf_init(&buf); 1744 1745 __kmp_str_buf_print(&buf, "%d", totals[0]); 1746 for (level = 1; level <= pkgLevel; level++) { 1747 __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 1748 } 1749 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 1750 __kmp_nThreadsPerCore, __kmp_ncores); 1751 1752 __kmp_str_buf_free(&buf); 1753 } 1754 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 1755 KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc); 1756 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 1757 for (proc = 0; (int)proc < nApics; ++proc) { 1758 __kmp_pu_os_idx[proc] = retval[proc].second; 1759 } 1760 if (__kmp_affinity_type == affinity_none) { 1761 __kmp_free(last); 1762 __kmp_free(maxCt); 1763 __kmp_free(counts); 1764 __kmp_free(totals); 1765 __kmp_free(retval); 1766 KMP_CPU_FREE(oldMask); 1767 return 0; 1768 } 1769 1770 // Find any levels with radiix 1, and remove them from the map 1771 // (except for the package level). 1772 int new_depth = 0; 1773 for (level = 0; level < depth; level++) { 1774 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1775 continue; 1776 } 1777 new_depth++; 1778 } 1779 1780 // If we are removing any levels, allocate a new vector to return, 1781 // and copy the relevant information to it. 1782 if (new_depth != depth) { 1783 AddrUnsPair *new_retval = 1784 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1785 for (proc = 0; (int)proc < nApics; proc++) { 1786 Address addr(new_depth); 1787 new_retval[proc] = AddrUnsPair(addr, retval[proc].second); 1788 } 1789 int new_level = 0; 1790 int newPkgLevel = -1; 1791 int newCoreLevel = -1; 1792 int newThreadLevel = -1; 1793 for (level = 0; level < depth; level++) { 1794 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1795 // Remove this level. Never remove the package level 1796 continue; 1797 } 1798 if (level == pkgLevel) { 1799 newPkgLevel = new_level; 1800 } 1801 if (level == coreLevel) { 1802 newCoreLevel = new_level; 1803 } 1804 if (level == threadLevel) { 1805 newThreadLevel = new_level; 1806 } 1807 for (proc = 0; (int)proc < nApics; proc++) { 1808 new_retval[proc].first.labels[new_level] = 1809 retval[proc].first.labels[level]; 1810 } 1811 new_level++; 1812 } 1813 1814 __kmp_free(retval); 1815 retval = new_retval; 1816 depth = new_depth; 1817 pkgLevel = newPkgLevel; 1818 coreLevel = newCoreLevel; 1819 threadLevel = newThreadLevel; 1820 } 1821 1822 if (__kmp_affinity_gran_levels < 0) { 1823 // Set the granularity level based on what levels are modeled 1824 // in the machine topology map. 1825 __kmp_affinity_gran_levels = 0; 1826 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1827 __kmp_affinity_gran_levels++; 1828 } 1829 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1830 __kmp_affinity_gran_levels++; 1831 } 1832 if (__kmp_affinity_gran > affinity_gran_package) { 1833 __kmp_affinity_gran_levels++; 1834 } 1835 } 1836 1837 if (__kmp_affinity_verbose) { 1838 __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, coreLevel, 1839 threadLevel); 1840 } 1841 1842 __kmp_free(last); 1843 __kmp_free(maxCt); 1844 __kmp_free(counts); 1845 __kmp_free(totals); 1846 KMP_CPU_FREE(oldMask); 1847 *address2os = retval; 1848 return depth; 1849 } 1850 1851 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1852 1853 #define osIdIndex 0 1854 #define threadIdIndex 1 1855 #define coreIdIndex 2 1856 #define pkgIdIndex 3 1857 #define nodeIdIndex 4 1858 1859 typedef unsigned *ProcCpuInfo; 1860 static unsigned maxIndex = pkgIdIndex; 1861 1862 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, 1863 const void *b) { 1864 unsigned i; 1865 const unsigned *aa = *(unsigned *const *)a; 1866 const unsigned *bb = *(unsigned *const *)b; 1867 for (i = maxIndex;; i--) { 1868 if (aa[i] < bb[i]) 1869 return -1; 1870 if (aa[i] > bb[i]) 1871 return 1; 1872 if (i == osIdIndex) 1873 break; 1874 } 1875 return 0; 1876 } 1877 1878 #if KMP_USE_HIER_SCHED 1879 // Set the array sizes for the hierarchy layers 1880 static void __kmp_dispatch_set_hierarchy_values() { 1881 // Set the maximum number of L1's to number of cores 1882 // Set the maximum number of L2's to to either number of cores / 2 for 1883 // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing 1884 // Or the number of cores for Intel(R) Xeon(R) processors 1885 // Set the maximum number of NUMA nodes and L3's to number of packages 1886 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] = 1887 nPackages * nCoresPerPkg * __kmp_nThreadsPerCore; 1888 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores; 1889 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) 1890 if (__kmp_mic_type >= mic3) 1891 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2; 1892 else 1893 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) 1894 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores; 1895 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages; 1896 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages; 1897 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1; 1898 // Set the number of threads per unit 1899 // Number of hardware threads per L1/L2/L3/NUMA/LOOP 1900 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1; 1901 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] = 1902 __kmp_nThreadsPerCore; 1903 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) 1904 if (__kmp_mic_type >= mic3) 1905 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] = 1906 2 * __kmp_nThreadsPerCore; 1907 else 1908 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) 1909 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] = 1910 __kmp_nThreadsPerCore; 1911 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] = 1912 nCoresPerPkg * __kmp_nThreadsPerCore; 1913 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] = 1914 nCoresPerPkg * __kmp_nThreadsPerCore; 1915 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] = 1916 nPackages * nCoresPerPkg * __kmp_nThreadsPerCore; 1917 } 1918 1919 // Return the index into the hierarchy for this tid and layer type (L1, L2, etc) 1920 // i.e., this thread's L1 or this thread's L2, etc. 1921 int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) { 1922 int index = type + 1; 1923 int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1]; 1924 KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST); 1925 if (type == kmp_hier_layer_e::LAYER_THREAD) 1926 return tid; 1927 else if (type == kmp_hier_layer_e::LAYER_LOOP) 1928 return 0; 1929 KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0); 1930 if (tid >= num_hw_threads) 1931 tid = tid % num_hw_threads; 1932 return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index]; 1933 } 1934 1935 // Return the number of t1's per t2 1936 int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) { 1937 int i1 = t1 + 1; 1938 int i2 = t2 + 1; 1939 KMP_DEBUG_ASSERT(i1 <= i2); 1940 KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST); 1941 KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST); 1942 KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0); 1943 // (nthreads/t2) / (nthreads/t1) = t1 / t2 1944 return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1]; 1945 } 1946 #endif // KMP_USE_HIER_SCHED 1947 1948 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 1949 // affinity map. 1950 static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, 1951 int *line, 1952 kmp_i18n_id_t *const msg_id, 1953 FILE *f) { 1954 *address2os = NULL; 1955 *msg_id = kmp_i18n_null; 1956 1957 // Scan of the file, and count the number of "processor" (osId) fields, 1958 // and find the highest value of <n> for a node_<n> field. 1959 char buf[256]; 1960 unsigned num_records = 0; 1961 while (!feof(f)) { 1962 buf[sizeof(buf) - 1] = 1; 1963 if (!fgets(buf, sizeof(buf), f)) { 1964 // Read errors presumably because of EOF 1965 break; 1966 } 1967 1968 char s1[] = "processor"; 1969 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1970 num_records++; 1971 continue; 1972 } 1973 1974 // FIXME - this will match "node_<n> <garbage>" 1975 unsigned level; 1976 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 1977 if (nodeIdIndex + level >= maxIndex) { 1978 maxIndex = nodeIdIndex + level; 1979 } 1980 continue; 1981 } 1982 } 1983 1984 // Check for empty file / no valid processor records, or too many. The number 1985 // of records can't exceed the number of valid bits in the affinity mask. 1986 if (num_records == 0) { 1987 *line = 0; 1988 *msg_id = kmp_i18n_str_NoProcRecords; 1989 return -1; 1990 } 1991 if (num_records > (unsigned)__kmp_xproc) { 1992 *line = 0; 1993 *msg_id = kmp_i18n_str_TooManyProcRecords; 1994 return -1; 1995 } 1996 1997 // Set the file pointer back to the begginning, so that we can scan the file 1998 // again, this time performing a full parse of the data. Allocate a vector of 1999 // ProcCpuInfo object, where we will place the data. Adding an extra element 2000 // at the end allows us to remove a lot of extra checks for termination 2001 // conditions. 2002 if (fseek(f, 0, SEEK_SET) != 0) { 2003 *line = 0; 2004 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 2005 return -1; 2006 } 2007 2008 // Allocate the array of records to store the proc info in. The dummy 2009 // element at the end makes the logic in filling them out easier to code. 2010 unsigned **threadInfo = 2011 (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *)); 2012 unsigned i; 2013 for (i = 0; i <= num_records; i++) { 2014 threadInfo[i] = 2015 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2016 } 2017 2018 #define CLEANUP_THREAD_INFO \ 2019 for (i = 0; i <= num_records; i++) { \ 2020 __kmp_free(threadInfo[i]); \ 2021 } \ 2022 __kmp_free(threadInfo); 2023 2024 // A value of UINT_MAX means that we didn't find the field 2025 unsigned __index; 2026 2027 #define INIT_PROC_INFO(p) \ 2028 for (__index = 0; __index <= maxIndex; __index++) { \ 2029 (p)[__index] = UINT_MAX; \ 2030 } 2031 2032 for (i = 0; i <= num_records; i++) { 2033 INIT_PROC_INFO(threadInfo[i]); 2034 } 2035 2036 unsigned num_avail = 0; 2037 *line = 0; 2038 while (!feof(f)) { 2039 // Create an inner scoping level, so that all the goto targets at the end of 2040 // the loop appear in an outer scoping level. This avoids warnings about 2041 // jumping past an initialization to a target in the same block. 2042 { 2043 buf[sizeof(buf) - 1] = 1; 2044 bool long_line = false; 2045 if (!fgets(buf, sizeof(buf), f)) { 2046 // Read errors presumably because of EOF 2047 // If there is valid data in threadInfo[num_avail], then fake 2048 // a blank line in ensure that the last address gets parsed. 2049 bool valid = false; 2050 for (i = 0; i <= maxIndex; i++) { 2051 if (threadInfo[num_avail][i] != UINT_MAX) { 2052 valid = true; 2053 } 2054 } 2055 if (!valid) { 2056 break; 2057 } 2058 buf[0] = 0; 2059 } else if (!buf[sizeof(buf) - 1]) { 2060 // The line is longer than the buffer. Set a flag and don't 2061 // emit an error if we were going to ignore the line, anyway. 2062 long_line = true; 2063 2064 #define CHECK_LINE \ 2065 if (long_line) { \ 2066 CLEANUP_THREAD_INFO; \ 2067 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 2068 return -1; \ 2069 } 2070 } 2071 (*line)++; 2072 2073 char s1[] = "processor"; 2074 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 2075 CHECK_LINE; 2076 char *p = strchr(buf + sizeof(s1) - 1, ':'); 2077 unsigned val; 2078 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2079 goto no_val; 2080 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) 2081 #if KMP_ARCH_AARCH64 2082 // Handle the old AArch64 /proc/cpuinfo layout differently, 2083 // it contains all of the 'processor' entries listed in a 2084 // single 'Processor' section, therefore the normal looking 2085 // for duplicates in that section will always fail. 2086 num_avail++; 2087 #else 2088 goto dup_field; 2089 #endif 2090 threadInfo[num_avail][osIdIndex] = val; 2091 #if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64) 2092 char path[256]; 2093 KMP_SNPRINTF( 2094 path, sizeof(path), 2095 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 2096 threadInfo[num_avail][osIdIndex]); 2097 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 2098 2099 KMP_SNPRINTF(path, sizeof(path), 2100 "/sys/devices/system/cpu/cpu%u/topology/core_id", 2101 threadInfo[num_avail][osIdIndex]); 2102 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 2103 continue; 2104 #else 2105 } 2106 char s2[] = "physical id"; 2107 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 2108 CHECK_LINE; 2109 char *p = strchr(buf + sizeof(s2) - 1, ':'); 2110 unsigned val; 2111 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2112 goto no_val; 2113 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) 2114 goto dup_field; 2115 threadInfo[num_avail][pkgIdIndex] = val; 2116 continue; 2117 } 2118 char s3[] = "core id"; 2119 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 2120 CHECK_LINE; 2121 char *p = strchr(buf + sizeof(s3) - 1, ':'); 2122 unsigned val; 2123 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2124 goto no_val; 2125 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) 2126 goto dup_field; 2127 threadInfo[num_avail][coreIdIndex] = val; 2128 continue; 2129 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 2130 } 2131 char s4[] = "thread id"; 2132 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 2133 CHECK_LINE; 2134 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2135 unsigned val; 2136 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2137 goto no_val; 2138 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) 2139 goto dup_field; 2140 threadInfo[num_avail][threadIdIndex] = val; 2141 continue; 2142 } 2143 unsigned level; 2144 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 2145 CHECK_LINE; 2146 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2147 unsigned val; 2148 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2149 goto no_val; 2150 KMP_ASSERT(nodeIdIndex + level <= maxIndex); 2151 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) 2152 goto dup_field; 2153 threadInfo[num_avail][nodeIdIndex + level] = val; 2154 continue; 2155 } 2156 2157 // We didn't recognize the leading token on the line. There are lots of 2158 // leading tokens that we don't recognize - if the line isn't empty, go on 2159 // to the next line. 2160 if ((*buf != 0) && (*buf != '\n')) { 2161 // If the line is longer than the buffer, read characters 2162 // until we find a newline. 2163 if (long_line) { 2164 int ch; 2165 while (((ch = fgetc(f)) != EOF) && (ch != '\n')) 2166 ; 2167 } 2168 continue; 2169 } 2170 2171 // A newline has signalled the end of the processor record. 2172 // Check that there aren't too many procs specified. 2173 if ((int)num_avail == __kmp_xproc) { 2174 CLEANUP_THREAD_INFO; 2175 *msg_id = kmp_i18n_str_TooManyEntries; 2176 return -1; 2177 } 2178 2179 // Check for missing fields. The osId field must be there, and we 2180 // currently require that the physical id field is specified, also. 2181 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 2182 CLEANUP_THREAD_INFO; 2183 *msg_id = kmp_i18n_str_MissingProcField; 2184 return -1; 2185 } 2186 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 2187 CLEANUP_THREAD_INFO; 2188 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 2189 return -1; 2190 } 2191 2192 // Skip this proc if it is not included in the machine model. 2193 if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], 2194 __kmp_affin_fullMask)) { 2195 INIT_PROC_INFO(threadInfo[num_avail]); 2196 continue; 2197 } 2198 2199 // We have a successful parse of this proc's info. 2200 // Increment the counter, and prepare for the next proc. 2201 num_avail++; 2202 KMP_ASSERT(num_avail <= num_records); 2203 INIT_PROC_INFO(threadInfo[num_avail]); 2204 } 2205 continue; 2206 2207 no_val: 2208 CLEANUP_THREAD_INFO; 2209 *msg_id = kmp_i18n_str_MissingValCpuinfo; 2210 return -1; 2211 2212 dup_field: 2213 CLEANUP_THREAD_INFO; 2214 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 2215 return -1; 2216 } 2217 *line = 0; 2218 2219 #if KMP_MIC && REDUCE_TEAM_SIZE 2220 unsigned teamSize = 0; 2221 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2222 2223 // check for num_records == __kmp_xproc ??? 2224 2225 // If there's only one thread context to bind to, form an Address object with 2226 // depth 1 and return immediately (or, if affinity is off, set address2os to 2227 // NULL and return). 2228 // 2229 // If it is configured to omit the package level when there is only a single 2230 // package, the logic at the end of this routine won't work if there is only a 2231 // single thread - it would try to form an Address object with depth 0. 2232 KMP_ASSERT(num_avail > 0); 2233 KMP_ASSERT(num_avail <= num_records); 2234 if (num_avail == 1) { 2235 __kmp_ncores = 1; 2236 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 2237 if (__kmp_affinity_verbose) { 2238 if (!KMP_AFFINITY_CAPABLE()) { 2239 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2240 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2241 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2242 } else { 2243 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2244 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 2245 __kmp_affin_fullMask); 2246 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2247 if (__kmp_affinity_respect_mask) { 2248 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2249 } else { 2250 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2251 } 2252 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2253 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2254 } 2255 int index; 2256 kmp_str_buf_t buf; 2257 __kmp_str_buf_init(&buf); 2258 __kmp_str_buf_print(&buf, "1"); 2259 for (index = maxIndex - 1; index > pkgIdIndex; index--) { 2260 __kmp_str_buf_print(&buf, " x 1"); 2261 } 2262 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1); 2263 __kmp_str_buf_free(&buf); 2264 } 2265 2266 if (__kmp_affinity_type == affinity_none) { 2267 CLEANUP_THREAD_INFO; 2268 return 0; 2269 } 2270 2271 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair)); 2272 Address addr(1); 2273 addr.labels[0] = threadInfo[0][pkgIdIndex]; 2274 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]); 2275 2276 if (__kmp_affinity_gran_levels < 0) { 2277 __kmp_affinity_gran_levels = 0; 2278 } 2279 2280 if (__kmp_affinity_verbose) { 2281 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 2282 } 2283 2284 CLEANUP_THREAD_INFO; 2285 return 1; 2286 } 2287 2288 // Sort the threadInfo table by physical Id. 2289 qsort(threadInfo, num_avail, sizeof(*threadInfo), 2290 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 2291 2292 // The table is now sorted by pkgId / coreId / threadId, but we really don't 2293 // know the radix of any of the fields. pkgId's may be sparsely assigned among 2294 // the chips on a system. Although coreId's are usually assigned 2295 // [0 .. coresPerPkg-1] and threadId's are usually assigned 2296 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2297 // 2298 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 2299 // total # packages) are at this point - we want to determine that now. We 2300 // only have an upper bound on the first two figures. 2301 unsigned *counts = 2302 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2303 unsigned *maxCt = 2304 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2305 unsigned *totals = 2306 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2307 unsigned *lastId = 2308 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2309 2310 bool assign_thread_ids = false; 2311 unsigned threadIdCt; 2312 unsigned index; 2313 2314 restart_radix_check: 2315 threadIdCt = 0; 2316 2317 // Initialize the counter arrays with data from threadInfo[0]. 2318 if (assign_thread_ids) { 2319 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 2320 threadInfo[0][threadIdIndex] = threadIdCt++; 2321 } else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 2322 threadIdCt = threadInfo[0][threadIdIndex] + 1; 2323 } 2324 } 2325 for (index = 0; index <= maxIndex; index++) { 2326 counts[index] = 1; 2327 maxCt[index] = 1; 2328 totals[index] = 1; 2329 lastId[index] = threadInfo[0][index]; 2330 ; 2331 } 2332 2333 // Run through the rest of the OS procs. 2334 for (i = 1; i < num_avail; i++) { 2335 // Find the most significant index whose id differs from the id for the 2336 // previous OS proc. 2337 for (index = maxIndex; index >= threadIdIndex; index--) { 2338 if (assign_thread_ids && (index == threadIdIndex)) { 2339 // Auto-assign the thread id field if it wasn't specified. 2340 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2341 threadInfo[i][threadIdIndex] = threadIdCt++; 2342 } 2343 // Apparently the thread id field was specified for some entries and not 2344 // others. Start the thread id counter off at the next higher thread id. 2345 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2346 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2347 } 2348 } 2349 if (threadInfo[i][index] != lastId[index]) { 2350 // Run through all indices which are less significant, and reset the 2351 // counts to 1. At all levels up to and including index, we need to 2352 // increment the totals and record the last id. 2353 unsigned index2; 2354 for (index2 = threadIdIndex; index2 < index; index2++) { 2355 totals[index2]++; 2356 if (counts[index2] > maxCt[index2]) { 2357 maxCt[index2] = counts[index2]; 2358 } 2359 counts[index2] = 1; 2360 lastId[index2] = threadInfo[i][index2]; 2361 } 2362 counts[index]++; 2363 totals[index]++; 2364 lastId[index] = threadInfo[i][index]; 2365 2366 if (assign_thread_ids && (index > threadIdIndex)) { 2367 2368 #if KMP_MIC && REDUCE_TEAM_SIZE 2369 // The default team size is the total #threads in the machine 2370 // minus 1 thread for every core that has 3 or more threads. 2371 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2372 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2373 2374 // Restart the thread counter, as we are on a new core. 2375 threadIdCt = 0; 2376 2377 // Auto-assign the thread id field if it wasn't specified. 2378 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2379 threadInfo[i][threadIdIndex] = threadIdCt++; 2380 } 2381 2382 // Aparrently the thread id field was specified for some entries and 2383 // not others. Start the thread id counter off at the next higher 2384 // thread id. 2385 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2386 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2387 } 2388 } 2389 break; 2390 } 2391 } 2392 if (index < threadIdIndex) { 2393 // If thread ids were specified, it is an error if they are not unique. 2394 // Also, check that we waven't already restarted the loop (to be safe - 2395 // shouldn't need to). 2396 if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) { 2397 __kmp_free(lastId); 2398 __kmp_free(totals); 2399 __kmp_free(maxCt); 2400 __kmp_free(counts); 2401 CLEANUP_THREAD_INFO; 2402 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2403 return -1; 2404 } 2405 2406 // If the thread ids were not specified and we see entries entries that 2407 // are duplicates, start the loop over and assign the thread ids manually. 2408 assign_thread_ids = true; 2409 goto restart_radix_check; 2410 } 2411 } 2412 2413 #if KMP_MIC && REDUCE_TEAM_SIZE 2414 // The default team size is the total #threads in the machine 2415 // minus 1 thread for every core that has 3 or more threads. 2416 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2417 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2418 2419 for (index = threadIdIndex; index <= maxIndex; index++) { 2420 if (counts[index] > maxCt[index]) { 2421 maxCt[index] = counts[index]; 2422 } 2423 } 2424 2425 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2426 nCoresPerPkg = maxCt[coreIdIndex]; 2427 nPackages = totals[pkgIdIndex]; 2428 2429 // Check to see if the machine topology is uniform 2430 unsigned prod = totals[maxIndex]; 2431 for (index = threadIdIndex; index < maxIndex; index++) { 2432 prod *= maxCt[index]; 2433 } 2434 bool uniform = (prod == totals[threadIdIndex]); 2435 2436 // When affinity is off, this routine will still be called to set 2437 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 2438 // Make sure all these vars are set correctly, and return now if affinity is 2439 // not enabled. 2440 __kmp_ncores = totals[coreIdIndex]; 2441 2442 if (__kmp_affinity_verbose) { 2443 if (!KMP_AFFINITY_CAPABLE()) { 2444 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2445 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2446 if (uniform) { 2447 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2448 } else { 2449 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2450 } 2451 } else { 2452 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2453 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 2454 __kmp_affin_fullMask); 2455 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2456 if (__kmp_affinity_respect_mask) { 2457 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2458 } else { 2459 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2460 } 2461 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2462 if (uniform) { 2463 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2464 } else { 2465 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2466 } 2467 } 2468 kmp_str_buf_t buf; 2469 __kmp_str_buf_init(&buf); 2470 2471 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]); 2472 for (index = maxIndex - 1; index >= pkgIdIndex; index--) { 2473 __kmp_str_buf_print(&buf, " x %d", maxCt[index]); 2474 } 2475 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex], 2476 maxCt[threadIdIndex], __kmp_ncores); 2477 2478 __kmp_str_buf_free(&buf); 2479 } 2480 2481 #if KMP_MIC && REDUCE_TEAM_SIZE 2482 // Set the default team size. 2483 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2484 __kmp_dflt_team_nth = teamSize; 2485 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting " 2486 "__kmp_dflt_team_nth = %d\n", 2487 __kmp_dflt_team_nth)); 2488 } 2489 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2490 2491 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 2492 KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc); 2493 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 2494 for (i = 0; i < num_avail; ++i) { // fill the os indices 2495 __kmp_pu_os_idx[i] = threadInfo[i][osIdIndex]; 2496 } 2497 2498 if (__kmp_affinity_type == affinity_none) { 2499 __kmp_free(lastId); 2500 __kmp_free(totals); 2501 __kmp_free(maxCt); 2502 __kmp_free(counts); 2503 CLEANUP_THREAD_INFO; 2504 return 0; 2505 } 2506 2507 // Count the number of levels which have more nodes at that level than at the 2508 // parent's level (with there being an implicit root node of the top level). 2509 // This is equivalent to saying that there is at least one node at this level 2510 // which has a sibling. These levels are in the map, and the package level is 2511 // always in the map. 2512 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2513 for (index = threadIdIndex; index < maxIndex; index++) { 2514 KMP_ASSERT(totals[index] >= totals[index + 1]); 2515 inMap[index] = (totals[index] > totals[index + 1]); 2516 } 2517 inMap[maxIndex] = (totals[maxIndex] > 1); 2518 inMap[pkgIdIndex] = true; 2519 2520 int depth = 0; 2521 for (index = threadIdIndex; index <= maxIndex; index++) { 2522 if (inMap[index]) { 2523 depth++; 2524 } 2525 } 2526 KMP_ASSERT(depth > 0); 2527 2528 // Construct the data structure that is to be returned. 2529 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * num_avail); 2530 int pkgLevel = -1; 2531 int coreLevel = -1; 2532 int threadLevel = -1; 2533 2534 for (i = 0; i < num_avail; ++i) { 2535 Address addr(depth); 2536 unsigned os = threadInfo[i][osIdIndex]; 2537 int src_index; 2538 int dst_index = 0; 2539 2540 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2541 if (!inMap[src_index]) { 2542 continue; 2543 } 2544 addr.labels[dst_index] = threadInfo[i][src_index]; 2545 if (src_index == pkgIdIndex) { 2546 pkgLevel = dst_index; 2547 } else if (src_index == coreIdIndex) { 2548 coreLevel = dst_index; 2549 } else if (src_index == threadIdIndex) { 2550 threadLevel = dst_index; 2551 } 2552 dst_index++; 2553 } 2554 (*address2os)[i] = AddrUnsPair(addr, os); 2555 } 2556 2557 if (__kmp_affinity_gran_levels < 0) { 2558 // Set the granularity level based on what levels are modeled 2559 // in the machine topology map. 2560 unsigned src_index; 2561 __kmp_affinity_gran_levels = 0; 2562 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) { 2563 if (!inMap[src_index]) { 2564 continue; 2565 } 2566 switch (src_index) { 2567 case threadIdIndex: 2568 if (__kmp_affinity_gran > affinity_gran_thread) { 2569 __kmp_affinity_gran_levels++; 2570 } 2571 2572 break; 2573 case coreIdIndex: 2574 if (__kmp_affinity_gran > affinity_gran_core) { 2575 __kmp_affinity_gran_levels++; 2576 } 2577 break; 2578 2579 case pkgIdIndex: 2580 if (__kmp_affinity_gran > affinity_gran_package) { 2581 __kmp_affinity_gran_levels++; 2582 } 2583 break; 2584 } 2585 } 2586 } 2587 2588 if (__kmp_affinity_verbose) { 2589 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel, 2590 coreLevel, threadLevel); 2591 } 2592 2593 __kmp_free(inMap); 2594 __kmp_free(lastId); 2595 __kmp_free(totals); 2596 __kmp_free(maxCt); 2597 __kmp_free(counts); 2598 CLEANUP_THREAD_INFO; 2599 return depth; 2600 } 2601 2602 // Create and return a table of affinity masks, indexed by OS thread ID. 2603 // This routine handles OR'ing together all the affinity masks of threads 2604 // that are sufficiently close, if granularity > fine. 2605 static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex, 2606 unsigned *numUnique, 2607 AddrUnsPair *address2os, 2608 unsigned numAddrs) { 2609 // First form a table of affinity masks in order of OS thread id. 2610 unsigned depth; 2611 unsigned maxOsId; 2612 unsigned i; 2613 2614 KMP_ASSERT(numAddrs > 0); 2615 depth = address2os[0].first.depth; 2616 2617 maxOsId = 0; 2618 for (i = numAddrs - 1;; --i) { 2619 unsigned osId = address2os[i].second; 2620 if (osId > maxOsId) { 2621 maxOsId = osId; 2622 } 2623 if (i == 0) 2624 break; 2625 } 2626 kmp_affin_mask_t *osId2Mask; 2627 KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1)); 2628 2629 // Sort the address2os table according to physical order. Doing so will put 2630 // all threads on the same core/package/node in consecutive locations. 2631 qsort(address2os, numAddrs, sizeof(*address2os), 2632 __kmp_affinity_cmp_Address_labels); 2633 2634 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2635 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2636 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2637 } 2638 if (__kmp_affinity_gran_levels >= (int)depth) { 2639 if (__kmp_affinity_verbose || 2640 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 2641 KMP_WARNING(AffThreadsMayMigrate); 2642 } 2643 } 2644 2645 // Run through the table, forming the masks for all threads on each core. 2646 // Threads on the same core will have identical "Address" objects, not 2647 // considering the last level, which must be the thread id. All threads on a 2648 // core will appear consecutively. 2649 unsigned unique = 0; 2650 unsigned j = 0; // index of 1st thread on core 2651 unsigned leader = 0; 2652 Address *leaderAddr = &(address2os[0].first); 2653 kmp_affin_mask_t *sum; 2654 KMP_CPU_ALLOC_ON_STACK(sum); 2655 KMP_CPU_ZERO(sum); 2656 KMP_CPU_SET(address2os[0].second, sum); 2657 for (i = 1; i < numAddrs; i++) { 2658 // If this thread is sufficiently close to the leader (within the 2659 // granularity setting), then set the bit for this os thread in the 2660 // affinity mask for this group, and go on to the next thread. 2661 if (leaderAddr->isClose(address2os[i].first, __kmp_affinity_gran_levels)) { 2662 KMP_CPU_SET(address2os[i].second, sum); 2663 continue; 2664 } 2665 2666 // For every thread in this group, copy the mask to the thread's entry in 2667 // the osId2Mask table. Mark the first address as a leader. 2668 for (; j < i; j++) { 2669 unsigned osId = address2os[j].second; 2670 KMP_DEBUG_ASSERT(osId <= maxOsId); 2671 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2672 KMP_CPU_COPY(mask, sum); 2673 address2os[j].first.leader = (j == leader); 2674 } 2675 unique++; 2676 2677 // Start a new mask. 2678 leader = i; 2679 leaderAddr = &(address2os[i].first); 2680 KMP_CPU_ZERO(sum); 2681 KMP_CPU_SET(address2os[i].second, sum); 2682 } 2683 2684 // For every thread in last group, copy the mask to the thread's 2685 // entry in the osId2Mask table. 2686 for (; j < i; j++) { 2687 unsigned osId = address2os[j].second; 2688 KMP_DEBUG_ASSERT(osId <= maxOsId); 2689 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2690 KMP_CPU_COPY(mask, sum); 2691 address2os[j].first.leader = (j == leader); 2692 } 2693 unique++; 2694 KMP_CPU_FREE_FROM_STACK(sum); 2695 2696 *maxIndex = maxOsId; 2697 *numUnique = unique; 2698 return osId2Mask; 2699 } 2700 2701 // Stuff for the affinity proclist parsers. It's easier to declare these vars 2702 // as file-static than to try and pass them through the calling sequence of 2703 // the recursive-descent OMP_PLACES parser. 2704 static kmp_affin_mask_t *newMasks; 2705 static int numNewMasks; 2706 static int nextNewMask; 2707 2708 #define ADD_MASK(_mask) \ 2709 { \ 2710 if (nextNewMask >= numNewMasks) { \ 2711 int i; \ 2712 numNewMasks *= 2; \ 2713 kmp_affin_mask_t *temp; \ 2714 KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \ 2715 for (i = 0; i < numNewMasks / 2; i++) { \ 2716 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); \ 2717 kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i); \ 2718 KMP_CPU_COPY(dest, src); \ 2719 } \ 2720 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2); \ 2721 newMasks = temp; \ 2722 } \ 2723 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 2724 nextNewMask++; \ 2725 } 2726 2727 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId) \ 2728 { \ 2729 if (((_osId) > _maxOsId) || \ 2730 (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 2731 if (__kmp_affinity_verbose || \ 2732 (__kmp_affinity_warnings && \ 2733 (__kmp_affinity_type != affinity_none))) { \ 2734 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 2735 } \ 2736 } else { \ 2737 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 2738 } \ 2739 } 2740 2741 // Re-parse the proclist (for the explicit affinity type), and form the list 2742 // of affinity newMasks indexed by gtid. 2743 static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 2744 unsigned int *out_numMasks, 2745 const char *proclist, 2746 kmp_affin_mask_t *osId2Mask, 2747 int maxOsId) { 2748 int i; 2749 const char *scan = proclist; 2750 const char *next = proclist; 2751 2752 // We use malloc() for the temporary mask vector, so that we can use 2753 // realloc() to extend it. 2754 numNewMasks = 2; 2755 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 2756 nextNewMask = 0; 2757 kmp_affin_mask_t *sumMask; 2758 KMP_CPU_ALLOC(sumMask); 2759 int setSize = 0; 2760 2761 for (;;) { 2762 int start, end, stride; 2763 2764 SKIP_WS(scan); 2765 next = scan; 2766 if (*next == '\0') { 2767 break; 2768 } 2769 2770 if (*next == '{') { 2771 int num; 2772 setSize = 0; 2773 next++; // skip '{' 2774 SKIP_WS(next); 2775 scan = next; 2776 2777 // Read the first integer in the set. 2778 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist"); 2779 SKIP_DIGITS(next); 2780 num = __kmp_str_to_int(scan, *next); 2781 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2782 2783 // Copy the mask for that osId to the sum (union) mask. 2784 if ((num > maxOsId) || 2785 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2786 if (__kmp_affinity_verbose || 2787 (__kmp_affinity_warnings && 2788 (__kmp_affinity_type != affinity_none))) { 2789 KMP_WARNING(AffIgnoreInvalidProcID, num); 2790 } 2791 KMP_CPU_ZERO(sumMask); 2792 } else { 2793 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2794 setSize = 1; 2795 } 2796 2797 for (;;) { 2798 // Check for end of set. 2799 SKIP_WS(next); 2800 if (*next == '}') { 2801 next++; // skip '}' 2802 break; 2803 } 2804 2805 // Skip optional comma. 2806 if (*next == ',') { 2807 next++; 2808 } 2809 SKIP_WS(next); 2810 2811 // Read the next integer in the set. 2812 scan = next; 2813 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2814 2815 SKIP_DIGITS(next); 2816 num = __kmp_str_to_int(scan, *next); 2817 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2818 2819 // Add the mask for that osId to the sum mask. 2820 if ((num > maxOsId) || 2821 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2822 if (__kmp_affinity_verbose || 2823 (__kmp_affinity_warnings && 2824 (__kmp_affinity_type != affinity_none))) { 2825 KMP_WARNING(AffIgnoreInvalidProcID, num); 2826 } 2827 } else { 2828 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2829 setSize++; 2830 } 2831 } 2832 if (setSize > 0) { 2833 ADD_MASK(sumMask); 2834 } 2835 2836 SKIP_WS(next); 2837 if (*next == ',') { 2838 next++; 2839 } 2840 scan = next; 2841 continue; 2842 } 2843 2844 // Read the first integer. 2845 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2846 SKIP_DIGITS(next); 2847 start = __kmp_str_to_int(scan, *next); 2848 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 2849 SKIP_WS(next); 2850 2851 // If this isn't a range, then add a mask to the list and go on. 2852 if (*next != '-') { 2853 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2854 2855 // Skip optional comma. 2856 if (*next == ',') { 2857 next++; 2858 } 2859 scan = next; 2860 continue; 2861 } 2862 2863 // This is a range. Skip over the '-' and read in the 2nd int. 2864 next++; // skip '-' 2865 SKIP_WS(next); 2866 scan = next; 2867 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2868 SKIP_DIGITS(next); 2869 end = __kmp_str_to_int(scan, *next); 2870 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 2871 2872 // Check for a stride parameter 2873 stride = 1; 2874 SKIP_WS(next); 2875 if (*next == ':') { 2876 // A stride is specified. Skip over the ':" and read the 3rd int. 2877 int sign = +1; 2878 next++; // skip ':' 2879 SKIP_WS(next); 2880 scan = next; 2881 if (*next == '-') { 2882 sign = -1; 2883 next++; 2884 SKIP_WS(next); 2885 scan = next; 2886 } 2887 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2888 SKIP_DIGITS(next); 2889 stride = __kmp_str_to_int(scan, *next); 2890 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 2891 stride *= sign; 2892 } 2893 2894 // Do some range checks. 2895 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 2896 if (stride > 0) { 2897 KMP_ASSERT2(start <= end, "bad explicit proc list"); 2898 } else { 2899 KMP_ASSERT2(start >= end, "bad explicit proc list"); 2900 } 2901 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 2902 2903 // Add the mask for each OS proc # to the list. 2904 if (stride > 0) { 2905 do { 2906 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2907 start += stride; 2908 } while (start <= end); 2909 } else { 2910 do { 2911 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2912 start += stride; 2913 } while (start >= end); 2914 } 2915 2916 // Skip optional comma. 2917 SKIP_WS(next); 2918 if (*next == ',') { 2919 next++; 2920 } 2921 scan = next; 2922 } 2923 2924 *out_numMasks = nextNewMask; 2925 if (nextNewMask == 0) { 2926 *out_masks = NULL; 2927 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 2928 return; 2929 } 2930 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 2931 for (i = 0; i < nextNewMask; i++) { 2932 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 2933 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 2934 KMP_CPU_COPY(dest, src); 2935 } 2936 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 2937 KMP_CPU_FREE(sumMask); 2938 } 2939 2940 #if OMP_40_ENABLED 2941 2942 /*----------------------------------------------------------------------------- 2943 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 2944 places. Again, Here is the grammar: 2945 2946 place_list := place 2947 place_list := place , place_list 2948 place := num 2949 place := place : num 2950 place := place : num : signed 2951 place := { subplacelist } 2952 place := ! place // (lowest priority) 2953 subplace_list := subplace 2954 subplace_list := subplace , subplace_list 2955 subplace := num 2956 subplace := num : num 2957 subplace := num : num : signed 2958 signed := num 2959 signed := + signed 2960 signed := - signed 2961 -----------------------------------------------------------------------------*/ 2962 2963 static void __kmp_process_subplace_list(const char **scan, 2964 kmp_affin_mask_t *osId2Mask, 2965 int maxOsId, kmp_affin_mask_t *tempMask, 2966 int *setSize) { 2967 const char *next; 2968 2969 for (;;) { 2970 int start, count, stride, i; 2971 2972 // Read in the starting proc id 2973 SKIP_WS(*scan); 2974 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 2975 next = *scan; 2976 SKIP_DIGITS(next); 2977 start = __kmp_str_to_int(*scan, *next); 2978 KMP_ASSERT(start >= 0); 2979 *scan = next; 2980 2981 // valid follow sets are ',' ':' and '}' 2982 SKIP_WS(*scan); 2983 if (**scan == '}' || **scan == ',') { 2984 if ((start > maxOsId) || 2985 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2986 if (__kmp_affinity_verbose || 2987 (__kmp_affinity_warnings && 2988 (__kmp_affinity_type != affinity_none))) { 2989 KMP_WARNING(AffIgnoreInvalidProcID, start); 2990 } 2991 } else { 2992 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2993 (*setSize)++; 2994 } 2995 if (**scan == '}') { 2996 break; 2997 } 2998 (*scan)++; // skip ',' 2999 continue; 3000 } 3001 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3002 (*scan)++; // skip ':' 3003 3004 // Read count parameter 3005 SKIP_WS(*scan); 3006 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3007 next = *scan; 3008 SKIP_DIGITS(next); 3009 count = __kmp_str_to_int(*scan, *next); 3010 KMP_ASSERT(count >= 0); 3011 *scan = next; 3012 3013 // valid follow sets are ',' ':' and '}' 3014 SKIP_WS(*scan); 3015 if (**scan == '}' || **scan == ',') { 3016 for (i = 0; i < count; i++) { 3017 if ((start > maxOsId) || 3018 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3019 if (__kmp_affinity_verbose || 3020 (__kmp_affinity_warnings && 3021 (__kmp_affinity_type != affinity_none))) { 3022 KMP_WARNING(AffIgnoreInvalidProcID, start); 3023 } 3024 break; // don't proliferate warnings for large count 3025 } else { 3026 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3027 start++; 3028 (*setSize)++; 3029 } 3030 } 3031 if (**scan == '}') { 3032 break; 3033 } 3034 (*scan)++; // skip ',' 3035 continue; 3036 } 3037 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3038 (*scan)++; // skip ':' 3039 3040 // Read stride parameter 3041 int sign = +1; 3042 for (;;) { 3043 SKIP_WS(*scan); 3044 if (**scan == '+') { 3045 (*scan)++; // skip '+' 3046 continue; 3047 } 3048 if (**scan == '-') { 3049 sign *= -1; 3050 (*scan)++; // skip '-' 3051 continue; 3052 } 3053 break; 3054 } 3055 SKIP_WS(*scan); 3056 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3057 next = *scan; 3058 SKIP_DIGITS(next); 3059 stride = __kmp_str_to_int(*scan, *next); 3060 KMP_ASSERT(stride >= 0); 3061 *scan = next; 3062 stride *= sign; 3063 3064 // valid follow sets are ',' and '}' 3065 SKIP_WS(*scan); 3066 if (**scan == '}' || **scan == ',') { 3067 for (i = 0; i < count; i++) { 3068 if ((start > maxOsId) || 3069 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3070 if (__kmp_affinity_verbose || 3071 (__kmp_affinity_warnings && 3072 (__kmp_affinity_type != affinity_none))) { 3073 KMP_WARNING(AffIgnoreInvalidProcID, start); 3074 } 3075 break; // don't proliferate warnings for large count 3076 } else { 3077 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3078 start += stride; 3079 (*setSize)++; 3080 } 3081 } 3082 if (**scan == '}') { 3083 break; 3084 } 3085 (*scan)++; // skip ',' 3086 continue; 3087 } 3088 3089 KMP_ASSERT2(0, "bad explicit places list"); 3090 } 3091 } 3092 3093 static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 3094 int maxOsId, kmp_affin_mask_t *tempMask, 3095 int *setSize) { 3096 const char *next; 3097 3098 // valid follow sets are '{' '!' and num 3099 SKIP_WS(*scan); 3100 if (**scan == '{') { 3101 (*scan)++; // skip '{' 3102 __kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize); 3103 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 3104 (*scan)++; // skip '}' 3105 } else if (**scan == '!') { 3106 (*scan)++; // skip '!' 3107 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 3108 KMP_CPU_COMPLEMENT(maxOsId, tempMask); 3109 } else if ((**scan >= '0') && (**scan <= '9')) { 3110 next = *scan; 3111 SKIP_DIGITS(next); 3112 int num = __kmp_str_to_int(*scan, *next); 3113 KMP_ASSERT(num >= 0); 3114 if ((num > maxOsId) || 3115 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3116 if (__kmp_affinity_verbose || 3117 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 3118 KMP_WARNING(AffIgnoreInvalidProcID, num); 3119 } 3120 } else { 3121 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 3122 (*setSize)++; 3123 } 3124 *scan = next; // skip num 3125 } else { 3126 KMP_ASSERT2(0, "bad explicit places list"); 3127 } 3128 } 3129 3130 // static void 3131 void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 3132 unsigned int *out_numMasks, 3133 const char *placelist, 3134 kmp_affin_mask_t *osId2Mask, 3135 int maxOsId) { 3136 int i, j, count, stride, sign; 3137 const char *scan = placelist; 3138 const char *next = placelist; 3139 3140 numNewMasks = 2; 3141 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 3142 nextNewMask = 0; 3143 3144 // tempMask is modified based on the previous or initial 3145 // place to form the current place 3146 // previousMask contains the previous place 3147 kmp_affin_mask_t *tempMask; 3148 kmp_affin_mask_t *previousMask; 3149 KMP_CPU_ALLOC(tempMask); 3150 KMP_CPU_ZERO(tempMask); 3151 KMP_CPU_ALLOC(previousMask); 3152 KMP_CPU_ZERO(previousMask); 3153 int setSize = 0; 3154 3155 for (;;) { 3156 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 3157 3158 // valid follow sets are ',' ':' and EOL 3159 SKIP_WS(scan); 3160 if (*scan == '\0' || *scan == ',') { 3161 if (setSize > 0) { 3162 ADD_MASK(tempMask); 3163 } 3164 KMP_CPU_ZERO(tempMask); 3165 setSize = 0; 3166 if (*scan == '\0') { 3167 break; 3168 } 3169 scan++; // skip ',' 3170 continue; 3171 } 3172 3173 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3174 scan++; // skip ':' 3175 3176 // Read count parameter 3177 SKIP_WS(scan); 3178 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 3179 next = scan; 3180 SKIP_DIGITS(next); 3181 count = __kmp_str_to_int(scan, *next); 3182 KMP_ASSERT(count >= 0); 3183 scan = next; 3184 3185 // valid follow sets are ',' ':' and EOL 3186 SKIP_WS(scan); 3187 if (*scan == '\0' || *scan == ',') { 3188 stride = +1; 3189 } else { 3190 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3191 scan++; // skip ':' 3192 3193 // Read stride parameter 3194 sign = +1; 3195 for (;;) { 3196 SKIP_WS(scan); 3197 if (*scan == '+') { 3198 scan++; // skip '+' 3199 continue; 3200 } 3201 if (*scan == '-') { 3202 sign *= -1; 3203 scan++; // skip '-' 3204 continue; 3205 } 3206 break; 3207 } 3208 SKIP_WS(scan); 3209 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 3210 next = scan; 3211 SKIP_DIGITS(next); 3212 stride = __kmp_str_to_int(scan, *next); 3213 KMP_DEBUG_ASSERT(stride >= 0); 3214 scan = next; 3215 stride *= sign; 3216 } 3217 3218 // Add places determined by initial_place : count : stride 3219 for (i = 0; i < count; i++) { 3220 if (setSize == 0) { 3221 break; 3222 } 3223 // Add the current place, then build the next place (tempMask) from that 3224 KMP_CPU_COPY(previousMask, tempMask); 3225 ADD_MASK(previousMask); 3226 KMP_CPU_ZERO(tempMask); 3227 setSize = 0; 3228 KMP_CPU_SET_ITERATE(j, previousMask) { 3229 if (!KMP_CPU_ISSET(j, previousMask)) { 3230 continue; 3231 } 3232 if ((j + stride > maxOsId) || (j + stride < 0) || 3233 (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) || 3234 (!KMP_CPU_ISSET(j + stride, 3235 KMP_CPU_INDEX(osId2Mask, j + stride)))) { 3236 if ((__kmp_affinity_verbose || 3237 (__kmp_affinity_warnings && 3238 (__kmp_affinity_type != affinity_none))) && 3239 i < count - 1) { 3240 KMP_WARNING(AffIgnoreInvalidProcID, j + stride); 3241 } 3242 continue; 3243 } 3244 KMP_CPU_SET(j + stride, tempMask); 3245 setSize++; 3246 } 3247 } 3248 KMP_CPU_ZERO(tempMask); 3249 setSize = 0; 3250 3251 // valid follow sets are ',' and EOL 3252 SKIP_WS(scan); 3253 if (*scan == '\0') { 3254 break; 3255 } 3256 if (*scan == ',') { 3257 scan++; // skip ',' 3258 continue; 3259 } 3260 3261 KMP_ASSERT2(0, "bad explicit places list"); 3262 } 3263 3264 *out_numMasks = nextNewMask; 3265 if (nextNewMask == 0) { 3266 *out_masks = NULL; 3267 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3268 return; 3269 } 3270 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3271 KMP_CPU_FREE(tempMask); 3272 KMP_CPU_FREE(previousMask); 3273 for (i = 0; i < nextNewMask; i++) { 3274 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 3275 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 3276 KMP_CPU_COPY(dest, src); 3277 } 3278 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3279 } 3280 3281 #endif /* OMP_40_ENABLED */ 3282 3283 #undef ADD_MASK 3284 #undef ADD_MASK_OSID 3285 3286 #if KMP_USE_HWLOC 3287 static int __kmp_hwloc_skip_PUs_obj(hwloc_topology_t t, hwloc_obj_t o) { 3288 // skip PUs descendants of the object o 3289 int skipped = 0; 3290 hwloc_obj_t hT = NULL; 3291 int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT); 3292 for (int i = 0; i < N; ++i) { 3293 KMP_DEBUG_ASSERT(hT); 3294 unsigned idx = hT->os_index; 3295 if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3296 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3297 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3298 ++skipped; 3299 } 3300 hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT); 3301 } 3302 return skipped; // count number of skipped units 3303 } 3304 3305 static int __kmp_hwloc_obj_has_PUs(hwloc_topology_t t, hwloc_obj_t o) { 3306 // check if obj has PUs present in fullMask 3307 hwloc_obj_t hT = NULL; 3308 int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT); 3309 for (int i = 0; i < N; ++i) { 3310 KMP_DEBUG_ASSERT(hT); 3311 unsigned idx = hT->os_index; 3312 if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) 3313 return 1; // found PU 3314 hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT); 3315 } 3316 return 0; // no PUs found 3317 } 3318 #endif // KMP_USE_HWLOC 3319 3320 static void __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) { 3321 AddrUnsPair *newAddr; 3322 if (__kmp_hws_requested == 0) 3323 goto _exit; // no topology limiting actions requested, exit 3324 #if KMP_USE_HWLOC 3325 if (__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 3326 // Number of subobjects calculated dynamically, this works fine for 3327 // any non-uniform topology. 3328 // L2 cache objects are determined by depth, other objects - by type. 3329 hwloc_topology_t tp = __kmp_hwloc_topology; 3330 int nS = 0, nN = 0, nL = 0, nC = 0, 3331 nT = 0; // logical index including skipped 3332 int nCr = 0, nTr = 0; // number of requested units 3333 int nPkg = 0, nCo = 0, n_new = 0, n_old = 0, nCpP = 0, nTpC = 0; // counters 3334 hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to) 3335 int L2depth, idx; 3336 3337 // check support of extensions ---------------------------------- 3338 int numa_support = 0, tile_support = 0; 3339 if (__kmp_pu_os_idx) 3340 hT = hwloc_get_pu_obj_by_os_index(tp, 3341 __kmp_pu_os_idx[__kmp_avail_proc - 1]); 3342 else 3343 hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, __kmp_avail_proc - 1); 3344 if (hT == NULL) { // something's gone wrong 3345 KMP_WARNING(AffHWSubsetUnsupported); 3346 goto _exit; 3347 } 3348 // check NUMA node 3349 hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT); 3350 hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT); 3351 if (hN != NULL && hN->depth > hS->depth) { 3352 numa_support = 1; // 1 in case socket includes node(s) 3353 } else if (__kmp_hws_node.num > 0) { 3354 // don't support sockets inside NUMA node (no such HW found for testing) 3355 KMP_WARNING(AffHWSubsetUnsupported); 3356 goto _exit; 3357 } 3358 // check L2 cahce, get object by depth because of multiple caches 3359 L2depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED); 3360 hL = hwloc_get_ancestor_obj_by_depth(tp, L2depth, hT); 3361 if (hL != NULL && 3362 __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1) { 3363 tile_support = 1; // no sense to count L2 if it includes single core 3364 } else if (__kmp_hws_tile.num > 0) { 3365 if (__kmp_hws_core.num == 0) { 3366 __kmp_hws_core = __kmp_hws_tile; // replace L2 with core 3367 __kmp_hws_tile.num = 0; 3368 } else { 3369 // L2 and core are both requested, but represent same object 3370 KMP_WARNING(AffHWSubsetInvalid); 3371 goto _exit; 3372 } 3373 } 3374 // end of check of extensions ----------------------------------- 3375 3376 // fill in unset items, validate settings ----------------------- 3377 if (__kmp_hws_socket.num == 0) 3378 __kmp_hws_socket.num = nPackages; // use all available sockets 3379 if (__kmp_hws_socket.offset >= nPackages) { 3380 KMP_WARNING(AffHWSubsetManySockets); 3381 goto _exit; 3382 } 3383 if (numa_support) { 3384 hN = NULL; 3385 int NN = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, 3386 &hN); // num nodes in socket 3387 if (__kmp_hws_node.num == 0) 3388 __kmp_hws_node.num = NN; // use all available nodes 3389 if (__kmp_hws_node.offset >= NN) { 3390 KMP_WARNING(AffHWSubsetManyNodes); 3391 goto _exit; 3392 } 3393 if (tile_support) { 3394 // get num tiles in node 3395 int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL); 3396 if (__kmp_hws_tile.num == 0) { 3397 __kmp_hws_tile.num = NL + 1; 3398 } // use all available tiles, some node may have more tiles, thus +1 3399 if (__kmp_hws_tile.offset >= NL) { 3400 KMP_WARNING(AffHWSubsetManyTiles); 3401 goto _exit; 3402 } 3403 int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, 3404 &hC); // num cores in tile 3405 if (__kmp_hws_core.num == 0) 3406 __kmp_hws_core.num = NC; // use all available cores 3407 if (__kmp_hws_core.offset >= NC) { 3408 KMP_WARNING(AffHWSubsetManyCores); 3409 goto _exit; 3410 } 3411 } else { // tile_support 3412 int NC = __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, 3413 &hC); // num cores in node 3414 if (__kmp_hws_core.num == 0) 3415 __kmp_hws_core.num = NC; // use all available cores 3416 if (__kmp_hws_core.offset >= NC) { 3417 KMP_WARNING(AffHWSubsetManyCores); 3418 goto _exit; 3419 } 3420 } // tile_support 3421 } else { // numa_support 3422 if (tile_support) { 3423 // get num tiles in socket 3424 int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL); 3425 if (__kmp_hws_tile.num == 0) 3426 __kmp_hws_tile.num = NL; // use all available tiles 3427 if (__kmp_hws_tile.offset >= NL) { 3428 KMP_WARNING(AffHWSubsetManyTiles); 3429 goto _exit; 3430 } 3431 int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, 3432 &hC); // num cores in tile 3433 if (__kmp_hws_core.num == 0) 3434 __kmp_hws_core.num = NC; // use all available cores 3435 if (__kmp_hws_core.offset >= NC) { 3436 KMP_WARNING(AffHWSubsetManyCores); 3437 goto _exit; 3438 } 3439 } else { // tile_support 3440 int NC = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, 3441 &hC); // num cores in socket 3442 if (__kmp_hws_core.num == 0) 3443 __kmp_hws_core.num = NC; // use all available cores 3444 if (__kmp_hws_core.offset >= NC) { 3445 KMP_WARNING(AffHWSubsetManyCores); 3446 goto _exit; 3447 } 3448 } // tile_support 3449 } 3450 if (__kmp_hws_proc.num == 0) 3451 __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all available procs 3452 if (__kmp_hws_proc.offset >= __kmp_nThreadsPerCore) { 3453 KMP_WARNING(AffHWSubsetManyProcs); 3454 goto _exit; 3455 } 3456 // end of validation -------------------------------------------- 3457 3458 if (pAddr) // pAddr is NULL in case of affinity_none 3459 newAddr = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * 3460 __kmp_avail_proc); // max size 3461 // main loop to form HW subset ---------------------------------- 3462 hS = NULL; 3463 int NP = hwloc_get_nbobjs_by_type(tp, HWLOC_OBJ_PACKAGE); 3464 for (int s = 0; s < NP; ++s) { 3465 // Check Socket ----------------------------------------------- 3466 hS = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hS); 3467 if (!__kmp_hwloc_obj_has_PUs(tp, hS)) 3468 continue; // skip socket if all PUs are out of fullMask 3469 ++nS; // only count objects those have PUs in affinity mask 3470 if (nS <= __kmp_hws_socket.offset || 3471 nS > __kmp_hws_socket.num + __kmp_hws_socket.offset) { 3472 n_old += __kmp_hwloc_skip_PUs_obj(tp, hS); // skip socket 3473 continue; // move to next socket 3474 } 3475 nCr = 0; // count number of cores per socket 3476 // socket requested, go down the topology tree 3477 // check 4 cases: (+NUMA+Tile), (+NUMA-Tile), (-NUMA+Tile), (-NUMA-Tile) 3478 if (numa_support) { 3479 nN = 0; 3480 hN = NULL; 3481 // num nodes in current socket 3482 int NN = 3483 __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, &hN); 3484 for (int n = 0; n < NN; ++n) { 3485 // Check NUMA Node ---------------------------------------- 3486 if (!__kmp_hwloc_obj_has_PUs(tp, hN)) { 3487 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3488 continue; // skip node if all PUs are out of fullMask 3489 } 3490 ++nN; 3491 if (nN <= __kmp_hws_node.offset || 3492 nN > __kmp_hws_node.num + __kmp_hws_node.offset) { 3493 // skip node as not requested 3494 n_old += __kmp_hwloc_skip_PUs_obj(tp, hN); // skip node 3495 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3496 continue; // move to next node 3497 } 3498 // node requested, go down the topology tree 3499 if (tile_support) { 3500 nL = 0; 3501 hL = NULL; 3502 int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL); 3503 for (int l = 0; l < NL; ++l) { 3504 // Check L2 (tile) ------------------------------------ 3505 if (!__kmp_hwloc_obj_has_PUs(tp, hL)) { 3506 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3507 continue; // skip tile if all PUs are out of fullMask 3508 } 3509 ++nL; 3510 if (nL <= __kmp_hws_tile.offset || 3511 nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) { 3512 // skip tile as not requested 3513 n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile 3514 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3515 continue; // move to next tile 3516 } 3517 // tile requested, go down the topology tree 3518 nC = 0; 3519 hC = NULL; 3520 // num cores in current tile 3521 int NC = __kmp_hwloc_count_children_by_type(tp, hL, 3522 HWLOC_OBJ_CORE, &hC); 3523 for (int c = 0; c < NC; ++c) { 3524 // Check Core --------------------------------------- 3525 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3526 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3527 continue; // skip core if all PUs are out of fullMask 3528 } 3529 ++nC; 3530 if (nC <= __kmp_hws_core.offset || 3531 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3532 // skip node as not requested 3533 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3534 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3535 continue; // move to next node 3536 } 3537 // core requested, go down to PUs 3538 nT = 0; 3539 nTr = 0; 3540 hT = NULL; 3541 // num procs in current core 3542 int NT = __kmp_hwloc_count_children_by_type(tp, hC, 3543 HWLOC_OBJ_PU, &hT); 3544 for (int t = 0; t < NT; ++t) { 3545 // Check PU --------------------------------------- 3546 idx = hT->os_index; 3547 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3548 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3549 continue; // skip PU if not in fullMask 3550 } 3551 ++nT; 3552 if (nT <= __kmp_hws_proc.offset || 3553 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3554 // skip PU 3555 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3556 ++n_old; 3557 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3558 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3559 continue; // move to next node 3560 } 3561 ++nTr; 3562 if (pAddr) // collect requested thread's data 3563 newAddr[n_new] = (*pAddr)[n_old]; 3564 ++n_new; 3565 ++n_old; 3566 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3567 } // threads loop 3568 if (nTr > 0) { 3569 ++nCr; // num cores per socket 3570 ++nCo; // total num cores 3571 if (nTr > nTpC) 3572 nTpC = nTr; // calc max threads per core 3573 } 3574 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3575 } // cores loop 3576 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3577 } // tiles loop 3578 } else { // tile_support 3579 // no tiles, check cores 3580 nC = 0; 3581 hC = NULL; 3582 // num cores in current node 3583 int NC = 3584 __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, &hC); 3585 for (int c = 0; c < NC; ++c) { 3586 // Check Core --------------------------------------- 3587 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3588 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3589 continue; // skip core if all PUs are out of fullMask 3590 } 3591 ++nC; 3592 if (nC <= __kmp_hws_core.offset || 3593 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3594 // skip node as not requested 3595 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3596 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3597 continue; // move to next node 3598 } 3599 // core requested, go down to PUs 3600 nT = 0; 3601 nTr = 0; 3602 hT = NULL; 3603 int NT = 3604 __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT); 3605 for (int t = 0; t < NT; ++t) { 3606 // Check PU --------------------------------------- 3607 idx = hT->os_index; 3608 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3609 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3610 continue; // skip PU if not in fullMask 3611 } 3612 ++nT; 3613 if (nT <= __kmp_hws_proc.offset || 3614 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3615 // skip PU 3616 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3617 ++n_old; 3618 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3619 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3620 continue; // move to next node 3621 } 3622 ++nTr; 3623 if (pAddr) // collect requested thread's data 3624 newAddr[n_new] = (*pAddr)[n_old]; 3625 ++n_new; 3626 ++n_old; 3627 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3628 } // threads loop 3629 if (nTr > 0) { 3630 ++nCr; // num cores per socket 3631 ++nCo; // total num cores 3632 if (nTr > nTpC) 3633 nTpC = nTr; // calc max threads per core 3634 } 3635 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3636 } // cores loop 3637 } // tiles support 3638 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3639 } // nodes loop 3640 } else { // numa_support 3641 // no NUMA support 3642 if (tile_support) { 3643 nL = 0; 3644 hL = NULL; 3645 // num tiles in current socket 3646 int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL); 3647 for (int l = 0; l < NL; ++l) { 3648 // Check L2 (tile) ------------------------------------ 3649 if (!__kmp_hwloc_obj_has_PUs(tp, hL)) { 3650 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3651 continue; // skip tile if all PUs are out of fullMask 3652 } 3653 ++nL; 3654 if (nL <= __kmp_hws_tile.offset || 3655 nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) { 3656 // skip tile as not requested 3657 n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile 3658 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3659 continue; // move to next tile 3660 } 3661 // tile requested, go down the topology tree 3662 nC = 0; 3663 hC = NULL; 3664 // num cores per tile 3665 int NC = 3666 __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC); 3667 for (int c = 0; c < NC; ++c) { 3668 // Check Core --------------------------------------- 3669 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3670 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3671 continue; // skip core if all PUs are out of fullMask 3672 } 3673 ++nC; 3674 if (nC <= __kmp_hws_core.offset || 3675 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3676 // skip node as not requested 3677 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3678 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3679 continue; // move to next node 3680 } 3681 // core requested, go down to PUs 3682 nT = 0; 3683 nTr = 0; 3684 hT = NULL; 3685 // num procs per core 3686 int NT = 3687 __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT); 3688 for (int t = 0; t < NT; ++t) { 3689 // Check PU --------------------------------------- 3690 idx = hT->os_index; 3691 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3692 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3693 continue; // skip PU if not in fullMask 3694 } 3695 ++nT; 3696 if (nT <= __kmp_hws_proc.offset || 3697 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3698 // skip PU 3699 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3700 ++n_old; 3701 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3702 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3703 continue; // move to next node 3704 } 3705 ++nTr; 3706 if (pAddr) // collect requested thread's data 3707 newAddr[n_new] = (*pAddr)[n_old]; 3708 ++n_new; 3709 ++n_old; 3710 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3711 } // threads loop 3712 if (nTr > 0) { 3713 ++nCr; // num cores per socket 3714 ++nCo; // total num cores 3715 if (nTr > nTpC) 3716 nTpC = nTr; // calc max threads per core 3717 } 3718 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3719 } // cores loop 3720 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3721 } // tiles loop 3722 } else { // tile_support 3723 // no tiles, check cores 3724 nC = 0; 3725 hC = NULL; 3726 // num cores in socket 3727 int NC = 3728 __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, &hC); 3729 for (int c = 0; c < NC; ++c) { 3730 // Check Core ------------------------------------------- 3731 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3732 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3733 continue; // skip core if all PUs are out of fullMask 3734 } 3735 ++nC; 3736 if (nC <= __kmp_hws_core.offset || 3737 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3738 // skip node as not requested 3739 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3740 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3741 continue; // move to next node 3742 } 3743 // core requested, go down to PUs 3744 nT = 0; 3745 nTr = 0; 3746 hT = NULL; 3747 // num procs per core 3748 int NT = 3749 __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT); 3750 for (int t = 0; t < NT; ++t) { 3751 // Check PU --------------------------------------- 3752 idx = hT->os_index; 3753 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3754 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3755 continue; // skip PU if not in fullMask 3756 } 3757 ++nT; 3758 if (nT <= __kmp_hws_proc.offset || 3759 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3760 // skip PU 3761 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3762 ++n_old; 3763 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3764 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3765 continue; // move to next node 3766 } 3767 ++nTr; 3768 if (pAddr) // collect requested thread's data 3769 newAddr[n_new] = (*pAddr)[n_old]; 3770 ++n_new; 3771 ++n_old; 3772 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3773 } // threads loop 3774 if (nTr > 0) { 3775 ++nCr; // num cores per socket 3776 ++nCo; // total num cores 3777 if (nTr > nTpC) 3778 nTpC = nTr; // calc max threads per core 3779 } 3780 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3781 } // cores loop 3782 } // tiles support 3783 } // numa_support 3784 if (nCr > 0) { // found cores? 3785 ++nPkg; // num sockets 3786 if (nCr > nCpP) 3787 nCpP = nCr; // calc max cores per socket 3788 } 3789 } // sockets loop 3790 3791 // check the subset is valid 3792 KMP_DEBUG_ASSERT(n_old == __kmp_avail_proc); 3793 KMP_DEBUG_ASSERT(nPkg > 0); 3794 KMP_DEBUG_ASSERT(nCpP > 0); 3795 KMP_DEBUG_ASSERT(nTpC > 0); 3796 KMP_DEBUG_ASSERT(nCo > 0); 3797 KMP_DEBUG_ASSERT(nPkg <= nPackages); 3798 KMP_DEBUG_ASSERT(nCpP <= nCoresPerPkg); 3799 KMP_DEBUG_ASSERT(nTpC <= __kmp_nThreadsPerCore); 3800 KMP_DEBUG_ASSERT(nCo <= __kmp_ncores); 3801 3802 nPackages = nPkg; // correct num sockets 3803 nCoresPerPkg = nCpP; // correct num cores per socket 3804 __kmp_nThreadsPerCore = nTpC; // correct num threads per core 3805 __kmp_avail_proc = n_new; // correct num procs 3806 __kmp_ncores = nCo; // correct num cores 3807 // hwloc topology method end 3808 } else 3809 #endif // KMP_USE_HWLOC 3810 { 3811 int n_old = 0, n_new = 0, proc_num = 0; 3812 if (__kmp_hws_node.num > 0 || __kmp_hws_tile.num > 0) { 3813 KMP_WARNING(AffHWSubsetNoHWLOC); 3814 goto _exit; 3815 } 3816 if (__kmp_hws_socket.num == 0) 3817 __kmp_hws_socket.num = nPackages; // use all available sockets 3818 if (__kmp_hws_core.num == 0) 3819 __kmp_hws_core.num = nCoresPerPkg; // use all available cores 3820 if (__kmp_hws_proc.num == 0 || __kmp_hws_proc.num > __kmp_nThreadsPerCore) 3821 __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all HW contexts 3822 if (!__kmp_affinity_uniform_topology()) { 3823 KMP_WARNING(AffHWSubsetNonUniform); 3824 goto _exit; // don't support non-uniform topology 3825 } 3826 if (depth > 3) { 3827 KMP_WARNING(AffHWSubsetNonThreeLevel); 3828 goto _exit; // don't support not-3-level topology 3829 } 3830 if (__kmp_hws_socket.offset + __kmp_hws_socket.num > nPackages) { 3831 KMP_WARNING(AffHWSubsetManySockets); 3832 goto _exit; 3833 } 3834 if (__kmp_hws_core.offset + __kmp_hws_core.num > nCoresPerPkg) { 3835 KMP_WARNING(AffHWSubsetManyCores); 3836 goto _exit; 3837 } 3838 // Form the requested subset 3839 if (pAddr) // pAddr is NULL in case of affinity_none 3840 newAddr = (AddrUnsPair *)__kmp_allocate( 3841 sizeof(AddrUnsPair) * __kmp_hws_socket.num * __kmp_hws_core.num * 3842 __kmp_hws_proc.num); 3843 for (int i = 0; i < nPackages; ++i) { 3844 if (i < __kmp_hws_socket.offset || 3845 i >= __kmp_hws_socket.offset + __kmp_hws_socket.num) { 3846 // skip not-requested socket 3847 n_old += nCoresPerPkg * __kmp_nThreadsPerCore; 3848 if (__kmp_pu_os_idx != NULL) { 3849 // walk through skipped socket 3850 for (int j = 0; j < nCoresPerPkg; ++j) { 3851 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 3852 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3853 ++proc_num; 3854 } 3855 } 3856 } 3857 } else { 3858 // walk through requested socket 3859 for (int j = 0; j < nCoresPerPkg; ++j) { 3860 if (j < __kmp_hws_core.offset || 3861 j >= __kmp_hws_core.offset + 3862 __kmp_hws_core.num) { // skip not-requested core 3863 n_old += __kmp_nThreadsPerCore; 3864 if (__kmp_pu_os_idx != NULL) { 3865 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 3866 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3867 ++proc_num; 3868 } 3869 } 3870 } else { 3871 // walk through requested core 3872 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 3873 if (k < __kmp_hws_proc.num) { 3874 if (pAddr) // collect requested thread's data 3875 newAddr[n_new] = (*pAddr)[n_old]; 3876 n_new++; 3877 } else { 3878 if (__kmp_pu_os_idx != NULL) 3879 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3880 } 3881 n_old++; 3882 ++proc_num; 3883 } 3884 } 3885 } 3886 } 3887 } 3888 KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore); 3889 KMP_DEBUG_ASSERT(n_new == 3890 __kmp_hws_socket.num * __kmp_hws_core.num * 3891 __kmp_hws_proc.num); 3892 nPackages = __kmp_hws_socket.num; // correct nPackages 3893 nCoresPerPkg = __kmp_hws_core.num; // correct nCoresPerPkg 3894 __kmp_nThreadsPerCore = __kmp_hws_proc.num; // correct __kmp_nThreadsPerCore 3895 __kmp_avail_proc = n_new; // correct avail_proc 3896 __kmp_ncores = nPackages * __kmp_hws_core.num; // correct ncores 3897 } // non-hwloc topology method 3898 if (pAddr) { 3899 __kmp_free(*pAddr); 3900 *pAddr = newAddr; // replace old topology with new one 3901 } 3902 if (__kmp_affinity_verbose) { 3903 char m[KMP_AFFIN_MASK_PRINT_LEN]; 3904 __kmp_affinity_print_mask(m, KMP_AFFIN_MASK_PRINT_LEN, 3905 __kmp_affin_fullMask); 3906 if (__kmp_affinity_respect_mask) { 3907 KMP_INFORM(InitOSProcSetRespect, "KMP_HW_SUBSET", m); 3908 } else { 3909 KMP_INFORM(InitOSProcSetNotRespect, "KMP_HW_SUBSET", m); 3910 } 3911 KMP_INFORM(AvailableOSProc, "KMP_HW_SUBSET", __kmp_avail_proc); 3912 kmp_str_buf_t buf; 3913 __kmp_str_buf_init(&buf); 3914 __kmp_str_buf_print(&buf, "%d", nPackages); 3915 KMP_INFORM(TopologyExtra, "KMP_HW_SUBSET", buf.str, nCoresPerPkg, 3916 __kmp_nThreadsPerCore, __kmp_ncores); 3917 __kmp_str_buf_free(&buf); 3918 } 3919 _exit: 3920 if (__kmp_pu_os_idx != NULL) { 3921 __kmp_free(__kmp_pu_os_idx); 3922 __kmp_pu_os_idx = NULL; 3923 } 3924 } 3925 3926 // This function figures out the deepest level at which there is at least one 3927 // cluster/core with more than one processing unit bound to it. 3928 static int __kmp_affinity_find_core_level(const AddrUnsPair *address2os, 3929 int nprocs, int bottom_level) { 3930 int core_level = 0; 3931 3932 for (int i = 0; i < nprocs; i++) { 3933 for (int j = bottom_level; j > 0; j--) { 3934 if (address2os[i].first.labels[j] > 0) { 3935 if (core_level < (j - 1)) { 3936 core_level = j - 1; 3937 } 3938 } 3939 } 3940 } 3941 return core_level; 3942 } 3943 3944 // This function counts number of clusters/cores at given level. 3945 static int __kmp_affinity_compute_ncores(const AddrUnsPair *address2os, 3946 int nprocs, int bottom_level, 3947 int core_level) { 3948 int ncores = 0; 3949 int i, j; 3950 3951 j = bottom_level; 3952 for (i = 0; i < nprocs; i++) { 3953 for (j = bottom_level; j > core_level; j--) { 3954 if ((i + 1) < nprocs) { 3955 if (address2os[i + 1].first.labels[j] > 0) { 3956 break; 3957 } 3958 } 3959 } 3960 if (j == core_level) { 3961 ncores++; 3962 } 3963 } 3964 if (j > core_level) { 3965 // In case of ( nprocs < __kmp_avail_proc ) we may end too deep and miss one 3966 // core. May occur when called from __kmp_affinity_find_core(). 3967 ncores++; 3968 } 3969 return ncores; 3970 } 3971 3972 // This function finds to which cluster/core given processing unit is bound. 3973 static int __kmp_affinity_find_core(const AddrUnsPair *address2os, int proc, 3974 int bottom_level, int core_level) { 3975 return __kmp_affinity_compute_ncores(address2os, proc + 1, bottom_level, 3976 core_level) - 3977 1; 3978 } 3979 3980 // This function finds maximal number of processing units bound to a 3981 // cluster/core at given level. 3982 static int __kmp_affinity_max_proc_per_core(const AddrUnsPair *address2os, 3983 int nprocs, int bottom_level, 3984 int core_level) { 3985 int maxprocpercore = 0; 3986 3987 if (core_level < bottom_level) { 3988 for (int i = 0; i < nprocs; i++) { 3989 int percore = address2os[i].first.labels[core_level + 1] + 1; 3990 3991 if (percore > maxprocpercore) { 3992 maxprocpercore = percore; 3993 } 3994 } 3995 } else { 3996 maxprocpercore = 1; 3997 } 3998 return maxprocpercore; 3999 } 4000 4001 static AddrUnsPair *address2os = NULL; 4002 static int *procarr = NULL; 4003 static int __kmp_aff_depth = 0; 4004 4005 #if KMP_USE_HIER_SCHED 4006 #define KMP_EXIT_AFF_NONE \ 4007 KMP_ASSERT(__kmp_affinity_type == affinity_none); \ 4008 KMP_ASSERT(address2os == NULL); \ 4009 __kmp_apply_thread_places(NULL, 0); \ 4010 __kmp_create_affinity_none_places(); \ 4011 __kmp_dispatch_set_hierarchy_values(); \ 4012 return; 4013 #else 4014 #define KMP_EXIT_AFF_NONE \ 4015 KMP_ASSERT(__kmp_affinity_type == affinity_none); \ 4016 KMP_ASSERT(address2os == NULL); \ 4017 __kmp_apply_thread_places(NULL, 0); \ 4018 __kmp_create_affinity_none_places(); \ 4019 return; 4020 #endif 4021 4022 // Create a one element mask array (set of places) which only contains the 4023 // initial process's affinity mask 4024 static void __kmp_create_affinity_none_places() { 4025 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4026 KMP_ASSERT(__kmp_affinity_type == affinity_none); 4027 __kmp_affinity_num_masks = 1; 4028 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4029 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, 0); 4030 KMP_CPU_COPY(dest, __kmp_affin_fullMask); 4031 } 4032 4033 static int __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) { 4034 const Address *aa = &(((const AddrUnsPair *)a)->first); 4035 const Address *bb = &(((const AddrUnsPair *)b)->first); 4036 unsigned depth = aa->depth; 4037 unsigned i; 4038 KMP_DEBUG_ASSERT(depth == bb->depth); 4039 KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth); 4040 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0); 4041 for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) { 4042 int j = depth - i - 1; 4043 if (aa->childNums[j] < bb->childNums[j]) 4044 return -1; 4045 if (aa->childNums[j] > bb->childNums[j]) 4046 return 1; 4047 } 4048 for (; i < depth; i++) { 4049 int j = i - __kmp_affinity_compact; 4050 if (aa->childNums[j] < bb->childNums[j]) 4051 return -1; 4052 if (aa->childNums[j] > bb->childNums[j]) 4053 return 1; 4054 } 4055 return 0; 4056 } 4057 4058 static void __kmp_aux_affinity_initialize(void) { 4059 if (__kmp_affinity_masks != NULL) { 4060 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4061 return; 4062 } 4063 4064 // Create the "full" mask - this defines all of the processors that we 4065 // consider to be in the machine model. If respect is set, then it is the 4066 // initialization thread's affinity mask. Otherwise, it is all processors that 4067 // we know about on the machine. 4068 if (__kmp_affin_fullMask == NULL) { 4069 KMP_CPU_ALLOC(__kmp_affin_fullMask); 4070 } 4071 if (KMP_AFFINITY_CAPABLE()) { 4072 if (__kmp_affinity_respect_mask) { 4073 __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE); 4074 4075 // Count the number of available processors. 4076 unsigned i; 4077 __kmp_avail_proc = 0; 4078 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 4079 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 4080 continue; 4081 } 4082 __kmp_avail_proc++; 4083 } 4084 if (__kmp_avail_proc > __kmp_xproc) { 4085 if (__kmp_affinity_verbose || 4086 (__kmp_affinity_warnings && 4087 (__kmp_affinity_type != affinity_none))) { 4088 KMP_WARNING(ErrorInitializeAffinity); 4089 } 4090 __kmp_affinity_type = affinity_none; 4091 KMP_AFFINITY_DISABLE(); 4092 return; 4093 } 4094 } else { 4095 __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask); 4096 __kmp_avail_proc = __kmp_xproc; 4097 } 4098 } 4099 4100 if (__kmp_affinity_gran == affinity_gran_tile && 4101 // check if user's request is valid 4102 __kmp_affinity_dispatch->get_api_type() == KMPAffinity::NATIVE_OS) { 4103 KMP_WARNING(AffTilesNoHWLOC, "KMP_AFFINITY"); 4104 __kmp_affinity_gran = affinity_gran_package; 4105 } 4106 4107 int depth = -1; 4108 kmp_i18n_id_t msg_id = kmp_i18n_null; 4109 4110 // For backward compatibility, setting KMP_CPUINFO_FILE => 4111 // KMP_TOPOLOGY_METHOD=cpuinfo 4112 if ((__kmp_cpuinfo_file != NULL) && 4113 (__kmp_affinity_top_method == affinity_top_method_all)) { 4114 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 4115 } 4116 4117 if (__kmp_affinity_top_method == affinity_top_method_all) { 4118 // In the default code path, errors are not fatal - we just try using 4119 // another method. We only emit a warning message if affinity is on, or the 4120 // verbose flag is set, an the nowarnings flag was not set. 4121 const char *file_name = NULL; 4122 int line = 0; 4123 #if KMP_USE_HWLOC 4124 if (depth < 0 && 4125 __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 4126 if (__kmp_affinity_verbose) { 4127 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 4128 } 4129 if (!__kmp_hwloc_error) { 4130 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); 4131 if (depth == 0) { 4132 KMP_EXIT_AFF_NONE; 4133 } else if (depth < 0 && __kmp_affinity_verbose) { 4134 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 4135 } 4136 } else if (__kmp_affinity_verbose) { 4137 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 4138 } 4139 } 4140 #endif 4141 4142 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4143 4144 if (depth < 0) { 4145 if (__kmp_affinity_verbose) { 4146 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 4147 } 4148 4149 file_name = NULL; 4150 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 4151 if (depth == 0) { 4152 KMP_EXIT_AFF_NONE; 4153 } 4154 4155 if (depth < 0) { 4156 if (__kmp_affinity_verbose) { 4157 if (msg_id != kmp_i18n_null) { 4158 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", 4159 __kmp_i18n_catgets(msg_id), 4160 KMP_I18N_STR(DecodingLegacyAPIC)); 4161 } else { 4162 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 4163 KMP_I18N_STR(DecodingLegacyAPIC)); 4164 } 4165 } 4166 4167 file_name = NULL; 4168 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 4169 if (depth == 0) { 4170 KMP_EXIT_AFF_NONE; 4171 } 4172 } 4173 } 4174 4175 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4176 4177 #if KMP_OS_LINUX 4178 4179 if (depth < 0) { 4180 if (__kmp_affinity_verbose) { 4181 if (msg_id != kmp_i18n_null) { 4182 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", 4183 __kmp_i18n_catgets(msg_id), "/proc/cpuinfo"); 4184 } else { 4185 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo"); 4186 } 4187 } 4188 4189 FILE *f = fopen("/proc/cpuinfo", "r"); 4190 if (f == NULL) { 4191 msg_id = kmp_i18n_str_CantOpenCpuinfo; 4192 } else { 4193 file_name = "/proc/cpuinfo"; 4194 depth = 4195 __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 4196 fclose(f); 4197 if (depth == 0) { 4198 KMP_EXIT_AFF_NONE; 4199 } 4200 } 4201 } 4202 4203 #endif /* KMP_OS_LINUX */ 4204 4205 #if KMP_GROUP_AFFINITY 4206 4207 if ((depth < 0) && (__kmp_num_proc_groups > 1)) { 4208 if (__kmp_affinity_verbose) { 4209 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 4210 } 4211 4212 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 4213 KMP_ASSERT(depth != 0); 4214 } 4215 4216 #endif /* KMP_GROUP_AFFINITY */ 4217 4218 if (depth < 0) { 4219 if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) { 4220 if (file_name == NULL) { 4221 KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id)); 4222 } else if (line == 0) { 4223 KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id)); 4224 } else { 4225 KMP_INFORM(UsingFlatOSFileLine, file_name, line, 4226 __kmp_i18n_catgets(msg_id)); 4227 } 4228 } 4229 // FIXME - print msg if msg_id = kmp_i18n_null ??? 4230 4231 file_name = ""; 4232 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 4233 if (depth == 0) { 4234 KMP_EXIT_AFF_NONE; 4235 } 4236 KMP_ASSERT(depth > 0); 4237 KMP_ASSERT(address2os != NULL); 4238 } 4239 } 4240 4241 #if KMP_USE_HWLOC 4242 else if (__kmp_affinity_top_method == affinity_top_method_hwloc) { 4243 KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC); 4244 if (__kmp_affinity_verbose) { 4245 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 4246 } 4247 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); 4248 if (depth == 0) { 4249 KMP_EXIT_AFF_NONE; 4250 } 4251 } 4252 #endif // KMP_USE_HWLOC 4253 4254 // If the user has specified that a paricular topology discovery method is to be 4255 // used, then we abort if that method fails. The exception is group affinity, 4256 // which might have been implicitly set. 4257 4258 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4259 4260 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 4261 if (__kmp_affinity_verbose) { 4262 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 4263 } 4264 4265 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 4266 if (depth == 0) { 4267 KMP_EXIT_AFF_NONE; 4268 } 4269 if (depth < 0) { 4270 KMP_ASSERT(msg_id != kmp_i18n_null); 4271 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4272 } 4273 } else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 4274 if (__kmp_affinity_verbose) { 4275 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 4276 } 4277 4278 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 4279 if (depth == 0) { 4280 KMP_EXIT_AFF_NONE; 4281 } 4282 if (depth < 0) { 4283 KMP_ASSERT(msg_id != kmp_i18n_null); 4284 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4285 } 4286 } 4287 4288 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4289 4290 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 4291 const char *filename; 4292 if (__kmp_cpuinfo_file != NULL) { 4293 filename = __kmp_cpuinfo_file; 4294 } else { 4295 filename = "/proc/cpuinfo"; 4296 } 4297 4298 if (__kmp_affinity_verbose) { 4299 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 4300 } 4301 4302 FILE *f = fopen(filename, "r"); 4303 if (f == NULL) { 4304 int code = errno; 4305 if (__kmp_cpuinfo_file != NULL) { 4306 __kmp_fatal(KMP_MSG(CantOpenFileForReading, filename), KMP_ERR(code), 4307 KMP_HNT(NameComesFrom_CPUINFO_FILE), __kmp_msg_null); 4308 } else { 4309 __kmp_fatal(KMP_MSG(CantOpenFileForReading, filename), KMP_ERR(code), 4310 __kmp_msg_null); 4311 } 4312 } 4313 int line = 0; 4314 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 4315 fclose(f); 4316 if (depth < 0) { 4317 KMP_ASSERT(msg_id != kmp_i18n_null); 4318 if (line > 0) { 4319 KMP_FATAL(FileLineMsgExiting, filename, line, 4320 __kmp_i18n_catgets(msg_id)); 4321 } else { 4322 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 4323 } 4324 } 4325 if (__kmp_affinity_type == affinity_none) { 4326 KMP_ASSERT(depth == 0); 4327 KMP_EXIT_AFF_NONE; 4328 } 4329 } 4330 4331 #if KMP_GROUP_AFFINITY 4332 4333 else if (__kmp_affinity_top_method == affinity_top_method_group) { 4334 if (__kmp_affinity_verbose) { 4335 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 4336 } 4337 4338 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 4339 KMP_ASSERT(depth != 0); 4340 if (depth < 0) { 4341 KMP_ASSERT(msg_id != kmp_i18n_null); 4342 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4343 } 4344 } 4345 4346 #endif /* KMP_GROUP_AFFINITY */ 4347 4348 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 4349 if (__kmp_affinity_verbose) { 4350 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY"); 4351 } 4352 4353 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 4354 if (depth == 0) { 4355 KMP_EXIT_AFF_NONE; 4356 } 4357 // should not fail 4358 KMP_ASSERT(depth > 0); 4359 KMP_ASSERT(address2os != NULL); 4360 } 4361 4362 #if KMP_USE_HIER_SCHED 4363 __kmp_dispatch_set_hierarchy_values(); 4364 #endif 4365 4366 if (address2os == NULL) { 4367 if (KMP_AFFINITY_CAPABLE() && 4368 (__kmp_affinity_verbose || 4369 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) { 4370 KMP_WARNING(ErrorInitializeAffinity); 4371 } 4372 __kmp_affinity_type = affinity_none; 4373 __kmp_create_affinity_none_places(); 4374 KMP_AFFINITY_DISABLE(); 4375 return; 4376 } 4377 4378 if (__kmp_affinity_gran == affinity_gran_tile 4379 #if KMP_USE_HWLOC 4380 && __kmp_tile_depth == 0 4381 #endif 4382 ) { 4383 // tiles requested but not detected, warn user on this 4384 KMP_WARNING(AffTilesNoTiles, "KMP_AFFINITY"); 4385 } 4386 4387 __kmp_apply_thread_places(&address2os, depth); 4388 4389 // Create the table of masks, indexed by thread Id. 4390 unsigned maxIndex; 4391 unsigned numUnique; 4392 kmp_affin_mask_t *osId2Mask = 4393 __kmp_create_masks(&maxIndex, &numUnique, address2os, __kmp_avail_proc); 4394 if (__kmp_affinity_gran_levels == 0) { 4395 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 4396 } 4397 4398 // Set the childNums vector in all Address objects. This must be done before 4399 // we can sort using __kmp_affinity_cmp_Address_child_num(), which takes into 4400 // account the setting of __kmp_affinity_compact. 4401 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc); 4402 4403 switch (__kmp_affinity_type) { 4404 4405 case affinity_explicit: 4406 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 4407 #if OMP_40_ENABLED 4408 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 4409 #endif 4410 { 4411 __kmp_affinity_process_proclist( 4412 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 4413 __kmp_affinity_proclist, osId2Mask, maxIndex); 4414 } 4415 #if OMP_40_ENABLED 4416 else { 4417 __kmp_affinity_process_placelist( 4418 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 4419 __kmp_affinity_proclist, osId2Mask, maxIndex); 4420 } 4421 #endif 4422 if (__kmp_affinity_num_masks == 0) { 4423 if (__kmp_affinity_verbose || 4424 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 4425 KMP_WARNING(AffNoValidProcID); 4426 } 4427 __kmp_affinity_type = affinity_none; 4428 return; 4429 } 4430 break; 4431 4432 // The other affinity types rely on sorting the Addresses according to some 4433 // permutation of the machine topology tree. Set __kmp_affinity_compact and 4434 // __kmp_affinity_offset appropriately, then jump to a common code fragment 4435 // to do the sort and create the array of affinity masks. 4436 4437 case affinity_logical: 4438 __kmp_affinity_compact = 0; 4439 if (__kmp_affinity_offset) { 4440 __kmp_affinity_offset = 4441 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 4442 } 4443 goto sortAddresses; 4444 4445 case affinity_physical: 4446 if (__kmp_nThreadsPerCore > 1) { 4447 __kmp_affinity_compact = 1; 4448 if (__kmp_affinity_compact >= depth) { 4449 __kmp_affinity_compact = 0; 4450 } 4451 } else { 4452 __kmp_affinity_compact = 0; 4453 } 4454 if (__kmp_affinity_offset) { 4455 __kmp_affinity_offset = 4456 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 4457 } 4458 goto sortAddresses; 4459 4460 case affinity_scatter: 4461 if (__kmp_affinity_compact >= depth) { 4462 __kmp_affinity_compact = 0; 4463 } else { 4464 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 4465 } 4466 goto sortAddresses; 4467 4468 case affinity_compact: 4469 if (__kmp_affinity_compact >= depth) { 4470 __kmp_affinity_compact = depth - 1; 4471 } 4472 goto sortAddresses; 4473 4474 case affinity_balanced: 4475 if (depth <= 1) { 4476 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 4477 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 4478 } 4479 __kmp_affinity_type = affinity_none; 4480 return; 4481 } else if (__kmp_affinity_uniform_topology()) { 4482 break; 4483 } else { // Non-uniform topology 4484 4485 // Save the depth for further usage 4486 __kmp_aff_depth = depth; 4487 4488 int core_level = __kmp_affinity_find_core_level( 4489 address2os, __kmp_avail_proc, depth - 1); 4490 int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, 4491 depth - 1, core_level); 4492 int maxprocpercore = __kmp_affinity_max_proc_per_core( 4493 address2os, __kmp_avail_proc, depth - 1, core_level); 4494 4495 int nproc = ncores * maxprocpercore; 4496 if ((nproc < 2) || (nproc < __kmp_avail_proc)) { 4497 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 4498 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 4499 } 4500 __kmp_affinity_type = affinity_none; 4501 return; 4502 } 4503 4504 procarr = (int *)__kmp_allocate(sizeof(int) * nproc); 4505 for (int i = 0; i < nproc; i++) { 4506 procarr[i] = -1; 4507 } 4508 4509 int lastcore = -1; 4510 int inlastcore = 0; 4511 for (int i = 0; i < __kmp_avail_proc; i++) { 4512 int proc = address2os[i].second; 4513 int core = 4514 __kmp_affinity_find_core(address2os, i, depth - 1, core_level); 4515 4516 if (core == lastcore) { 4517 inlastcore++; 4518 } else { 4519 inlastcore = 0; 4520 } 4521 lastcore = core; 4522 4523 procarr[core * maxprocpercore + inlastcore] = proc; 4524 } 4525 4526 break; 4527 } 4528 4529 sortAddresses: 4530 // Allocate the gtid->affinity mask table. 4531 if (__kmp_affinity_dups) { 4532 __kmp_affinity_num_masks = __kmp_avail_proc; 4533 } else { 4534 __kmp_affinity_num_masks = numUnique; 4535 } 4536 4537 #if OMP_40_ENABLED 4538 if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) && 4539 (__kmp_affinity_num_places > 0) && 4540 ((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) { 4541 __kmp_affinity_num_masks = __kmp_affinity_num_places; 4542 } 4543 #endif 4544 4545 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4546 4547 // Sort the address2os table according to the current setting of 4548 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 4549 qsort(address2os, __kmp_avail_proc, sizeof(*address2os), 4550 __kmp_affinity_cmp_Address_child_num); 4551 { 4552 int i; 4553 unsigned j; 4554 for (i = 0, j = 0; i < __kmp_avail_proc; i++) { 4555 if ((!__kmp_affinity_dups) && (!address2os[i].first.leader)) { 4556 continue; 4557 } 4558 unsigned osId = address2os[i].second; 4559 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 4560 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j); 4561 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 4562 KMP_CPU_COPY(dest, src); 4563 if (++j >= __kmp_affinity_num_masks) { 4564 break; 4565 } 4566 } 4567 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 4568 } 4569 break; 4570 4571 default: 4572 KMP_ASSERT2(0, "Unexpected affinity setting"); 4573 } 4574 4575 KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1); 4576 machine_hierarchy.init(address2os, __kmp_avail_proc); 4577 } 4578 #undef KMP_EXIT_AFF_NONE 4579 4580 void __kmp_affinity_initialize(void) { 4581 // Much of the code above was written assumming that if a machine was not 4582 // affinity capable, then __kmp_affinity_type == affinity_none. We now 4583 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 4584 // There are too many checks for __kmp_affinity_type == affinity_none 4585 // in this code. Instead of trying to change them all, check if 4586 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 4587 // affinity_none, call the real initialization routine, then restore 4588 // __kmp_affinity_type to affinity_disabled. 4589 int disabled = (__kmp_affinity_type == affinity_disabled); 4590 if (!KMP_AFFINITY_CAPABLE()) { 4591 KMP_ASSERT(disabled); 4592 } 4593 if (disabled) { 4594 __kmp_affinity_type = affinity_none; 4595 } 4596 __kmp_aux_affinity_initialize(); 4597 if (disabled) { 4598 __kmp_affinity_type = affinity_disabled; 4599 } 4600 } 4601 4602 void __kmp_affinity_uninitialize(void) { 4603 if (__kmp_affinity_masks != NULL) { 4604 KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4605 __kmp_affinity_masks = NULL; 4606 } 4607 if (__kmp_affin_fullMask != NULL) { 4608 KMP_CPU_FREE(__kmp_affin_fullMask); 4609 __kmp_affin_fullMask = NULL; 4610 } 4611 __kmp_affinity_num_masks = 0; 4612 __kmp_affinity_type = affinity_default; 4613 #if OMP_40_ENABLED 4614 __kmp_affinity_num_places = 0; 4615 #endif 4616 if (__kmp_affinity_proclist != NULL) { 4617 __kmp_free(__kmp_affinity_proclist); 4618 __kmp_affinity_proclist = NULL; 4619 } 4620 if (address2os != NULL) { 4621 __kmp_free(address2os); 4622 address2os = NULL; 4623 } 4624 if (procarr != NULL) { 4625 __kmp_free(procarr); 4626 procarr = NULL; 4627 } 4628 #if KMP_USE_HWLOC 4629 if (__kmp_hwloc_topology != NULL) { 4630 hwloc_topology_destroy(__kmp_hwloc_topology); 4631 __kmp_hwloc_topology = NULL; 4632 } 4633 #endif 4634 KMPAffinity::destroy_api(); 4635 } 4636 4637 void __kmp_affinity_set_init_mask(int gtid, int isa_root) { 4638 if (!KMP_AFFINITY_CAPABLE()) { 4639 return; 4640 } 4641 4642 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4643 if (th->th.th_affin_mask == NULL) { 4644 KMP_CPU_ALLOC(th->th.th_affin_mask); 4645 } else { 4646 KMP_CPU_ZERO(th->th.th_affin_mask); 4647 } 4648 4649 // Copy the thread mask to the kmp_info_t strucuture. If 4650 // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that 4651 // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set, 4652 // then the full mask is the same as the mask of the initialization thread. 4653 kmp_affin_mask_t *mask; 4654 int i; 4655 4656 #if OMP_40_ENABLED 4657 if (KMP_AFFINITY_NON_PROC_BIND) 4658 #endif 4659 { 4660 if ((__kmp_affinity_type == affinity_none) || 4661 (__kmp_affinity_type == affinity_balanced)) { 4662 #if KMP_GROUP_AFFINITY 4663 if (__kmp_num_proc_groups > 1) { 4664 return; 4665 } 4666 #endif 4667 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4668 i = 0; 4669 mask = __kmp_affin_fullMask; 4670 } else { 4671 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 4672 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4673 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4674 } 4675 } 4676 #if OMP_40_ENABLED 4677 else { 4678 if ((!isa_root) || 4679 (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 4680 #if KMP_GROUP_AFFINITY 4681 if (__kmp_num_proc_groups > 1) { 4682 return; 4683 } 4684 #endif 4685 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4686 i = KMP_PLACE_ALL; 4687 mask = __kmp_affin_fullMask; 4688 } else { 4689 // int i = some hash function or just a counter that doesn't 4690 // always start at 0. Use gtid for now. 4691 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 4692 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4693 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4694 } 4695 } 4696 #endif 4697 4698 #if OMP_40_ENABLED 4699 th->th.th_current_place = i; 4700 if (isa_root) { 4701 th->th.th_new_place = i; 4702 th->th.th_first_place = 0; 4703 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4704 } 4705 4706 if (i == KMP_PLACE_ALL) { 4707 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 4708 gtid)); 4709 } else { 4710 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 4711 gtid, i)); 4712 } 4713 #else 4714 if (i == -1) { 4715 KA_TRACE( 4716 100, 4717 ("__kmp_affinity_set_init_mask: binding T#%d to __kmp_affin_fullMask\n", 4718 gtid)); 4719 } else { 4720 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n", 4721 gtid, i)); 4722 } 4723 #endif /* OMP_40_ENABLED */ 4724 4725 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4726 4727 if (__kmp_affinity_verbose 4728 /* to avoid duplicate printing (will be correctly printed on barrier) */ 4729 && (__kmp_affinity_type == affinity_none || 4730 (i != KMP_PLACE_ALL && __kmp_affinity_type != affinity_balanced))) { 4731 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4732 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4733 th->th.th_affin_mask); 4734 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4735 __kmp_gettid(), gtid, buf); 4736 } 4737 4738 #if KMP_OS_WINDOWS 4739 // On Windows* OS, the process affinity mask might have changed. If the user 4740 // didn't request affinity and this call fails, just continue silently. 4741 // See CQ171393. 4742 if (__kmp_affinity_type == affinity_none) { 4743 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 4744 } else 4745 #endif 4746 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4747 } 4748 4749 #if OMP_40_ENABLED 4750 4751 void __kmp_affinity_set_place(int gtid) { 4752 if (!KMP_AFFINITY_CAPABLE()) { 4753 return; 4754 } 4755 4756 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4757 4758 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current " 4759 "place = %d)\n", 4760 gtid, th->th.th_new_place, th->th.th_current_place)); 4761 4762 // Check that the new place is within this thread's partition. 4763 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4764 KMP_ASSERT(th->th.th_new_place >= 0); 4765 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 4766 if (th->th.th_first_place <= th->th.th_last_place) { 4767 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) && 4768 (th->th.th_new_place <= th->th.th_last_place)); 4769 } else { 4770 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) || 4771 (th->th.th_new_place >= th->th.th_last_place)); 4772 } 4773 4774 // Copy the thread mask to the kmp_info_t strucuture, 4775 // and set this thread's affinity. 4776 kmp_affin_mask_t *mask = 4777 KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place); 4778 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4779 th->th.th_current_place = th->th.th_new_place; 4780 4781 if (__kmp_affinity_verbose) { 4782 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4783 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4784 th->th.th_affin_mask); 4785 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), 4786 __kmp_gettid(), gtid, buf); 4787 } 4788 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4789 } 4790 4791 #endif /* OMP_40_ENABLED */ 4792 4793 int __kmp_aux_set_affinity(void **mask) { 4794 int gtid; 4795 kmp_info_t *th; 4796 int retval; 4797 4798 if (!KMP_AFFINITY_CAPABLE()) { 4799 return -1; 4800 } 4801 4802 gtid = __kmp_entry_gtid(); 4803 KA_TRACE(1000, ; { 4804 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4805 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4806 (kmp_affin_mask_t *)(*mask)); 4807 __kmp_debug_printf( 4808 "kmp_set_affinity: setting affinity mask for thread %d = %s\n", gtid, 4809 buf); 4810 }); 4811 4812 if (__kmp_env_consistency_check) { 4813 if ((mask == NULL) || (*mask == NULL)) { 4814 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4815 } else { 4816 unsigned proc; 4817 int num_procs = 0; 4818 4819 KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) { 4820 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4821 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4822 } 4823 if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 4824 continue; 4825 } 4826 num_procs++; 4827 } 4828 if (num_procs == 0) { 4829 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4830 } 4831 4832 #if KMP_GROUP_AFFINITY 4833 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 4834 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4835 } 4836 #endif /* KMP_GROUP_AFFINITY */ 4837 } 4838 } 4839 4840 th = __kmp_threads[gtid]; 4841 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4842 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4843 if (retval == 0) { 4844 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 4845 } 4846 4847 #if OMP_40_ENABLED 4848 th->th.th_current_place = KMP_PLACE_UNDEFINED; 4849 th->th.th_new_place = KMP_PLACE_UNDEFINED; 4850 th->th.th_first_place = 0; 4851 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4852 4853 // Turn off 4.0 affinity for the current tread at this parallel level. 4854 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 4855 #endif 4856 4857 return retval; 4858 } 4859 4860 int __kmp_aux_get_affinity(void **mask) { 4861 int gtid; 4862 int retval; 4863 kmp_info_t *th; 4864 4865 if (!KMP_AFFINITY_CAPABLE()) { 4866 return -1; 4867 } 4868 4869 gtid = __kmp_entry_gtid(); 4870 th = __kmp_threads[gtid]; 4871 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4872 4873 KA_TRACE(1000, ; { 4874 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4875 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4876 th->th.th_affin_mask); 4877 __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", 4878 gtid, buf); 4879 }); 4880 4881 if (__kmp_env_consistency_check) { 4882 if ((mask == NULL) || (*mask == NULL)) { 4883 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 4884 } 4885 } 4886 4887 #if !KMP_OS_WINDOWS 4888 4889 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4890 KA_TRACE(1000, ; { 4891 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4892 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4893 (kmp_affin_mask_t *)(*mask)); 4894 __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", 4895 gtid, buf); 4896 }); 4897 return retval; 4898 4899 #else 4900 4901 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 4902 return 0; 4903 4904 #endif /* KMP_OS_WINDOWS */ 4905 } 4906 4907 int __kmp_aux_get_affinity_max_proc() { 4908 if (!KMP_AFFINITY_CAPABLE()) { 4909 return 0; 4910 } 4911 #if KMP_GROUP_AFFINITY 4912 if (__kmp_num_proc_groups > 1) { 4913 return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT); 4914 } 4915 #endif 4916 return __kmp_xproc; 4917 } 4918 4919 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) { 4920 if (!KMP_AFFINITY_CAPABLE()) { 4921 return -1; 4922 } 4923 4924 KA_TRACE(1000, ; { 4925 int gtid = __kmp_entry_gtid(); 4926 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4927 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4928 (kmp_affin_mask_t *)(*mask)); 4929 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in " 4930 "affinity mask for thread %d = %s\n", 4931 proc, gtid, buf); 4932 }); 4933 4934 if (__kmp_env_consistency_check) { 4935 if ((mask == NULL) || (*mask == NULL)) { 4936 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4937 } 4938 } 4939 4940 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4941 return -1; 4942 } 4943 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4944 return -2; 4945 } 4946 4947 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 4948 return 0; 4949 } 4950 4951 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) { 4952 if (!KMP_AFFINITY_CAPABLE()) { 4953 return -1; 4954 } 4955 4956 KA_TRACE(1000, ; { 4957 int gtid = __kmp_entry_gtid(); 4958 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4959 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4960 (kmp_affin_mask_t *)(*mask)); 4961 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in " 4962 "affinity mask for thread %d = %s\n", 4963 proc, gtid, buf); 4964 }); 4965 4966 if (__kmp_env_consistency_check) { 4967 if ((mask == NULL) || (*mask == NULL)) { 4968 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 4969 } 4970 } 4971 4972 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4973 return -1; 4974 } 4975 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4976 return -2; 4977 } 4978 4979 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 4980 return 0; 4981 } 4982 4983 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) { 4984 if (!KMP_AFFINITY_CAPABLE()) { 4985 return -1; 4986 } 4987 4988 KA_TRACE(1000, ; { 4989 int gtid = __kmp_entry_gtid(); 4990 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4991 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4992 (kmp_affin_mask_t *)(*mask)); 4993 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in " 4994 "affinity mask for thread %d = %s\n", 4995 proc, gtid, buf); 4996 }); 4997 4998 if (__kmp_env_consistency_check) { 4999 if ((mask == NULL) || (*mask == NULL)) { 5000 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc"); 5001 } 5002 } 5003 5004 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 5005 return -1; 5006 } 5007 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 5008 return 0; 5009 } 5010 5011 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 5012 } 5013 5014 // Dynamic affinity settings - Affinity balanced 5015 void __kmp_balanced_affinity(int tid, int nthreads) { 5016 bool fine_gran = true; 5017 5018 switch (__kmp_affinity_gran) { 5019 case affinity_gran_fine: 5020 case affinity_gran_thread: 5021 break; 5022 case affinity_gran_core: 5023 if (__kmp_nThreadsPerCore > 1) { 5024 fine_gran = false; 5025 } 5026 break; 5027 case affinity_gran_package: 5028 if (nCoresPerPkg > 1) { 5029 fine_gran = false; 5030 } 5031 break; 5032 default: 5033 fine_gran = false; 5034 } 5035 5036 if (__kmp_affinity_uniform_topology()) { 5037 int coreID; 5038 int threadID; 5039 // Number of hyper threads per core in HT machine 5040 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 5041 // Number of cores 5042 int ncores = __kmp_ncores; 5043 if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) { 5044 __kmp_nth_per_core = __kmp_avail_proc / nPackages; 5045 ncores = nPackages; 5046 } 5047 // How many threads will be bound to each core 5048 int chunk = nthreads / ncores; 5049 // How many cores will have an additional thread bound to it - "big cores" 5050 int big_cores = nthreads % ncores; 5051 // Number of threads on the big cores 5052 int big_nth = (chunk + 1) * big_cores; 5053 if (tid < big_nth) { 5054 coreID = tid / (chunk + 1); 5055 threadID = (tid % (chunk + 1)) % __kmp_nth_per_core; 5056 } else { // tid >= big_nth 5057 coreID = (tid - big_cores) / chunk; 5058 threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core; 5059 } 5060 5061 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 5062 "Illegal set affinity operation when not capable"); 5063 5064 kmp_affin_mask_t *mask; 5065 KMP_CPU_ALLOC_ON_STACK(mask); 5066 KMP_CPU_ZERO(mask); 5067 5068 if (fine_gran) { 5069 int osID = address2os[coreID * __kmp_nth_per_core + threadID].second; 5070 KMP_CPU_SET(osID, mask); 5071 } else { 5072 for (int i = 0; i < __kmp_nth_per_core; i++) { 5073 int osID; 5074 osID = address2os[coreID * __kmp_nth_per_core + i].second; 5075 KMP_CPU_SET(osID, mask); 5076 } 5077 } 5078 if (__kmp_affinity_verbose) { 5079 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5080 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 5081 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 5082 __kmp_gettid(), tid, buf); 5083 } 5084 __kmp_set_system_affinity(mask, TRUE); 5085 KMP_CPU_FREE_FROM_STACK(mask); 5086 } else { // Non-uniform topology 5087 5088 kmp_affin_mask_t *mask; 5089 KMP_CPU_ALLOC_ON_STACK(mask); 5090 KMP_CPU_ZERO(mask); 5091 5092 int core_level = __kmp_affinity_find_core_level( 5093 address2os, __kmp_avail_proc, __kmp_aff_depth - 1); 5094 int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, 5095 __kmp_aff_depth - 1, core_level); 5096 int nth_per_core = __kmp_affinity_max_proc_per_core( 5097 address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level); 5098 5099 // For performance gain consider the special case nthreads == 5100 // __kmp_avail_proc 5101 if (nthreads == __kmp_avail_proc) { 5102 if (fine_gran) { 5103 int osID = address2os[tid].second; 5104 KMP_CPU_SET(osID, mask); 5105 } else { 5106 int core = __kmp_affinity_find_core(address2os, tid, 5107 __kmp_aff_depth - 1, core_level); 5108 for (int i = 0; i < __kmp_avail_proc; i++) { 5109 int osID = address2os[i].second; 5110 if (__kmp_affinity_find_core(address2os, i, __kmp_aff_depth - 1, 5111 core_level) == core) { 5112 KMP_CPU_SET(osID, mask); 5113 } 5114 } 5115 } 5116 } else if (nthreads <= ncores) { 5117 5118 int core = 0; 5119 for (int i = 0; i < ncores; i++) { 5120 // Check if this core from procarr[] is in the mask 5121 int in_mask = 0; 5122 for (int j = 0; j < nth_per_core; j++) { 5123 if (procarr[i * nth_per_core + j] != -1) { 5124 in_mask = 1; 5125 break; 5126 } 5127 } 5128 if (in_mask) { 5129 if (tid == core) { 5130 for (int j = 0; j < nth_per_core; j++) { 5131 int osID = procarr[i * nth_per_core + j]; 5132 if (osID != -1) { 5133 KMP_CPU_SET(osID, mask); 5134 // For fine granularity it is enough to set the first available 5135 // osID for this core 5136 if (fine_gran) { 5137 break; 5138 } 5139 } 5140 } 5141 break; 5142 } else { 5143 core++; 5144 } 5145 } 5146 } 5147 } else { // nthreads > ncores 5148 // Array to save the number of processors at each core 5149 int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores); 5150 // Array to save the number of cores with "x" available processors; 5151 int *ncores_with_x_procs = 5152 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 5153 // Array to save the number of cores with # procs from x to nth_per_core 5154 int *ncores_with_x_to_max_procs = 5155 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 5156 5157 for (int i = 0; i <= nth_per_core; i++) { 5158 ncores_with_x_procs[i] = 0; 5159 ncores_with_x_to_max_procs[i] = 0; 5160 } 5161 5162 for (int i = 0; i < ncores; i++) { 5163 int cnt = 0; 5164 for (int j = 0; j < nth_per_core; j++) { 5165 if (procarr[i * nth_per_core + j] != -1) { 5166 cnt++; 5167 } 5168 } 5169 nproc_at_core[i] = cnt; 5170 ncores_with_x_procs[cnt]++; 5171 } 5172 5173 for (int i = 0; i <= nth_per_core; i++) { 5174 for (int j = i; j <= nth_per_core; j++) { 5175 ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j]; 5176 } 5177 } 5178 5179 // Max number of processors 5180 int nproc = nth_per_core * ncores; 5181 // An array to keep number of threads per each context 5182 int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc); 5183 for (int i = 0; i < nproc; i++) { 5184 newarr[i] = 0; 5185 } 5186 5187 int nth = nthreads; 5188 int flag = 0; 5189 while (nth > 0) { 5190 for (int j = 1; j <= nth_per_core; j++) { 5191 int cnt = ncores_with_x_to_max_procs[j]; 5192 for (int i = 0; i < ncores; i++) { 5193 // Skip the core with 0 processors 5194 if (nproc_at_core[i] == 0) { 5195 continue; 5196 } 5197 for (int k = 0; k < nth_per_core; k++) { 5198 if (procarr[i * nth_per_core + k] != -1) { 5199 if (newarr[i * nth_per_core + k] == 0) { 5200 newarr[i * nth_per_core + k] = 1; 5201 cnt--; 5202 nth--; 5203 break; 5204 } else { 5205 if (flag != 0) { 5206 newarr[i * nth_per_core + k]++; 5207 cnt--; 5208 nth--; 5209 break; 5210 } 5211 } 5212 } 5213 } 5214 if (cnt == 0 || nth == 0) { 5215 break; 5216 } 5217 } 5218 if (nth == 0) { 5219 break; 5220 } 5221 } 5222 flag = 1; 5223 } 5224 int sum = 0; 5225 for (int i = 0; i < nproc; i++) { 5226 sum += newarr[i]; 5227 if (sum > tid) { 5228 if (fine_gran) { 5229 int osID = procarr[i]; 5230 KMP_CPU_SET(osID, mask); 5231 } else { 5232 int coreID = i / nth_per_core; 5233 for (int ii = 0; ii < nth_per_core; ii++) { 5234 int osID = procarr[coreID * nth_per_core + ii]; 5235 if (osID != -1) { 5236 KMP_CPU_SET(osID, mask); 5237 } 5238 } 5239 } 5240 break; 5241 } 5242 } 5243 __kmp_free(newarr); 5244 } 5245 5246 if (__kmp_affinity_verbose) { 5247 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5248 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 5249 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 5250 __kmp_gettid(), tid, buf); 5251 } 5252 __kmp_set_system_affinity(mask, TRUE); 5253 KMP_CPU_FREE_FROM_STACK(mask); 5254 } 5255 } 5256 5257 #if KMP_OS_LINUX 5258 // We don't need this entry for Windows because 5259 // there is GetProcessAffinityMask() api 5260 // 5261 // The intended usage is indicated by these steps: 5262 // 1) The user gets the current affinity mask 5263 // 2) Then sets the affinity by calling this function 5264 // 3) Error check the return value 5265 // 4) Use non-OpenMP parallelization 5266 // 5) Reset the affinity to what was stored in step 1) 5267 #ifdef __cplusplus 5268 extern "C" 5269 #endif 5270 int 5271 kmp_set_thread_affinity_mask_initial() 5272 // the function returns 0 on success, 5273 // -1 if we cannot bind thread 5274 // >0 (errno) if an error happened during binding 5275 { 5276 int gtid = __kmp_get_gtid(); 5277 if (gtid < 0) { 5278 // Do not touch non-omp threads 5279 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5280 "non-omp thread, returning\n")); 5281 return -1; 5282 } 5283 if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) { 5284 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5285 "affinity not initialized, returning\n")); 5286 return -1; 5287 } 5288 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5289 "set full mask for thread %d\n", 5290 gtid)); 5291 KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL); 5292 return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE); 5293 } 5294 #endif 5295 5296 #endif // KMP_AFFINITY_SUPPORTED 5297