1 /* 2 * kmp_affinity.cpp -- affinity management 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // The LLVM Compiler Infrastructure 8 // 9 // This file is dual licensed under the MIT and the University of Illinois Open 10 // Source Licenses. See LICENSE.txt for details. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "kmp.h" 15 #include "kmp_affinity.h" 16 #include "kmp_i18n.h" 17 #include "kmp_io.h" 18 #include "kmp_str.h" 19 #include "kmp_wrapper_getpid.h" 20 21 // Store the real or imagined machine hierarchy here 22 static hierarchy_info machine_hierarchy; 23 24 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); } 25 26 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 27 kmp_uint32 depth; 28 // The test below is true if affinity is available, but set to "none". Need to 29 // init on first use of hierarchical barrier. 30 if (TCR_1(machine_hierarchy.uninitialized)) 31 machine_hierarchy.init(NULL, nproc); 32 33 // Adjust the hierarchy in case num threads exceeds original 34 if (nproc > machine_hierarchy.base_num_threads) 35 machine_hierarchy.resize(nproc); 36 37 depth = machine_hierarchy.depth; 38 KMP_DEBUG_ASSERT(depth > 0); 39 40 thr_bar->depth = depth; 41 thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0] - 1; 42 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 43 } 44 45 #if KMP_AFFINITY_SUPPORTED 46 47 bool KMPAffinity::picked_api = false; 48 49 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); } 50 void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); } 51 void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); } 52 void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); } 53 void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); } 54 void KMPAffinity::operator delete(void *p) { __kmp_free(p); } 55 56 void KMPAffinity::pick_api() { 57 KMPAffinity *affinity_dispatch; 58 if (picked_api) 59 return; 60 #if KMP_USE_HWLOC 61 // Only use Hwloc if affinity isn't explicitly disabled and 62 // user requests Hwloc topology method 63 if (__kmp_affinity_top_method == affinity_top_method_hwloc && 64 __kmp_affinity_type != affinity_disabled) { 65 affinity_dispatch = new KMPHwlocAffinity(); 66 } else 67 #endif 68 { 69 affinity_dispatch = new KMPNativeAffinity(); 70 } 71 __kmp_affinity_dispatch = affinity_dispatch; 72 picked_api = true; 73 } 74 75 void KMPAffinity::destroy_api() { 76 if (__kmp_affinity_dispatch != NULL) { 77 delete __kmp_affinity_dispatch; 78 __kmp_affinity_dispatch = NULL; 79 picked_api = false; 80 } 81 } 82 83 // Print the affinity mask to the character array in a pretty format. 84 char *__kmp_affinity_print_mask(char *buf, int buf_len, 85 kmp_affin_mask_t *mask) { 86 KMP_ASSERT(buf_len >= 40); 87 char *scan = buf; 88 char *end = buf + buf_len - 1; 89 90 // Find first element / check for empty set. 91 size_t i; 92 i = mask->begin(); 93 if (i == mask->end()) { 94 KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}"); 95 while (*scan != '\0') 96 scan++; 97 KMP_ASSERT(scan <= end); 98 return buf; 99 } 100 101 KMP_SNPRINTF(scan, end - scan + 1, "{%ld", (long)i); 102 while (*scan != '\0') 103 scan++; 104 i++; 105 for (; i != mask->end(); i = mask->next(i)) { 106 if (!KMP_CPU_ISSET(i, mask)) { 107 continue; 108 } 109 110 // Check for buffer overflow. A string of the form ",<n>" will have at most 111 // 10 characters, plus we want to leave room to print ",...}" if the set is 112 // too large to print for a total of 15 characters. We already left room for 113 // '\0' in setting end. 114 if (end - scan < 15) { 115 break; 116 } 117 KMP_SNPRINTF(scan, end - scan + 1, ",%-ld", (long)i); 118 while (*scan != '\0') 119 scan++; 120 } 121 if (i != mask->end()) { 122 KMP_SNPRINTF(scan, end - scan + 1, ",..."); 123 while (*scan != '\0') 124 scan++; 125 } 126 KMP_SNPRINTF(scan, end - scan + 1, "}"); 127 while (*scan != '\0') 128 scan++; 129 KMP_ASSERT(scan <= end); 130 return buf; 131 } 132 133 void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) { 134 KMP_CPU_ZERO(mask); 135 136 #if KMP_GROUP_AFFINITY 137 138 if (__kmp_num_proc_groups > 1) { 139 int group; 140 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 141 for (group = 0; group < __kmp_num_proc_groups; group++) { 142 int i; 143 int num = __kmp_GetActiveProcessorCount(group); 144 for (i = 0; i < num; i++) { 145 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 146 } 147 } 148 } else 149 150 #endif /* KMP_GROUP_AFFINITY */ 151 152 { 153 int proc; 154 for (proc = 0; proc < __kmp_xproc; proc++) { 155 KMP_CPU_SET(proc, mask); 156 } 157 } 158 } 159 160 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be 161 // called to renumber the labels from [0..n] and place them into the child_num 162 // vector of the address object. This is done in case the labels used for 163 // the children at one node of the hierarchy differ from those used for 164 // another node at the same level. Example: suppose the machine has 2 nodes 165 // with 2 packages each. The first node contains packages 601 and 602, and 166 // second node contains packages 603 and 604. If we try to sort the table 167 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604 168 // because we are paying attention to the labels themselves, not the ordinal 169 // child numbers. By using the child numbers in the sort, the result is 170 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604. 171 static void __kmp_affinity_assign_child_nums(AddrUnsPair *address2os, 172 int numAddrs) { 173 KMP_DEBUG_ASSERT(numAddrs > 0); 174 int depth = address2os->first.depth; 175 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 176 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 177 int labCt; 178 for (labCt = 0; labCt < depth; labCt++) { 179 address2os[0].first.childNums[labCt] = counts[labCt] = 0; 180 lastLabel[labCt] = address2os[0].first.labels[labCt]; 181 } 182 int i; 183 for (i = 1; i < numAddrs; i++) { 184 for (labCt = 0; labCt < depth; labCt++) { 185 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) { 186 int labCt2; 187 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) { 188 counts[labCt2] = 0; 189 lastLabel[labCt2] = address2os[i].first.labels[labCt2]; 190 } 191 counts[labCt]++; 192 lastLabel[labCt] = address2os[i].first.labels[labCt]; 193 break; 194 } 195 } 196 for (labCt = 0; labCt < depth; labCt++) { 197 address2os[i].first.childNums[labCt] = counts[labCt]; 198 } 199 for (; labCt < (int)Address::maxDepth; labCt++) { 200 address2os[i].first.childNums[labCt] = 0; 201 } 202 } 203 __kmp_free(lastLabel); 204 __kmp_free(counts); 205 } 206 207 // All of the __kmp_affinity_create_*_map() routines should set 208 // __kmp_affinity_masks to a vector of affinity mask objects of length 209 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and return 210 // the number of levels in the machine topology tree (zero if 211 // __kmp_affinity_type == affinity_none). 212 // 213 // All of the __kmp_affinity_create_*_map() routines should set 214 // *__kmp_affin_fullMask to the affinity mask for the initialization thread. 215 // They need to save and restore the mask, and it could be needed later, so 216 // saving it is just an optimization to avoid calling kmp_get_system_affinity() 217 // again. 218 kmp_affin_mask_t *__kmp_affin_fullMask = NULL; 219 220 static int nCoresPerPkg, nPackages; 221 static int __kmp_nThreadsPerCore; 222 #ifndef KMP_DFLT_NTH_CORES 223 static int __kmp_ncores; 224 #endif 225 static int *__kmp_pu_os_idx = NULL; 226 227 // __kmp_affinity_uniform_topology() doesn't work when called from 228 // places which support arbitrarily many levels in the machine topology 229 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map() 230 // __kmp_affinity_create_x2apicid_map(). 231 inline static bool __kmp_affinity_uniform_topology() { 232 return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages); 233 } 234 235 // Print out the detailed machine topology map, i.e. the physical locations 236 // of each OS proc. 237 static void __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, 238 int depth, int pkgLevel, 239 int coreLevel, int threadLevel) { 240 int proc; 241 242 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 243 for (proc = 0; proc < len; proc++) { 244 int level; 245 kmp_str_buf_t buf; 246 __kmp_str_buf_init(&buf); 247 for (level = 0; level < depth; level++) { 248 if (level == threadLevel) { 249 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread)); 250 } else if (level == coreLevel) { 251 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core)); 252 } else if (level == pkgLevel) { 253 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package)); 254 } else if (level > pkgLevel) { 255 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node), 256 level - pkgLevel - 1); 257 } else { 258 __kmp_str_buf_print(&buf, "L%d ", level); 259 } 260 __kmp_str_buf_print(&buf, "%d ", address2os[proc].first.labels[level]); 261 } 262 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second, 263 buf.str); 264 __kmp_str_buf_free(&buf); 265 } 266 } 267 268 #if KMP_USE_HWLOC 269 270 static void __kmp_affinity_print_hwloc_tp(AddrUnsPair *addrP, int len, 271 int depth, int *levels) { 272 int proc; 273 kmp_str_buf_t buf; 274 __kmp_str_buf_init(&buf); 275 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 276 for (proc = 0; proc < len; proc++) { 277 __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Package), 278 addrP[proc].first.labels[0]); 279 if (depth > 1) { 280 int level = 1; // iterate over levels 281 int label = 1; // iterate over labels 282 if (__kmp_numa_detected) 283 // node level follows package 284 if (levels[level++] > 0) 285 __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Node), 286 addrP[proc].first.labels[label++]); 287 if (__kmp_tile_depth > 0) 288 // tile level follows node if any, or package 289 if (levels[level++] > 0) 290 __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Tile), 291 addrP[proc].first.labels[label++]); 292 if (levels[level++] > 0) 293 // core level follows 294 __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Core), 295 addrP[proc].first.labels[label++]); 296 if (levels[level++] > 0) 297 // thread level is the latest 298 __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Thread), 299 addrP[proc].first.labels[label++]); 300 KMP_DEBUG_ASSERT(label == depth); 301 } 302 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", addrP[proc].second, buf.str); 303 __kmp_str_buf_clear(&buf); 304 } 305 __kmp_str_buf_free(&buf); 306 } 307 308 static int nNodePerPkg, nTilePerPkg, nTilePerNode, nCorePerNode, nCorePerTile; 309 310 // This function removes the topology levels that are radix 1 and don't offer 311 // further information about the topology. The most common example is when you 312 // have one thread context per core, we don't want the extra thread context 313 // level if it offers no unique labels. So they are removed. 314 // return value: the new depth of address2os 315 static int __kmp_affinity_remove_radix_one_levels(AddrUnsPair *addrP, int nTh, 316 int depth, int *levels) { 317 int level; 318 int i; 319 int radix1_detected; 320 int new_depth = depth; 321 for (level = depth - 1; level > 0; --level) { 322 // Detect if this level is radix 1 323 radix1_detected = 1; 324 for (i = 1; i < nTh; ++i) { 325 if (addrP[0].first.labels[level] != addrP[i].first.labels[level]) { 326 // There are differing label values for this level so it stays 327 radix1_detected = 0; 328 break; 329 } 330 } 331 if (!radix1_detected) 332 continue; 333 // Radix 1 was detected 334 --new_depth; 335 levels[level] = -1; // mark level as not present in address2os array 336 if (level == new_depth) { 337 // "turn off" deepest level, just decrement the depth that removes 338 // the level from address2os array 339 for (i = 0; i < nTh; ++i) { 340 addrP[i].first.depth--; 341 } 342 } else { 343 // For other levels, we move labels over and also reduce the depth 344 int j; 345 for (j = level; j < new_depth; ++j) { 346 for (i = 0; i < nTh; ++i) { 347 addrP[i].first.labels[j] = addrP[i].first.labels[j + 1]; 348 addrP[i].first.depth--; 349 } 350 levels[j + 1] -= 1; 351 } 352 } 353 } 354 return new_depth; 355 } 356 357 // Returns the number of objects of type 'type' below 'obj' within the topology 358 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is 359 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET 360 // object. 361 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj, 362 hwloc_obj_type_t type) { 363 int retval = 0; 364 hwloc_obj_t first; 365 for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type, 366 obj->logical_index, type, 0); 367 first != NULL && 368 hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, obj->type, first) == 369 obj; 370 first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type, 371 first)) { 372 ++retval; 373 } 374 return retval; 375 } 376 377 static int __kmp_hwloc_count_children_by_depth(hwloc_topology_t t, 378 hwloc_obj_t o, unsigned depth, 379 hwloc_obj_t *f) { 380 if (o->depth == depth) { 381 if (*f == NULL) 382 *f = o; // output first descendant found 383 return 1; 384 } 385 int sum = 0; 386 for (unsigned i = 0; i < o->arity; i++) 387 sum += __kmp_hwloc_count_children_by_depth(t, o->children[i], depth, f); 388 return sum; // will be 0 if no one found (as PU arity is 0) 389 } 390 391 static int __kmp_hwloc_count_children_by_type(hwloc_topology_t t, hwloc_obj_t o, 392 hwloc_obj_type_t type, 393 hwloc_obj_t *f) { 394 if (!hwloc_compare_types(o->type, type)) { 395 if (*f == NULL) 396 *f = o; // output first descendant found 397 return 1; 398 } 399 int sum = 0; 400 for (unsigned i = 0; i < o->arity; i++) 401 sum += __kmp_hwloc_count_children_by_type(t, o->children[i], type, f); 402 return sum; // will be 0 if no one found (as PU arity is 0) 403 } 404 405 static int __kmp_hwloc_process_obj_core_pu(AddrUnsPair *addrPair, 406 int &nActiveThreads, 407 int &num_active_cores, 408 hwloc_obj_t obj, int depth, 409 int *labels) { 410 hwloc_obj_t core = NULL; 411 hwloc_topology_t &tp = __kmp_hwloc_topology; 412 int NC = __kmp_hwloc_count_children_by_type(tp, obj, HWLOC_OBJ_CORE, &core); 413 for (int core_id = 0; core_id < NC; ++core_id, core = core->next_cousin) { 414 hwloc_obj_t pu = NULL; 415 KMP_DEBUG_ASSERT(core != NULL); 416 int num_active_threads = 0; 417 int NT = __kmp_hwloc_count_children_by_type(tp, core, HWLOC_OBJ_PU, &pu); 418 // int NT = core->arity; pu = core->first_child; // faster? 419 for (int pu_id = 0; pu_id < NT; ++pu_id, pu = pu->next_cousin) { 420 KMP_DEBUG_ASSERT(pu != NULL); 421 if (!KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask)) 422 continue; // skip inactive (inaccessible) unit 423 Address addr(depth + 2); 424 KA_TRACE(20, ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n", 425 obj->os_index, obj->logical_index, core->os_index, 426 core->logical_index, pu->os_index, pu->logical_index)); 427 for (int i = 0; i < depth; ++i) 428 addr.labels[i] = labels[i]; // package, etc. 429 addr.labels[depth] = core_id; // core 430 addr.labels[depth + 1] = pu_id; // pu 431 addrPair[nActiveThreads] = AddrUnsPair(addr, pu->os_index); 432 __kmp_pu_os_idx[nActiveThreads] = pu->os_index; 433 nActiveThreads++; 434 ++num_active_threads; // count active threads per core 435 } 436 if (num_active_threads) { // were there any active threads on the core? 437 ++__kmp_ncores; // count total active cores 438 ++num_active_cores; // count active cores per socket 439 if (num_active_threads > __kmp_nThreadsPerCore) 440 __kmp_nThreadsPerCore = num_active_threads; // calc maximum 441 } 442 } 443 return 0; 444 } 445 446 // Check if NUMA node detected below the package, 447 // and if tile object is detected and return its depth 448 static int __kmp_hwloc_check_numa() { 449 hwloc_topology_t &tp = __kmp_hwloc_topology; 450 hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to) 451 int depth; 452 453 // Get some PU 454 hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, 0); 455 if (hT == NULL) // something has gone wrong 456 return 1; 457 458 // check NUMA node below PACKAGE 459 hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT); 460 hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT); 461 KMP_DEBUG_ASSERT(hS != NULL); 462 if (hN != NULL && hN->depth > hS->depth) { 463 __kmp_numa_detected = TRUE; // socket includes node(s) 464 if (__kmp_affinity_gran == affinity_gran_node) { 465 __kmp_affinity_gran == affinity_gran_numa; 466 } 467 } 468 469 // check tile, get object by depth because of multiple caches possible 470 depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED); 471 hL = hwloc_get_ancestor_obj_by_depth(tp, depth, hT); 472 hC = NULL; // not used, but reset it here just in case 473 if (hL != NULL && 474 __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1) 475 __kmp_tile_depth = depth; // tile consists of multiple cores 476 return 0; 477 } 478 479 static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os, 480 kmp_i18n_id_t *const msg_id) { 481 hwloc_topology_t &tp = __kmp_hwloc_topology; // shortcut of a long name 482 *address2os = NULL; 483 *msg_id = kmp_i18n_null; 484 485 // Save the affinity mask for the current thread. 486 kmp_affin_mask_t *oldMask; 487 KMP_CPU_ALLOC(oldMask); 488 __kmp_get_system_affinity(oldMask, TRUE); 489 __kmp_hwloc_check_numa(); 490 491 if (!KMP_AFFINITY_CAPABLE()) { 492 // Hack to try and infer the machine topology using only the data 493 // available from cpuid on the current thread, and __kmp_xproc. 494 KMP_ASSERT(__kmp_affinity_type == affinity_none); 495 496 nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj( 497 hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0), HWLOC_OBJ_CORE); 498 __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj( 499 hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0), HWLOC_OBJ_PU); 500 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 501 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 502 if (__kmp_affinity_verbose) { 503 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 504 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 505 if (__kmp_affinity_uniform_topology()) { 506 KMP_INFORM(Uniform, "KMP_AFFINITY"); 507 } else { 508 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 509 } 510 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 511 __kmp_nThreadsPerCore, __kmp_ncores); 512 } 513 KMP_CPU_FREE(oldMask); 514 return 0; 515 } 516 517 int depth = 3; 518 int levels[5] = {0, 1, 2, 3, 4}; // package, [node,] [tile,] core, thread 519 int labels[3] = {0}; // package [,node] [,tile] - head of lables array 520 if (__kmp_numa_detected) 521 ++depth; 522 if (__kmp_tile_depth) 523 ++depth; 524 525 // Allocate the data structure to be returned. 526 AddrUnsPair *retval = 527 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 528 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 529 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 530 531 // When affinity is off, this routine will still be called to set 532 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 533 // nCoresPerPkg, & nPackages. Make sure all these vars are set 534 // correctly, and return if affinity is not enabled. 535 536 hwloc_obj_t socket, node, tile; 537 int nActiveThreads = 0; 538 int socket_id = 0; 539 // re-calculate globals to count only accessible resources 540 __kmp_ncores = nPackages = nCoresPerPkg = __kmp_nThreadsPerCore = 0; 541 nNodePerPkg = nTilePerPkg = nTilePerNode = nCorePerNode = nCorePerTile = 0; 542 for (socket = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0); socket != NULL; 543 socket = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, socket), 544 socket_id++) { 545 labels[0] = socket_id; 546 if (__kmp_numa_detected) { 547 int NN; 548 int n_active_nodes = 0; 549 node = NULL; 550 NN = __kmp_hwloc_count_children_by_type(tp, socket, HWLOC_OBJ_NUMANODE, 551 &node); 552 for (int node_id = 0; node_id < NN; ++node_id, node = node->next_cousin) { 553 labels[1] = node_id; 554 if (__kmp_tile_depth) { 555 // NUMA + tiles 556 int NT; 557 int n_active_tiles = 0; 558 tile = NULL; 559 NT = __kmp_hwloc_count_children_by_depth(tp, node, __kmp_tile_depth, 560 &tile); 561 for (int tl_id = 0; tl_id < NT; ++tl_id, tile = tile->next_cousin) { 562 labels[2] = tl_id; 563 int n_active_cores = 0; 564 __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads, 565 n_active_cores, tile, 3, labels); 566 if (n_active_cores) { // were there any active cores on the socket? 567 ++n_active_tiles; // count active tiles per node 568 if (n_active_cores > nCorePerTile) 569 nCorePerTile = n_active_cores; // calc maximum 570 } 571 } 572 if (n_active_tiles) { // were there any active tiles on the socket? 573 ++n_active_nodes; // count active nodes per package 574 if (n_active_tiles > nTilePerNode) 575 nTilePerNode = n_active_tiles; // calc maximum 576 } 577 } else { 578 // NUMA, no tiles 579 int n_active_cores = 0; 580 __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads, 581 n_active_cores, node, 2, labels); 582 if (n_active_cores) { // were there any active cores on the socket? 583 ++n_active_nodes; // count active nodes per package 584 if (n_active_cores > nCorePerNode) 585 nCorePerNode = n_active_cores; // calc maximum 586 } 587 } 588 } 589 if (n_active_nodes) { // were there any active nodes on the socket? 590 ++nPackages; // count total active packages 591 if (n_active_nodes > nNodePerPkg) 592 nNodePerPkg = n_active_nodes; // calc maximum 593 } 594 } else { 595 if (__kmp_tile_depth) { 596 // no NUMA, tiles 597 int NT; 598 int n_active_tiles = 0; 599 tile = NULL; 600 NT = __kmp_hwloc_count_children_by_depth(tp, socket, __kmp_tile_depth, 601 &tile); 602 for (int tl_id = 0; tl_id < NT; ++tl_id, tile = tile->next_cousin) { 603 labels[1] = tl_id; 604 int n_active_cores = 0; 605 __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads, 606 n_active_cores, tile, 2, labels); 607 if (n_active_cores) { // were there any active cores on the socket? 608 ++n_active_tiles; // count active tiles per package 609 if (n_active_cores > nCorePerTile) 610 nCorePerTile = n_active_cores; // calc maximum 611 } 612 } 613 if (n_active_tiles) { // were there any active tiles on the socket? 614 ++nPackages; // count total active packages 615 if (n_active_tiles > nTilePerPkg) 616 nTilePerPkg = n_active_tiles; // calc maximum 617 } 618 } else { 619 // no NUMA, no tiles 620 int n_active_cores = 0; 621 __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads, n_active_cores, 622 socket, 1, labels); 623 if (n_active_cores) { // were there any active cores on the socket? 624 ++nPackages; // count total active packages 625 if (n_active_cores > nCoresPerPkg) 626 nCoresPerPkg = n_active_cores; // calc maximum 627 } 628 } 629 } 630 } 631 632 // If there's only one thread context to bind to, return now. 633 KMP_DEBUG_ASSERT(nActiveThreads == __kmp_avail_proc); 634 KMP_ASSERT(nActiveThreads > 0); 635 if (nActiveThreads == 1) { 636 __kmp_ncores = nPackages = 1; 637 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 638 if (__kmp_affinity_verbose) { 639 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 640 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 641 642 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 643 if (__kmp_affinity_respect_mask) { 644 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 645 } else { 646 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 647 } 648 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 649 KMP_INFORM(Uniform, "KMP_AFFINITY"); 650 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 651 __kmp_nThreadsPerCore, __kmp_ncores); 652 } 653 654 if (__kmp_affinity_type == affinity_none) { 655 __kmp_free(retval); 656 KMP_CPU_FREE(oldMask); 657 return 0; 658 } 659 660 // Form an Address object which only includes the package level. 661 Address addr(1); 662 addr.labels[0] = retval[0].first.labels[0]; 663 retval[0].first = addr; 664 665 if (__kmp_affinity_gran_levels < 0) { 666 __kmp_affinity_gran_levels = 0; 667 } 668 669 if (__kmp_affinity_verbose) { 670 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 671 } 672 673 *address2os = retval; 674 KMP_CPU_FREE(oldMask); 675 return 1; 676 } 677 678 // Sort the table by physical Id. 679 qsort(retval, nActiveThreads, sizeof(*retval), 680 __kmp_affinity_cmp_Address_labels); 681 682 // Check to see if the machine topology is uniform 683 int nPUs = nPackages * __kmp_nThreadsPerCore; 684 if (__kmp_numa_detected) { 685 if (__kmp_tile_depth) { // NUMA + tiles 686 nPUs *= (nNodePerPkg * nTilePerNode * nCorePerTile); 687 } else { // NUMA, no tiles 688 nPUs *= (nNodePerPkg * nCorePerNode); 689 } 690 } else { 691 if (__kmp_tile_depth) { // no NUMA, tiles 692 nPUs *= (nTilePerPkg * nCorePerTile); 693 } else { // no NUMA, no tiles 694 nPUs *= nCoresPerPkg; 695 } 696 } 697 unsigned uniform = (nPUs == nActiveThreads); 698 699 // Print the machine topology summary. 700 if (__kmp_affinity_verbose) { 701 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 702 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 703 if (__kmp_affinity_respect_mask) { 704 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 705 } else { 706 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 707 } 708 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 709 if (uniform) { 710 KMP_INFORM(Uniform, "KMP_AFFINITY"); 711 } else { 712 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 713 } 714 if (__kmp_numa_detected) { 715 if (__kmp_tile_depth) { // NUMA + tiles 716 KMP_INFORM(TopologyExtraNoTi, "KMP_AFFINITY", nPackages, nNodePerPkg, 717 nTilePerNode, nCorePerTile, __kmp_nThreadsPerCore, 718 __kmp_ncores); 719 } else { // NUMA, no tiles 720 KMP_INFORM(TopologyExtraNode, "KMP_AFFINITY", nPackages, nNodePerPkg, 721 nCorePerNode, __kmp_nThreadsPerCore, __kmp_ncores); 722 nPUs *= (nNodePerPkg * nCorePerNode); 723 } 724 } else { 725 if (__kmp_tile_depth) { // no NUMA, tiles 726 KMP_INFORM(TopologyExtraTile, "KMP_AFFINITY", nPackages, nTilePerPkg, 727 nCorePerTile, __kmp_nThreadsPerCore, __kmp_ncores); 728 } else { // no NUMA, no tiles 729 kmp_str_buf_t buf; 730 __kmp_str_buf_init(&buf); 731 __kmp_str_buf_print(&buf, "%d", nPackages); 732 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 733 __kmp_nThreadsPerCore, __kmp_ncores); 734 __kmp_str_buf_free(&buf); 735 } 736 } 737 } 738 739 if (__kmp_affinity_type == affinity_none) { 740 __kmp_free(retval); 741 KMP_CPU_FREE(oldMask); 742 return 0; 743 } 744 745 int depth_full = depth; // number of levels before compressing 746 // Find any levels with radiix 1, and remove them from the map 747 // (except for the package level). 748 depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth, 749 levels); 750 KMP_DEBUG_ASSERT(__kmp_affinity_gran != affinity_gran_default); 751 if (__kmp_affinity_gran_levels < 0) { 752 // Set the granularity level based on what levels are modeled 753 // in the machine topology map. 754 __kmp_affinity_gran_levels = 0; // lowest level (e.g. fine) 755 if (__kmp_affinity_gran > affinity_gran_thread) { 756 for (int i = 1; i <= depth_full; ++i) { 757 if (__kmp_affinity_gran <= i) // only count deeper levels 758 break; 759 if (levels[depth_full - i] > 0) 760 __kmp_affinity_gran_levels++; 761 } 762 } 763 if (__kmp_affinity_gran > affinity_gran_package) 764 __kmp_affinity_gran_levels++; // e.g. granularity = group 765 } 766 767 if (__kmp_affinity_verbose) 768 __kmp_affinity_print_hwloc_tp(retval, nActiveThreads, depth, levels); 769 770 KMP_CPU_FREE(oldMask); 771 *address2os = retval; 772 return depth; 773 } 774 #endif // KMP_USE_HWLOC 775 776 // If we don't know how to retrieve the machine's processor topology, or 777 // encounter an error in doing so, this routine is called to form a "flat" 778 // mapping of os thread id's <-> processor id's. 779 static int __kmp_affinity_create_flat_map(AddrUnsPair **address2os, 780 kmp_i18n_id_t *const msg_id) { 781 *address2os = NULL; 782 *msg_id = kmp_i18n_null; 783 784 // Even if __kmp_affinity_type == affinity_none, this routine might still 785 // called to set __kmp_ncores, as well as 786 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 787 if (!KMP_AFFINITY_CAPABLE()) { 788 KMP_ASSERT(__kmp_affinity_type == affinity_none); 789 __kmp_ncores = nPackages = __kmp_xproc; 790 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 791 if (__kmp_affinity_verbose) { 792 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY"); 793 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 794 KMP_INFORM(Uniform, "KMP_AFFINITY"); 795 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 796 __kmp_nThreadsPerCore, __kmp_ncores); 797 } 798 return 0; 799 } 800 801 // When affinity is off, this routine will still be called to set 802 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 803 // Make sure all these vars are set correctly, and return now if affinity is 804 // not enabled. 805 __kmp_ncores = nPackages = __kmp_avail_proc; 806 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 807 if (__kmp_affinity_verbose) { 808 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 809 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 810 __kmp_affin_fullMask); 811 812 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY"); 813 if (__kmp_affinity_respect_mask) { 814 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 815 } else { 816 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 817 } 818 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 819 KMP_INFORM(Uniform, "KMP_AFFINITY"); 820 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 821 __kmp_nThreadsPerCore, __kmp_ncores); 822 } 823 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 824 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 825 if (__kmp_affinity_type == affinity_none) { 826 int avail_ct = 0; 827 int i; 828 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 829 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) 830 continue; 831 __kmp_pu_os_idx[avail_ct++] = i; // suppose indices are flat 832 } 833 return 0; 834 } 835 836 // Contruct the data structure to be returned. 837 *address2os = 838 (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 839 int avail_ct = 0; 840 unsigned int i; 841 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 842 // Skip this proc if it is not included in the machine model. 843 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 844 continue; 845 } 846 __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat 847 Address addr(1); 848 addr.labels[0] = i; 849 (*address2os)[avail_ct++] = AddrUnsPair(addr, i); 850 } 851 if (__kmp_affinity_verbose) { 852 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 853 } 854 855 if (__kmp_affinity_gran_levels < 0) { 856 // Only the package level is modeled in the machine topology map, 857 // so the #levels of granularity is either 0 or 1. 858 if (__kmp_affinity_gran > affinity_gran_package) { 859 __kmp_affinity_gran_levels = 1; 860 } else { 861 __kmp_affinity_gran_levels = 0; 862 } 863 } 864 return 1; 865 } 866 867 #if KMP_GROUP_AFFINITY 868 869 // If multiple Windows* OS processor groups exist, we can create a 2-level 870 // topology map with the groups at level 0 and the individual procs at level 1. 871 // This facilitates letting the threads float among all procs in a group, 872 // if granularity=group (the default when there are multiple groups). 873 static int __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os, 874 kmp_i18n_id_t *const msg_id) { 875 *address2os = NULL; 876 *msg_id = kmp_i18n_null; 877 878 // If we aren't affinity capable, then return now. 879 // The flat mapping will be used. 880 if (!KMP_AFFINITY_CAPABLE()) { 881 // FIXME set *msg_id 882 return -1; 883 } 884 885 // Contruct the data structure to be returned. 886 *address2os = 887 (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 888 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 889 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 890 int avail_ct = 0; 891 int i; 892 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 893 // Skip this proc if it is not included in the machine model. 894 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 895 continue; 896 } 897 __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat 898 Address addr(2); 899 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR)); 900 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR)); 901 (*address2os)[avail_ct++] = AddrUnsPair(addr, i); 902 903 if (__kmp_affinity_verbose) { 904 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0], 905 addr.labels[1]); 906 } 907 } 908 909 if (__kmp_affinity_gran_levels < 0) { 910 if (__kmp_affinity_gran == affinity_gran_group) { 911 __kmp_affinity_gran_levels = 1; 912 } else if ((__kmp_affinity_gran == affinity_gran_fine) || 913 (__kmp_affinity_gran == affinity_gran_thread)) { 914 __kmp_affinity_gran_levels = 0; 915 } else { 916 const char *gran_str = NULL; 917 if (__kmp_affinity_gran == affinity_gran_core) { 918 gran_str = "core"; 919 } else if (__kmp_affinity_gran == affinity_gran_package) { 920 gran_str = "package"; 921 } else if (__kmp_affinity_gran == affinity_gran_node) { 922 gran_str = "node"; 923 } else { 924 KMP_ASSERT(0); 925 } 926 927 // Warning: can't use affinity granularity \"gran\" with group topology 928 // method, using "thread" 929 __kmp_affinity_gran_levels = 0; 930 } 931 } 932 return 2; 933 } 934 935 #endif /* KMP_GROUP_AFFINITY */ 936 937 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 938 939 static int __kmp_cpuid_mask_width(int count) { 940 int r = 0; 941 942 while ((1 << r) < count) 943 ++r; 944 return r; 945 } 946 947 class apicThreadInfo { 948 public: 949 unsigned osId; // param to __kmp_affinity_bind_thread 950 unsigned apicId; // from cpuid after binding 951 unsigned maxCoresPerPkg; // "" 952 unsigned maxThreadsPerPkg; // "" 953 unsigned pkgId; // inferred from above values 954 unsigned coreId; // "" 955 unsigned threadId; // "" 956 }; 957 958 static int __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, 959 const void *b) { 960 const apicThreadInfo *aa = (const apicThreadInfo *)a; 961 const apicThreadInfo *bb = (const apicThreadInfo *)b; 962 if (aa->osId < bb->osId) 963 return -1; 964 if (aa->osId > bb->osId) 965 return 1; 966 return 0; 967 } 968 969 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, 970 const void *b) { 971 const apicThreadInfo *aa = (const apicThreadInfo *)a; 972 const apicThreadInfo *bb = (const apicThreadInfo *)b; 973 if (aa->pkgId < bb->pkgId) 974 return -1; 975 if (aa->pkgId > bb->pkgId) 976 return 1; 977 if (aa->coreId < bb->coreId) 978 return -1; 979 if (aa->coreId > bb->coreId) 980 return 1; 981 if (aa->threadId < bb->threadId) 982 return -1; 983 if (aa->threadId > bb->threadId) 984 return 1; 985 return 0; 986 } 987 988 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 989 // an algorithm which cycles through the available os threads, setting 990 // the current thread's affinity mask to that thread, and then retrieves 991 // the Apic Id for each thread context using the cpuid instruction. 992 static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os, 993 kmp_i18n_id_t *const msg_id) { 994 kmp_cpuid buf; 995 int rc; 996 *address2os = NULL; 997 *msg_id = kmp_i18n_null; 998 999 // Check if cpuid leaf 4 is supported. 1000 __kmp_x86_cpuid(0, 0, &buf); 1001 if (buf.eax < 4) { 1002 *msg_id = kmp_i18n_str_NoLeaf4Support; 1003 return -1; 1004 } 1005 1006 // The algorithm used starts by setting the affinity to each available thread 1007 // and retrieving info from the cpuid instruction, so if we are not capable of 1008 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we 1009 // need to do something else - use the defaults that we calculated from 1010 // issuing cpuid without binding to each proc. 1011 if (!KMP_AFFINITY_CAPABLE()) { 1012 // Hack to try and infer the machine topology using only the data 1013 // available from cpuid on the current thread, and __kmp_xproc. 1014 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1015 1016 // Get an upper bound on the number of threads per package using cpuid(1). 1017 // On some OS/chps combinations where HT is supported by the chip but is 1018 // disabled, this value will be 2 on a single core chip. Usually, it will be 1019 // 2 if HT is enabled and 1 if HT is disabled. 1020 __kmp_x86_cpuid(1, 0, &buf); 1021 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1022 if (maxThreadsPerPkg == 0) { 1023 maxThreadsPerPkg = 1; 1024 } 1025 1026 // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded 1027 // value. 1028 // 1029 // The author of cpu_count.cpp treated this only an upper bound on the 1030 // number of cores, but I haven't seen any cases where it was greater than 1031 // the actual number of cores, so we will treat it as exact in this block of 1032 // code. 1033 // 1034 // First, we need to check if cpuid(4) is supported on this chip. To see if 1035 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or 1036 // greater. 1037 __kmp_x86_cpuid(0, 0, &buf); 1038 if (buf.eax >= 4) { 1039 __kmp_x86_cpuid(4, 0, &buf); 1040 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1041 } else { 1042 nCoresPerPkg = 1; 1043 } 1044 1045 // There is no way to reliably tell if HT is enabled without issuing the 1046 // cpuid instruction from every thread, can correlating the cpuid info, so 1047 // if the machine is not affinity capable, we assume that HT is off. We have 1048 // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine 1049 // does not support HT. 1050 // 1051 // - Older OSes are usually found on machines with older chips, which do not 1052 // support HT. 1053 // - The performance penalty for mistakenly identifying a machine as HT when 1054 // it isn't (which results in blocktime being incorrecly set to 0) is 1055 // greater than the penalty when for mistakenly identifying a machine as 1056 // being 1 thread/core when it is really HT enabled (which results in 1057 // blocktime being incorrectly set to a positive value). 1058 __kmp_ncores = __kmp_xproc; 1059 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1060 __kmp_nThreadsPerCore = 1; 1061 if (__kmp_affinity_verbose) { 1062 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY"); 1063 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1064 if (__kmp_affinity_uniform_topology()) { 1065 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1066 } else { 1067 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1068 } 1069 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1070 __kmp_nThreadsPerCore, __kmp_ncores); 1071 } 1072 return 0; 1073 } 1074 1075 // From here on, we can assume that it is safe to call 1076 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 1077 // __kmp_affinity_type = affinity_none. 1078 1079 // Save the affinity mask for the current thread. 1080 kmp_affin_mask_t *oldMask; 1081 KMP_CPU_ALLOC(oldMask); 1082 KMP_ASSERT(oldMask != NULL); 1083 __kmp_get_system_affinity(oldMask, TRUE); 1084 1085 // Run through each of the available contexts, binding the current thread 1086 // to it, and obtaining the pertinent information using the cpuid instr. 1087 // 1088 // The relevant information is: 1089 // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 1090 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 1091 // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value 1092 // of this field determines the width of the core# + thread# fields in the 1093 // Apic Id. It is also an upper bound on the number of threads per 1094 // package, but it has been verified that situations happen were it is not 1095 // exact. In particular, on certain OS/chip combinations where Intel(R) 1096 // Hyper-Threading Technology is supported by the chip but has been 1097 // disabled, the value of this field will be 2 (for a single core chip). 1098 // On other OS/chip combinations supporting Intel(R) Hyper-Threading 1099 // Technology, the value of this field will be 1 when Intel(R) 1100 // Hyper-Threading Technology is disabled and 2 when it is enabled. 1101 // - Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The value 1102 // of this field (+1) determines the width of the core# field in the Apic 1103 // Id. The comments in "cpucount.cpp" say that this value is an upper 1104 // bound, but the IA-32 architecture manual says that it is exactly the 1105 // number of cores per package, and I haven't seen any case where it 1106 // wasn't. 1107 // 1108 // From this information, deduce the package Id, core Id, and thread Id, 1109 // and set the corresponding fields in the apicThreadInfo struct. 1110 unsigned i; 1111 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 1112 __kmp_avail_proc * sizeof(apicThreadInfo)); 1113 unsigned nApics = 0; 1114 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1115 // Skip this proc if it is not included in the machine model. 1116 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1117 continue; 1118 } 1119 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 1120 1121 __kmp_affinity_dispatch->bind_thread(i); 1122 threadInfo[nApics].osId = i; 1123 1124 // The apic id and max threads per pkg come from cpuid(1). 1125 __kmp_x86_cpuid(1, 0, &buf); 1126 if (((buf.edx >> 9) & 1) == 0) { 1127 __kmp_set_system_affinity(oldMask, TRUE); 1128 __kmp_free(threadInfo); 1129 KMP_CPU_FREE(oldMask); 1130 *msg_id = kmp_i18n_str_ApicNotPresent; 1131 return -1; 1132 } 1133 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 1134 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1135 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 1136 threadInfo[nApics].maxThreadsPerPkg = 1; 1137 } 1138 1139 // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded 1140 // value. 1141 // 1142 // First, we need to check if cpuid(4) is supported on this chip. To see if 1143 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n 1144 // or greater. 1145 __kmp_x86_cpuid(0, 0, &buf); 1146 if (buf.eax >= 4) { 1147 __kmp_x86_cpuid(4, 0, &buf); 1148 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1149 } else { 1150 threadInfo[nApics].maxCoresPerPkg = 1; 1151 } 1152 1153 // Infer the pkgId / coreId / threadId using only the info obtained locally. 1154 int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg); 1155 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 1156 1157 int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg); 1158 int widthT = widthCT - widthC; 1159 if (widthT < 0) { 1160 // I've never seen this one happen, but I suppose it could, if the cpuid 1161 // instruction on a chip was really screwed up. Make sure to restore the 1162 // affinity mask before the tail call. 1163 __kmp_set_system_affinity(oldMask, TRUE); 1164 __kmp_free(threadInfo); 1165 KMP_CPU_FREE(oldMask); 1166 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1167 return -1; 1168 } 1169 1170 int maskC = (1 << widthC) - 1; 1171 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC; 1172 1173 int maskT = (1 << widthT) - 1; 1174 threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT; 1175 1176 nApics++; 1177 } 1178 1179 // We've collected all the info we need. 1180 // Restore the old affinity mask for this thread. 1181 __kmp_set_system_affinity(oldMask, TRUE); 1182 1183 // If there's only one thread context to bind to, form an Address object 1184 // with depth 1 and return immediately (or, if affinity is off, set 1185 // address2os to NULL and return). 1186 // 1187 // If it is configured to omit the package level when there is only a single 1188 // package, the logic at the end of this routine won't work if there is only 1189 // a single thread - it would try to form an Address object with depth 0. 1190 KMP_ASSERT(nApics > 0); 1191 if (nApics == 1) { 1192 __kmp_ncores = nPackages = 1; 1193 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1194 if (__kmp_affinity_verbose) { 1195 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1196 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1197 1198 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1199 if (__kmp_affinity_respect_mask) { 1200 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1201 } else { 1202 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1203 } 1204 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1205 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1206 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1207 __kmp_nThreadsPerCore, __kmp_ncores); 1208 } 1209 1210 if (__kmp_affinity_type == affinity_none) { 1211 __kmp_free(threadInfo); 1212 KMP_CPU_FREE(oldMask); 1213 return 0; 1214 } 1215 1216 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair)); 1217 Address addr(1); 1218 addr.labels[0] = threadInfo[0].pkgId; 1219 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId); 1220 1221 if (__kmp_affinity_gran_levels < 0) { 1222 __kmp_affinity_gran_levels = 0; 1223 } 1224 1225 if (__kmp_affinity_verbose) { 1226 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 1227 } 1228 1229 __kmp_free(threadInfo); 1230 KMP_CPU_FREE(oldMask); 1231 return 1; 1232 } 1233 1234 // Sort the threadInfo table by physical Id. 1235 qsort(threadInfo, nApics, sizeof(*threadInfo), 1236 __kmp_affinity_cmp_apicThreadInfo_phys_id); 1237 1238 // The table is now sorted by pkgId / coreId / threadId, but we really don't 1239 // know the radix of any of the fields. pkgId's may be sparsely assigned among 1240 // the chips on a system. Although coreId's are usually assigned 1241 // [0 .. coresPerPkg-1] and threadId's are usually assigned 1242 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1243 // 1244 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 1245 // total # packages) are at this point - we want to determine that now. We 1246 // only have an upper bound on the first two figures. 1247 // 1248 // We also perform a consistency check at this point: the values returned by 1249 // the cpuid instruction for any thread bound to a given package had better 1250 // return the same info for maxThreadsPerPkg and maxCoresPerPkg. 1251 nPackages = 1; 1252 nCoresPerPkg = 1; 1253 __kmp_nThreadsPerCore = 1; 1254 unsigned nCores = 1; 1255 1256 unsigned pkgCt = 1; // to determine radii 1257 unsigned lastPkgId = threadInfo[0].pkgId; 1258 unsigned coreCt = 1; 1259 unsigned lastCoreId = threadInfo[0].coreId; 1260 unsigned threadCt = 1; 1261 unsigned lastThreadId = threadInfo[0].threadId; 1262 1263 // intra-pkg consist checks 1264 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 1265 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 1266 1267 for (i = 1; i < nApics; i++) { 1268 if (threadInfo[i].pkgId != lastPkgId) { 1269 nCores++; 1270 pkgCt++; 1271 lastPkgId = threadInfo[i].pkgId; 1272 if ((int)coreCt > nCoresPerPkg) 1273 nCoresPerPkg = coreCt; 1274 coreCt = 1; 1275 lastCoreId = threadInfo[i].coreId; 1276 if ((int)threadCt > __kmp_nThreadsPerCore) 1277 __kmp_nThreadsPerCore = threadCt; 1278 threadCt = 1; 1279 lastThreadId = threadInfo[i].threadId; 1280 1281 // This is a different package, so go on to the next iteration without 1282 // doing any consistency checks. Reset the consistency check vars, though. 1283 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 1284 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 1285 continue; 1286 } 1287 1288 if (threadInfo[i].coreId != lastCoreId) { 1289 nCores++; 1290 coreCt++; 1291 lastCoreId = threadInfo[i].coreId; 1292 if ((int)threadCt > __kmp_nThreadsPerCore) 1293 __kmp_nThreadsPerCore = threadCt; 1294 threadCt = 1; 1295 lastThreadId = threadInfo[i].threadId; 1296 } else if (threadInfo[i].threadId != lastThreadId) { 1297 threadCt++; 1298 lastThreadId = threadInfo[i].threadId; 1299 } else { 1300 __kmp_free(threadInfo); 1301 KMP_CPU_FREE(oldMask); 1302 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1303 return -1; 1304 } 1305 1306 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 1307 // fields agree between all the threads bounds to a given package. 1308 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) || 1309 (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 1310 __kmp_free(threadInfo); 1311 KMP_CPU_FREE(oldMask); 1312 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1313 return -1; 1314 } 1315 } 1316 nPackages = pkgCt; 1317 if ((int)coreCt > nCoresPerPkg) 1318 nCoresPerPkg = coreCt; 1319 if ((int)threadCt > __kmp_nThreadsPerCore) 1320 __kmp_nThreadsPerCore = threadCt; 1321 1322 // When affinity is off, this routine will still be called to set 1323 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1324 // Make sure all these vars are set correctly, and return now if affinity is 1325 // not enabled. 1326 __kmp_ncores = nCores; 1327 if (__kmp_affinity_verbose) { 1328 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1329 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1330 1331 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1332 if (__kmp_affinity_respect_mask) { 1333 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1334 } else { 1335 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1336 } 1337 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1338 if (__kmp_affinity_uniform_topology()) { 1339 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1340 } else { 1341 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1342 } 1343 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1344 __kmp_nThreadsPerCore, __kmp_ncores); 1345 } 1346 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 1347 KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc); 1348 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 1349 for (i = 0; i < nApics; ++i) { 1350 __kmp_pu_os_idx[i] = threadInfo[i].osId; 1351 } 1352 if (__kmp_affinity_type == affinity_none) { 1353 __kmp_free(threadInfo); 1354 KMP_CPU_FREE(oldMask); 1355 return 0; 1356 } 1357 1358 // Now that we've determined the number of packages, the number of cores per 1359 // package, and the number of threads per core, we can construct the data 1360 // structure that is to be returned. 1361 int pkgLevel = 0; 1362 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1; 1363 int threadLevel = 1364 (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 1365 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 1366 1367 KMP_ASSERT(depth > 0); 1368 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1369 1370 for (i = 0; i < nApics; ++i) { 1371 Address addr(depth); 1372 unsigned os = threadInfo[i].osId; 1373 int d = 0; 1374 1375 if (pkgLevel >= 0) { 1376 addr.labels[d++] = threadInfo[i].pkgId; 1377 } 1378 if (coreLevel >= 0) { 1379 addr.labels[d++] = threadInfo[i].coreId; 1380 } 1381 if (threadLevel >= 0) { 1382 addr.labels[d++] = threadInfo[i].threadId; 1383 } 1384 (*address2os)[i] = AddrUnsPair(addr, os); 1385 } 1386 1387 if (__kmp_affinity_gran_levels < 0) { 1388 // Set the granularity level based on what levels are modeled in the machine 1389 // topology map. 1390 __kmp_affinity_gran_levels = 0; 1391 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1392 __kmp_affinity_gran_levels++; 1393 } 1394 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1395 __kmp_affinity_gran_levels++; 1396 } 1397 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) { 1398 __kmp_affinity_gran_levels++; 1399 } 1400 } 1401 1402 if (__kmp_affinity_verbose) { 1403 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel, 1404 coreLevel, threadLevel); 1405 } 1406 1407 __kmp_free(threadInfo); 1408 KMP_CPU_FREE(oldMask); 1409 return depth; 1410 } 1411 1412 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 1413 // architectures support a newer interface for specifying the x2APIC Ids, 1414 // based on cpuid leaf 11. 1415 static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os, 1416 kmp_i18n_id_t *const msg_id) { 1417 kmp_cpuid buf; 1418 *address2os = NULL; 1419 *msg_id = kmp_i18n_null; 1420 1421 // Check to see if cpuid leaf 11 is supported. 1422 __kmp_x86_cpuid(0, 0, &buf); 1423 if (buf.eax < 11) { 1424 *msg_id = kmp_i18n_str_NoLeaf11Support; 1425 return -1; 1426 } 1427 __kmp_x86_cpuid(11, 0, &buf); 1428 if (buf.ebx == 0) { 1429 *msg_id = kmp_i18n_str_NoLeaf11Support; 1430 return -1; 1431 } 1432 1433 // Find the number of levels in the machine topology. While we're at it, get 1434 // the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will try to 1435 // get more accurate values later by explicitly counting them, but get 1436 // reasonable defaults now, in case we return early. 1437 int level; 1438 int threadLevel = -1; 1439 int coreLevel = -1; 1440 int pkgLevel = -1; 1441 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1442 1443 for (level = 0;; level++) { 1444 if (level > 31) { 1445 // FIXME: Hack for DPD200163180 1446 // 1447 // If level is big then something went wrong -> exiting 1448 // 1449 // There could actually be 32 valid levels in the machine topology, but so 1450 // far, the only machine we have seen which does not exit this loop before 1451 // iteration 32 has fubar x2APIC settings. 1452 // 1453 // For now, just reject this case based upon loop trip count. 1454 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1455 return -1; 1456 } 1457 __kmp_x86_cpuid(11, level, &buf); 1458 if (buf.ebx == 0) { 1459 if (pkgLevel < 0) { 1460 // Will infer nPackages from __kmp_xproc 1461 pkgLevel = level; 1462 level++; 1463 } 1464 break; 1465 } 1466 int kind = (buf.ecx >> 8) & 0xff; 1467 if (kind == 1) { 1468 // SMT level 1469 threadLevel = level; 1470 coreLevel = -1; 1471 pkgLevel = -1; 1472 __kmp_nThreadsPerCore = buf.ebx & 0xffff; 1473 if (__kmp_nThreadsPerCore == 0) { 1474 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1475 return -1; 1476 } 1477 } else if (kind == 2) { 1478 // core level 1479 coreLevel = level; 1480 pkgLevel = -1; 1481 nCoresPerPkg = buf.ebx & 0xffff; 1482 if (nCoresPerPkg == 0) { 1483 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1484 return -1; 1485 } 1486 } else { 1487 if (level <= 0) { 1488 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1489 return -1; 1490 } 1491 if (pkgLevel >= 0) { 1492 continue; 1493 } 1494 pkgLevel = level; 1495 nPackages = buf.ebx & 0xffff; 1496 if (nPackages == 0) { 1497 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1498 return -1; 1499 } 1500 } 1501 } 1502 int depth = level; 1503 1504 // In the above loop, "level" was counted from the finest level (usually 1505 // thread) to the coarsest. The caller expects that we will place the labels 1506 // in (*address2os)[].first.labels[] in the inverse order, so we need to 1507 // invert the vars saying which level means what. 1508 if (threadLevel >= 0) { 1509 threadLevel = depth - threadLevel - 1; 1510 } 1511 if (coreLevel >= 0) { 1512 coreLevel = depth - coreLevel - 1; 1513 } 1514 KMP_DEBUG_ASSERT(pkgLevel >= 0); 1515 pkgLevel = depth - pkgLevel - 1; 1516 1517 // The algorithm used starts by setting the affinity to each available thread 1518 // and retrieving info from the cpuid instruction, so if we are not capable of 1519 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we 1520 // need to do something else - use the defaults that we calculated from 1521 // issuing cpuid without binding to each proc. 1522 if (!KMP_AFFINITY_CAPABLE()) { 1523 // Hack to try and infer the machine topology using only the data 1524 // available from cpuid on the current thread, and __kmp_xproc. 1525 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1526 1527 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1528 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1529 if (__kmp_affinity_verbose) { 1530 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 1531 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1532 if (__kmp_affinity_uniform_topology()) { 1533 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1534 } else { 1535 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1536 } 1537 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1538 __kmp_nThreadsPerCore, __kmp_ncores); 1539 } 1540 return 0; 1541 } 1542 1543 // From here on, we can assume that it is safe to call 1544 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 1545 // __kmp_affinity_type = affinity_none. 1546 1547 // Save the affinity mask for the current thread. 1548 kmp_affin_mask_t *oldMask; 1549 KMP_CPU_ALLOC(oldMask); 1550 __kmp_get_system_affinity(oldMask, TRUE); 1551 1552 // Allocate the data structure to be returned. 1553 AddrUnsPair *retval = 1554 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 1555 1556 // Run through each of the available contexts, binding the current thread 1557 // to it, and obtaining the pertinent information using the cpuid instr. 1558 unsigned int proc; 1559 int nApics = 0; 1560 KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) { 1561 // Skip this proc if it is not included in the machine model. 1562 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 1563 continue; 1564 } 1565 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc); 1566 1567 __kmp_affinity_dispatch->bind_thread(proc); 1568 1569 // Extract labels for each level in the machine topology map from Apic ID. 1570 Address addr(depth); 1571 int prev_shift = 0; 1572 1573 for (level = 0; level < depth; level++) { 1574 __kmp_x86_cpuid(11, level, &buf); 1575 unsigned apicId = buf.edx; 1576 if (buf.ebx == 0) { 1577 if (level != depth - 1) { 1578 KMP_CPU_FREE(oldMask); 1579 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1580 return -1; 1581 } 1582 addr.labels[depth - level - 1] = apicId >> prev_shift; 1583 level++; 1584 break; 1585 } 1586 int shift = buf.eax & 0x1f; 1587 int mask = (1 << shift) - 1; 1588 addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift; 1589 prev_shift = shift; 1590 } 1591 if (level != depth) { 1592 KMP_CPU_FREE(oldMask); 1593 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1594 return -1; 1595 } 1596 1597 retval[nApics] = AddrUnsPair(addr, proc); 1598 nApics++; 1599 } 1600 1601 // We've collected all the info we need. 1602 // Restore the old affinity mask for this thread. 1603 __kmp_set_system_affinity(oldMask, TRUE); 1604 1605 // If there's only one thread context to bind to, return now. 1606 KMP_ASSERT(nApics > 0); 1607 if (nApics == 1) { 1608 __kmp_ncores = nPackages = 1; 1609 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1610 if (__kmp_affinity_verbose) { 1611 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1612 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1613 1614 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1615 if (__kmp_affinity_respect_mask) { 1616 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1617 } else { 1618 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1619 } 1620 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1621 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1622 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1623 __kmp_nThreadsPerCore, __kmp_ncores); 1624 } 1625 1626 if (__kmp_affinity_type == affinity_none) { 1627 __kmp_free(retval); 1628 KMP_CPU_FREE(oldMask); 1629 return 0; 1630 } 1631 1632 // Form an Address object which only includes the package level. 1633 Address addr(1); 1634 addr.labels[0] = retval[0].first.labels[pkgLevel]; 1635 retval[0].first = addr; 1636 1637 if (__kmp_affinity_gran_levels < 0) { 1638 __kmp_affinity_gran_levels = 0; 1639 } 1640 1641 if (__kmp_affinity_verbose) { 1642 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 1643 } 1644 1645 *address2os = retval; 1646 KMP_CPU_FREE(oldMask); 1647 return 1; 1648 } 1649 1650 // Sort the table by physical Id. 1651 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels); 1652 1653 // Find the radix at each of the levels. 1654 unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1655 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1656 unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1657 unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1658 for (level = 0; level < depth; level++) { 1659 totals[level] = 1; 1660 maxCt[level] = 1; 1661 counts[level] = 1; 1662 last[level] = retval[0].first.labels[level]; 1663 } 1664 1665 // From here on, the iteration variable "level" runs from the finest level to 1666 // the coarsest, i.e. we iterate forward through 1667 // (*address2os)[].first.labels[] - in the previous loops, we iterated 1668 // backwards. 1669 for (proc = 1; (int)proc < nApics; proc++) { 1670 int level; 1671 for (level = 0; level < depth; level++) { 1672 if (retval[proc].first.labels[level] != last[level]) { 1673 int j; 1674 for (j = level + 1; j < depth; j++) { 1675 totals[j]++; 1676 counts[j] = 1; 1677 // The line below causes printing incorrect topology information in 1678 // case the max value for some level (maxCt[level]) is encountered 1679 // earlier than some less value while going through the array. For 1680 // example, let pkg0 has 4 cores and pkg1 has 2 cores. Then 1681 // maxCt[1] == 2 1682 // whereas it must be 4. 1683 // TODO!!! Check if it can be commented safely 1684 // maxCt[j] = 1; 1685 last[j] = retval[proc].first.labels[j]; 1686 } 1687 totals[level]++; 1688 counts[level]++; 1689 if (counts[level] > maxCt[level]) { 1690 maxCt[level] = counts[level]; 1691 } 1692 last[level] = retval[proc].first.labels[level]; 1693 break; 1694 } else if (level == depth - 1) { 1695 __kmp_free(last); 1696 __kmp_free(maxCt); 1697 __kmp_free(counts); 1698 __kmp_free(totals); 1699 __kmp_free(retval); 1700 KMP_CPU_FREE(oldMask); 1701 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 1702 return -1; 1703 } 1704 } 1705 } 1706 1707 // When affinity is off, this routine will still be called to set 1708 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1709 // Make sure all these vars are set correctly, and return if affinity is not 1710 // enabled. 1711 if (threadLevel >= 0) { 1712 __kmp_nThreadsPerCore = maxCt[threadLevel]; 1713 } else { 1714 __kmp_nThreadsPerCore = 1; 1715 } 1716 nPackages = totals[pkgLevel]; 1717 1718 if (coreLevel >= 0) { 1719 __kmp_ncores = totals[coreLevel]; 1720 nCoresPerPkg = maxCt[coreLevel]; 1721 } else { 1722 __kmp_ncores = nPackages; 1723 nCoresPerPkg = 1; 1724 } 1725 1726 // Check to see if the machine topology is uniform 1727 unsigned prod = maxCt[0]; 1728 for (level = 1; level < depth; level++) { 1729 prod *= maxCt[level]; 1730 } 1731 bool uniform = (prod == totals[level - 1]); 1732 1733 // Print the machine topology summary. 1734 if (__kmp_affinity_verbose) { 1735 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 1736 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1737 1738 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1739 if (__kmp_affinity_respect_mask) { 1740 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 1741 } else { 1742 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 1743 } 1744 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1745 if (uniform) { 1746 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1747 } else { 1748 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1749 } 1750 1751 kmp_str_buf_t buf; 1752 __kmp_str_buf_init(&buf); 1753 1754 __kmp_str_buf_print(&buf, "%d", totals[0]); 1755 for (level = 1; level <= pkgLevel; level++) { 1756 __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 1757 } 1758 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 1759 __kmp_nThreadsPerCore, __kmp_ncores); 1760 1761 __kmp_str_buf_free(&buf); 1762 } 1763 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 1764 KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc); 1765 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 1766 for (proc = 0; (int)proc < nApics; ++proc) { 1767 __kmp_pu_os_idx[proc] = retval[proc].second; 1768 } 1769 if (__kmp_affinity_type == affinity_none) { 1770 __kmp_free(last); 1771 __kmp_free(maxCt); 1772 __kmp_free(counts); 1773 __kmp_free(totals); 1774 __kmp_free(retval); 1775 KMP_CPU_FREE(oldMask); 1776 return 0; 1777 } 1778 1779 // Find any levels with radiix 1, and remove them from the map 1780 // (except for the package level). 1781 int new_depth = 0; 1782 for (level = 0; level < depth; level++) { 1783 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1784 continue; 1785 } 1786 new_depth++; 1787 } 1788 1789 // If we are removing any levels, allocate a new vector to return, 1790 // and copy the relevant information to it. 1791 if (new_depth != depth) { 1792 AddrUnsPair *new_retval = 1793 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1794 for (proc = 0; (int)proc < nApics; proc++) { 1795 Address addr(new_depth); 1796 new_retval[proc] = AddrUnsPair(addr, retval[proc].second); 1797 } 1798 int new_level = 0; 1799 int newPkgLevel = -1; 1800 int newCoreLevel = -1; 1801 int newThreadLevel = -1; 1802 int i; 1803 for (level = 0; level < depth; level++) { 1804 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1805 // Remove this level. Never remove the package level 1806 continue; 1807 } 1808 if (level == pkgLevel) { 1809 newPkgLevel = new_level; 1810 } 1811 if (level == coreLevel) { 1812 newCoreLevel = new_level; 1813 } 1814 if (level == threadLevel) { 1815 newThreadLevel = new_level; 1816 } 1817 for (proc = 0; (int)proc < nApics; proc++) { 1818 new_retval[proc].first.labels[new_level] = 1819 retval[proc].first.labels[level]; 1820 } 1821 new_level++; 1822 } 1823 1824 __kmp_free(retval); 1825 retval = new_retval; 1826 depth = new_depth; 1827 pkgLevel = newPkgLevel; 1828 coreLevel = newCoreLevel; 1829 threadLevel = newThreadLevel; 1830 } 1831 1832 if (__kmp_affinity_gran_levels < 0) { 1833 // Set the granularity level based on what levels are modeled 1834 // in the machine topology map. 1835 __kmp_affinity_gran_levels = 0; 1836 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1837 __kmp_affinity_gran_levels++; 1838 } 1839 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1840 __kmp_affinity_gran_levels++; 1841 } 1842 if (__kmp_affinity_gran > affinity_gran_package) { 1843 __kmp_affinity_gran_levels++; 1844 } 1845 } 1846 1847 if (__kmp_affinity_verbose) { 1848 __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, coreLevel, 1849 threadLevel); 1850 } 1851 1852 __kmp_free(last); 1853 __kmp_free(maxCt); 1854 __kmp_free(counts); 1855 __kmp_free(totals); 1856 KMP_CPU_FREE(oldMask); 1857 *address2os = retval; 1858 return depth; 1859 } 1860 1861 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1862 1863 #define osIdIndex 0 1864 #define threadIdIndex 1 1865 #define coreIdIndex 2 1866 #define pkgIdIndex 3 1867 #define nodeIdIndex 4 1868 1869 typedef unsigned *ProcCpuInfo; 1870 static unsigned maxIndex = pkgIdIndex; 1871 1872 static int __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b) { 1873 const unsigned *aa = (const unsigned *)a; 1874 const unsigned *bb = (const unsigned *)b; 1875 if (aa[osIdIndex] < bb[osIdIndex]) 1876 return -1; 1877 if (aa[osIdIndex] > bb[osIdIndex]) 1878 return 1; 1879 return 0; 1880 } 1881 1882 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, 1883 const void *b) { 1884 unsigned i; 1885 const unsigned *aa = *(unsigned *const *)a; 1886 const unsigned *bb = *(unsigned *const *)b; 1887 for (i = maxIndex;; i--) { 1888 if (aa[i] < bb[i]) 1889 return -1; 1890 if (aa[i] > bb[i]) 1891 return 1; 1892 if (i == osIdIndex) 1893 break; 1894 } 1895 return 0; 1896 } 1897 1898 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 1899 // affinity map. 1900 static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, 1901 int *line, 1902 kmp_i18n_id_t *const msg_id, 1903 FILE *f) { 1904 *address2os = NULL; 1905 *msg_id = kmp_i18n_null; 1906 1907 // Scan of the file, and count the number of "processor" (osId) fields, 1908 // and find the highest value of <n> for a node_<n> field. 1909 char buf[256]; 1910 unsigned num_records = 0; 1911 while (!feof(f)) { 1912 buf[sizeof(buf) - 1] = 1; 1913 if (!fgets(buf, sizeof(buf), f)) { 1914 // Read errors presumably because of EOF 1915 break; 1916 } 1917 1918 char s1[] = "processor"; 1919 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1920 num_records++; 1921 continue; 1922 } 1923 1924 // FIXME - this will match "node_<n> <garbage>" 1925 unsigned level; 1926 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 1927 if (nodeIdIndex + level >= maxIndex) { 1928 maxIndex = nodeIdIndex + level; 1929 } 1930 continue; 1931 } 1932 } 1933 1934 // Check for empty file / no valid processor records, or too many. The number 1935 // of records can't exceed the number of valid bits in the affinity mask. 1936 if (num_records == 0) { 1937 *line = 0; 1938 *msg_id = kmp_i18n_str_NoProcRecords; 1939 return -1; 1940 } 1941 if (num_records > (unsigned)__kmp_xproc) { 1942 *line = 0; 1943 *msg_id = kmp_i18n_str_TooManyProcRecords; 1944 return -1; 1945 } 1946 1947 // Set the file pointer back to the begginning, so that we can scan the file 1948 // again, this time performing a full parse of the data. Allocate a vector of 1949 // ProcCpuInfo object, where we will place the data. Adding an extra element 1950 // at the end allows us to remove a lot of extra checks for termination 1951 // conditions. 1952 if (fseek(f, 0, SEEK_SET) != 0) { 1953 *line = 0; 1954 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 1955 return -1; 1956 } 1957 1958 // Allocate the array of records to store the proc info in. The dummy 1959 // element at the end makes the logic in filling them out easier to code. 1960 unsigned **threadInfo = 1961 (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *)); 1962 unsigned i; 1963 for (i = 0; i <= num_records; i++) { 1964 threadInfo[i] = 1965 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 1966 } 1967 1968 #define CLEANUP_THREAD_INFO \ 1969 for (i = 0; i <= num_records; i++) { \ 1970 __kmp_free(threadInfo[i]); \ 1971 } \ 1972 __kmp_free(threadInfo); 1973 1974 // A value of UINT_MAX means that we didn't find the field 1975 unsigned __index; 1976 1977 #define INIT_PROC_INFO(p) \ 1978 for (__index = 0; __index <= maxIndex; __index++) { \ 1979 (p)[__index] = UINT_MAX; \ 1980 } 1981 1982 for (i = 0; i <= num_records; i++) { 1983 INIT_PROC_INFO(threadInfo[i]); 1984 } 1985 1986 unsigned num_avail = 0; 1987 *line = 0; 1988 while (!feof(f)) { 1989 // Create an inner scoping level, so that all the goto targets at the end of 1990 // the loop appear in an outer scoping level. This avoids warnings about 1991 // jumping past an initialization to a target in the same block. 1992 { 1993 buf[sizeof(buf) - 1] = 1; 1994 bool long_line = false; 1995 if (!fgets(buf, sizeof(buf), f)) { 1996 // Read errors presumably because of EOF 1997 // If there is valid data in threadInfo[num_avail], then fake 1998 // a blank line in ensure that the last address gets parsed. 1999 bool valid = false; 2000 for (i = 0; i <= maxIndex; i++) { 2001 if (threadInfo[num_avail][i] != UINT_MAX) { 2002 valid = true; 2003 } 2004 } 2005 if (!valid) { 2006 break; 2007 } 2008 buf[0] = 0; 2009 } else if (!buf[sizeof(buf) - 1]) { 2010 // The line is longer than the buffer. Set a flag and don't 2011 // emit an error if we were going to ignore the line, anyway. 2012 long_line = true; 2013 2014 #define CHECK_LINE \ 2015 if (long_line) { \ 2016 CLEANUP_THREAD_INFO; \ 2017 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 2018 return -1; \ 2019 } 2020 } 2021 (*line)++; 2022 2023 char s1[] = "processor"; 2024 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 2025 CHECK_LINE; 2026 char *p = strchr(buf + sizeof(s1) - 1, ':'); 2027 unsigned val; 2028 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2029 goto no_val; 2030 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) 2031 goto dup_field; 2032 threadInfo[num_avail][osIdIndex] = val; 2033 #if KMP_OS_LINUX && USE_SYSFS_INFO 2034 char path[256]; 2035 KMP_SNPRINTF( 2036 path, sizeof(path), 2037 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 2038 threadInfo[num_avail][osIdIndex]); 2039 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 2040 2041 KMP_SNPRINTF(path, sizeof(path), 2042 "/sys/devices/system/cpu/cpu%u/topology/core_id", 2043 threadInfo[num_avail][osIdIndex]); 2044 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 2045 continue; 2046 #else 2047 } 2048 char s2[] = "physical id"; 2049 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 2050 CHECK_LINE; 2051 char *p = strchr(buf + sizeof(s2) - 1, ':'); 2052 unsigned val; 2053 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2054 goto no_val; 2055 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) 2056 goto dup_field; 2057 threadInfo[num_avail][pkgIdIndex] = val; 2058 continue; 2059 } 2060 char s3[] = "core id"; 2061 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 2062 CHECK_LINE; 2063 char *p = strchr(buf + sizeof(s3) - 1, ':'); 2064 unsigned val; 2065 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2066 goto no_val; 2067 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) 2068 goto dup_field; 2069 threadInfo[num_avail][coreIdIndex] = val; 2070 continue; 2071 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 2072 } 2073 char s4[] = "thread id"; 2074 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 2075 CHECK_LINE; 2076 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2077 unsigned val; 2078 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2079 goto no_val; 2080 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) 2081 goto dup_field; 2082 threadInfo[num_avail][threadIdIndex] = val; 2083 continue; 2084 } 2085 unsigned level; 2086 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 2087 CHECK_LINE; 2088 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2089 unsigned val; 2090 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2091 goto no_val; 2092 KMP_ASSERT(nodeIdIndex + level <= maxIndex); 2093 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) 2094 goto dup_field; 2095 threadInfo[num_avail][nodeIdIndex + level] = val; 2096 continue; 2097 } 2098 2099 // We didn't recognize the leading token on the line. There are lots of 2100 // leading tokens that we don't recognize - if the line isn't empty, go on 2101 // to the next line. 2102 if ((*buf != 0) && (*buf != '\n')) { 2103 // If the line is longer than the buffer, read characters 2104 // until we find a newline. 2105 if (long_line) { 2106 int ch; 2107 while (((ch = fgetc(f)) != EOF) && (ch != '\n')) 2108 ; 2109 } 2110 continue; 2111 } 2112 2113 // A newline has signalled the end of the processor record. 2114 // Check that there aren't too many procs specified. 2115 if ((int)num_avail == __kmp_xproc) { 2116 CLEANUP_THREAD_INFO; 2117 *msg_id = kmp_i18n_str_TooManyEntries; 2118 return -1; 2119 } 2120 2121 // Check for missing fields. The osId field must be there, and we 2122 // currently require that the physical id field is specified, also. 2123 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 2124 CLEANUP_THREAD_INFO; 2125 *msg_id = kmp_i18n_str_MissingProcField; 2126 return -1; 2127 } 2128 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 2129 CLEANUP_THREAD_INFO; 2130 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 2131 return -1; 2132 } 2133 2134 // Skip this proc if it is not included in the machine model. 2135 if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], 2136 __kmp_affin_fullMask)) { 2137 INIT_PROC_INFO(threadInfo[num_avail]); 2138 continue; 2139 } 2140 2141 // We have a successful parse of this proc's info. 2142 // Increment the counter, and prepare for the next proc. 2143 num_avail++; 2144 KMP_ASSERT(num_avail <= num_records); 2145 INIT_PROC_INFO(threadInfo[num_avail]); 2146 } 2147 continue; 2148 2149 no_val: 2150 CLEANUP_THREAD_INFO; 2151 *msg_id = kmp_i18n_str_MissingValCpuinfo; 2152 return -1; 2153 2154 dup_field: 2155 CLEANUP_THREAD_INFO; 2156 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 2157 return -1; 2158 } 2159 *line = 0; 2160 2161 #if KMP_MIC && REDUCE_TEAM_SIZE 2162 unsigned teamSize = 0; 2163 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2164 2165 // check for num_records == __kmp_xproc ??? 2166 2167 // If there's only one thread context to bind to, form an Address object with 2168 // depth 1 and return immediately (or, if affinity is off, set address2os to 2169 // NULL and return). 2170 // 2171 // If it is configured to omit the package level when there is only a single 2172 // package, the logic at the end of this routine won't work if there is only a 2173 // single thread - it would try to form an Address object with depth 0. 2174 KMP_ASSERT(num_avail > 0); 2175 KMP_ASSERT(num_avail <= num_records); 2176 if (num_avail == 1) { 2177 __kmp_ncores = 1; 2178 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 2179 if (__kmp_affinity_verbose) { 2180 if (!KMP_AFFINITY_CAPABLE()) { 2181 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2182 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2183 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2184 } else { 2185 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2186 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 2187 __kmp_affin_fullMask); 2188 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2189 if (__kmp_affinity_respect_mask) { 2190 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2191 } else { 2192 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2193 } 2194 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2195 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2196 } 2197 int index; 2198 kmp_str_buf_t buf; 2199 __kmp_str_buf_init(&buf); 2200 __kmp_str_buf_print(&buf, "1"); 2201 for (index = maxIndex - 1; index > pkgIdIndex; index--) { 2202 __kmp_str_buf_print(&buf, " x 1"); 2203 } 2204 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1); 2205 __kmp_str_buf_free(&buf); 2206 } 2207 2208 if (__kmp_affinity_type == affinity_none) { 2209 CLEANUP_THREAD_INFO; 2210 return 0; 2211 } 2212 2213 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair)); 2214 Address addr(1); 2215 addr.labels[0] = threadInfo[0][pkgIdIndex]; 2216 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]); 2217 2218 if (__kmp_affinity_gran_levels < 0) { 2219 __kmp_affinity_gran_levels = 0; 2220 } 2221 2222 if (__kmp_affinity_verbose) { 2223 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 2224 } 2225 2226 CLEANUP_THREAD_INFO; 2227 return 1; 2228 } 2229 2230 // Sort the threadInfo table by physical Id. 2231 qsort(threadInfo, num_avail, sizeof(*threadInfo), 2232 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 2233 2234 // The table is now sorted by pkgId / coreId / threadId, but we really don't 2235 // know the radix of any of the fields. pkgId's may be sparsely assigned among 2236 // the chips on a system. Although coreId's are usually assigned 2237 // [0 .. coresPerPkg-1] and threadId's are usually assigned 2238 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2239 // 2240 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 2241 // total # packages) are at this point - we want to determine that now. We 2242 // only have an upper bound on the first two figures. 2243 unsigned *counts = 2244 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2245 unsigned *maxCt = 2246 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2247 unsigned *totals = 2248 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2249 unsigned *lastId = 2250 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2251 2252 bool assign_thread_ids = false; 2253 unsigned threadIdCt; 2254 unsigned index; 2255 2256 restart_radix_check: 2257 threadIdCt = 0; 2258 2259 // Initialize the counter arrays with data from threadInfo[0]. 2260 if (assign_thread_ids) { 2261 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 2262 threadInfo[0][threadIdIndex] = threadIdCt++; 2263 } else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 2264 threadIdCt = threadInfo[0][threadIdIndex] + 1; 2265 } 2266 } 2267 for (index = 0; index <= maxIndex; index++) { 2268 counts[index] = 1; 2269 maxCt[index] = 1; 2270 totals[index] = 1; 2271 lastId[index] = threadInfo[0][index]; 2272 ; 2273 } 2274 2275 // Run through the rest of the OS procs. 2276 for (i = 1; i < num_avail; i++) { 2277 // Find the most significant index whose id differs from the id for the 2278 // previous OS proc. 2279 for (index = maxIndex; index >= threadIdIndex; index--) { 2280 if (assign_thread_ids && (index == threadIdIndex)) { 2281 // Auto-assign the thread id field if it wasn't specified. 2282 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2283 threadInfo[i][threadIdIndex] = threadIdCt++; 2284 } 2285 // Apparently the thread id field was specified for some entries and not 2286 // others. Start the thread id counter off at the next higher thread id. 2287 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2288 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2289 } 2290 } 2291 if (threadInfo[i][index] != lastId[index]) { 2292 // Run through all indices which are less significant, and reset the 2293 // counts to 1. At all levels up to and including index, we need to 2294 // increment the totals and record the last id. 2295 unsigned index2; 2296 for (index2 = threadIdIndex; index2 < index; index2++) { 2297 totals[index2]++; 2298 if (counts[index2] > maxCt[index2]) { 2299 maxCt[index2] = counts[index2]; 2300 } 2301 counts[index2] = 1; 2302 lastId[index2] = threadInfo[i][index2]; 2303 } 2304 counts[index]++; 2305 totals[index]++; 2306 lastId[index] = threadInfo[i][index]; 2307 2308 if (assign_thread_ids && (index > threadIdIndex)) { 2309 2310 #if KMP_MIC && REDUCE_TEAM_SIZE 2311 // The default team size is the total #threads in the machine 2312 // minus 1 thread for every core that has 3 or more threads. 2313 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2314 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2315 2316 // Restart the thread counter, as we are on a new core. 2317 threadIdCt = 0; 2318 2319 // Auto-assign the thread id field if it wasn't specified. 2320 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2321 threadInfo[i][threadIdIndex] = threadIdCt++; 2322 } 2323 2324 // Aparrently the thread id field was specified for some entries and 2325 // not others. Start the thread id counter off at the next higher 2326 // thread id. 2327 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2328 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2329 } 2330 } 2331 break; 2332 } 2333 } 2334 if (index < threadIdIndex) { 2335 // If thread ids were specified, it is an error if they are not unique. 2336 // Also, check that we waven't already restarted the loop (to be safe - 2337 // shouldn't need to). 2338 if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) { 2339 __kmp_free(lastId); 2340 __kmp_free(totals); 2341 __kmp_free(maxCt); 2342 __kmp_free(counts); 2343 CLEANUP_THREAD_INFO; 2344 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2345 return -1; 2346 } 2347 2348 // If the thread ids were not specified and we see entries entries that 2349 // are duplicates, start the loop over and assign the thread ids manually. 2350 assign_thread_ids = true; 2351 goto restart_radix_check; 2352 } 2353 } 2354 2355 #if KMP_MIC && REDUCE_TEAM_SIZE 2356 // The default team size is the total #threads in the machine 2357 // minus 1 thread for every core that has 3 or more threads. 2358 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2359 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2360 2361 for (index = threadIdIndex; index <= maxIndex; index++) { 2362 if (counts[index] > maxCt[index]) { 2363 maxCt[index] = counts[index]; 2364 } 2365 } 2366 2367 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2368 nCoresPerPkg = maxCt[coreIdIndex]; 2369 nPackages = totals[pkgIdIndex]; 2370 2371 // Check to see if the machine topology is uniform 2372 unsigned prod = totals[maxIndex]; 2373 for (index = threadIdIndex; index < maxIndex; index++) { 2374 prod *= maxCt[index]; 2375 } 2376 bool uniform = (prod == totals[threadIdIndex]); 2377 2378 // When affinity is off, this routine will still be called to set 2379 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 2380 // Make sure all these vars are set correctly, and return now if affinity is 2381 // not enabled. 2382 __kmp_ncores = totals[coreIdIndex]; 2383 2384 if (__kmp_affinity_verbose) { 2385 if (!KMP_AFFINITY_CAPABLE()) { 2386 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2387 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2388 if (uniform) { 2389 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2390 } else { 2391 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2392 } 2393 } else { 2394 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2395 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 2396 __kmp_affin_fullMask); 2397 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2398 if (__kmp_affinity_respect_mask) { 2399 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2400 } else { 2401 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2402 } 2403 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2404 if (uniform) { 2405 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2406 } else { 2407 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2408 } 2409 } 2410 kmp_str_buf_t buf; 2411 __kmp_str_buf_init(&buf); 2412 2413 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]); 2414 for (index = maxIndex - 1; index >= pkgIdIndex; index--) { 2415 __kmp_str_buf_print(&buf, " x %d", maxCt[index]); 2416 } 2417 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex], 2418 maxCt[threadIdIndex], __kmp_ncores); 2419 2420 __kmp_str_buf_free(&buf); 2421 } 2422 2423 #if KMP_MIC && REDUCE_TEAM_SIZE 2424 // Set the default team size. 2425 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2426 __kmp_dflt_team_nth = teamSize; 2427 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting " 2428 "__kmp_dflt_team_nth = %d\n", 2429 __kmp_dflt_team_nth)); 2430 } 2431 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2432 2433 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 2434 KMP_DEBUG_ASSERT(num_avail == __kmp_avail_proc); 2435 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 2436 for (i = 0; i < num_avail; ++i) { // fill the os indices 2437 __kmp_pu_os_idx[i] = threadInfo[i][osIdIndex]; 2438 } 2439 2440 if (__kmp_affinity_type == affinity_none) { 2441 __kmp_free(lastId); 2442 __kmp_free(totals); 2443 __kmp_free(maxCt); 2444 __kmp_free(counts); 2445 CLEANUP_THREAD_INFO; 2446 return 0; 2447 } 2448 2449 // Count the number of levels which have more nodes at that level than at the 2450 // parent's level (with there being an implicit root node of the top level). 2451 // This is equivalent to saying that there is at least one node at this level 2452 // which has a sibling. These levels are in the map, and the package level is 2453 // always in the map. 2454 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2455 int level = 0; 2456 for (index = threadIdIndex; index < maxIndex; index++) { 2457 KMP_ASSERT(totals[index] >= totals[index + 1]); 2458 inMap[index] = (totals[index] > totals[index + 1]); 2459 } 2460 inMap[maxIndex] = (totals[maxIndex] > 1); 2461 inMap[pkgIdIndex] = true; 2462 2463 int depth = 0; 2464 for (index = threadIdIndex; index <= maxIndex; index++) { 2465 if (inMap[index]) { 2466 depth++; 2467 } 2468 } 2469 KMP_ASSERT(depth > 0); 2470 2471 // Construct the data structure that is to be returned. 2472 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * num_avail); 2473 int pkgLevel = -1; 2474 int coreLevel = -1; 2475 int threadLevel = -1; 2476 2477 for (i = 0; i < num_avail; ++i) { 2478 Address addr(depth); 2479 unsigned os = threadInfo[i][osIdIndex]; 2480 int src_index; 2481 int dst_index = 0; 2482 2483 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2484 if (!inMap[src_index]) { 2485 continue; 2486 } 2487 addr.labels[dst_index] = threadInfo[i][src_index]; 2488 if (src_index == pkgIdIndex) { 2489 pkgLevel = dst_index; 2490 } else if (src_index == coreIdIndex) { 2491 coreLevel = dst_index; 2492 } else if (src_index == threadIdIndex) { 2493 threadLevel = dst_index; 2494 } 2495 dst_index++; 2496 } 2497 (*address2os)[i] = AddrUnsPair(addr, os); 2498 } 2499 2500 if (__kmp_affinity_gran_levels < 0) { 2501 // Set the granularity level based on what levels are modeled 2502 // in the machine topology map. 2503 unsigned src_index; 2504 __kmp_affinity_gran_levels = 0; 2505 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) { 2506 if (!inMap[src_index]) { 2507 continue; 2508 } 2509 switch (src_index) { 2510 case threadIdIndex: 2511 if (__kmp_affinity_gran > affinity_gran_thread) { 2512 __kmp_affinity_gran_levels++; 2513 } 2514 2515 break; 2516 case coreIdIndex: 2517 if (__kmp_affinity_gran > affinity_gran_core) { 2518 __kmp_affinity_gran_levels++; 2519 } 2520 break; 2521 2522 case pkgIdIndex: 2523 if (__kmp_affinity_gran > affinity_gran_package) { 2524 __kmp_affinity_gran_levels++; 2525 } 2526 break; 2527 } 2528 } 2529 } 2530 2531 if (__kmp_affinity_verbose) { 2532 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel, 2533 coreLevel, threadLevel); 2534 } 2535 2536 __kmp_free(inMap); 2537 __kmp_free(lastId); 2538 __kmp_free(totals); 2539 __kmp_free(maxCt); 2540 __kmp_free(counts); 2541 CLEANUP_THREAD_INFO; 2542 return depth; 2543 } 2544 2545 // Create and return a table of affinity masks, indexed by OS thread ID. 2546 // This routine handles OR'ing together all the affinity masks of threads 2547 // that are sufficiently close, if granularity > fine. 2548 static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex, 2549 unsigned *numUnique, 2550 AddrUnsPair *address2os, 2551 unsigned numAddrs) { 2552 // First form a table of affinity masks in order of OS thread id. 2553 unsigned depth; 2554 unsigned maxOsId; 2555 unsigned i; 2556 2557 KMP_ASSERT(numAddrs > 0); 2558 depth = address2os[0].first.depth; 2559 2560 maxOsId = 0; 2561 for (i = numAddrs - 1;; --i) { 2562 unsigned osId = address2os[i].second; 2563 if (osId > maxOsId) { 2564 maxOsId = osId; 2565 } 2566 if (i == 0) 2567 break; 2568 } 2569 kmp_affin_mask_t *osId2Mask; 2570 KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1)); 2571 2572 // Sort the address2os table according to physical order. Doing so will put 2573 // all threads on the same core/package/node in consecutive locations. 2574 qsort(address2os, numAddrs, sizeof(*address2os), 2575 __kmp_affinity_cmp_Address_labels); 2576 2577 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2578 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2579 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2580 } 2581 if (__kmp_affinity_gran_levels >= (int)depth) { 2582 if (__kmp_affinity_verbose || 2583 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 2584 KMP_WARNING(AffThreadsMayMigrate); 2585 } 2586 } 2587 2588 // Run through the table, forming the masks for all threads on each core. 2589 // Threads on the same core will have identical "Address" objects, not 2590 // considering the last level, which must be the thread id. All threads on a 2591 // core will appear consecutively. 2592 unsigned unique = 0; 2593 unsigned j = 0; // index of 1st thread on core 2594 unsigned leader = 0; 2595 Address *leaderAddr = &(address2os[0].first); 2596 kmp_affin_mask_t *sum; 2597 KMP_CPU_ALLOC_ON_STACK(sum); 2598 KMP_CPU_ZERO(sum); 2599 KMP_CPU_SET(address2os[0].second, sum); 2600 for (i = 1; i < numAddrs; i++) { 2601 // If this thread is sufficiently close to the leader (within the 2602 // granularity setting), then set the bit for this os thread in the 2603 // affinity mask for this group, and go on to the next thread. 2604 if (leaderAddr->isClose(address2os[i].first, __kmp_affinity_gran_levels)) { 2605 KMP_CPU_SET(address2os[i].second, sum); 2606 continue; 2607 } 2608 2609 // For every thread in this group, copy the mask to the thread's entry in 2610 // the osId2Mask table. Mark the first address as a leader. 2611 for (; j < i; j++) { 2612 unsigned osId = address2os[j].second; 2613 KMP_DEBUG_ASSERT(osId <= maxOsId); 2614 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2615 KMP_CPU_COPY(mask, sum); 2616 address2os[j].first.leader = (j == leader); 2617 } 2618 unique++; 2619 2620 // Start a new mask. 2621 leader = i; 2622 leaderAddr = &(address2os[i].first); 2623 KMP_CPU_ZERO(sum); 2624 KMP_CPU_SET(address2os[i].second, sum); 2625 } 2626 2627 // For every thread in last group, copy the mask to the thread's 2628 // entry in the osId2Mask table. 2629 for (; j < i; j++) { 2630 unsigned osId = address2os[j].second; 2631 KMP_DEBUG_ASSERT(osId <= maxOsId); 2632 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2633 KMP_CPU_COPY(mask, sum); 2634 address2os[j].first.leader = (j == leader); 2635 } 2636 unique++; 2637 KMP_CPU_FREE_FROM_STACK(sum); 2638 2639 *maxIndex = maxOsId; 2640 *numUnique = unique; 2641 return osId2Mask; 2642 } 2643 2644 // Stuff for the affinity proclist parsers. It's easier to declare these vars 2645 // as file-static than to try and pass them through the calling sequence of 2646 // the recursive-descent OMP_PLACES parser. 2647 static kmp_affin_mask_t *newMasks; 2648 static int numNewMasks; 2649 static int nextNewMask; 2650 2651 #define ADD_MASK(_mask) \ 2652 { \ 2653 if (nextNewMask >= numNewMasks) { \ 2654 int i; \ 2655 numNewMasks *= 2; \ 2656 kmp_affin_mask_t *temp; \ 2657 KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \ 2658 for (i = 0; i < numNewMasks / 2; i++) { \ 2659 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); \ 2660 kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i); \ 2661 KMP_CPU_COPY(dest, src); \ 2662 } \ 2663 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2); \ 2664 newMasks = temp; \ 2665 } \ 2666 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 2667 nextNewMask++; \ 2668 } 2669 2670 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId) \ 2671 { \ 2672 if (((_osId) > _maxOsId) || \ 2673 (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 2674 if (__kmp_affinity_verbose || \ 2675 (__kmp_affinity_warnings && \ 2676 (__kmp_affinity_type != affinity_none))) { \ 2677 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 2678 } \ 2679 } else { \ 2680 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 2681 } \ 2682 } 2683 2684 // Re-parse the proclist (for the explicit affinity type), and form the list 2685 // of affinity newMasks indexed by gtid. 2686 static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 2687 unsigned int *out_numMasks, 2688 const char *proclist, 2689 kmp_affin_mask_t *osId2Mask, 2690 int maxOsId) { 2691 int i; 2692 const char *scan = proclist; 2693 const char *next = proclist; 2694 2695 // We use malloc() for the temporary mask vector, so that we can use 2696 // realloc() to extend it. 2697 numNewMasks = 2; 2698 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 2699 nextNewMask = 0; 2700 kmp_affin_mask_t *sumMask; 2701 KMP_CPU_ALLOC(sumMask); 2702 int setSize = 0; 2703 2704 for (;;) { 2705 int start, end, stride; 2706 2707 SKIP_WS(scan); 2708 next = scan; 2709 if (*next == '\0') { 2710 break; 2711 } 2712 2713 if (*next == '{') { 2714 int num; 2715 setSize = 0; 2716 next++; // skip '{' 2717 SKIP_WS(next); 2718 scan = next; 2719 2720 // Read the first integer in the set. 2721 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist"); 2722 SKIP_DIGITS(next); 2723 num = __kmp_str_to_int(scan, *next); 2724 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2725 2726 // Copy the mask for that osId to the sum (union) mask. 2727 if ((num > maxOsId) || 2728 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2729 if (__kmp_affinity_verbose || 2730 (__kmp_affinity_warnings && 2731 (__kmp_affinity_type != affinity_none))) { 2732 KMP_WARNING(AffIgnoreInvalidProcID, num); 2733 } 2734 KMP_CPU_ZERO(sumMask); 2735 } else { 2736 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2737 setSize = 1; 2738 } 2739 2740 for (;;) { 2741 // Check for end of set. 2742 SKIP_WS(next); 2743 if (*next == '}') { 2744 next++; // skip '}' 2745 break; 2746 } 2747 2748 // Skip optional comma. 2749 if (*next == ',') { 2750 next++; 2751 } 2752 SKIP_WS(next); 2753 2754 // Read the next integer in the set. 2755 scan = next; 2756 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2757 2758 SKIP_DIGITS(next); 2759 num = __kmp_str_to_int(scan, *next); 2760 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2761 2762 // Add the mask for that osId to the sum mask. 2763 if ((num > maxOsId) || 2764 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2765 if (__kmp_affinity_verbose || 2766 (__kmp_affinity_warnings && 2767 (__kmp_affinity_type != affinity_none))) { 2768 KMP_WARNING(AffIgnoreInvalidProcID, num); 2769 } 2770 } else { 2771 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2772 setSize++; 2773 } 2774 } 2775 if (setSize > 0) { 2776 ADD_MASK(sumMask); 2777 } 2778 2779 SKIP_WS(next); 2780 if (*next == ',') { 2781 next++; 2782 } 2783 scan = next; 2784 continue; 2785 } 2786 2787 // Read the first integer. 2788 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2789 SKIP_DIGITS(next); 2790 start = __kmp_str_to_int(scan, *next); 2791 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 2792 SKIP_WS(next); 2793 2794 // If this isn't a range, then add a mask to the list and go on. 2795 if (*next != '-') { 2796 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2797 2798 // Skip optional comma. 2799 if (*next == ',') { 2800 next++; 2801 } 2802 scan = next; 2803 continue; 2804 } 2805 2806 // This is a range. Skip over the '-' and read in the 2nd int. 2807 next++; // skip '-' 2808 SKIP_WS(next); 2809 scan = next; 2810 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2811 SKIP_DIGITS(next); 2812 end = __kmp_str_to_int(scan, *next); 2813 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 2814 2815 // Check for a stride parameter 2816 stride = 1; 2817 SKIP_WS(next); 2818 if (*next == ':') { 2819 // A stride is specified. Skip over the ':" and read the 3rd int. 2820 int sign = +1; 2821 next++; // skip ':' 2822 SKIP_WS(next); 2823 scan = next; 2824 if (*next == '-') { 2825 sign = -1; 2826 next++; 2827 SKIP_WS(next); 2828 scan = next; 2829 } 2830 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2831 SKIP_DIGITS(next); 2832 stride = __kmp_str_to_int(scan, *next); 2833 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 2834 stride *= sign; 2835 } 2836 2837 // Do some range checks. 2838 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 2839 if (stride > 0) { 2840 KMP_ASSERT2(start <= end, "bad explicit proc list"); 2841 } else { 2842 KMP_ASSERT2(start >= end, "bad explicit proc list"); 2843 } 2844 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 2845 2846 // Add the mask for each OS proc # to the list. 2847 if (stride > 0) { 2848 do { 2849 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2850 start += stride; 2851 } while (start <= end); 2852 } else { 2853 do { 2854 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2855 start += stride; 2856 } while (start >= end); 2857 } 2858 2859 // Skip optional comma. 2860 SKIP_WS(next); 2861 if (*next == ',') { 2862 next++; 2863 } 2864 scan = next; 2865 } 2866 2867 *out_numMasks = nextNewMask; 2868 if (nextNewMask == 0) { 2869 *out_masks = NULL; 2870 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 2871 return; 2872 } 2873 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 2874 for (i = 0; i < nextNewMask; i++) { 2875 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 2876 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 2877 KMP_CPU_COPY(dest, src); 2878 } 2879 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 2880 KMP_CPU_FREE(sumMask); 2881 } 2882 2883 #if OMP_40_ENABLED 2884 2885 /*----------------------------------------------------------------------------- 2886 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 2887 places. Again, Here is the grammar: 2888 2889 place_list := place 2890 place_list := place , place_list 2891 place := num 2892 place := place : num 2893 place := place : num : signed 2894 place := { subplacelist } 2895 place := ! place // (lowest priority) 2896 subplace_list := subplace 2897 subplace_list := subplace , subplace_list 2898 subplace := num 2899 subplace := num : num 2900 subplace := num : num : signed 2901 signed := num 2902 signed := + signed 2903 signed := - signed 2904 -----------------------------------------------------------------------------*/ 2905 2906 static void __kmp_process_subplace_list(const char **scan, 2907 kmp_affin_mask_t *osId2Mask, 2908 int maxOsId, kmp_affin_mask_t *tempMask, 2909 int *setSize) { 2910 const char *next; 2911 2912 for (;;) { 2913 int start, count, stride, i; 2914 2915 // Read in the starting proc id 2916 SKIP_WS(*scan); 2917 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 2918 next = *scan; 2919 SKIP_DIGITS(next); 2920 start = __kmp_str_to_int(*scan, *next); 2921 KMP_ASSERT(start >= 0); 2922 *scan = next; 2923 2924 // valid follow sets are ',' ':' and '}' 2925 SKIP_WS(*scan); 2926 if (**scan == '}' || **scan == ',') { 2927 if ((start > maxOsId) || 2928 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2929 if (__kmp_affinity_verbose || 2930 (__kmp_affinity_warnings && 2931 (__kmp_affinity_type != affinity_none))) { 2932 KMP_WARNING(AffIgnoreInvalidProcID, start); 2933 } 2934 } else { 2935 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2936 (*setSize)++; 2937 } 2938 if (**scan == '}') { 2939 break; 2940 } 2941 (*scan)++; // skip ',' 2942 continue; 2943 } 2944 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 2945 (*scan)++; // skip ':' 2946 2947 // Read count parameter 2948 SKIP_WS(*scan); 2949 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 2950 next = *scan; 2951 SKIP_DIGITS(next); 2952 count = __kmp_str_to_int(*scan, *next); 2953 KMP_ASSERT(count >= 0); 2954 *scan = next; 2955 2956 // valid follow sets are ',' ':' and '}' 2957 SKIP_WS(*scan); 2958 if (**scan == '}' || **scan == ',') { 2959 for (i = 0; i < count; i++) { 2960 if ((start > maxOsId) || 2961 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2962 if (__kmp_affinity_verbose || 2963 (__kmp_affinity_warnings && 2964 (__kmp_affinity_type != affinity_none))) { 2965 KMP_WARNING(AffIgnoreInvalidProcID, start); 2966 } 2967 break; // don't proliferate warnings for large count 2968 } else { 2969 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2970 start++; 2971 (*setSize)++; 2972 } 2973 } 2974 if (**scan == '}') { 2975 break; 2976 } 2977 (*scan)++; // skip ',' 2978 continue; 2979 } 2980 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 2981 (*scan)++; // skip ':' 2982 2983 // Read stride parameter 2984 int sign = +1; 2985 for (;;) { 2986 SKIP_WS(*scan); 2987 if (**scan == '+') { 2988 (*scan)++; // skip '+' 2989 continue; 2990 } 2991 if (**scan == '-') { 2992 sign *= -1; 2993 (*scan)++; // skip '-' 2994 continue; 2995 } 2996 break; 2997 } 2998 SKIP_WS(*scan); 2999 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3000 next = *scan; 3001 SKIP_DIGITS(next); 3002 stride = __kmp_str_to_int(*scan, *next); 3003 KMP_ASSERT(stride >= 0); 3004 *scan = next; 3005 stride *= sign; 3006 3007 // valid follow sets are ',' and '}' 3008 SKIP_WS(*scan); 3009 if (**scan == '}' || **scan == ',') { 3010 for (i = 0; i < count; i++) { 3011 if ((start > maxOsId) || 3012 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3013 if (__kmp_affinity_verbose || 3014 (__kmp_affinity_warnings && 3015 (__kmp_affinity_type != affinity_none))) { 3016 KMP_WARNING(AffIgnoreInvalidProcID, start); 3017 } 3018 break; // don't proliferate warnings for large count 3019 } else { 3020 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3021 start += stride; 3022 (*setSize)++; 3023 } 3024 } 3025 if (**scan == '}') { 3026 break; 3027 } 3028 (*scan)++; // skip ',' 3029 continue; 3030 } 3031 3032 KMP_ASSERT2(0, "bad explicit places list"); 3033 } 3034 } 3035 3036 static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 3037 int maxOsId, kmp_affin_mask_t *tempMask, 3038 int *setSize) { 3039 const char *next; 3040 3041 // valid follow sets are '{' '!' and num 3042 SKIP_WS(*scan); 3043 if (**scan == '{') { 3044 (*scan)++; // skip '{' 3045 __kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize); 3046 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 3047 (*scan)++; // skip '}' 3048 } else if (**scan == '!') { 3049 (*scan)++; // skip '!' 3050 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 3051 KMP_CPU_COMPLEMENT(maxOsId, tempMask); 3052 } else if ((**scan >= '0') && (**scan <= '9')) { 3053 next = *scan; 3054 SKIP_DIGITS(next); 3055 int num = __kmp_str_to_int(*scan, *next); 3056 KMP_ASSERT(num >= 0); 3057 if ((num > maxOsId) || 3058 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3059 if (__kmp_affinity_verbose || 3060 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 3061 KMP_WARNING(AffIgnoreInvalidProcID, num); 3062 } 3063 } else { 3064 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 3065 (*setSize)++; 3066 } 3067 *scan = next; // skip num 3068 } else { 3069 KMP_ASSERT2(0, "bad explicit places list"); 3070 } 3071 } 3072 3073 // static void 3074 void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 3075 unsigned int *out_numMasks, 3076 const char *placelist, 3077 kmp_affin_mask_t *osId2Mask, 3078 int maxOsId) { 3079 int i, j, count, stride, sign; 3080 const char *scan = placelist; 3081 const char *next = placelist; 3082 3083 numNewMasks = 2; 3084 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 3085 nextNewMask = 0; 3086 3087 // tempMask is modified based on the previous or initial 3088 // place to form the current place 3089 // previousMask contains the previous place 3090 kmp_affin_mask_t *tempMask; 3091 kmp_affin_mask_t *previousMask; 3092 KMP_CPU_ALLOC(tempMask); 3093 KMP_CPU_ZERO(tempMask); 3094 KMP_CPU_ALLOC(previousMask); 3095 KMP_CPU_ZERO(previousMask); 3096 int setSize = 0; 3097 3098 for (;;) { 3099 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 3100 3101 // valid follow sets are ',' ':' and EOL 3102 SKIP_WS(scan); 3103 if (*scan == '\0' || *scan == ',') { 3104 if (setSize > 0) { 3105 ADD_MASK(tempMask); 3106 } 3107 KMP_CPU_ZERO(tempMask); 3108 setSize = 0; 3109 if (*scan == '\0') { 3110 break; 3111 } 3112 scan++; // skip ',' 3113 continue; 3114 } 3115 3116 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3117 scan++; // skip ':' 3118 3119 // Read count parameter 3120 SKIP_WS(scan); 3121 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 3122 next = scan; 3123 SKIP_DIGITS(next); 3124 count = __kmp_str_to_int(scan, *next); 3125 KMP_ASSERT(count >= 0); 3126 scan = next; 3127 3128 // valid follow sets are ',' ':' and EOL 3129 SKIP_WS(scan); 3130 if (*scan == '\0' || *scan == ',') { 3131 stride = +1; 3132 } else { 3133 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3134 scan++; // skip ':' 3135 3136 // Read stride parameter 3137 sign = +1; 3138 for (;;) { 3139 SKIP_WS(scan); 3140 if (*scan == '+') { 3141 scan++; // skip '+' 3142 continue; 3143 } 3144 if (*scan == '-') { 3145 sign *= -1; 3146 scan++; // skip '-' 3147 continue; 3148 } 3149 break; 3150 } 3151 SKIP_WS(scan); 3152 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 3153 next = scan; 3154 SKIP_DIGITS(next); 3155 stride = __kmp_str_to_int(scan, *next); 3156 KMP_DEBUG_ASSERT(stride >= 0); 3157 scan = next; 3158 stride *= sign; 3159 } 3160 3161 // Add places determined by initial_place : count : stride 3162 for (i = 0; i < count; i++) { 3163 if (setSize == 0) { 3164 break; 3165 } 3166 // Add the current place, then build the next place (tempMask) from that 3167 KMP_CPU_COPY(previousMask, tempMask); 3168 ADD_MASK(previousMask); 3169 KMP_CPU_ZERO(tempMask); 3170 setSize = 0; 3171 KMP_CPU_SET_ITERATE(j, previousMask) { 3172 if (!KMP_CPU_ISSET(j, previousMask)) { 3173 continue; 3174 } 3175 if ((j + stride > maxOsId) || (j + stride < 0) || 3176 (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) || 3177 (!KMP_CPU_ISSET(j + stride, 3178 KMP_CPU_INDEX(osId2Mask, j + stride)))) { 3179 if ((__kmp_affinity_verbose || 3180 (__kmp_affinity_warnings && 3181 (__kmp_affinity_type != affinity_none))) && 3182 i < count - 1) { 3183 KMP_WARNING(AffIgnoreInvalidProcID, j + stride); 3184 } 3185 continue; 3186 } 3187 KMP_CPU_SET(j + stride, tempMask); 3188 setSize++; 3189 } 3190 } 3191 KMP_CPU_ZERO(tempMask); 3192 setSize = 0; 3193 3194 // valid follow sets are ',' and EOL 3195 SKIP_WS(scan); 3196 if (*scan == '\0') { 3197 break; 3198 } 3199 if (*scan == ',') { 3200 scan++; // skip ',' 3201 continue; 3202 } 3203 3204 KMP_ASSERT2(0, "bad explicit places list"); 3205 } 3206 3207 *out_numMasks = nextNewMask; 3208 if (nextNewMask == 0) { 3209 *out_masks = NULL; 3210 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3211 return; 3212 } 3213 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3214 KMP_CPU_FREE(tempMask); 3215 KMP_CPU_FREE(previousMask); 3216 for (i = 0; i < nextNewMask; i++) { 3217 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 3218 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 3219 KMP_CPU_COPY(dest, src); 3220 } 3221 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3222 } 3223 3224 #endif /* OMP_40_ENABLED */ 3225 3226 #undef ADD_MASK 3227 #undef ADD_MASK_OSID 3228 3229 #if KMP_USE_HWLOC 3230 static int __kmp_hwloc_skip_PUs_obj(hwloc_topology_t t, hwloc_obj_t o) { 3231 // skip PUs descendants of the object o 3232 int skipped = 0; 3233 hwloc_obj_t hT = NULL; 3234 int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT); 3235 for (int i = 0; i < N; ++i) { 3236 KMP_DEBUG_ASSERT(hT); 3237 unsigned idx = hT->os_index; 3238 if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3239 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3240 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3241 ++skipped; 3242 } 3243 hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT); 3244 } 3245 return skipped; // count number of skipped units 3246 } 3247 3248 static int __kmp_hwloc_obj_has_PUs(hwloc_topology_t t, hwloc_obj_t o) { 3249 // check if obj has PUs present in fullMask 3250 hwloc_obj_t hT = NULL; 3251 int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT); 3252 for (int i = 0; i < N; ++i) { 3253 KMP_DEBUG_ASSERT(hT); 3254 unsigned idx = hT->os_index; 3255 if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) 3256 return 1; // found PU 3257 hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT); 3258 } 3259 return 0; // no PUs found 3260 } 3261 #endif // KMP_USE_HWLOC 3262 3263 static void __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) { 3264 AddrUnsPair *newAddr; 3265 if (__kmp_hws_requested == 0) 3266 goto _exit; // no topology limiting actions requested, exit 3267 #if KMP_USE_HWLOC 3268 if (__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 3269 // Number of subobjects calculated dynamically, this works fine for 3270 // any non-uniform topology. 3271 // L2 cache objects are determined by depth, other objects - by type. 3272 hwloc_topology_t tp = __kmp_hwloc_topology; 3273 int nS = 0, nN = 0, nL = 0, nC = 0, 3274 nT = 0; // logical index including skipped 3275 int nCr = 0, nTr = 0; // number of requested units 3276 int nPkg = 0, nCo = 0, n_new = 0, n_old = 0, nCpP = 0, nTpC = 0; // counters 3277 hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to) 3278 int L2depth, idx; 3279 3280 // check support of extensions ---------------------------------- 3281 int numa_support = 0, tile_support = 0; 3282 if (__kmp_pu_os_idx) 3283 hT = hwloc_get_pu_obj_by_os_index(tp, 3284 __kmp_pu_os_idx[__kmp_avail_proc - 1]); 3285 else 3286 hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, __kmp_avail_proc - 1); 3287 if (hT == NULL) { // something's gone wrong 3288 KMP_WARNING(AffHWSubsetUnsupported); 3289 goto _exit; 3290 } 3291 // check NUMA node 3292 hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT); 3293 hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT); 3294 if (hN != NULL && hN->depth > hS->depth) { 3295 numa_support = 1; // 1 in case socket includes node(s) 3296 } else if (__kmp_hws_node.num > 0) { 3297 // don't support sockets inside NUMA node (no such HW found for testing) 3298 KMP_WARNING(AffHWSubsetUnsupported); 3299 goto _exit; 3300 } 3301 // check L2 cahce, get object by depth because of multiple caches 3302 L2depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED); 3303 hL = hwloc_get_ancestor_obj_by_depth(tp, L2depth, hT); 3304 if (hL != NULL && 3305 __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1) { 3306 tile_support = 1; // no sense to count L2 if it includes single core 3307 } else if (__kmp_hws_tile.num > 0) { 3308 if (__kmp_hws_core.num == 0) { 3309 __kmp_hws_core = __kmp_hws_tile; // replace L2 with core 3310 __kmp_hws_tile.num = 0; 3311 } else { 3312 // L2 and core are both requested, but represent same object 3313 KMP_WARNING(AffHWSubsetInvalid); 3314 goto _exit; 3315 } 3316 } 3317 // end of check of extensions ----------------------------------- 3318 3319 // fill in unset items, validate settings ----------------------- 3320 if (__kmp_hws_socket.num == 0) 3321 __kmp_hws_socket.num = nPackages; // use all available sockets 3322 if (__kmp_hws_socket.offset >= nPackages) { 3323 KMP_WARNING(AffHWSubsetManySockets); 3324 goto _exit; 3325 } 3326 if (numa_support) { 3327 hN = NULL; 3328 int NN = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, 3329 &hN); // num nodes in socket 3330 if (__kmp_hws_node.num == 0) 3331 __kmp_hws_node.num = NN; // use all available nodes 3332 if (__kmp_hws_node.offset >= NN) { 3333 KMP_WARNING(AffHWSubsetManyNodes); 3334 goto _exit; 3335 } 3336 if (tile_support) { 3337 // get num tiles in node 3338 int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL); 3339 if (__kmp_hws_tile.num == 0) { 3340 __kmp_hws_tile.num = NL + 1; 3341 } // use all available tiles, some node may have more tiles, thus +1 3342 if (__kmp_hws_tile.offset >= NL) { 3343 KMP_WARNING(AffHWSubsetManyTiles); 3344 goto _exit; 3345 } 3346 int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, 3347 &hC); // num cores in tile 3348 if (__kmp_hws_core.num == 0) 3349 __kmp_hws_core.num = NC; // use all available cores 3350 if (__kmp_hws_core.offset >= NC) { 3351 KMP_WARNING(AffHWSubsetManyCores); 3352 goto _exit; 3353 } 3354 } else { // tile_support 3355 int NC = __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, 3356 &hC); // num cores in node 3357 if (__kmp_hws_core.num == 0) 3358 __kmp_hws_core.num = NC; // use all available cores 3359 if (__kmp_hws_core.offset >= NC) { 3360 KMP_WARNING(AffHWSubsetManyCores); 3361 goto _exit; 3362 } 3363 } // tile_support 3364 } else { // numa_support 3365 if (tile_support) { 3366 // get num tiles in socket 3367 int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL); 3368 if (__kmp_hws_tile.num == 0) 3369 __kmp_hws_tile.num = NL; // use all available tiles 3370 if (__kmp_hws_tile.offset >= NL) { 3371 KMP_WARNING(AffHWSubsetManyTiles); 3372 goto _exit; 3373 } 3374 int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, 3375 &hC); // num cores in tile 3376 if (__kmp_hws_core.num == 0) 3377 __kmp_hws_core.num = NC; // use all available cores 3378 if (__kmp_hws_core.offset >= NC) { 3379 KMP_WARNING(AffHWSubsetManyCores); 3380 goto _exit; 3381 } 3382 } else { // tile_support 3383 int NC = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, 3384 &hC); // num cores in socket 3385 if (__kmp_hws_core.num == 0) 3386 __kmp_hws_core.num = NC; // use all available cores 3387 if (__kmp_hws_core.offset >= NC) { 3388 KMP_WARNING(AffHWSubsetManyCores); 3389 goto _exit; 3390 } 3391 } // tile_support 3392 } 3393 if (__kmp_hws_proc.num == 0) 3394 __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all available procs 3395 if (__kmp_hws_proc.offset >= __kmp_nThreadsPerCore) { 3396 KMP_WARNING(AffHWSubsetManyProcs); 3397 goto _exit; 3398 } 3399 // end of validation -------------------------------------------- 3400 3401 if (pAddr) // pAddr is NULL in case of affinity_none 3402 newAddr = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * 3403 __kmp_avail_proc); // max size 3404 // main loop to form HW subset ---------------------------------- 3405 hS = NULL; 3406 int NP = hwloc_get_nbobjs_by_type(tp, HWLOC_OBJ_PACKAGE); 3407 for (int s = 0; s < NP; ++s) { 3408 // Check Socket ----------------------------------------------- 3409 hS = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hS); 3410 if (!__kmp_hwloc_obj_has_PUs(tp, hS)) 3411 continue; // skip socket if all PUs are out of fullMask 3412 ++nS; // only count objects those have PUs in affinity mask 3413 if (nS <= __kmp_hws_socket.offset || 3414 nS > __kmp_hws_socket.num + __kmp_hws_socket.offset) { 3415 n_old += __kmp_hwloc_skip_PUs_obj(tp, hS); // skip socket 3416 continue; // move to next socket 3417 } 3418 nCr = 0; // count number of cores per socket 3419 // socket requested, go down the topology tree 3420 // check 4 cases: (+NUMA+Tile), (+NUMA-Tile), (-NUMA+Tile), (-NUMA-Tile) 3421 if (numa_support) { 3422 nN = 0; 3423 hN = NULL; 3424 // num nodes in current socket 3425 int NN = 3426 __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, &hN); 3427 for (int n = 0; n < NN; ++n) { 3428 // Check NUMA Node ---------------------------------------- 3429 if (!__kmp_hwloc_obj_has_PUs(tp, hN)) { 3430 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3431 continue; // skip node if all PUs are out of fullMask 3432 } 3433 ++nN; 3434 if (nN <= __kmp_hws_node.offset || 3435 nN > __kmp_hws_node.num + __kmp_hws_node.offset) { 3436 // skip node as not requested 3437 n_old += __kmp_hwloc_skip_PUs_obj(tp, hN); // skip node 3438 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3439 continue; // move to next node 3440 } 3441 // node requested, go down the topology tree 3442 if (tile_support) { 3443 nL = 0; 3444 hL = NULL; 3445 int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL); 3446 for (int l = 0; l < NL; ++l) { 3447 // Check L2 (tile) ------------------------------------ 3448 if (!__kmp_hwloc_obj_has_PUs(tp, hL)) { 3449 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3450 continue; // skip tile if all PUs are out of fullMask 3451 } 3452 ++nL; 3453 if (nL <= __kmp_hws_tile.offset || 3454 nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) { 3455 // skip tile as not requested 3456 n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile 3457 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3458 continue; // move to next tile 3459 } 3460 // tile requested, go down the topology tree 3461 nC = 0; 3462 hC = NULL; 3463 // num cores in current tile 3464 int NC = __kmp_hwloc_count_children_by_type(tp, hL, 3465 HWLOC_OBJ_CORE, &hC); 3466 for (int c = 0; c < NC; ++c) { 3467 // Check Core --------------------------------------- 3468 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3469 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3470 continue; // skip core if all PUs are out of fullMask 3471 } 3472 ++nC; 3473 if (nC <= __kmp_hws_core.offset || 3474 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3475 // skip node as not requested 3476 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3477 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3478 continue; // move to next node 3479 } 3480 // core requested, go down to PUs 3481 nT = 0; 3482 nTr = 0; 3483 hT = NULL; 3484 // num procs in current core 3485 int NT = __kmp_hwloc_count_children_by_type(tp, hC, 3486 HWLOC_OBJ_PU, &hT); 3487 for (int t = 0; t < NT; ++t) { 3488 // Check PU --------------------------------------- 3489 idx = hT->os_index; 3490 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3491 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3492 continue; // skip PU if not in fullMask 3493 } 3494 ++nT; 3495 if (nT <= __kmp_hws_proc.offset || 3496 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3497 // skip PU 3498 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3499 ++n_old; 3500 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3501 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3502 continue; // move to next node 3503 } 3504 ++nTr; 3505 if (pAddr) // collect requested thread's data 3506 newAddr[n_new] = (*pAddr)[n_old]; 3507 ++n_new; 3508 ++n_old; 3509 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3510 } // threads loop 3511 if (nTr > 0) { 3512 ++nCr; // num cores per socket 3513 ++nCo; // total num cores 3514 if (nTr > nTpC) 3515 nTpC = nTr; // calc max threads per core 3516 } 3517 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3518 } // cores loop 3519 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3520 } // tiles loop 3521 } else { // tile_support 3522 // no tiles, check cores 3523 nC = 0; 3524 hC = NULL; 3525 // num cores in current node 3526 int NC = 3527 __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, &hC); 3528 for (int c = 0; c < NC; ++c) { 3529 // Check Core --------------------------------------- 3530 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3531 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3532 continue; // skip core if all PUs are out of fullMask 3533 } 3534 ++nC; 3535 if (nC <= __kmp_hws_core.offset || 3536 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3537 // skip node as not requested 3538 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3539 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3540 continue; // move to next node 3541 } 3542 // core requested, go down to PUs 3543 nT = 0; 3544 nTr = 0; 3545 hT = NULL; 3546 int NT = 3547 __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT); 3548 for (int t = 0; t < NT; ++t) { 3549 // Check PU --------------------------------------- 3550 idx = hT->os_index; 3551 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3552 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3553 continue; // skip PU if not in fullMask 3554 } 3555 ++nT; 3556 if (nT <= __kmp_hws_proc.offset || 3557 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3558 // skip PU 3559 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3560 ++n_old; 3561 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3562 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3563 continue; // move to next node 3564 } 3565 ++nTr; 3566 if (pAddr) // collect requested thread's data 3567 newAddr[n_new] = (*pAddr)[n_old]; 3568 ++n_new; 3569 ++n_old; 3570 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3571 } // threads loop 3572 if (nTr > 0) { 3573 ++nCr; // num cores per socket 3574 ++nCo; // total num cores 3575 if (nTr > nTpC) 3576 nTpC = nTr; // calc max threads per core 3577 } 3578 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3579 } // cores loop 3580 } // tiles support 3581 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3582 } // nodes loop 3583 } else { // numa_support 3584 // no NUMA support 3585 if (tile_support) { 3586 nL = 0; 3587 hL = NULL; 3588 // num tiles in current socket 3589 int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL); 3590 for (int l = 0; l < NL; ++l) { 3591 // Check L2 (tile) ------------------------------------ 3592 if (!__kmp_hwloc_obj_has_PUs(tp, hL)) { 3593 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3594 continue; // skip tile if all PUs are out of fullMask 3595 } 3596 ++nL; 3597 if (nL <= __kmp_hws_tile.offset || 3598 nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) { 3599 // skip tile as not requested 3600 n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile 3601 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3602 continue; // move to next tile 3603 } 3604 // tile requested, go down the topology tree 3605 nC = 0; 3606 hC = NULL; 3607 // num cores per tile 3608 int NC = 3609 __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC); 3610 for (int c = 0; c < NC; ++c) { 3611 // Check Core --------------------------------------- 3612 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3613 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3614 continue; // skip core if all PUs are out of fullMask 3615 } 3616 ++nC; 3617 if (nC <= __kmp_hws_core.offset || 3618 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3619 // skip node as not requested 3620 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3621 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3622 continue; // move to next node 3623 } 3624 // core requested, go down to PUs 3625 nT = 0; 3626 nTr = 0; 3627 hT = NULL; 3628 // num procs per core 3629 int NT = 3630 __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT); 3631 for (int t = 0; t < NT; ++t) { 3632 // Check PU --------------------------------------- 3633 idx = hT->os_index; 3634 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3635 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3636 continue; // skip PU if not in fullMask 3637 } 3638 ++nT; 3639 if (nT <= __kmp_hws_proc.offset || 3640 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3641 // skip PU 3642 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3643 ++n_old; 3644 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3645 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3646 continue; // move to next node 3647 } 3648 ++nTr; 3649 if (pAddr) // collect requested thread's data 3650 newAddr[n_new] = (*pAddr)[n_old]; 3651 ++n_new; 3652 ++n_old; 3653 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3654 } // threads loop 3655 if (nTr > 0) { 3656 ++nCr; // num cores per socket 3657 ++nCo; // total num cores 3658 if (nTr > nTpC) 3659 nTpC = nTr; // calc max threads per core 3660 } 3661 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3662 } // cores loop 3663 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3664 } // tiles loop 3665 } else { // tile_support 3666 // no tiles, check cores 3667 nC = 0; 3668 hC = NULL; 3669 // num cores in socket 3670 int NC = 3671 __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, &hC); 3672 for (int c = 0; c < NC; ++c) { 3673 // Check Core ------------------------------------------- 3674 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3675 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3676 continue; // skip core if all PUs are out of fullMask 3677 } 3678 ++nC; 3679 if (nC <= __kmp_hws_core.offset || 3680 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3681 // skip node as not requested 3682 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3683 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3684 continue; // move to next node 3685 } 3686 // core requested, go down to PUs 3687 nT = 0; 3688 nTr = 0; 3689 hT = NULL; 3690 // num procs per core 3691 int NT = 3692 __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT); 3693 for (int t = 0; t < NT; ++t) { 3694 // Check PU --------------------------------------- 3695 idx = hT->os_index; 3696 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3697 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3698 continue; // skip PU if not in fullMask 3699 } 3700 ++nT; 3701 if (nT <= __kmp_hws_proc.offset || 3702 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3703 // skip PU 3704 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3705 ++n_old; 3706 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3707 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3708 continue; // move to next node 3709 } 3710 ++nTr; 3711 if (pAddr) // collect requested thread's data 3712 newAddr[n_new] = (*pAddr)[n_old]; 3713 ++n_new; 3714 ++n_old; 3715 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3716 } // threads loop 3717 if (nTr > 0) { 3718 ++nCr; // num cores per socket 3719 ++nCo; // total num cores 3720 if (nTr > nTpC) 3721 nTpC = nTr; // calc max threads per core 3722 } 3723 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3724 } // cores loop 3725 } // tiles support 3726 } // numa_support 3727 if (nCr > 0) { // found cores? 3728 ++nPkg; // num sockets 3729 if (nCr > nCpP) 3730 nCpP = nCr; // calc max cores per socket 3731 } 3732 } // sockets loop 3733 3734 // check the subset is valid 3735 KMP_DEBUG_ASSERT(n_old == __kmp_avail_proc); 3736 KMP_DEBUG_ASSERT(nPkg > 0); 3737 KMP_DEBUG_ASSERT(nCpP > 0); 3738 KMP_DEBUG_ASSERT(nTpC > 0); 3739 KMP_DEBUG_ASSERT(nCo > 0); 3740 KMP_DEBUG_ASSERT(nPkg <= nPackages); 3741 KMP_DEBUG_ASSERT(nCpP <= nCoresPerPkg); 3742 KMP_DEBUG_ASSERT(nTpC <= __kmp_nThreadsPerCore); 3743 KMP_DEBUG_ASSERT(nCo <= __kmp_ncores); 3744 3745 nPackages = nPkg; // correct num sockets 3746 nCoresPerPkg = nCpP; // correct num cores per socket 3747 __kmp_nThreadsPerCore = nTpC; // correct num threads per core 3748 __kmp_avail_proc = n_new; // correct num procs 3749 __kmp_ncores = nCo; // correct num cores 3750 // hwloc topology method end 3751 } else 3752 #endif // KMP_USE_HWLOC 3753 { 3754 int n_old = 0, n_new = 0, proc_num = 0; 3755 if (__kmp_hws_node.num > 0 || __kmp_hws_tile.num > 0) { 3756 KMP_WARNING(AffHWSubsetNoHWLOC); 3757 goto _exit; 3758 } 3759 if (__kmp_hws_socket.num == 0) 3760 __kmp_hws_socket.num = nPackages; // use all available sockets 3761 if (__kmp_hws_core.num == 0) 3762 __kmp_hws_core.num = nCoresPerPkg; // use all available cores 3763 if (__kmp_hws_proc.num == 0 || __kmp_hws_proc.num > __kmp_nThreadsPerCore) 3764 __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all HW contexts 3765 if (!__kmp_affinity_uniform_topology()) { 3766 KMP_WARNING(AffHWSubsetNonUniform); 3767 goto _exit; // don't support non-uniform topology 3768 } 3769 if (depth > 3) { 3770 KMP_WARNING(AffHWSubsetNonThreeLevel); 3771 goto _exit; // don't support not-3-level topology 3772 } 3773 if (__kmp_hws_socket.offset + __kmp_hws_socket.num > nPackages) { 3774 KMP_WARNING(AffHWSubsetManySockets); 3775 goto _exit; 3776 } 3777 if (__kmp_hws_core.offset + __kmp_hws_core.num > nCoresPerPkg) { 3778 KMP_WARNING(AffHWSubsetManyCores); 3779 goto _exit; 3780 } 3781 // Form the requested subset 3782 if (pAddr) // pAddr is NULL in case of affinity_none 3783 newAddr = (AddrUnsPair *)__kmp_allocate( 3784 sizeof(AddrUnsPair) * __kmp_hws_socket.num * __kmp_hws_core.num * 3785 __kmp_hws_proc.num); 3786 for (int i = 0; i < nPackages; ++i) { 3787 if (i < __kmp_hws_socket.offset || 3788 i >= __kmp_hws_socket.offset + __kmp_hws_socket.num) { 3789 // skip not-requested socket 3790 n_old += nCoresPerPkg * __kmp_nThreadsPerCore; 3791 if (__kmp_pu_os_idx != NULL) { 3792 // walk through skipped socket 3793 for (int j = 0; j < nCoresPerPkg; ++j) { 3794 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 3795 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3796 ++proc_num; 3797 } 3798 } 3799 } 3800 } else { 3801 // walk through requested socket 3802 for (int j = 0; j < nCoresPerPkg; ++j) { 3803 if (j < __kmp_hws_core.offset || 3804 j >= __kmp_hws_core.offset + 3805 __kmp_hws_core.num) { // skip not-requested core 3806 n_old += __kmp_nThreadsPerCore; 3807 if (__kmp_pu_os_idx != NULL) { 3808 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 3809 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3810 ++proc_num; 3811 } 3812 } 3813 } else { 3814 // walk through requested core 3815 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 3816 if (k < __kmp_hws_proc.num) { 3817 if (pAddr) // collect requested thread's data 3818 newAddr[n_new] = (*pAddr)[n_old]; 3819 n_new++; 3820 } else { 3821 if (__kmp_pu_os_idx != NULL) 3822 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3823 } 3824 n_old++; 3825 ++proc_num; 3826 } 3827 } 3828 } 3829 } 3830 } 3831 KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore); 3832 KMP_DEBUG_ASSERT(n_new == 3833 __kmp_hws_socket.num * __kmp_hws_core.num * 3834 __kmp_hws_proc.num); 3835 nPackages = __kmp_hws_socket.num; // correct nPackages 3836 nCoresPerPkg = __kmp_hws_core.num; // correct nCoresPerPkg 3837 __kmp_nThreadsPerCore = __kmp_hws_proc.num; // correct __kmp_nThreadsPerCore 3838 __kmp_avail_proc = n_new; // correct avail_proc 3839 __kmp_ncores = nPackages * __kmp_hws_core.num; // correct ncores 3840 } // non-hwloc topology method 3841 if (pAddr) { 3842 __kmp_free(*pAddr); 3843 *pAddr = newAddr; // replace old topology with new one 3844 } 3845 if (__kmp_affinity_verbose) { 3846 char m[KMP_AFFIN_MASK_PRINT_LEN]; 3847 __kmp_affinity_print_mask(m, KMP_AFFIN_MASK_PRINT_LEN, 3848 __kmp_affin_fullMask); 3849 if (__kmp_affinity_respect_mask) { 3850 KMP_INFORM(InitOSProcSetRespect, "KMP_HW_SUBSET", m); 3851 } else { 3852 KMP_INFORM(InitOSProcSetNotRespect, "KMP_HW_SUBSET", m); 3853 } 3854 KMP_INFORM(AvailableOSProc, "KMP_HW_SUBSET", __kmp_avail_proc); 3855 kmp_str_buf_t buf; 3856 __kmp_str_buf_init(&buf); 3857 __kmp_str_buf_print(&buf, "%d", nPackages); 3858 KMP_INFORM(TopologyExtra, "KMP_HW_SUBSET", buf.str, nCoresPerPkg, 3859 __kmp_nThreadsPerCore, __kmp_ncores); 3860 __kmp_str_buf_free(&buf); 3861 } 3862 _exit: 3863 if (__kmp_pu_os_idx != NULL) { 3864 __kmp_free(__kmp_pu_os_idx); 3865 __kmp_pu_os_idx = NULL; 3866 } 3867 } 3868 3869 // This function figures out the deepest level at which there is at least one 3870 // cluster/core with more than one processing unit bound to it. 3871 static int __kmp_affinity_find_core_level(const AddrUnsPair *address2os, 3872 int nprocs, int bottom_level) { 3873 int core_level = 0; 3874 3875 for (int i = 0; i < nprocs; i++) { 3876 for (int j = bottom_level; j > 0; j--) { 3877 if (address2os[i].first.labels[j] > 0) { 3878 if (core_level < (j - 1)) { 3879 core_level = j - 1; 3880 } 3881 } 3882 } 3883 } 3884 return core_level; 3885 } 3886 3887 // This function counts number of clusters/cores at given level. 3888 static int __kmp_affinity_compute_ncores(const AddrUnsPair *address2os, 3889 int nprocs, int bottom_level, 3890 int core_level) { 3891 int ncores = 0; 3892 int i, j; 3893 3894 j = bottom_level; 3895 for (i = 0; i < nprocs; i++) { 3896 for (j = bottom_level; j > core_level; j--) { 3897 if ((i + 1) < nprocs) { 3898 if (address2os[i + 1].first.labels[j] > 0) { 3899 break; 3900 } 3901 } 3902 } 3903 if (j == core_level) { 3904 ncores++; 3905 } 3906 } 3907 if (j > core_level) { 3908 // In case of ( nprocs < __kmp_avail_proc ) we may end too deep and miss one 3909 // core. May occur when called from __kmp_affinity_find_core(). 3910 ncores++; 3911 } 3912 return ncores; 3913 } 3914 3915 // This function finds to which cluster/core given processing unit is bound. 3916 static int __kmp_affinity_find_core(const AddrUnsPair *address2os, int proc, 3917 int bottom_level, int core_level) { 3918 return __kmp_affinity_compute_ncores(address2os, proc + 1, bottom_level, 3919 core_level) - 3920 1; 3921 } 3922 3923 // This function finds maximal number of processing units bound to a 3924 // cluster/core at given level. 3925 static int __kmp_affinity_max_proc_per_core(const AddrUnsPair *address2os, 3926 int nprocs, int bottom_level, 3927 int core_level) { 3928 int maxprocpercore = 0; 3929 3930 if (core_level < bottom_level) { 3931 for (int i = 0; i < nprocs; i++) { 3932 int percore = address2os[i].first.labels[core_level + 1] + 1; 3933 3934 if (percore > maxprocpercore) { 3935 maxprocpercore = percore; 3936 } 3937 } 3938 } else { 3939 maxprocpercore = 1; 3940 } 3941 return maxprocpercore; 3942 } 3943 3944 static AddrUnsPair *address2os = NULL; 3945 static int *procarr = NULL; 3946 static int __kmp_aff_depth = 0; 3947 3948 #define KMP_EXIT_AFF_NONE \ 3949 KMP_ASSERT(__kmp_affinity_type == affinity_none); \ 3950 KMP_ASSERT(address2os == NULL); \ 3951 __kmp_apply_thread_places(NULL, 0); \ 3952 return; 3953 3954 static int __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) { 3955 const Address *aa = &(((const AddrUnsPair *)a)->first); 3956 const Address *bb = &(((const AddrUnsPair *)b)->first); 3957 unsigned depth = aa->depth; 3958 unsigned i; 3959 KMP_DEBUG_ASSERT(depth == bb->depth); 3960 KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth); 3961 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0); 3962 for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) { 3963 int j = depth - i - 1; 3964 if (aa->childNums[j] < bb->childNums[j]) 3965 return -1; 3966 if (aa->childNums[j] > bb->childNums[j]) 3967 return 1; 3968 } 3969 for (; i < depth; i++) { 3970 int j = i - __kmp_affinity_compact; 3971 if (aa->childNums[j] < bb->childNums[j]) 3972 return -1; 3973 if (aa->childNums[j] > bb->childNums[j]) 3974 return 1; 3975 } 3976 return 0; 3977 } 3978 3979 static void __kmp_aux_affinity_initialize(void) { 3980 if (__kmp_affinity_masks != NULL) { 3981 KMP_ASSERT(__kmp_affin_fullMask != NULL); 3982 return; 3983 } 3984 3985 // Create the "full" mask - this defines all of the processors that we 3986 // consider to be in the machine model. If respect is set, then it is the 3987 // initialization thread's affinity mask. Otherwise, it is all processors that 3988 // we know about on the machine. 3989 if (__kmp_affin_fullMask == NULL) { 3990 KMP_CPU_ALLOC(__kmp_affin_fullMask); 3991 } 3992 if (KMP_AFFINITY_CAPABLE()) { 3993 if (__kmp_affinity_respect_mask) { 3994 __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE); 3995 3996 // Count the number of available processors. 3997 unsigned i; 3998 __kmp_avail_proc = 0; 3999 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 4000 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 4001 continue; 4002 } 4003 __kmp_avail_proc++; 4004 } 4005 if (__kmp_avail_proc > __kmp_xproc) { 4006 if (__kmp_affinity_verbose || 4007 (__kmp_affinity_warnings && 4008 (__kmp_affinity_type != affinity_none))) { 4009 KMP_WARNING(ErrorInitializeAffinity); 4010 } 4011 __kmp_affinity_type = affinity_none; 4012 KMP_AFFINITY_DISABLE(); 4013 return; 4014 } 4015 } else { 4016 __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask); 4017 __kmp_avail_proc = __kmp_xproc; 4018 } 4019 } 4020 4021 if (__kmp_affinity_gran == affinity_gran_tile && 4022 // check if user's request is valid 4023 __kmp_affinity_dispatch->get_api_type() == KMPAffinity::NATIVE_OS) { 4024 KMP_WARNING(AffTilesNoHWLOC, "KMP_AFFINITY"); 4025 __kmp_affinity_gran = affinity_gran_package; 4026 } 4027 4028 int depth = -1; 4029 kmp_i18n_id_t msg_id = kmp_i18n_null; 4030 4031 // For backward compatibility, setting KMP_CPUINFO_FILE => 4032 // KMP_TOPOLOGY_METHOD=cpuinfo 4033 if ((__kmp_cpuinfo_file != NULL) && 4034 (__kmp_affinity_top_method == affinity_top_method_all)) { 4035 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 4036 } 4037 4038 if (__kmp_affinity_top_method == affinity_top_method_all) { 4039 // In the default code path, errors are not fatal - we just try using 4040 // another method. We only emit a warning message if affinity is on, or the 4041 // verbose flag is set, an the nowarnings flag was not set. 4042 const char *file_name = NULL; 4043 int line = 0; 4044 #if KMP_USE_HWLOC 4045 if (depth < 0 && 4046 __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 4047 if (__kmp_affinity_verbose) { 4048 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 4049 } 4050 if (!__kmp_hwloc_error) { 4051 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); 4052 if (depth == 0) { 4053 KMP_EXIT_AFF_NONE; 4054 } else if (depth < 0 && __kmp_affinity_verbose) { 4055 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 4056 } 4057 } else if (__kmp_affinity_verbose) { 4058 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 4059 } 4060 } 4061 #endif 4062 4063 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4064 4065 if (depth < 0) { 4066 if (__kmp_affinity_verbose) { 4067 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 4068 } 4069 4070 file_name = NULL; 4071 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 4072 if (depth == 0) { 4073 KMP_EXIT_AFF_NONE; 4074 } 4075 4076 if (depth < 0) { 4077 if (__kmp_affinity_verbose) { 4078 if (msg_id != kmp_i18n_null) { 4079 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", 4080 __kmp_i18n_catgets(msg_id), 4081 KMP_I18N_STR(DecodingLegacyAPIC)); 4082 } else { 4083 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 4084 KMP_I18N_STR(DecodingLegacyAPIC)); 4085 } 4086 } 4087 4088 file_name = NULL; 4089 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 4090 if (depth == 0) { 4091 KMP_EXIT_AFF_NONE; 4092 } 4093 } 4094 } 4095 4096 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4097 4098 #if KMP_OS_LINUX 4099 4100 if (depth < 0) { 4101 if (__kmp_affinity_verbose) { 4102 if (msg_id != kmp_i18n_null) { 4103 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", 4104 __kmp_i18n_catgets(msg_id), "/proc/cpuinfo"); 4105 } else { 4106 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo"); 4107 } 4108 } 4109 4110 FILE *f = fopen("/proc/cpuinfo", "r"); 4111 if (f == NULL) { 4112 msg_id = kmp_i18n_str_CantOpenCpuinfo; 4113 } else { 4114 file_name = "/proc/cpuinfo"; 4115 depth = 4116 __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 4117 fclose(f); 4118 if (depth == 0) { 4119 KMP_EXIT_AFF_NONE; 4120 } 4121 } 4122 } 4123 4124 #endif /* KMP_OS_LINUX */ 4125 4126 #if KMP_GROUP_AFFINITY 4127 4128 if ((depth < 0) && (__kmp_num_proc_groups > 1)) { 4129 if (__kmp_affinity_verbose) { 4130 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 4131 } 4132 4133 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 4134 KMP_ASSERT(depth != 0); 4135 } 4136 4137 #endif /* KMP_GROUP_AFFINITY */ 4138 4139 if (depth < 0) { 4140 if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) { 4141 if (file_name == NULL) { 4142 KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id)); 4143 } else if (line == 0) { 4144 KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id)); 4145 } else { 4146 KMP_INFORM(UsingFlatOSFileLine, file_name, line, 4147 __kmp_i18n_catgets(msg_id)); 4148 } 4149 } 4150 // FIXME - print msg if msg_id = kmp_i18n_null ??? 4151 4152 file_name = ""; 4153 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 4154 if (depth == 0) { 4155 KMP_EXIT_AFF_NONE; 4156 } 4157 KMP_ASSERT(depth > 0); 4158 KMP_ASSERT(address2os != NULL); 4159 } 4160 } 4161 4162 #if KMP_USE_HWLOC 4163 else if (__kmp_affinity_top_method == affinity_top_method_hwloc) { 4164 KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC); 4165 if (__kmp_affinity_verbose) { 4166 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 4167 } 4168 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); 4169 if (depth == 0) { 4170 KMP_EXIT_AFF_NONE; 4171 } 4172 } 4173 #endif // KMP_USE_HWLOC 4174 4175 // If the user has specified that a paricular topology discovery method is to be 4176 // used, then we abort if that method fails. The exception is group affinity, 4177 // which might have been implicitly set. 4178 4179 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4180 4181 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 4182 if (__kmp_affinity_verbose) { 4183 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 4184 } 4185 4186 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 4187 if (depth == 0) { 4188 KMP_EXIT_AFF_NONE; 4189 } 4190 if (depth < 0) { 4191 KMP_ASSERT(msg_id != kmp_i18n_null); 4192 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4193 } 4194 } else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 4195 if (__kmp_affinity_verbose) { 4196 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 4197 } 4198 4199 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 4200 if (depth == 0) { 4201 KMP_EXIT_AFF_NONE; 4202 } 4203 if (depth < 0) { 4204 KMP_ASSERT(msg_id != kmp_i18n_null); 4205 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4206 } 4207 } 4208 4209 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4210 4211 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 4212 const char *filename; 4213 if (__kmp_cpuinfo_file != NULL) { 4214 filename = __kmp_cpuinfo_file; 4215 } else { 4216 filename = "/proc/cpuinfo"; 4217 } 4218 4219 if (__kmp_affinity_verbose) { 4220 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 4221 } 4222 4223 FILE *f = fopen(filename, "r"); 4224 if (f == NULL) { 4225 int code = errno; 4226 if (__kmp_cpuinfo_file != NULL) { 4227 __kmp_fatal(KMP_MSG(CantOpenFileForReading, filename), KMP_ERR(code), 4228 KMP_HNT(NameComesFrom_CPUINFO_FILE), __kmp_msg_null); 4229 } else { 4230 __kmp_fatal(KMP_MSG(CantOpenFileForReading, filename), KMP_ERR(code), 4231 __kmp_msg_null); 4232 } 4233 } 4234 int line = 0; 4235 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 4236 fclose(f); 4237 if (depth < 0) { 4238 KMP_ASSERT(msg_id != kmp_i18n_null); 4239 if (line > 0) { 4240 KMP_FATAL(FileLineMsgExiting, filename, line, 4241 __kmp_i18n_catgets(msg_id)); 4242 } else { 4243 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 4244 } 4245 } 4246 if (__kmp_affinity_type == affinity_none) { 4247 KMP_ASSERT(depth == 0); 4248 KMP_EXIT_AFF_NONE; 4249 } 4250 } 4251 4252 #if KMP_GROUP_AFFINITY 4253 4254 else if (__kmp_affinity_top_method == affinity_top_method_group) { 4255 if (__kmp_affinity_verbose) { 4256 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 4257 } 4258 4259 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 4260 KMP_ASSERT(depth != 0); 4261 if (depth < 0) { 4262 KMP_ASSERT(msg_id != kmp_i18n_null); 4263 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4264 } 4265 } 4266 4267 #endif /* KMP_GROUP_AFFINITY */ 4268 4269 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 4270 if (__kmp_affinity_verbose) { 4271 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY"); 4272 } 4273 4274 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 4275 if (depth == 0) { 4276 KMP_EXIT_AFF_NONE; 4277 } 4278 // should not fail 4279 KMP_ASSERT(depth > 0); 4280 KMP_ASSERT(address2os != NULL); 4281 } 4282 4283 if (address2os == NULL) { 4284 if (KMP_AFFINITY_CAPABLE() && 4285 (__kmp_affinity_verbose || 4286 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) { 4287 KMP_WARNING(ErrorInitializeAffinity); 4288 } 4289 __kmp_affinity_type = affinity_none; 4290 KMP_AFFINITY_DISABLE(); 4291 return; 4292 } 4293 4294 if (__kmp_affinity_gran == affinity_gran_tile 4295 #if KMP_USE_HWLOC 4296 && __kmp_tile_depth == 0 4297 #endif 4298 ) { 4299 // tiles requested but not detected, warn user on this 4300 KMP_WARNING(AffTilesNoTiles, "KMP_AFFINITY"); 4301 } 4302 4303 __kmp_apply_thread_places(&address2os, depth); 4304 4305 // Create the table of masks, indexed by thread Id. 4306 unsigned maxIndex; 4307 unsigned numUnique; 4308 kmp_affin_mask_t *osId2Mask = 4309 __kmp_create_masks(&maxIndex, &numUnique, address2os, __kmp_avail_proc); 4310 if (__kmp_affinity_gran_levels == 0) { 4311 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 4312 } 4313 4314 // Set the childNums vector in all Address objects. This must be done before 4315 // we can sort using __kmp_affinity_cmp_Address_child_num(), which takes into 4316 // account the setting of __kmp_affinity_compact. 4317 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc); 4318 4319 switch (__kmp_affinity_type) { 4320 4321 case affinity_explicit: 4322 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 4323 #if OMP_40_ENABLED 4324 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 4325 #endif 4326 { 4327 __kmp_affinity_process_proclist( 4328 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 4329 __kmp_affinity_proclist, osId2Mask, maxIndex); 4330 } 4331 #if OMP_40_ENABLED 4332 else { 4333 __kmp_affinity_process_placelist( 4334 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 4335 __kmp_affinity_proclist, osId2Mask, maxIndex); 4336 } 4337 #endif 4338 if (__kmp_affinity_num_masks == 0) { 4339 if (__kmp_affinity_verbose || 4340 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 4341 KMP_WARNING(AffNoValidProcID); 4342 } 4343 __kmp_affinity_type = affinity_none; 4344 return; 4345 } 4346 break; 4347 4348 // The other affinity types rely on sorting the Addresses according to some 4349 // permutation of the machine topology tree. Set __kmp_affinity_compact and 4350 // __kmp_affinity_offset appropriately, then jump to a common code fragment 4351 // to do the sort and create the array of affinity masks. 4352 4353 case affinity_logical: 4354 __kmp_affinity_compact = 0; 4355 if (__kmp_affinity_offset) { 4356 __kmp_affinity_offset = 4357 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 4358 } 4359 goto sortAddresses; 4360 4361 case affinity_physical: 4362 if (__kmp_nThreadsPerCore > 1) { 4363 __kmp_affinity_compact = 1; 4364 if (__kmp_affinity_compact >= depth) { 4365 __kmp_affinity_compact = 0; 4366 } 4367 } else { 4368 __kmp_affinity_compact = 0; 4369 } 4370 if (__kmp_affinity_offset) { 4371 __kmp_affinity_offset = 4372 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 4373 } 4374 goto sortAddresses; 4375 4376 case affinity_scatter: 4377 if (__kmp_affinity_compact >= depth) { 4378 __kmp_affinity_compact = 0; 4379 } else { 4380 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 4381 } 4382 goto sortAddresses; 4383 4384 case affinity_compact: 4385 if (__kmp_affinity_compact >= depth) { 4386 __kmp_affinity_compact = depth - 1; 4387 } 4388 goto sortAddresses; 4389 4390 case affinity_balanced: 4391 if (depth <= 1) { 4392 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 4393 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 4394 } 4395 __kmp_affinity_type = affinity_none; 4396 return; 4397 } else if (__kmp_affinity_uniform_topology()) { 4398 break; 4399 } else { // Non-uniform topology 4400 4401 // Save the depth for further usage 4402 __kmp_aff_depth = depth; 4403 4404 int core_level = __kmp_affinity_find_core_level( 4405 address2os, __kmp_avail_proc, depth - 1); 4406 int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, 4407 depth - 1, core_level); 4408 int maxprocpercore = __kmp_affinity_max_proc_per_core( 4409 address2os, __kmp_avail_proc, depth - 1, core_level); 4410 4411 int nproc = ncores * maxprocpercore; 4412 if ((nproc < 2) || (nproc < __kmp_avail_proc)) { 4413 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 4414 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 4415 } 4416 __kmp_affinity_type = affinity_none; 4417 return; 4418 } 4419 4420 procarr = (int *)__kmp_allocate(sizeof(int) * nproc); 4421 for (int i = 0; i < nproc; i++) { 4422 procarr[i] = -1; 4423 } 4424 4425 int lastcore = -1; 4426 int inlastcore = 0; 4427 for (int i = 0; i < __kmp_avail_proc; i++) { 4428 int proc = address2os[i].second; 4429 int core = 4430 __kmp_affinity_find_core(address2os, i, depth - 1, core_level); 4431 4432 if (core == lastcore) { 4433 inlastcore++; 4434 } else { 4435 inlastcore = 0; 4436 } 4437 lastcore = core; 4438 4439 procarr[core * maxprocpercore + inlastcore] = proc; 4440 } 4441 4442 break; 4443 } 4444 4445 sortAddresses: 4446 // Allocate the gtid->affinity mask table. 4447 if (__kmp_affinity_dups) { 4448 __kmp_affinity_num_masks = __kmp_avail_proc; 4449 } else { 4450 __kmp_affinity_num_masks = numUnique; 4451 } 4452 4453 #if OMP_40_ENABLED 4454 if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) && 4455 (__kmp_affinity_num_places > 0) && 4456 ((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) { 4457 __kmp_affinity_num_masks = __kmp_affinity_num_places; 4458 } 4459 #endif 4460 4461 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4462 4463 // Sort the address2os table according to the current setting of 4464 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 4465 qsort(address2os, __kmp_avail_proc, sizeof(*address2os), 4466 __kmp_affinity_cmp_Address_child_num); 4467 { 4468 int i; 4469 unsigned j; 4470 for (i = 0, j = 0; i < __kmp_avail_proc; i++) { 4471 if ((!__kmp_affinity_dups) && (!address2os[i].first.leader)) { 4472 continue; 4473 } 4474 unsigned osId = address2os[i].second; 4475 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 4476 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j); 4477 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 4478 KMP_CPU_COPY(dest, src); 4479 if (++j >= __kmp_affinity_num_masks) { 4480 break; 4481 } 4482 } 4483 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 4484 } 4485 break; 4486 4487 default: 4488 KMP_ASSERT2(0, "Unexpected affinity setting"); 4489 } 4490 4491 KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1); 4492 machine_hierarchy.init(address2os, __kmp_avail_proc); 4493 } 4494 #undef KMP_EXIT_AFF_NONE 4495 4496 void __kmp_affinity_initialize(void) { 4497 // Much of the code above was written assumming that if a machine was not 4498 // affinity capable, then __kmp_affinity_type == affinity_none. We now 4499 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 4500 // There are too many checks for __kmp_affinity_type == affinity_none 4501 // in this code. Instead of trying to change them all, check if 4502 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 4503 // affinity_none, call the real initialization routine, then restore 4504 // __kmp_affinity_type to affinity_disabled. 4505 int disabled = (__kmp_affinity_type == affinity_disabled); 4506 if (!KMP_AFFINITY_CAPABLE()) { 4507 KMP_ASSERT(disabled); 4508 } 4509 if (disabled) { 4510 __kmp_affinity_type = affinity_none; 4511 } 4512 __kmp_aux_affinity_initialize(); 4513 if (disabled) { 4514 __kmp_affinity_type = affinity_disabled; 4515 } 4516 } 4517 4518 void __kmp_affinity_uninitialize(void) { 4519 if (__kmp_affinity_masks != NULL) { 4520 KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4521 __kmp_affinity_masks = NULL; 4522 } 4523 if (__kmp_affin_fullMask != NULL) { 4524 KMP_CPU_FREE(__kmp_affin_fullMask); 4525 __kmp_affin_fullMask = NULL; 4526 } 4527 __kmp_affinity_num_masks = 0; 4528 __kmp_affinity_type = affinity_default; 4529 #if OMP_40_ENABLED 4530 __kmp_affinity_num_places = 0; 4531 #endif 4532 if (__kmp_affinity_proclist != NULL) { 4533 __kmp_free(__kmp_affinity_proclist); 4534 __kmp_affinity_proclist = NULL; 4535 } 4536 if (address2os != NULL) { 4537 __kmp_free(address2os); 4538 address2os = NULL; 4539 } 4540 if (procarr != NULL) { 4541 __kmp_free(procarr); 4542 procarr = NULL; 4543 } 4544 #if KMP_USE_HWLOC 4545 if (__kmp_hwloc_topology != NULL) { 4546 hwloc_topology_destroy(__kmp_hwloc_topology); 4547 __kmp_hwloc_topology = NULL; 4548 } 4549 #endif 4550 KMPAffinity::destroy_api(); 4551 } 4552 4553 void __kmp_affinity_set_init_mask(int gtid, int isa_root) { 4554 if (!KMP_AFFINITY_CAPABLE()) { 4555 return; 4556 } 4557 4558 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4559 if (th->th.th_affin_mask == NULL) { 4560 KMP_CPU_ALLOC(th->th.th_affin_mask); 4561 } else { 4562 KMP_CPU_ZERO(th->th.th_affin_mask); 4563 } 4564 4565 // Copy the thread mask to the kmp_info_t strucuture. If 4566 // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that 4567 // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set, 4568 // then the full mask is the same as the mask of the initialization thread. 4569 kmp_affin_mask_t *mask; 4570 int i; 4571 4572 #if OMP_40_ENABLED 4573 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 4574 #endif 4575 { 4576 if ((__kmp_affinity_type == affinity_none) || 4577 (__kmp_affinity_type == affinity_balanced)) { 4578 #if KMP_GROUP_AFFINITY 4579 if (__kmp_num_proc_groups > 1) { 4580 return; 4581 } 4582 #endif 4583 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4584 i = KMP_PLACE_ALL; 4585 mask = __kmp_affin_fullMask; 4586 } else { 4587 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 4588 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4589 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4590 } 4591 } 4592 #if OMP_40_ENABLED 4593 else { 4594 if ((!isa_root) || 4595 (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 4596 #if KMP_GROUP_AFFINITY 4597 if (__kmp_num_proc_groups > 1) { 4598 return; 4599 } 4600 #endif 4601 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4602 i = KMP_PLACE_ALL; 4603 mask = __kmp_affin_fullMask; 4604 } else { 4605 // int i = some hash function or just a counter that doesn't 4606 // always start at 0. Use gtid for now. 4607 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 4608 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4609 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4610 } 4611 } 4612 #endif 4613 4614 #if OMP_40_ENABLED 4615 th->th.th_current_place = i; 4616 if (isa_root) { 4617 th->th.th_new_place = i; 4618 th->th.th_first_place = 0; 4619 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4620 } 4621 4622 if (i == KMP_PLACE_ALL) { 4623 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 4624 gtid)); 4625 } else { 4626 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 4627 gtid, i)); 4628 } 4629 #else 4630 if (i == -1) { 4631 KA_TRACE( 4632 100, 4633 ("__kmp_affinity_set_init_mask: binding T#%d to __kmp_affin_fullMask\n", 4634 gtid)); 4635 } else { 4636 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n", 4637 gtid, i)); 4638 } 4639 #endif /* OMP_40_ENABLED */ 4640 4641 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4642 4643 if (__kmp_affinity_verbose) { 4644 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4645 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4646 th->th.th_affin_mask); 4647 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4648 __kmp_gettid(), gtid, buf); 4649 } 4650 4651 #if KMP_OS_WINDOWS 4652 // On Windows* OS, the process affinity mask might have changed. If the user 4653 // didn't request affinity and this call fails, just continue silently. 4654 // See CQ171393. 4655 if (__kmp_affinity_type == affinity_none) { 4656 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 4657 } else 4658 #endif 4659 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4660 } 4661 4662 #if OMP_40_ENABLED 4663 4664 void __kmp_affinity_set_place(int gtid) { 4665 int retval; 4666 4667 if (!KMP_AFFINITY_CAPABLE()) { 4668 return; 4669 } 4670 4671 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4672 4673 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current " 4674 "place = %d)\n", 4675 gtid, th->th.th_new_place, th->th.th_current_place)); 4676 4677 // Check that the new place is within this thread's partition. 4678 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4679 KMP_ASSERT(th->th.th_new_place >= 0); 4680 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 4681 if (th->th.th_first_place <= th->th.th_last_place) { 4682 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) && 4683 (th->th.th_new_place <= th->th.th_last_place)); 4684 } else { 4685 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) || 4686 (th->th.th_new_place >= th->th.th_last_place)); 4687 } 4688 4689 // Copy the thread mask to the kmp_info_t strucuture, 4690 // and set this thread's affinity. 4691 kmp_affin_mask_t *mask = 4692 KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place); 4693 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4694 th->th.th_current_place = th->th.th_new_place; 4695 4696 if (__kmp_affinity_verbose) { 4697 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4698 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4699 th->th.th_affin_mask); 4700 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), 4701 __kmp_gettid(), gtid, buf); 4702 } 4703 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4704 } 4705 4706 #endif /* OMP_40_ENABLED */ 4707 4708 int __kmp_aux_set_affinity(void **mask) { 4709 int gtid; 4710 kmp_info_t *th; 4711 int retval; 4712 4713 if (!KMP_AFFINITY_CAPABLE()) { 4714 return -1; 4715 } 4716 4717 gtid = __kmp_entry_gtid(); 4718 KA_TRACE(1000, ; { 4719 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4720 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4721 (kmp_affin_mask_t *)(*mask)); 4722 __kmp_debug_printf( 4723 "kmp_set_affinity: setting affinity mask for thread %d = %s\n", gtid, 4724 buf); 4725 }); 4726 4727 if (__kmp_env_consistency_check) { 4728 if ((mask == NULL) || (*mask == NULL)) { 4729 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4730 } else { 4731 unsigned proc; 4732 int num_procs = 0; 4733 4734 KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) { 4735 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4736 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4737 } 4738 if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 4739 continue; 4740 } 4741 num_procs++; 4742 } 4743 if (num_procs == 0) { 4744 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4745 } 4746 4747 #if KMP_GROUP_AFFINITY 4748 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 4749 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4750 } 4751 #endif /* KMP_GROUP_AFFINITY */ 4752 } 4753 } 4754 4755 th = __kmp_threads[gtid]; 4756 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4757 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4758 if (retval == 0) { 4759 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 4760 } 4761 4762 #if OMP_40_ENABLED 4763 th->th.th_current_place = KMP_PLACE_UNDEFINED; 4764 th->th.th_new_place = KMP_PLACE_UNDEFINED; 4765 th->th.th_first_place = 0; 4766 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4767 4768 // Turn off 4.0 affinity for the current tread at this parallel level. 4769 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 4770 #endif 4771 4772 return retval; 4773 } 4774 4775 int __kmp_aux_get_affinity(void **mask) { 4776 int gtid; 4777 int retval; 4778 kmp_info_t *th; 4779 4780 if (!KMP_AFFINITY_CAPABLE()) { 4781 return -1; 4782 } 4783 4784 gtid = __kmp_entry_gtid(); 4785 th = __kmp_threads[gtid]; 4786 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4787 4788 KA_TRACE(1000, ; { 4789 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4790 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4791 th->th.th_affin_mask); 4792 __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", 4793 gtid, buf); 4794 }); 4795 4796 if (__kmp_env_consistency_check) { 4797 if ((mask == NULL) || (*mask == NULL)) { 4798 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 4799 } 4800 } 4801 4802 #if !KMP_OS_WINDOWS 4803 4804 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4805 KA_TRACE(1000, ; { 4806 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4807 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4808 (kmp_affin_mask_t *)(*mask)); 4809 __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", 4810 gtid, buf); 4811 }); 4812 return retval; 4813 4814 #else 4815 4816 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 4817 return 0; 4818 4819 #endif /* KMP_OS_WINDOWS */ 4820 } 4821 4822 int __kmp_aux_get_affinity_max_proc() { 4823 if (!KMP_AFFINITY_CAPABLE()) { 4824 return 0; 4825 } 4826 #if KMP_GROUP_AFFINITY 4827 if (__kmp_num_proc_groups > 1) { 4828 return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT); 4829 } 4830 #endif 4831 return __kmp_xproc; 4832 } 4833 4834 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) { 4835 int retval; 4836 4837 if (!KMP_AFFINITY_CAPABLE()) { 4838 return -1; 4839 } 4840 4841 KA_TRACE(1000, ; { 4842 int gtid = __kmp_entry_gtid(); 4843 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4844 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4845 (kmp_affin_mask_t *)(*mask)); 4846 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in " 4847 "affinity mask for thread %d = %s\n", 4848 proc, gtid, buf); 4849 }); 4850 4851 if (__kmp_env_consistency_check) { 4852 if ((mask == NULL) || (*mask == NULL)) { 4853 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4854 } 4855 } 4856 4857 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4858 return -1; 4859 } 4860 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4861 return -2; 4862 } 4863 4864 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 4865 return 0; 4866 } 4867 4868 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) { 4869 int retval; 4870 4871 if (!KMP_AFFINITY_CAPABLE()) { 4872 return -1; 4873 } 4874 4875 KA_TRACE(1000, ; { 4876 int gtid = __kmp_entry_gtid(); 4877 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4878 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4879 (kmp_affin_mask_t *)(*mask)); 4880 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in " 4881 "affinity mask for thread %d = %s\n", 4882 proc, gtid, buf); 4883 }); 4884 4885 if (__kmp_env_consistency_check) { 4886 if ((mask == NULL) || (*mask == NULL)) { 4887 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 4888 } 4889 } 4890 4891 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4892 return -1; 4893 } 4894 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4895 return -2; 4896 } 4897 4898 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 4899 return 0; 4900 } 4901 4902 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) { 4903 int retval; 4904 4905 if (!KMP_AFFINITY_CAPABLE()) { 4906 return -1; 4907 } 4908 4909 KA_TRACE(1000, ; { 4910 int gtid = __kmp_entry_gtid(); 4911 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4912 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4913 (kmp_affin_mask_t *)(*mask)); 4914 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in " 4915 "affinity mask for thread %d = %s\n", 4916 proc, gtid, buf); 4917 }); 4918 4919 if (__kmp_env_consistency_check) { 4920 if ((mask == NULL) || (*mask == NULL)) { 4921 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc"); 4922 } 4923 } 4924 4925 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4926 return -1; 4927 } 4928 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4929 return 0; 4930 } 4931 4932 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 4933 } 4934 4935 // Dynamic affinity settings - Affinity balanced 4936 void __kmp_balanced_affinity(int tid, int nthreads) { 4937 bool fine_gran = true; 4938 4939 switch (__kmp_affinity_gran) { 4940 case affinity_gran_fine: 4941 case affinity_gran_thread: 4942 break; 4943 case affinity_gran_core: 4944 if (__kmp_nThreadsPerCore > 1) { 4945 fine_gran = false; 4946 } 4947 break; 4948 case affinity_gran_package: 4949 if (nCoresPerPkg > 1) { 4950 fine_gran = false; 4951 } 4952 break; 4953 default: 4954 fine_gran = false; 4955 } 4956 4957 if (__kmp_affinity_uniform_topology()) { 4958 int coreID; 4959 int threadID; 4960 // Number of hyper threads per core in HT machine 4961 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 4962 // Number of cores 4963 int ncores = __kmp_ncores; 4964 if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) { 4965 __kmp_nth_per_core = __kmp_avail_proc / nPackages; 4966 ncores = nPackages; 4967 } 4968 // How many threads will be bound to each core 4969 int chunk = nthreads / ncores; 4970 // How many cores will have an additional thread bound to it - "big cores" 4971 int big_cores = nthreads % ncores; 4972 // Number of threads on the big cores 4973 int big_nth = (chunk + 1) * big_cores; 4974 if (tid < big_nth) { 4975 coreID = tid / (chunk + 1); 4976 threadID = (tid % (chunk + 1)) % __kmp_nth_per_core; 4977 } else { // tid >= big_nth 4978 coreID = (tid - big_cores) / chunk; 4979 threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core; 4980 } 4981 4982 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 4983 "Illegal set affinity operation when not capable"); 4984 4985 kmp_affin_mask_t *mask; 4986 KMP_CPU_ALLOC_ON_STACK(mask); 4987 KMP_CPU_ZERO(mask); 4988 4989 if (fine_gran) { 4990 int osID = address2os[coreID * __kmp_nth_per_core + threadID].second; 4991 KMP_CPU_SET(osID, mask); 4992 } else { 4993 for (int i = 0; i < __kmp_nth_per_core; i++) { 4994 int osID; 4995 osID = address2os[coreID * __kmp_nth_per_core + i].second; 4996 KMP_CPU_SET(osID, mask); 4997 } 4998 } 4999 if (__kmp_affinity_verbose) { 5000 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5001 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 5002 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 5003 __kmp_gettid(), tid, buf); 5004 } 5005 __kmp_set_system_affinity(mask, TRUE); 5006 KMP_CPU_FREE_FROM_STACK(mask); 5007 } else { // Non-uniform topology 5008 5009 kmp_affin_mask_t *mask; 5010 KMP_CPU_ALLOC_ON_STACK(mask); 5011 KMP_CPU_ZERO(mask); 5012 5013 int core_level = __kmp_affinity_find_core_level( 5014 address2os, __kmp_avail_proc, __kmp_aff_depth - 1); 5015 int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, 5016 __kmp_aff_depth - 1, core_level); 5017 int nth_per_core = __kmp_affinity_max_proc_per_core( 5018 address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level); 5019 5020 // For performance gain consider the special case nthreads == 5021 // __kmp_avail_proc 5022 if (nthreads == __kmp_avail_proc) { 5023 if (fine_gran) { 5024 int osID = address2os[tid].second; 5025 KMP_CPU_SET(osID, mask); 5026 } else { 5027 int core = __kmp_affinity_find_core(address2os, tid, 5028 __kmp_aff_depth - 1, core_level); 5029 for (int i = 0; i < __kmp_avail_proc; i++) { 5030 int osID = address2os[i].second; 5031 if (__kmp_affinity_find_core(address2os, i, __kmp_aff_depth - 1, 5032 core_level) == core) { 5033 KMP_CPU_SET(osID, mask); 5034 } 5035 } 5036 } 5037 } else if (nthreads <= ncores) { 5038 5039 int core = 0; 5040 for (int i = 0; i < ncores; i++) { 5041 // Check if this core from procarr[] is in the mask 5042 int in_mask = 0; 5043 for (int j = 0; j < nth_per_core; j++) { 5044 if (procarr[i * nth_per_core + j] != -1) { 5045 in_mask = 1; 5046 break; 5047 } 5048 } 5049 if (in_mask) { 5050 if (tid == core) { 5051 for (int j = 0; j < nth_per_core; j++) { 5052 int osID = procarr[i * nth_per_core + j]; 5053 if (osID != -1) { 5054 KMP_CPU_SET(osID, mask); 5055 // For fine granularity it is enough to set the first available 5056 // osID for this core 5057 if (fine_gran) { 5058 break; 5059 } 5060 } 5061 } 5062 break; 5063 } else { 5064 core++; 5065 } 5066 } 5067 } 5068 } else { // nthreads > ncores 5069 // Array to save the number of processors at each core 5070 int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores); 5071 // Array to save the number of cores with "x" available processors; 5072 int *ncores_with_x_procs = 5073 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 5074 // Array to save the number of cores with # procs from x to nth_per_core 5075 int *ncores_with_x_to_max_procs = 5076 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 5077 5078 for (int i = 0; i <= nth_per_core; i++) { 5079 ncores_with_x_procs[i] = 0; 5080 ncores_with_x_to_max_procs[i] = 0; 5081 } 5082 5083 for (int i = 0; i < ncores; i++) { 5084 int cnt = 0; 5085 for (int j = 0; j < nth_per_core; j++) { 5086 if (procarr[i * nth_per_core + j] != -1) { 5087 cnt++; 5088 } 5089 } 5090 nproc_at_core[i] = cnt; 5091 ncores_with_x_procs[cnt]++; 5092 } 5093 5094 for (int i = 0; i <= nth_per_core; i++) { 5095 for (int j = i; j <= nth_per_core; j++) { 5096 ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j]; 5097 } 5098 } 5099 5100 // Max number of processors 5101 int nproc = nth_per_core * ncores; 5102 // An array to keep number of threads per each context 5103 int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc); 5104 for (int i = 0; i < nproc; i++) { 5105 newarr[i] = 0; 5106 } 5107 5108 int nth = nthreads; 5109 int flag = 0; 5110 while (nth > 0) { 5111 for (int j = 1; j <= nth_per_core; j++) { 5112 int cnt = ncores_with_x_to_max_procs[j]; 5113 for (int i = 0; i < ncores; i++) { 5114 // Skip the core with 0 processors 5115 if (nproc_at_core[i] == 0) { 5116 continue; 5117 } 5118 for (int k = 0; k < nth_per_core; k++) { 5119 if (procarr[i * nth_per_core + k] != -1) { 5120 if (newarr[i * nth_per_core + k] == 0) { 5121 newarr[i * nth_per_core + k] = 1; 5122 cnt--; 5123 nth--; 5124 break; 5125 } else { 5126 if (flag != 0) { 5127 newarr[i * nth_per_core + k]++; 5128 cnt--; 5129 nth--; 5130 break; 5131 } 5132 } 5133 } 5134 } 5135 if (cnt == 0 || nth == 0) { 5136 break; 5137 } 5138 } 5139 if (nth == 0) { 5140 break; 5141 } 5142 } 5143 flag = 1; 5144 } 5145 int sum = 0; 5146 for (int i = 0; i < nproc; i++) { 5147 sum += newarr[i]; 5148 if (sum > tid) { 5149 if (fine_gran) { 5150 int osID = procarr[i]; 5151 KMP_CPU_SET(osID, mask); 5152 } else { 5153 int coreID = i / nth_per_core; 5154 for (int ii = 0; ii < nth_per_core; ii++) { 5155 int osID = procarr[coreID * nth_per_core + ii]; 5156 if (osID != -1) { 5157 KMP_CPU_SET(osID, mask); 5158 } 5159 } 5160 } 5161 break; 5162 } 5163 } 5164 __kmp_free(newarr); 5165 } 5166 5167 if (__kmp_affinity_verbose) { 5168 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5169 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 5170 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 5171 __kmp_gettid(), tid, buf); 5172 } 5173 __kmp_set_system_affinity(mask, TRUE); 5174 KMP_CPU_FREE_FROM_STACK(mask); 5175 } 5176 } 5177 5178 #if KMP_OS_LINUX 5179 // We don't need this entry for Windows because 5180 // there is GetProcessAffinityMask() api 5181 // 5182 // The intended usage is indicated by these steps: 5183 // 1) The user gets the current affinity mask 5184 // 2) Then sets the affinity by calling this function 5185 // 3) Error check the return value 5186 // 4) Use non-OpenMP parallelization 5187 // 5) Reset the affinity to what was stored in step 1) 5188 #ifdef __cplusplus 5189 extern "C" 5190 #endif 5191 int 5192 kmp_set_thread_affinity_mask_initial() 5193 // the function returns 0 on success, 5194 // -1 if we cannot bind thread 5195 // >0 (errno) if an error happened during binding 5196 { 5197 int gtid = __kmp_get_gtid(); 5198 if (gtid < 0) { 5199 // Do not touch non-omp threads 5200 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5201 "non-omp thread, returning\n")); 5202 return -1; 5203 } 5204 if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) { 5205 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5206 "affinity not initialized, returning\n")); 5207 return -1; 5208 } 5209 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5210 "set full mask for thread %d\n", 5211 gtid)); 5212 KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL); 5213 return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE); 5214 } 5215 #endif 5216 5217 #endif // KMP_AFFINITY_SUPPORTED 5218