1 /* 2 * kmp_affinity.cpp -- affinity management 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // The LLVM Compiler Infrastructure 8 // 9 // This file is dual licensed under the MIT and the University of Illinois Open 10 // Source Licenses. See LICENSE.txt for details. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "kmp.h" 15 #include "kmp_affinity.h" 16 #include "kmp_i18n.h" 17 #include "kmp_io.h" 18 #include "kmp_str.h" 19 #include "kmp_wrapper_getpid.h" 20 21 // Store the real or imagined machine hierarchy here 22 static hierarchy_info machine_hierarchy; 23 24 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); } 25 26 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 27 kmp_uint32 depth; 28 // The test below is true if affinity is available, but set to "none". Need to 29 // init on first use of hierarchical barrier. 30 if (TCR_1(machine_hierarchy.uninitialized)) 31 machine_hierarchy.init(NULL, nproc); 32 33 // Adjust the hierarchy in case num threads exceeds original 34 if (nproc > machine_hierarchy.base_num_threads) 35 machine_hierarchy.resize(nproc); 36 37 depth = machine_hierarchy.depth; 38 KMP_DEBUG_ASSERT(depth > 0); 39 40 thr_bar->depth = depth; 41 thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0] - 1; 42 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 43 } 44 45 #if KMP_AFFINITY_SUPPORTED 46 47 bool KMPAffinity::picked_api = false; 48 49 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); } 50 void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); } 51 void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); } 52 void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); } 53 void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); } 54 void KMPAffinity::operator delete(void *p) { __kmp_free(p); } 55 56 void KMPAffinity::pick_api() { 57 KMPAffinity *affinity_dispatch; 58 if (picked_api) 59 return; 60 #if KMP_USE_HWLOC 61 // Only use Hwloc if affinity isn't explicitly disabled and 62 // user requests Hwloc topology method 63 if (__kmp_affinity_top_method == affinity_top_method_hwloc && 64 __kmp_affinity_type != affinity_disabled) { 65 affinity_dispatch = new KMPHwlocAffinity(); 66 } else 67 #endif 68 { 69 affinity_dispatch = new KMPNativeAffinity(); 70 } 71 __kmp_affinity_dispatch = affinity_dispatch; 72 picked_api = true; 73 } 74 75 void KMPAffinity::destroy_api() { 76 if (__kmp_affinity_dispatch != NULL) { 77 delete __kmp_affinity_dispatch; 78 __kmp_affinity_dispatch = NULL; 79 picked_api = false; 80 } 81 } 82 83 // Print the affinity mask to the character array in a pretty format. 84 char *__kmp_affinity_print_mask(char *buf, int buf_len, 85 kmp_affin_mask_t *mask) { 86 KMP_ASSERT(buf_len >= 40); 87 char *scan = buf; 88 char *end = buf + buf_len - 1; 89 90 // Find first element / check for empty set. 91 size_t i; 92 i = mask->begin(); 93 if (i == mask->end()) { 94 KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}"); 95 while (*scan != '\0') 96 scan++; 97 KMP_ASSERT(scan <= end); 98 return buf; 99 } 100 101 KMP_SNPRINTF(scan, end - scan + 1, "{%ld", (long)i); 102 while (*scan != '\0') 103 scan++; 104 i++; 105 for (; i != mask->end(); i = mask->next(i)) { 106 if (!KMP_CPU_ISSET(i, mask)) { 107 continue; 108 } 109 110 // Check for buffer overflow. A string of the form ",<n>" will have at most 111 // 10 characters, plus we want to leave room to print ",...}" if the set is 112 // too large to print for a total of 15 characters. We already left room for 113 // '\0' in setting end. 114 if (end - scan < 15) { 115 break; 116 } 117 KMP_SNPRINTF(scan, end - scan + 1, ",%-ld", (long)i); 118 while (*scan != '\0') 119 scan++; 120 } 121 if (i != mask->end()) { 122 KMP_SNPRINTF(scan, end - scan + 1, ",..."); 123 while (*scan != '\0') 124 scan++; 125 } 126 KMP_SNPRINTF(scan, end - scan + 1, "}"); 127 while (*scan != '\0') 128 scan++; 129 KMP_ASSERT(scan <= end); 130 return buf; 131 } 132 133 void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) { 134 KMP_CPU_ZERO(mask); 135 136 #if KMP_GROUP_AFFINITY 137 138 if (__kmp_num_proc_groups > 1) { 139 int group; 140 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 141 for (group = 0; group < __kmp_num_proc_groups; group++) { 142 int i; 143 int num = __kmp_GetActiveProcessorCount(group); 144 for (i = 0; i < num; i++) { 145 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 146 } 147 } 148 } else 149 150 #endif /* KMP_GROUP_AFFINITY */ 151 152 { 153 int proc; 154 for (proc = 0; proc < __kmp_xproc; proc++) { 155 KMP_CPU_SET(proc, mask); 156 } 157 } 158 } 159 160 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be 161 // called to renumber the labels from [0..n] and place them into the child_num 162 // vector of the address object. This is done in case the labels used for 163 // the children at one node of the hierarchy differ from those used for 164 // another node at the same level. Example: suppose the machine has 2 nodes 165 // with 2 packages each. The first node contains packages 601 and 602, and 166 // second node contains packages 603 and 604. If we try to sort the table 167 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604 168 // because we are paying attention to the labels themselves, not the ordinal 169 // child numbers. By using the child numbers in the sort, the result is 170 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604. 171 static void __kmp_affinity_assign_child_nums(AddrUnsPair *address2os, 172 int numAddrs) { 173 KMP_DEBUG_ASSERT(numAddrs > 0); 174 int depth = address2os->first.depth; 175 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 176 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 177 int labCt; 178 for (labCt = 0; labCt < depth; labCt++) { 179 address2os[0].first.childNums[labCt] = counts[labCt] = 0; 180 lastLabel[labCt] = address2os[0].first.labels[labCt]; 181 } 182 int i; 183 for (i = 1; i < numAddrs; i++) { 184 for (labCt = 0; labCt < depth; labCt++) { 185 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) { 186 int labCt2; 187 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) { 188 counts[labCt2] = 0; 189 lastLabel[labCt2] = address2os[i].first.labels[labCt2]; 190 } 191 counts[labCt]++; 192 lastLabel[labCt] = address2os[i].first.labels[labCt]; 193 break; 194 } 195 } 196 for (labCt = 0; labCt < depth; labCt++) { 197 address2os[i].first.childNums[labCt] = counts[labCt]; 198 } 199 for (; labCt < (int)Address::maxDepth; labCt++) { 200 address2os[i].first.childNums[labCt] = 0; 201 } 202 } 203 __kmp_free(lastLabel); 204 __kmp_free(counts); 205 } 206 207 // All of the __kmp_affinity_create_*_map() routines should set 208 // __kmp_affinity_masks to a vector of affinity mask objects of length 209 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and return 210 // the number of levels in the machine topology tree (zero if 211 // __kmp_affinity_type == affinity_none). 212 // 213 // All of the __kmp_affinity_create_*_map() routines should set 214 // *__kmp_affin_fullMask to the affinity mask for the initialization thread. 215 // They need to save and restore the mask, and it could be needed later, so 216 // saving it is just an optimization to avoid calling kmp_get_system_affinity() 217 // again. 218 kmp_affin_mask_t *__kmp_affin_fullMask = NULL; 219 220 static int nCoresPerPkg, nPackages; 221 static int __kmp_nThreadsPerCore; 222 #ifndef KMP_DFLT_NTH_CORES 223 static int __kmp_ncores; 224 #endif 225 static int *__kmp_pu_os_idx = NULL; 226 227 // __kmp_affinity_uniform_topology() doesn't work when called from 228 // places which support arbitrarily many levels in the machine topology 229 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map() 230 // __kmp_affinity_create_x2apicid_map(). 231 inline static bool __kmp_affinity_uniform_topology() { 232 return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages); 233 } 234 235 // Print out the detailed machine topology map, i.e. the physical locations 236 // of each OS proc. 237 static void __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, 238 int depth, int pkgLevel, 239 int coreLevel, int threadLevel) { 240 int proc; 241 242 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 243 for (proc = 0; proc < len; proc++) { 244 int level; 245 kmp_str_buf_t buf; 246 __kmp_str_buf_init(&buf); 247 for (level = 0; level < depth; level++) { 248 if (level == threadLevel) { 249 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread)); 250 } else if (level == coreLevel) { 251 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core)); 252 } else if (level == pkgLevel) { 253 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package)); 254 } else if (level > pkgLevel) { 255 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node), 256 level - pkgLevel - 1); 257 } else { 258 __kmp_str_buf_print(&buf, "L%d ", level); 259 } 260 __kmp_str_buf_print(&buf, "%d ", address2os[proc].first.labels[level]); 261 } 262 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second, 263 buf.str); 264 __kmp_str_buf_free(&buf); 265 } 266 } 267 268 #if KMP_USE_HWLOC 269 270 static void __kmp_affinity_print_hwloc_tp(AddrUnsPair *addrP, int len, 271 int depth, int *levels) { 272 int proc; 273 kmp_str_buf_t buf; 274 __kmp_str_buf_init(&buf); 275 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 276 for (proc = 0; proc < len; proc++) { 277 __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Package), 278 addrP[proc].first.labels[0]); 279 if (depth > 1) { 280 int level = 1; // iterate over levels 281 int label = 1; // iterate over labels 282 if (__kmp_numa_detected) 283 // node level follows package 284 if (levels[level++] > 0) 285 __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Node), 286 addrP[proc].first.labels[label++]); 287 if (__kmp_tile_depth > 0) 288 // tile level follows node if any, or package 289 if (levels[level++] > 0) 290 __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Tile), 291 addrP[proc].first.labels[label++]); 292 if (levels[level++] > 0) 293 // core level follows 294 __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Core), 295 addrP[proc].first.labels[label++]); 296 if (levels[level++] > 0) 297 // thread level is the latest 298 __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Thread), 299 addrP[proc].first.labels[label++]); 300 KMP_DEBUG_ASSERT(label == depth); 301 } 302 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", addrP[proc].second, buf.str); 303 __kmp_str_buf_clear(&buf); 304 } 305 __kmp_str_buf_free(&buf); 306 } 307 308 static int nNodePerPkg, nTilePerPkg, nTilePerNode, nCorePerNode, nCorePerTile; 309 310 // This function removes the topology levels that are radix 1 and don't offer 311 // further information about the topology. The most common example is when you 312 // have one thread context per core, we don't want the extra thread context 313 // level if it offers no unique labels. So they are removed. 314 // return value: the new depth of address2os 315 static int __kmp_affinity_remove_radix_one_levels(AddrUnsPair *addrP, int nTh, 316 int depth, int *levels) { 317 int level; 318 int i; 319 int radix1_detected; 320 int new_depth = depth; 321 for (level = depth - 1; level > 0; --level) { 322 // Detect if this level is radix 1 323 radix1_detected = 1; 324 for (i = 1; i < nTh; ++i) { 325 if (addrP[0].first.labels[level] != addrP[i].first.labels[level]) { 326 // There are differing label values for this level so it stays 327 radix1_detected = 0; 328 break; 329 } 330 } 331 if (!radix1_detected) 332 continue; 333 // Radix 1 was detected 334 --new_depth; 335 levels[level] = -1; // mark level as not present in address2os array 336 if (level == new_depth) { 337 // "turn off" deepest level, just decrement the depth that removes 338 // the level from address2os array 339 for (i = 0; i < nTh; ++i) { 340 addrP[i].first.depth--; 341 } 342 } else { 343 // For other levels, we move labels over and also reduce the depth 344 int j; 345 for (j = level; j < new_depth; ++j) { 346 for (i = 0; i < nTh; ++i) { 347 addrP[i].first.labels[j] = addrP[i].first.labels[j + 1]; 348 addrP[i].first.depth--; 349 } 350 levels[j + 1] -= 1; 351 } 352 } 353 } 354 return new_depth; 355 } 356 357 // Returns the number of objects of type 'type' below 'obj' within the topology 358 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is 359 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET 360 // object. 361 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj, 362 hwloc_obj_type_t type) { 363 int retval = 0; 364 hwloc_obj_t first; 365 for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type, 366 obj->logical_index, type, 0); 367 first != NULL && 368 hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, obj->type, first) == 369 obj; 370 first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type, 371 first)) { 372 ++retval; 373 } 374 return retval; 375 } 376 377 static int __kmp_hwloc_count_children_by_depth(hwloc_topology_t t, 378 hwloc_obj_t o, unsigned depth, 379 hwloc_obj_t *f) { 380 if (o->depth == depth) { 381 if (*f == NULL) 382 *f = o; // output first descendant found 383 return 1; 384 } 385 int sum = 0; 386 for (unsigned i = 0; i < o->arity; i++) 387 sum += __kmp_hwloc_count_children_by_depth(t, o->children[i], depth, f); 388 return sum; // will be 0 if no one found (as PU arity is 0) 389 } 390 391 static int __kmp_hwloc_count_children_by_type(hwloc_topology_t t, hwloc_obj_t o, 392 hwloc_obj_type_t type, 393 hwloc_obj_t *f) { 394 if (!hwloc_compare_types(o->type, type)) { 395 if (*f == NULL) 396 *f = o; // output first descendant found 397 return 1; 398 } 399 int sum = 0; 400 for (unsigned i = 0; i < o->arity; i++) 401 sum += __kmp_hwloc_count_children_by_type(t, o->children[i], type, f); 402 return sum; // will be 0 if no one found (as PU arity is 0) 403 } 404 405 static int __kmp_hwloc_process_obj_core_pu(AddrUnsPair *addrPair, 406 int &nActiveThreads, 407 int &num_active_cores, 408 hwloc_obj_t obj, int depth, 409 int *labels) { 410 hwloc_obj_t core = NULL; 411 hwloc_topology_t &tp = __kmp_hwloc_topology; 412 int NC = __kmp_hwloc_count_children_by_type(tp, obj, HWLOC_OBJ_CORE, &core); 413 for (int core_id = 0; core_id < NC; ++core_id, core = core->next_cousin) { 414 hwloc_obj_t pu = NULL; 415 KMP_DEBUG_ASSERT(core != NULL); 416 int num_active_threads = 0; 417 int NT = __kmp_hwloc_count_children_by_type(tp, core, HWLOC_OBJ_PU, &pu); 418 // int NT = core->arity; pu = core->first_child; // faster? 419 for (int pu_id = 0; pu_id < NT; ++pu_id, pu = pu->next_cousin) { 420 KMP_DEBUG_ASSERT(pu != NULL); 421 if (!KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask)) 422 continue; // skip inactive (inaccessible) unit 423 Address addr(depth + 2); 424 KA_TRACE(20, ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n", 425 obj->os_index, obj->logical_index, core->os_index, 426 core->logical_index, pu->os_index, pu->logical_index)); 427 for (int i = 0; i < depth; ++i) 428 addr.labels[i] = labels[i]; // package, etc. 429 addr.labels[depth] = core_id; // core 430 addr.labels[depth + 1] = pu_id; // pu 431 addrPair[nActiveThreads] = AddrUnsPair(addr, pu->os_index); 432 __kmp_pu_os_idx[nActiveThreads] = pu->os_index; 433 nActiveThreads++; 434 ++num_active_threads; // count active threads per core 435 } 436 if (num_active_threads) { // were there any active threads on the core? 437 ++__kmp_ncores; // count total active cores 438 ++num_active_cores; // count active cores per socket 439 if (num_active_threads > __kmp_nThreadsPerCore) 440 __kmp_nThreadsPerCore = num_active_threads; // calc maximum 441 } 442 } 443 return 0; 444 } 445 446 // Check if NUMA node detected below the package, 447 // and if tile object is detected and return its depth 448 static int __kmp_hwloc_check_numa() { 449 hwloc_topology_t &tp = __kmp_hwloc_topology; 450 hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to) 451 int depth; 452 453 // Get some PU 454 hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, 0); 455 if (hT == NULL) // something has gone wrong 456 return 1; 457 458 // check NUMA node below PACKAGE 459 hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT); 460 hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT); 461 KMP_DEBUG_ASSERT(hS != NULL); 462 if (hN != NULL && hN->depth > hS->depth) { 463 __kmp_numa_detected = TRUE; // socket includes node(s) 464 if (__kmp_affinity_gran == affinity_gran_node) { 465 __kmp_affinity_gran == affinity_gran_numa; 466 } 467 } 468 469 // check tile, get object by depth because of multiple caches possible 470 depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED); 471 hL = hwloc_get_ancestor_obj_by_depth(tp, depth, hT); 472 hC = NULL; // not used, but reset it here just in case 473 if (hL != NULL && 474 __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1) 475 __kmp_tile_depth = depth; // tile consists of multiple cores 476 return 0; 477 } 478 479 static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os, 480 kmp_i18n_id_t *const msg_id) { 481 hwloc_topology_t &tp = __kmp_hwloc_topology; // shortcut of a long name 482 *address2os = NULL; 483 *msg_id = kmp_i18n_null; 484 485 // Save the affinity mask for the current thread. 486 kmp_affin_mask_t *oldMask; 487 KMP_CPU_ALLOC(oldMask); 488 __kmp_get_system_affinity(oldMask, TRUE); 489 __kmp_hwloc_check_numa(); 490 491 if (!KMP_AFFINITY_CAPABLE()) { 492 // Hack to try and infer the machine topology using only the data 493 // available from cpuid on the current thread, and __kmp_xproc. 494 KMP_ASSERT(__kmp_affinity_type == affinity_none); 495 496 nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj( 497 hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0), HWLOC_OBJ_CORE); 498 __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj( 499 hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0), HWLOC_OBJ_PU); 500 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 501 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 502 if (__kmp_affinity_verbose) { 503 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 504 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 505 if (__kmp_affinity_uniform_topology()) { 506 KMP_INFORM(Uniform, "KMP_AFFINITY"); 507 } else { 508 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 509 } 510 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 511 __kmp_nThreadsPerCore, __kmp_ncores); 512 } 513 KMP_CPU_FREE(oldMask); 514 return 0; 515 } 516 517 int depth = 3; 518 int levels[5] = {0, 1, 2, 3, 4}; // package, [node,] [tile,] core, thread 519 int labels[3] = {0}; // package [,node] [,tile] - head of lables array 520 if (__kmp_numa_detected) 521 ++depth; 522 if (__kmp_tile_depth) 523 ++depth; 524 525 // Allocate the data structure to be returned. 526 AddrUnsPair *retval = 527 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 528 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 529 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 530 531 // When affinity is off, this routine will still be called to set 532 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 533 // nCoresPerPkg, & nPackages. Make sure all these vars are set 534 // correctly, and return if affinity is not enabled. 535 536 hwloc_obj_t socket, node, tile; 537 int nActiveThreads = 0; 538 int socket_id = 0; 539 // re-calculate globals to count only accessible resources 540 __kmp_ncores = nPackages = nCoresPerPkg = __kmp_nThreadsPerCore = 0; 541 nNodePerPkg = nTilePerPkg = nTilePerNode = nCorePerNode = nCorePerTile = 0; 542 for (socket = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0); socket != NULL; 543 socket = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, socket), 544 socket_id++) { 545 labels[0] = socket_id; 546 if (__kmp_numa_detected) { 547 int NN; 548 int n_active_nodes = 0; 549 node = NULL; 550 NN = __kmp_hwloc_count_children_by_type(tp, socket, HWLOC_OBJ_NUMANODE, 551 &node); 552 for (int node_id = 0; node_id < NN; ++node_id, node = node->next_cousin) { 553 labels[1] = node_id; 554 if (__kmp_tile_depth) { 555 // NUMA + tiles 556 int NT; 557 int n_active_tiles = 0; 558 tile = NULL; 559 NT = __kmp_hwloc_count_children_by_depth(tp, node, __kmp_tile_depth, 560 &tile); 561 for (int tl_id = 0; tl_id < NT; ++tl_id, tile = tile->next_cousin) { 562 labels[2] = tl_id; 563 int n_active_cores = 0; 564 __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads, 565 n_active_cores, tile, 3, labels); 566 if (n_active_cores) { // were there any active cores on the socket? 567 ++n_active_tiles; // count active tiles per node 568 if (n_active_cores > nCorePerTile) 569 nCorePerTile = n_active_cores; // calc maximum 570 } 571 } 572 if (n_active_tiles) { // were there any active tiles on the socket? 573 ++n_active_nodes; // count active nodes per package 574 if (n_active_tiles > nTilePerNode) 575 nTilePerNode = n_active_tiles; // calc maximum 576 } 577 } else { 578 // NUMA, no tiles 579 int n_active_cores = 0; 580 __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads, 581 n_active_cores, node, 2, labels); 582 if (n_active_cores) { // were there any active cores on the socket? 583 ++n_active_nodes; // count active nodes per package 584 if (n_active_cores > nCorePerNode) 585 nCorePerNode = n_active_cores; // calc maximum 586 } 587 } 588 } 589 if (n_active_nodes) { // were there any active nodes on the socket? 590 ++nPackages; // count total active packages 591 if (n_active_nodes > nNodePerPkg) 592 nNodePerPkg = n_active_nodes; // calc maximum 593 } 594 } else { 595 if (__kmp_tile_depth) { 596 // no NUMA, tiles 597 int NT; 598 int n_active_tiles = 0; 599 tile = NULL; 600 NT = __kmp_hwloc_count_children_by_depth(tp, socket, __kmp_tile_depth, 601 &tile); 602 for (int tl_id = 0; tl_id < NT; ++tl_id, tile = tile->next_cousin) { 603 labels[1] = tl_id; 604 int n_active_cores = 0; 605 __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads, 606 n_active_cores, tile, 2, labels); 607 if (n_active_cores) { // were there any active cores on the socket? 608 ++n_active_tiles; // count active tiles per package 609 if (n_active_cores > nCorePerTile) 610 nCorePerTile = n_active_cores; // calc maximum 611 } 612 } 613 if (n_active_tiles) { // were there any active tiles on the socket? 614 ++nPackages; // count total active packages 615 if (n_active_tiles > nTilePerPkg) 616 nTilePerPkg = n_active_tiles; // calc maximum 617 } 618 } else { 619 // no NUMA, no tiles 620 int n_active_cores = 0; 621 __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads, n_active_cores, 622 socket, 1, labels); 623 if (n_active_cores) { // were there any active cores on the socket? 624 ++nPackages; // count total active packages 625 if (n_active_cores > nCoresPerPkg) 626 nCoresPerPkg = n_active_cores; // calc maximum 627 } 628 } 629 } 630 } 631 632 // If there's only one thread context to bind to, return now. 633 KMP_DEBUG_ASSERT(nActiveThreads == __kmp_avail_proc); 634 KMP_ASSERT(nActiveThreads > 0); 635 if (nActiveThreads == 1) { 636 __kmp_ncores = nPackages = 1; 637 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 638 if (__kmp_affinity_verbose) { 639 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 640 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 641 642 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 643 if (__kmp_affinity_respect_mask) { 644 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 645 } else { 646 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 647 } 648 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 649 KMP_INFORM(Uniform, "KMP_AFFINITY"); 650 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 651 __kmp_nThreadsPerCore, __kmp_ncores); 652 } 653 654 if (__kmp_affinity_type == affinity_none) { 655 __kmp_free(retval); 656 KMP_CPU_FREE(oldMask); 657 return 0; 658 } 659 660 // Form an Address object which only includes the package level. 661 Address addr(1); 662 addr.labels[0] = retval[0].first.labels[0]; 663 retval[0].first = addr; 664 665 if (__kmp_affinity_gran_levels < 0) { 666 __kmp_affinity_gran_levels = 0; 667 } 668 669 if (__kmp_affinity_verbose) { 670 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 671 } 672 673 *address2os = retval; 674 KMP_CPU_FREE(oldMask); 675 return 1; 676 } 677 678 // Sort the table by physical Id. 679 qsort(retval, nActiveThreads, sizeof(*retval), 680 __kmp_affinity_cmp_Address_labels); 681 682 // Check to see if the machine topology is uniform 683 int nPUs = nPackages * __kmp_nThreadsPerCore; 684 if (__kmp_numa_detected) { 685 if (__kmp_tile_depth) { // NUMA + tiles 686 nPUs *= (nNodePerPkg * nTilePerNode * nCorePerTile); 687 } else { // NUMA, no tiles 688 nPUs *= (nNodePerPkg * nCorePerNode); 689 } 690 } else { 691 if (__kmp_tile_depth) { // no NUMA, tiles 692 nPUs *= (nTilePerPkg * nCorePerTile); 693 } else { // no NUMA, no tiles 694 nPUs *= nCoresPerPkg; 695 } 696 } 697 unsigned uniform = (nPUs == nActiveThreads); 698 699 // Print the machine topology summary. 700 if (__kmp_affinity_verbose) { 701 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 702 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 703 if (__kmp_affinity_respect_mask) { 704 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 705 } else { 706 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 707 } 708 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 709 if (uniform) { 710 KMP_INFORM(Uniform, "KMP_AFFINITY"); 711 } else { 712 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 713 } 714 if (__kmp_numa_detected) { 715 if (__kmp_tile_depth) { // NUMA + tiles 716 KMP_INFORM(TopologyExtraNoTi, "KMP_AFFINITY", nPackages, nNodePerPkg, 717 nTilePerNode, nCorePerTile, __kmp_nThreadsPerCore, 718 __kmp_ncores); 719 } else { // NUMA, no tiles 720 KMP_INFORM(TopologyExtraNode, "KMP_AFFINITY", nPackages, nNodePerPkg, 721 nCorePerNode, __kmp_nThreadsPerCore, __kmp_ncores); 722 nPUs *= (nNodePerPkg * nCorePerNode); 723 } 724 } else { 725 if (__kmp_tile_depth) { // no NUMA, tiles 726 KMP_INFORM(TopologyExtraTile, "KMP_AFFINITY", nPackages, nTilePerPkg, 727 nCorePerTile, __kmp_nThreadsPerCore, __kmp_ncores); 728 } else { // no NUMA, no tiles 729 kmp_str_buf_t buf; 730 __kmp_str_buf_init(&buf); 731 __kmp_str_buf_print(&buf, "%d", nPackages); 732 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 733 __kmp_nThreadsPerCore, __kmp_ncores); 734 __kmp_str_buf_free(&buf); 735 } 736 } 737 } 738 739 if (__kmp_affinity_type == affinity_none) { 740 __kmp_free(retval); 741 KMP_CPU_FREE(oldMask); 742 return 0; 743 } 744 745 int depth_full = depth; // number of levels before compressing 746 // Find any levels with radiix 1, and remove them from the map 747 // (except for the package level). 748 depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth, 749 levels); 750 KMP_DEBUG_ASSERT(__kmp_affinity_gran != affinity_gran_default); 751 if (__kmp_affinity_gran_levels < 0) { 752 // Set the granularity level based on what levels are modeled 753 // in the machine topology map. 754 __kmp_affinity_gran_levels = 0; // lowest level (e.g. fine) 755 if (__kmp_affinity_gran > affinity_gran_thread) { 756 for (int i = 1; i <= depth_full; ++i) { 757 if (__kmp_affinity_gran <= i) // only count deeper levels 758 break; 759 if (levels[depth_full - i] > 0) 760 __kmp_affinity_gran_levels++; 761 } 762 } 763 if (__kmp_affinity_gran > affinity_gran_package) 764 __kmp_affinity_gran_levels++; // e.g. granularity = group 765 } 766 767 if (__kmp_affinity_verbose) 768 __kmp_affinity_print_hwloc_tp(retval, nActiveThreads, depth, levels); 769 770 KMP_CPU_FREE(oldMask); 771 *address2os = retval; 772 return depth; 773 } 774 #endif // KMP_USE_HWLOC 775 776 // If we don't know how to retrieve the machine's processor topology, or 777 // encounter an error in doing so, this routine is called to form a "flat" 778 // mapping of os thread id's <-> processor id's. 779 static int __kmp_affinity_create_flat_map(AddrUnsPair **address2os, 780 kmp_i18n_id_t *const msg_id) { 781 *address2os = NULL; 782 *msg_id = kmp_i18n_null; 783 784 // Even if __kmp_affinity_type == affinity_none, this routine might still 785 // called to set __kmp_ncores, as well as 786 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 787 if (!KMP_AFFINITY_CAPABLE()) { 788 KMP_ASSERT(__kmp_affinity_type == affinity_none); 789 __kmp_ncores = nPackages = __kmp_xproc; 790 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 791 if (__kmp_affinity_verbose) { 792 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY"); 793 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 794 KMP_INFORM(Uniform, "KMP_AFFINITY"); 795 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 796 __kmp_nThreadsPerCore, __kmp_ncores); 797 } 798 return 0; 799 } 800 801 // When affinity is off, this routine will still be called to set 802 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 803 // Make sure all these vars are set correctly, and return now if affinity is 804 // not enabled. 805 __kmp_ncores = nPackages = __kmp_avail_proc; 806 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 807 if (__kmp_affinity_verbose) { 808 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 809 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 810 __kmp_affin_fullMask); 811 812 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY"); 813 if (__kmp_affinity_respect_mask) { 814 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 815 } else { 816 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 817 } 818 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 819 KMP_INFORM(Uniform, "KMP_AFFINITY"); 820 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 821 __kmp_nThreadsPerCore, __kmp_ncores); 822 } 823 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 824 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 825 if (__kmp_affinity_type == affinity_none) { 826 int avail_ct = 0; 827 int i; 828 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 829 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) 830 continue; 831 __kmp_pu_os_idx[avail_ct++] = i; // suppose indices are flat 832 } 833 return 0; 834 } 835 836 // Contruct the data structure to be returned. 837 *address2os = 838 (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 839 int avail_ct = 0; 840 unsigned int i; 841 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 842 // Skip this proc if it is not included in the machine model. 843 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 844 continue; 845 } 846 __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat 847 Address addr(1); 848 addr.labels[0] = i; 849 (*address2os)[avail_ct++] = AddrUnsPair(addr, i); 850 } 851 if (__kmp_affinity_verbose) { 852 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 853 } 854 855 if (__kmp_affinity_gran_levels < 0) { 856 // Only the package level is modeled in the machine topology map, 857 // so the #levels of granularity is either 0 or 1. 858 if (__kmp_affinity_gran > affinity_gran_package) { 859 __kmp_affinity_gran_levels = 1; 860 } else { 861 __kmp_affinity_gran_levels = 0; 862 } 863 } 864 return 1; 865 } 866 867 #if KMP_GROUP_AFFINITY 868 869 // If multiple Windows* OS processor groups exist, we can create a 2-level 870 // topology map with the groups at level 0 and the individual procs at level 1. 871 // This facilitates letting the threads float among all procs in a group, 872 // if granularity=group (the default when there are multiple groups). 873 static int __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os, 874 kmp_i18n_id_t *const msg_id) { 875 *address2os = NULL; 876 *msg_id = kmp_i18n_null; 877 878 // If we aren't affinity capable, then return now. 879 // The flat mapping will be used. 880 if (!KMP_AFFINITY_CAPABLE()) { 881 // FIXME set *msg_id 882 return -1; 883 } 884 885 // Contruct the data structure to be returned. 886 *address2os = 887 (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 888 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 889 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 890 int avail_ct = 0; 891 int i; 892 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 893 // Skip this proc if it is not included in the machine model. 894 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 895 continue; 896 } 897 __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat 898 Address addr(2); 899 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR)); 900 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR)); 901 (*address2os)[avail_ct++] = AddrUnsPair(addr, i); 902 903 if (__kmp_affinity_verbose) { 904 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0], 905 addr.labels[1]); 906 } 907 } 908 909 if (__kmp_affinity_gran_levels < 0) { 910 if (__kmp_affinity_gran == affinity_gran_group) { 911 __kmp_affinity_gran_levels = 1; 912 } else if ((__kmp_affinity_gran == affinity_gran_fine) || 913 (__kmp_affinity_gran == affinity_gran_thread)) { 914 __kmp_affinity_gran_levels = 0; 915 } else { 916 const char *gran_str = NULL; 917 if (__kmp_affinity_gran == affinity_gran_core) { 918 gran_str = "core"; 919 } else if (__kmp_affinity_gran == affinity_gran_package) { 920 gran_str = "package"; 921 } else if (__kmp_affinity_gran == affinity_gran_node) { 922 gran_str = "node"; 923 } else { 924 KMP_ASSERT(0); 925 } 926 927 // Warning: can't use affinity granularity \"gran\" with group topology 928 // method, using "thread" 929 __kmp_affinity_gran_levels = 0; 930 } 931 } 932 return 2; 933 } 934 935 #endif /* KMP_GROUP_AFFINITY */ 936 937 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 938 939 static int __kmp_cpuid_mask_width(int count) { 940 int r = 0; 941 942 while ((1 << r) < count) 943 ++r; 944 return r; 945 } 946 947 class apicThreadInfo { 948 public: 949 unsigned osId; // param to __kmp_affinity_bind_thread 950 unsigned apicId; // from cpuid after binding 951 unsigned maxCoresPerPkg; // "" 952 unsigned maxThreadsPerPkg; // "" 953 unsigned pkgId; // inferred from above values 954 unsigned coreId; // "" 955 unsigned threadId; // "" 956 }; 957 958 static int __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, 959 const void *b) { 960 const apicThreadInfo *aa = (const apicThreadInfo *)a; 961 const apicThreadInfo *bb = (const apicThreadInfo *)b; 962 if (aa->osId < bb->osId) 963 return -1; 964 if (aa->osId > bb->osId) 965 return 1; 966 return 0; 967 } 968 969 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, 970 const void *b) { 971 const apicThreadInfo *aa = (const apicThreadInfo *)a; 972 const apicThreadInfo *bb = (const apicThreadInfo *)b; 973 if (aa->pkgId < bb->pkgId) 974 return -1; 975 if (aa->pkgId > bb->pkgId) 976 return 1; 977 if (aa->coreId < bb->coreId) 978 return -1; 979 if (aa->coreId > bb->coreId) 980 return 1; 981 if (aa->threadId < bb->threadId) 982 return -1; 983 if (aa->threadId > bb->threadId) 984 return 1; 985 return 0; 986 } 987 988 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 989 // an algorithm which cycles through the available os threads, setting 990 // the current thread's affinity mask to that thread, and then retrieves 991 // the Apic Id for each thread context using the cpuid instruction. 992 static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os, 993 kmp_i18n_id_t *const msg_id) { 994 kmp_cpuid buf; 995 int rc; 996 *address2os = NULL; 997 *msg_id = kmp_i18n_null; 998 999 // Check if cpuid leaf 4 is supported. 1000 __kmp_x86_cpuid(0, 0, &buf); 1001 if (buf.eax < 4) { 1002 *msg_id = kmp_i18n_str_NoLeaf4Support; 1003 return -1; 1004 } 1005 1006 // The algorithm used starts by setting the affinity to each available thread 1007 // and retrieving info from the cpuid instruction, so if we are not capable of 1008 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we 1009 // need to do something else - use the defaults that we calculated from 1010 // issuing cpuid without binding to each proc. 1011 if (!KMP_AFFINITY_CAPABLE()) { 1012 // Hack to try and infer the machine topology using only the data 1013 // available from cpuid on the current thread, and __kmp_xproc. 1014 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1015 1016 // Get an upper bound on the number of threads per package using cpuid(1). 1017 // On some OS/chps combinations where HT is supported by the chip but is 1018 // disabled, this value will be 2 on a single core chip. Usually, it will be 1019 // 2 if HT is enabled and 1 if HT is disabled. 1020 __kmp_x86_cpuid(1, 0, &buf); 1021 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1022 if (maxThreadsPerPkg == 0) { 1023 maxThreadsPerPkg = 1; 1024 } 1025 1026 // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded 1027 // value. 1028 // 1029 // The author of cpu_count.cpp treated this only an upper bound on the 1030 // number of cores, but I haven't seen any cases where it was greater than 1031 // the actual number of cores, so we will treat it as exact in this block of 1032 // code. 1033 // 1034 // First, we need to check if cpuid(4) is supported on this chip. To see if 1035 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or 1036 // greater. 1037 __kmp_x86_cpuid(0, 0, &buf); 1038 if (buf.eax >= 4) { 1039 __kmp_x86_cpuid(4, 0, &buf); 1040 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1041 } else { 1042 nCoresPerPkg = 1; 1043 } 1044 1045 // There is no way to reliably tell if HT is enabled without issuing the 1046 // cpuid instruction from every thread, can correlating the cpuid info, so 1047 // if the machine is not affinity capable, we assume that HT is off. We have 1048 // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine 1049 // does not support HT. 1050 // 1051 // - Older OSes are usually found on machines with older chips, which do not 1052 // support HT. 1053 // - The performance penalty for mistakenly identifying a machine as HT when 1054 // it isn't (which results in blocktime being incorrecly set to 0) is 1055 // greater than the penalty when for mistakenly identifying a machine as 1056 // being 1 thread/core when it is really HT enabled (which results in 1057 // blocktime being incorrectly set to a positive value). 1058 __kmp_ncores = __kmp_xproc; 1059 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1060 __kmp_nThreadsPerCore = 1; 1061 if (__kmp_affinity_verbose) { 1062 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY"); 1063 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1064 if (__kmp_affinity_uniform_topology()) { 1065 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1066 } else { 1067 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1068 } 1069 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1070 __kmp_nThreadsPerCore, __kmp_ncores); 1071 } 1072 return 0; 1073 } 1074 1075 // From here on, we can assume that it is safe to call 1076 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 1077 // __kmp_affinity_type = affinity_none. 1078 1079 // Save the affinity mask for the current thread. 1080 kmp_affin_mask_t *oldMask; 1081 KMP_CPU_ALLOC(oldMask); 1082 KMP_ASSERT(oldMask != NULL); 1083 __kmp_get_system_affinity(oldMask, TRUE); 1084 1085 // Run through each of the available contexts, binding the current thread 1086 // to it, and obtaining the pertinent information using the cpuid instr. 1087 // 1088 // The relevant information is: 1089 // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 1090 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 1091 // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value 1092 // of this field determines the width of the core# + thread# fields in the 1093 // Apic Id. It is also an upper bound on the number of threads per 1094 // package, but it has been verified that situations happen were it is not 1095 // exact. In particular, on certain OS/chip combinations where Intel(R) 1096 // Hyper-Threading Technology is supported by the chip but has been 1097 // disabled, the value of this field will be 2 (for a single core chip). 1098 // On other OS/chip combinations supporting Intel(R) Hyper-Threading 1099 // Technology, the value of this field will be 1 when Intel(R) 1100 // Hyper-Threading Technology is disabled and 2 when it is enabled. 1101 // - Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The value 1102 // of this field (+1) determines the width of the core# field in the Apic 1103 // Id. The comments in "cpucount.cpp" say that this value is an upper 1104 // bound, but the IA-32 architecture manual says that it is exactly the 1105 // number of cores per package, and I haven't seen any case where it 1106 // wasn't. 1107 // 1108 // From this information, deduce the package Id, core Id, and thread Id, 1109 // and set the corresponding fields in the apicThreadInfo struct. 1110 unsigned i; 1111 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 1112 __kmp_avail_proc * sizeof(apicThreadInfo)); 1113 unsigned nApics = 0; 1114 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1115 // Skip this proc if it is not included in the machine model. 1116 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1117 continue; 1118 } 1119 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 1120 1121 __kmp_affinity_dispatch->bind_thread(i); 1122 threadInfo[nApics].osId = i; 1123 1124 // The apic id and max threads per pkg come from cpuid(1). 1125 __kmp_x86_cpuid(1, 0, &buf); 1126 if (((buf.edx >> 9) & 1) == 0) { 1127 __kmp_set_system_affinity(oldMask, TRUE); 1128 __kmp_free(threadInfo); 1129 KMP_CPU_FREE(oldMask); 1130 *msg_id = kmp_i18n_str_ApicNotPresent; 1131 return -1; 1132 } 1133 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 1134 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1135 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 1136 threadInfo[nApics].maxThreadsPerPkg = 1; 1137 } 1138 1139 // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded 1140 // value. 1141 // 1142 // First, we need to check if cpuid(4) is supported on this chip. To see if 1143 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n 1144 // or greater. 1145 __kmp_x86_cpuid(0, 0, &buf); 1146 if (buf.eax >= 4) { 1147 __kmp_x86_cpuid(4, 0, &buf); 1148 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1149 } else { 1150 threadInfo[nApics].maxCoresPerPkg = 1; 1151 } 1152 1153 // Infer the pkgId / coreId / threadId using only the info obtained locally. 1154 int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg); 1155 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 1156 1157 int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg); 1158 int widthT = widthCT - widthC; 1159 if (widthT < 0) { 1160 // I've never seen this one happen, but I suppose it could, if the cpuid 1161 // instruction on a chip was really screwed up. Make sure to restore the 1162 // affinity mask before the tail call. 1163 __kmp_set_system_affinity(oldMask, TRUE); 1164 __kmp_free(threadInfo); 1165 KMP_CPU_FREE(oldMask); 1166 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1167 return -1; 1168 } 1169 1170 int maskC = (1 << widthC) - 1; 1171 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC; 1172 1173 int maskT = (1 << widthT) - 1; 1174 threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT; 1175 1176 nApics++; 1177 } 1178 1179 // We've collected all the info we need. 1180 // Restore the old affinity mask for this thread. 1181 __kmp_set_system_affinity(oldMask, TRUE); 1182 1183 // If there's only one thread context to bind to, form an Address object 1184 // with depth 1 and return immediately (or, if affinity is off, set 1185 // address2os to NULL and return). 1186 // 1187 // If it is configured to omit the package level when there is only a single 1188 // package, the logic at the end of this routine won't work if there is only 1189 // a single thread - it would try to form an Address object with depth 0. 1190 KMP_ASSERT(nApics > 0); 1191 if (nApics == 1) { 1192 __kmp_ncores = nPackages = 1; 1193 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1194 if (__kmp_affinity_verbose) { 1195 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1196 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1197 1198 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1199 if (__kmp_affinity_respect_mask) { 1200 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1201 } else { 1202 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1203 } 1204 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1205 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1206 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1207 __kmp_nThreadsPerCore, __kmp_ncores); 1208 } 1209 1210 if (__kmp_affinity_type == affinity_none) { 1211 __kmp_free(threadInfo); 1212 KMP_CPU_FREE(oldMask); 1213 return 0; 1214 } 1215 1216 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair)); 1217 Address addr(1); 1218 addr.labels[0] = threadInfo[0].pkgId; 1219 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId); 1220 1221 if (__kmp_affinity_gran_levels < 0) { 1222 __kmp_affinity_gran_levels = 0; 1223 } 1224 1225 if (__kmp_affinity_verbose) { 1226 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 1227 } 1228 1229 __kmp_free(threadInfo); 1230 KMP_CPU_FREE(oldMask); 1231 return 1; 1232 } 1233 1234 // Sort the threadInfo table by physical Id. 1235 qsort(threadInfo, nApics, sizeof(*threadInfo), 1236 __kmp_affinity_cmp_apicThreadInfo_phys_id); 1237 1238 // The table is now sorted by pkgId / coreId / threadId, but we really don't 1239 // know the radix of any of the fields. pkgId's may be sparsely assigned among 1240 // the chips on a system. Although coreId's are usually assigned 1241 // [0 .. coresPerPkg-1] and threadId's are usually assigned 1242 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1243 // 1244 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 1245 // total # packages) are at this point - we want to determine that now. We 1246 // only have an upper bound on the first two figures. 1247 // 1248 // We also perform a consistency check at this point: the values returned by 1249 // the cpuid instruction for any thread bound to a given package had better 1250 // return the same info for maxThreadsPerPkg and maxCoresPerPkg. 1251 nPackages = 1; 1252 nCoresPerPkg = 1; 1253 __kmp_nThreadsPerCore = 1; 1254 unsigned nCores = 1; 1255 1256 unsigned pkgCt = 1; // to determine radii 1257 unsigned lastPkgId = threadInfo[0].pkgId; 1258 unsigned coreCt = 1; 1259 unsigned lastCoreId = threadInfo[0].coreId; 1260 unsigned threadCt = 1; 1261 unsigned lastThreadId = threadInfo[0].threadId; 1262 1263 // intra-pkg consist checks 1264 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 1265 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 1266 1267 for (i = 1; i < nApics; i++) { 1268 if (threadInfo[i].pkgId != lastPkgId) { 1269 nCores++; 1270 pkgCt++; 1271 lastPkgId = threadInfo[i].pkgId; 1272 if ((int)coreCt > nCoresPerPkg) 1273 nCoresPerPkg = coreCt; 1274 coreCt = 1; 1275 lastCoreId = threadInfo[i].coreId; 1276 if ((int)threadCt > __kmp_nThreadsPerCore) 1277 __kmp_nThreadsPerCore = threadCt; 1278 threadCt = 1; 1279 lastThreadId = threadInfo[i].threadId; 1280 1281 // This is a different package, so go on to the next iteration without 1282 // doing any consistency checks. Reset the consistency check vars, though. 1283 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 1284 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 1285 continue; 1286 } 1287 1288 if (threadInfo[i].coreId != lastCoreId) { 1289 nCores++; 1290 coreCt++; 1291 lastCoreId = threadInfo[i].coreId; 1292 if ((int)threadCt > __kmp_nThreadsPerCore) 1293 __kmp_nThreadsPerCore = threadCt; 1294 threadCt = 1; 1295 lastThreadId = threadInfo[i].threadId; 1296 } else if (threadInfo[i].threadId != lastThreadId) { 1297 threadCt++; 1298 lastThreadId = threadInfo[i].threadId; 1299 } else { 1300 __kmp_free(threadInfo); 1301 KMP_CPU_FREE(oldMask); 1302 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1303 return -1; 1304 } 1305 1306 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 1307 // fields agree between all the threads bounds to a given package. 1308 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) || 1309 (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 1310 __kmp_free(threadInfo); 1311 KMP_CPU_FREE(oldMask); 1312 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1313 return -1; 1314 } 1315 } 1316 nPackages = pkgCt; 1317 if ((int)coreCt > nCoresPerPkg) 1318 nCoresPerPkg = coreCt; 1319 if ((int)threadCt > __kmp_nThreadsPerCore) 1320 __kmp_nThreadsPerCore = threadCt; 1321 1322 // When affinity is off, this routine will still be called to set 1323 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1324 // Make sure all these vars are set correctly, and return now if affinity is 1325 // not enabled. 1326 __kmp_ncores = nCores; 1327 if (__kmp_affinity_verbose) { 1328 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1329 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1330 1331 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1332 if (__kmp_affinity_respect_mask) { 1333 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1334 } else { 1335 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1336 } 1337 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1338 if (__kmp_affinity_uniform_topology()) { 1339 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1340 } else { 1341 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1342 } 1343 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1344 __kmp_nThreadsPerCore, __kmp_ncores); 1345 } 1346 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 1347 KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc); 1348 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 1349 for (i = 0; i < nApics; ++i) { 1350 __kmp_pu_os_idx[i] = threadInfo[i].osId; 1351 } 1352 if (__kmp_affinity_type == affinity_none) { 1353 __kmp_free(threadInfo); 1354 KMP_CPU_FREE(oldMask); 1355 return 0; 1356 } 1357 1358 // Now that we've determined the number of packages, the number of cores per 1359 // package, and the number of threads per core, we can construct the data 1360 // structure that is to be returned. 1361 int pkgLevel = 0; 1362 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1; 1363 int threadLevel = 1364 (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 1365 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 1366 1367 KMP_ASSERT(depth > 0); 1368 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1369 1370 for (i = 0; i < nApics; ++i) { 1371 Address addr(depth); 1372 unsigned os = threadInfo[i].osId; 1373 int d = 0; 1374 1375 if (pkgLevel >= 0) { 1376 addr.labels[d++] = threadInfo[i].pkgId; 1377 } 1378 if (coreLevel >= 0) { 1379 addr.labels[d++] = threadInfo[i].coreId; 1380 } 1381 if (threadLevel >= 0) { 1382 addr.labels[d++] = threadInfo[i].threadId; 1383 } 1384 (*address2os)[i] = AddrUnsPair(addr, os); 1385 } 1386 1387 if (__kmp_affinity_gran_levels < 0) { 1388 // Set the granularity level based on what levels are modeled in the machine 1389 // topology map. 1390 __kmp_affinity_gran_levels = 0; 1391 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1392 __kmp_affinity_gran_levels++; 1393 } 1394 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1395 __kmp_affinity_gran_levels++; 1396 } 1397 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) { 1398 __kmp_affinity_gran_levels++; 1399 } 1400 } 1401 1402 if (__kmp_affinity_verbose) { 1403 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel, 1404 coreLevel, threadLevel); 1405 } 1406 1407 __kmp_free(threadInfo); 1408 KMP_CPU_FREE(oldMask); 1409 return depth; 1410 } 1411 1412 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 1413 // architectures support a newer interface for specifying the x2APIC Ids, 1414 // based on cpuid leaf 11. 1415 static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os, 1416 kmp_i18n_id_t *const msg_id) { 1417 kmp_cpuid buf; 1418 *address2os = NULL; 1419 *msg_id = kmp_i18n_null; 1420 1421 // Check to see if cpuid leaf 11 is supported. 1422 __kmp_x86_cpuid(0, 0, &buf); 1423 if (buf.eax < 11) { 1424 *msg_id = kmp_i18n_str_NoLeaf11Support; 1425 return -1; 1426 } 1427 __kmp_x86_cpuid(11, 0, &buf); 1428 if (buf.ebx == 0) { 1429 *msg_id = kmp_i18n_str_NoLeaf11Support; 1430 return -1; 1431 } 1432 1433 // Find the number of levels in the machine topology. While we're at it, get 1434 // the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will try to 1435 // get more accurate values later by explicitly counting them, but get 1436 // reasonable defaults now, in case we return early. 1437 int level; 1438 int threadLevel = -1; 1439 int coreLevel = -1; 1440 int pkgLevel = -1; 1441 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1442 1443 for (level = 0;; level++) { 1444 if (level > 31) { 1445 // FIXME: Hack for DPD200163180 1446 // 1447 // If level is big then something went wrong -> exiting 1448 // 1449 // There could actually be 32 valid levels in the machine topology, but so 1450 // far, the only machine we have seen which does not exit this loop before 1451 // iteration 32 has fubar x2APIC settings. 1452 // 1453 // For now, just reject this case based upon loop trip count. 1454 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1455 return -1; 1456 } 1457 __kmp_x86_cpuid(11, level, &buf); 1458 if (buf.ebx == 0) { 1459 if (pkgLevel < 0) { 1460 // Will infer nPackages from __kmp_xproc 1461 pkgLevel = level; 1462 level++; 1463 } 1464 break; 1465 } 1466 int kind = (buf.ecx >> 8) & 0xff; 1467 if (kind == 1) { 1468 // SMT level 1469 threadLevel = level; 1470 coreLevel = -1; 1471 pkgLevel = -1; 1472 __kmp_nThreadsPerCore = buf.ebx & 0xffff; 1473 if (__kmp_nThreadsPerCore == 0) { 1474 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1475 return -1; 1476 } 1477 } else if (kind == 2) { 1478 // core level 1479 coreLevel = level; 1480 pkgLevel = -1; 1481 nCoresPerPkg = buf.ebx & 0xffff; 1482 if (nCoresPerPkg == 0) { 1483 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1484 return -1; 1485 } 1486 } else { 1487 if (level <= 0) { 1488 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1489 return -1; 1490 } 1491 if (pkgLevel >= 0) { 1492 continue; 1493 } 1494 pkgLevel = level; 1495 nPackages = buf.ebx & 0xffff; 1496 if (nPackages == 0) { 1497 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1498 return -1; 1499 } 1500 } 1501 } 1502 int depth = level; 1503 1504 // In the above loop, "level" was counted from the finest level (usually 1505 // thread) to the coarsest. The caller expects that we will place the labels 1506 // in (*address2os)[].first.labels[] in the inverse order, so we need to 1507 // invert the vars saying which level means what. 1508 if (threadLevel >= 0) { 1509 threadLevel = depth - threadLevel - 1; 1510 } 1511 if (coreLevel >= 0) { 1512 coreLevel = depth - coreLevel - 1; 1513 } 1514 KMP_DEBUG_ASSERT(pkgLevel >= 0); 1515 pkgLevel = depth - pkgLevel - 1; 1516 1517 // The algorithm used starts by setting the affinity to each available thread 1518 // and retrieving info from the cpuid instruction, so if we are not capable of 1519 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we 1520 // need to do something else - use the defaults that we calculated from 1521 // issuing cpuid without binding to each proc. 1522 if (!KMP_AFFINITY_CAPABLE()) { 1523 // Hack to try and infer the machine topology using only the data 1524 // available from cpuid on the current thread, and __kmp_xproc. 1525 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1526 1527 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1528 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1529 if (__kmp_affinity_verbose) { 1530 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 1531 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1532 if (__kmp_affinity_uniform_topology()) { 1533 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1534 } else { 1535 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1536 } 1537 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1538 __kmp_nThreadsPerCore, __kmp_ncores); 1539 } 1540 return 0; 1541 } 1542 1543 // From here on, we can assume that it is safe to call 1544 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 1545 // __kmp_affinity_type = affinity_none. 1546 1547 // Save the affinity mask for the current thread. 1548 kmp_affin_mask_t *oldMask; 1549 KMP_CPU_ALLOC(oldMask); 1550 __kmp_get_system_affinity(oldMask, TRUE); 1551 1552 // Allocate the data structure to be returned. 1553 AddrUnsPair *retval = 1554 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 1555 1556 // Run through each of the available contexts, binding the current thread 1557 // to it, and obtaining the pertinent information using the cpuid instr. 1558 unsigned int proc; 1559 int nApics = 0; 1560 KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) { 1561 // Skip this proc if it is not included in the machine model. 1562 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 1563 continue; 1564 } 1565 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc); 1566 1567 __kmp_affinity_dispatch->bind_thread(proc); 1568 1569 // Extract labels for each level in the machine topology map from Apic ID. 1570 Address addr(depth); 1571 int prev_shift = 0; 1572 1573 for (level = 0; level < depth; level++) { 1574 __kmp_x86_cpuid(11, level, &buf); 1575 unsigned apicId = buf.edx; 1576 if (buf.ebx == 0) { 1577 if (level != depth - 1) { 1578 KMP_CPU_FREE(oldMask); 1579 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1580 return -1; 1581 } 1582 addr.labels[depth - level - 1] = apicId >> prev_shift; 1583 level++; 1584 break; 1585 } 1586 int shift = buf.eax & 0x1f; 1587 int mask = (1 << shift) - 1; 1588 addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift; 1589 prev_shift = shift; 1590 } 1591 if (level != depth) { 1592 KMP_CPU_FREE(oldMask); 1593 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1594 return -1; 1595 } 1596 1597 retval[nApics] = AddrUnsPair(addr, proc); 1598 nApics++; 1599 } 1600 1601 // We've collected all the info we need. 1602 // Restore the old affinity mask for this thread. 1603 __kmp_set_system_affinity(oldMask, TRUE); 1604 1605 // If there's only one thread context to bind to, return now. 1606 KMP_ASSERT(nApics > 0); 1607 if (nApics == 1) { 1608 __kmp_ncores = nPackages = 1; 1609 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1610 if (__kmp_affinity_verbose) { 1611 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1612 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1613 1614 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1615 if (__kmp_affinity_respect_mask) { 1616 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1617 } else { 1618 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1619 } 1620 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1621 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1622 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1623 __kmp_nThreadsPerCore, __kmp_ncores); 1624 } 1625 1626 if (__kmp_affinity_type == affinity_none) { 1627 __kmp_free(retval); 1628 KMP_CPU_FREE(oldMask); 1629 return 0; 1630 } 1631 1632 // Form an Address object which only includes the package level. 1633 Address addr(1); 1634 addr.labels[0] = retval[0].first.labels[pkgLevel]; 1635 retval[0].first = addr; 1636 1637 if (__kmp_affinity_gran_levels < 0) { 1638 __kmp_affinity_gran_levels = 0; 1639 } 1640 1641 if (__kmp_affinity_verbose) { 1642 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 1643 } 1644 1645 *address2os = retval; 1646 KMP_CPU_FREE(oldMask); 1647 return 1; 1648 } 1649 1650 // Sort the table by physical Id. 1651 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels); 1652 1653 // Find the radix at each of the levels. 1654 unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1655 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1656 unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1657 unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1658 for (level = 0; level < depth; level++) { 1659 totals[level] = 1; 1660 maxCt[level] = 1; 1661 counts[level] = 1; 1662 last[level] = retval[0].first.labels[level]; 1663 } 1664 1665 // From here on, the iteration variable "level" runs from the finest level to 1666 // the coarsest, i.e. we iterate forward through 1667 // (*address2os)[].first.labels[] - in the previous loops, we iterated 1668 // backwards. 1669 for (proc = 1; (int)proc < nApics; proc++) { 1670 int level; 1671 for (level = 0; level < depth; level++) { 1672 if (retval[proc].first.labels[level] != last[level]) { 1673 int j; 1674 for (j = level + 1; j < depth; j++) { 1675 totals[j]++; 1676 counts[j] = 1; 1677 // The line below causes printing incorrect topology information in 1678 // case the max value for some level (maxCt[level]) is encountered 1679 // earlier than some less value while going through the array. For 1680 // example, let pkg0 has 4 cores and pkg1 has 2 cores. Then 1681 // maxCt[1] == 2 1682 // whereas it must be 4. 1683 // TODO!!! Check if it can be commented safely 1684 // maxCt[j] = 1; 1685 last[j] = retval[proc].first.labels[j]; 1686 } 1687 totals[level]++; 1688 counts[level]++; 1689 if (counts[level] > maxCt[level]) { 1690 maxCt[level] = counts[level]; 1691 } 1692 last[level] = retval[proc].first.labels[level]; 1693 break; 1694 } else if (level == depth - 1) { 1695 __kmp_free(last); 1696 __kmp_free(maxCt); 1697 __kmp_free(counts); 1698 __kmp_free(totals); 1699 __kmp_free(retval); 1700 KMP_CPU_FREE(oldMask); 1701 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 1702 return -1; 1703 } 1704 } 1705 } 1706 1707 // When affinity is off, this routine will still be called to set 1708 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1709 // Make sure all these vars are set correctly, and return if affinity is not 1710 // enabled. 1711 if (threadLevel >= 0) { 1712 __kmp_nThreadsPerCore = maxCt[threadLevel]; 1713 } else { 1714 __kmp_nThreadsPerCore = 1; 1715 } 1716 nPackages = totals[pkgLevel]; 1717 1718 if (coreLevel >= 0) { 1719 __kmp_ncores = totals[coreLevel]; 1720 nCoresPerPkg = maxCt[coreLevel]; 1721 } else { 1722 __kmp_ncores = nPackages; 1723 nCoresPerPkg = 1; 1724 } 1725 1726 // Check to see if the machine topology is uniform 1727 unsigned prod = maxCt[0]; 1728 for (level = 1; level < depth; level++) { 1729 prod *= maxCt[level]; 1730 } 1731 bool uniform = (prod == totals[level - 1]); 1732 1733 // Print the machine topology summary. 1734 if (__kmp_affinity_verbose) { 1735 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 1736 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1737 1738 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1739 if (__kmp_affinity_respect_mask) { 1740 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 1741 } else { 1742 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 1743 } 1744 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1745 if (uniform) { 1746 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1747 } else { 1748 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1749 } 1750 1751 kmp_str_buf_t buf; 1752 __kmp_str_buf_init(&buf); 1753 1754 __kmp_str_buf_print(&buf, "%d", totals[0]); 1755 for (level = 1; level <= pkgLevel; level++) { 1756 __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 1757 } 1758 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 1759 __kmp_nThreadsPerCore, __kmp_ncores); 1760 1761 __kmp_str_buf_free(&buf); 1762 } 1763 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 1764 KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc); 1765 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 1766 for (proc = 0; (int)proc < nApics; ++proc) { 1767 __kmp_pu_os_idx[proc] = retval[proc].second; 1768 } 1769 if (__kmp_affinity_type == affinity_none) { 1770 __kmp_free(last); 1771 __kmp_free(maxCt); 1772 __kmp_free(counts); 1773 __kmp_free(totals); 1774 __kmp_free(retval); 1775 KMP_CPU_FREE(oldMask); 1776 return 0; 1777 } 1778 1779 // Find any levels with radiix 1, and remove them from the map 1780 // (except for the package level). 1781 int new_depth = 0; 1782 for (level = 0; level < depth; level++) { 1783 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1784 continue; 1785 } 1786 new_depth++; 1787 } 1788 1789 // If we are removing any levels, allocate a new vector to return, 1790 // and copy the relevant information to it. 1791 if (new_depth != depth) { 1792 AddrUnsPair *new_retval = 1793 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1794 for (proc = 0; (int)proc < nApics; proc++) { 1795 Address addr(new_depth); 1796 new_retval[proc] = AddrUnsPair(addr, retval[proc].second); 1797 } 1798 int new_level = 0; 1799 int newPkgLevel = -1; 1800 int newCoreLevel = -1; 1801 int newThreadLevel = -1; 1802 int i; 1803 for (level = 0; level < depth; level++) { 1804 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1805 // Remove this level. Never remove the package level 1806 continue; 1807 } 1808 if (level == pkgLevel) { 1809 newPkgLevel = new_level; 1810 } 1811 if (level == coreLevel) { 1812 newCoreLevel = new_level; 1813 } 1814 if (level == threadLevel) { 1815 newThreadLevel = new_level; 1816 } 1817 for (proc = 0; (int)proc < nApics; proc++) { 1818 new_retval[proc].first.labels[new_level] = 1819 retval[proc].first.labels[level]; 1820 } 1821 new_level++; 1822 } 1823 1824 __kmp_free(retval); 1825 retval = new_retval; 1826 depth = new_depth; 1827 pkgLevel = newPkgLevel; 1828 coreLevel = newCoreLevel; 1829 threadLevel = newThreadLevel; 1830 } 1831 1832 if (__kmp_affinity_gran_levels < 0) { 1833 // Set the granularity level based on what levels are modeled 1834 // in the machine topology map. 1835 __kmp_affinity_gran_levels = 0; 1836 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1837 __kmp_affinity_gran_levels++; 1838 } 1839 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1840 __kmp_affinity_gran_levels++; 1841 } 1842 if (__kmp_affinity_gran > affinity_gran_package) { 1843 __kmp_affinity_gran_levels++; 1844 } 1845 } 1846 1847 if (__kmp_affinity_verbose) { 1848 __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, coreLevel, 1849 threadLevel); 1850 } 1851 1852 __kmp_free(last); 1853 __kmp_free(maxCt); 1854 __kmp_free(counts); 1855 __kmp_free(totals); 1856 KMP_CPU_FREE(oldMask); 1857 *address2os = retval; 1858 return depth; 1859 } 1860 1861 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1862 1863 #define osIdIndex 0 1864 #define threadIdIndex 1 1865 #define coreIdIndex 2 1866 #define pkgIdIndex 3 1867 #define nodeIdIndex 4 1868 1869 typedef unsigned *ProcCpuInfo; 1870 static unsigned maxIndex = pkgIdIndex; 1871 1872 static int __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b) { 1873 const unsigned *aa = (const unsigned *)a; 1874 const unsigned *bb = (const unsigned *)b; 1875 if (aa[osIdIndex] < bb[osIdIndex]) 1876 return -1; 1877 if (aa[osIdIndex] > bb[osIdIndex]) 1878 return 1; 1879 return 0; 1880 } 1881 1882 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, 1883 const void *b) { 1884 unsigned i; 1885 const unsigned *aa = *(unsigned *const *)a; 1886 const unsigned *bb = *(unsigned *const *)b; 1887 for (i = maxIndex;; i--) { 1888 if (aa[i] < bb[i]) 1889 return -1; 1890 if (aa[i] > bb[i]) 1891 return 1; 1892 if (i == osIdIndex) 1893 break; 1894 } 1895 return 0; 1896 } 1897 1898 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 1899 // affinity map. 1900 static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, 1901 int *line, 1902 kmp_i18n_id_t *const msg_id, 1903 FILE *f) { 1904 *address2os = NULL; 1905 *msg_id = kmp_i18n_null; 1906 1907 // Scan of the file, and count the number of "processor" (osId) fields, 1908 // and find the highest value of <n> for a node_<n> field. 1909 char buf[256]; 1910 unsigned num_records = 0; 1911 while (!feof(f)) { 1912 buf[sizeof(buf) - 1] = 1; 1913 if (!fgets(buf, sizeof(buf), f)) { 1914 // Read errors presumably because of EOF 1915 break; 1916 } 1917 1918 char s1[] = "processor"; 1919 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1920 num_records++; 1921 continue; 1922 } 1923 1924 // FIXME - this will match "node_<n> <garbage>" 1925 unsigned level; 1926 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 1927 if (nodeIdIndex + level >= maxIndex) { 1928 maxIndex = nodeIdIndex + level; 1929 } 1930 continue; 1931 } 1932 } 1933 1934 // Check for empty file / no valid processor records, or too many. The number 1935 // of records can't exceed the number of valid bits in the affinity mask. 1936 if (num_records == 0) { 1937 *line = 0; 1938 *msg_id = kmp_i18n_str_NoProcRecords; 1939 return -1; 1940 } 1941 if (num_records > (unsigned)__kmp_xproc) { 1942 *line = 0; 1943 *msg_id = kmp_i18n_str_TooManyProcRecords; 1944 return -1; 1945 } 1946 1947 // Set the file pointer back to the begginning, so that we can scan the file 1948 // again, this time performing a full parse of the data. Allocate a vector of 1949 // ProcCpuInfo object, where we will place the data. Adding an extra element 1950 // at the end allows us to remove a lot of extra checks for termination 1951 // conditions. 1952 if (fseek(f, 0, SEEK_SET) != 0) { 1953 *line = 0; 1954 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 1955 return -1; 1956 } 1957 1958 // Allocate the array of records to store the proc info in. The dummy 1959 // element at the end makes the logic in filling them out easier to code. 1960 unsigned **threadInfo = 1961 (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *)); 1962 unsigned i; 1963 for (i = 0; i <= num_records; i++) { 1964 threadInfo[i] = 1965 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 1966 } 1967 1968 #define CLEANUP_THREAD_INFO \ 1969 for (i = 0; i <= num_records; i++) { \ 1970 __kmp_free(threadInfo[i]); \ 1971 } \ 1972 __kmp_free(threadInfo); 1973 1974 // A value of UINT_MAX means that we didn't find the field 1975 unsigned __index; 1976 1977 #define INIT_PROC_INFO(p) \ 1978 for (__index = 0; __index <= maxIndex; __index++) { \ 1979 (p)[__index] = UINT_MAX; \ 1980 } 1981 1982 for (i = 0; i <= num_records; i++) { 1983 INIT_PROC_INFO(threadInfo[i]); 1984 } 1985 1986 unsigned num_avail = 0; 1987 *line = 0; 1988 while (!feof(f)) { 1989 // Create an inner scoping level, so that all the goto targets at the end of 1990 // the loop appear in an outer scoping level. This avoids warnings about 1991 // jumping past an initialization to a target in the same block. 1992 { 1993 buf[sizeof(buf) - 1] = 1; 1994 bool long_line = false; 1995 if (!fgets(buf, sizeof(buf), f)) { 1996 // Read errors presumably because of EOF 1997 // If there is valid data in threadInfo[num_avail], then fake 1998 // a blank line in ensure that the last address gets parsed. 1999 bool valid = false; 2000 for (i = 0; i <= maxIndex; i++) { 2001 if (threadInfo[num_avail][i] != UINT_MAX) { 2002 valid = true; 2003 } 2004 } 2005 if (!valid) { 2006 break; 2007 } 2008 buf[0] = 0; 2009 } else if (!buf[sizeof(buf) - 1]) { 2010 // The line is longer than the buffer. Set a flag and don't 2011 // emit an error if we were going to ignore the line, anyway. 2012 long_line = true; 2013 2014 #define CHECK_LINE \ 2015 if (long_line) { \ 2016 CLEANUP_THREAD_INFO; \ 2017 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 2018 return -1; \ 2019 } 2020 } 2021 (*line)++; 2022 2023 char s1[] = "processor"; 2024 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 2025 CHECK_LINE; 2026 char *p = strchr(buf + sizeof(s1) - 1, ':'); 2027 unsigned val; 2028 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2029 goto no_val; 2030 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) 2031 #if KMP_ARCH_AARCH64 2032 // Handle the old AArch64 /proc/cpuinfo layout differently, 2033 // it contains all of the 'processor' entries listed in a 2034 // single 'Processor' section, therefore the normal looking 2035 // for duplicates in that section will always fail. 2036 num_avail++; 2037 #else 2038 goto dup_field; 2039 #endif 2040 threadInfo[num_avail][osIdIndex] = val; 2041 #if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64) 2042 char path[256]; 2043 KMP_SNPRINTF( 2044 path, sizeof(path), 2045 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 2046 threadInfo[num_avail][osIdIndex]); 2047 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 2048 2049 KMP_SNPRINTF(path, sizeof(path), 2050 "/sys/devices/system/cpu/cpu%u/topology/core_id", 2051 threadInfo[num_avail][osIdIndex]); 2052 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 2053 continue; 2054 #else 2055 } 2056 char s2[] = "physical id"; 2057 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 2058 CHECK_LINE; 2059 char *p = strchr(buf + sizeof(s2) - 1, ':'); 2060 unsigned val; 2061 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2062 goto no_val; 2063 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) 2064 goto dup_field; 2065 threadInfo[num_avail][pkgIdIndex] = val; 2066 continue; 2067 } 2068 char s3[] = "core id"; 2069 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 2070 CHECK_LINE; 2071 char *p = strchr(buf + sizeof(s3) - 1, ':'); 2072 unsigned val; 2073 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2074 goto no_val; 2075 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) 2076 goto dup_field; 2077 threadInfo[num_avail][coreIdIndex] = val; 2078 continue; 2079 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 2080 } 2081 char s4[] = "thread id"; 2082 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 2083 CHECK_LINE; 2084 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2085 unsigned val; 2086 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2087 goto no_val; 2088 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) 2089 goto dup_field; 2090 threadInfo[num_avail][threadIdIndex] = val; 2091 continue; 2092 } 2093 unsigned level; 2094 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 2095 CHECK_LINE; 2096 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2097 unsigned val; 2098 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2099 goto no_val; 2100 KMP_ASSERT(nodeIdIndex + level <= maxIndex); 2101 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) 2102 goto dup_field; 2103 threadInfo[num_avail][nodeIdIndex + level] = val; 2104 continue; 2105 } 2106 2107 // We didn't recognize the leading token on the line. There are lots of 2108 // leading tokens that we don't recognize - if the line isn't empty, go on 2109 // to the next line. 2110 if ((*buf != 0) && (*buf != '\n')) { 2111 // If the line is longer than the buffer, read characters 2112 // until we find a newline. 2113 if (long_line) { 2114 int ch; 2115 while (((ch = fgetc(f)) != EOF) && (ch != '\n')) 2116 ; 2117 } 2118 continue; 2119 } 2120 2121 // A newline has signalled the end of the processor record. 2122 // Check that there aren't too many procs specified. 2123 if ((int)num_avail == __kmp_xproc) { 2124 CLEANUP_THREAD_INFO; 2125 *msg_id = kmp_i18n_str_TooManyEntries; 2126 return -1; 2127 } 2128 2129 // Check for missing fields. The osId field must be there, and we 2130 // currently require that the physical id field is specified, also. 2131 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 2132 CLEANUP_THREAD_INFO; 2133 *msg_id = kmp_i18n_str_MissingProcField; 2134 return -1; 2135 } 2136 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 2137 CLEANUP_THREAD_INFO; 2138 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 2139 return -1; 2140 } 2141 2142 // Skip this proc if it is not included in the machine model. 2143 if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], 2144 __kmp_affin_fullMask)) { 2145 INIT_PROC_INFO(threadInfo[num_avail]); 2146 continue; 2147 } 2148 2149 // We have a successful parse of this proc's info. 2150 // Increment the counter, and prepare for the next proc. 2151 num_avail++; 2152 KMP_ASSERT(num_avail <= num_records); 2153 INIT_PROC_INFO(threadInfo[num_avail]); 2154 } 2155 continue; 2156 2157 no_val: 2158 CLEANUP_THREAD_INFO; 2159 *msg_id = kmp_i18n_str_MissingValCpuinfo; 2160 return -1; 2161 2162 dup_field: 2163 CLEANUP_THREAD_INFO; 2164 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 2165 return -1; 2166 } 2167 *line = 0; 2168 2169 #if KMP_MIC && REDUCE_TEAM_SIZE 2170 unsigned teamSize = 0; 2171 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2172 2173 // check for num_records == __kmp_xproc ??? 2174 2175 // If there's only one thread context to bind to, form an Address object with 2176 // depth 1 and return immediately (or, if affinity is off, set address2os to 2177 // NULL and return). 2178 // 2179 // If it is configured to omit the package level when there is only a single 2180 // package, the logic at the end of this routine won't work if there is only a 2181 // single thread - it would try to form an Address object with depth 0. 2182 KMP_ASSERT(num_avail > 0); 2183 KMP_ASSERT(num_avail <= num_records); 2184 if (num_avail == 1) { 2185 __kmp_ncores = 1; 2186 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 2187 if (__kmp_affinity_verbose) { 2188 if (!KMP_AFFINITY_CAPABLE()) { 2189 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2190 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2191 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2192 } else { 2193 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2194 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 2195 __kmp_affin_fullMask); 2196 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2197 if (__kmp_affinity_respect_mask) { 2198 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2199 } else { 2200 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2201 } 2202 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2203 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2204 } 2205 int index; 2206 kmp_str_buf_t buf; 2207 __kmp_str_buf_init(&buf); 2208 __kmp_str_buf_print(&buf, "1"); 2209 for (index = maxIndex - 1; index > pkgIdIndex; index--) { 2210 __kmp_str_buf_print(&buf, " x 1"); 2211 } 2212 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1); 2213 __kmp_str_buf_free(&buf); 2214 } 2215 2216 if (__kmp_affinity_type == affinity_none) { 2217 CLEANUP_THREAD_INFO; 2218 return 0; 2219 } 2220 2221 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair)); 2222 Address addr(1); 2223 addr.labels[0] = threadInfo[0][pkgIdIndex]; 2224 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]); 2225 2226 if (__kmp_affinity_gran_levels < 0) { 2227 __kmp_affinity_gran_levels = 0; 2228 } 2229 2230 if (__kmp_affinity_verbose) { 2231 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 2232 } 2233 2234 CLEANUP_THREAD_INFO; 2235 return 1; 2236 } 2237 2238 // Sort the threadInfo table by physical Id. 2239 qsort(threadInfo, num_avail, sizeof(*threadInfo), 2240 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 2241 2242 // The table is now sorted by pkgId / coreId / threadId, but we really don't 2243 // know the radix of any of the fields. pkgId's may be sparsely assigned among 2244 // the chips on a system. Although coreId's are usually assigned 2245 // [0 .. coresPerPkg-1] and threadId's are usually assigned 2246 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2247 // 2248 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 2249 // total # packages) are at this point - we want to determine that now. We 2250 // only have an upper bound on the first two figures. 2251 unsigned *counts = 2252 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2253 unsigned *maxCt = 2254 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2255 unsigned *totals = 2256 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2257 unsigned *lastId = 2258 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2259 2260 bool assign_thread_ids = false; 2261 unsigned threadIdCt; 2262 unsigned index; 2263 2264 restart_radix_check: 2265 threadIdCt = 0; 2266 2267 // Initialize the counter arrays with data from threadInfo[0]. 2268 if (assign_thread_ids) { 2269 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 2270 threadInfo[0][threadIdIndex] = threadIdCt++; 2271 } else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 2272 threadIdCt = threadInfo[0][threadIdIndex] + 1; 2273 } 2274 } 2275 for (index = 0; index <= maxIndex; index++) { 2276 counts[index] = 1; 2277 maxCt[index] = 1; 2278 totals[index] = 1; 2279 lastId[index] = threadInfo[0][index]; 2280 ; 2281 } 2282 2283 // Run through the rest of the OS procs. 2284 for (i = 1; i < num_avail; i++) { 2285 // Find the most significant index whose id differs from the id for the 2286 // previous OS proc. 2287 for (index = maxIndex; index >= threadIdIndex; index--) { 2288 if (assign_thread_ids && (index == threadIdIndex)) { 2289 // Auto-assign the thread id field if it wasn't specified. 2290 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2291 threadInfo[i][threadIdIndex] = threadIdCt++; 2292 } 2293 // Apparently the thread id field was specified for some entries and not 2294 // others. Start the thread id counter off at the next higher thread id. 2295 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2296 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2297 } 2298 } 2299 if (threadInfo[i][index] != lastId[index]) { 2300 // Run through all indices which are less significant, and reset the 2301 // counts to 1. At all levels up to and including index, we need to 2302 // increment the totals and record the last id. 2303 unsigned index2; 2304 for (index2 = threadIdIndex; index2 < index; index2++) { 2305 totals[index2]++; 2306 if (counts[index2] > maxCt[index2]) { 2307 maxCt[index2] = counts[index2]; 2308 } 2309 counts[index2] = 1; 2310 lastId[index2] = threadInfo[i][index2]; 2311 } 2312 counts[index]++; 2313 totals[index]++; 2314 lastId[index] = threadInfo[i][index]; 2315 2316 if (assign_thread_ids && (index > threadIdIndex)) { 2317 2318 #if KMP_MIC && REDUCE_TEAM_SIZE 2319 // The default team size is the total #threads in the machine 2320 // minus 1 thread for every core that has 3 or more threads. 2321 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2322 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2323 2324 // Restart the thread counter, as we are on a new core. 2325 threadIdCt = 0; 2326 2327 // Auto-assign the thread id field if it wasn't specified. 2328 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2329 threadInfo[i][threadIdIndex] = threadIdCt++; 2330 } 2331 2332 // Aparrently the thread id field was specified for some entries and 2333 // not others. Start the thread id counter off at the next higher 2334 // thread id. 2335 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2336 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2337 } 2338 } 2339 break; 2340 } 2341 } 2342 if (index < threadIdIndex) { 2343 // If thread ids were specified, it is an error if they are not unique. 2344 // Also, check that we waven't already restarted the loop (to be safe - 2345 // shouldn't need to). 2346 if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) { 2347 __kmp_free(lastId); 2348 __kmp_free(totals); 2349 __kmp_free(maxCt); 2350 __kmp_free(counts); 2351 CLEANUP_THREAD_INFO; 2352 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2353 return -1; 2354 } 2355 2356 // If the thread ids were not specified and we see entries entries that 2357 // are duplicates, start the loop over and assign the thread ids manually. 2358 assign_thread_ids = true; 2359 goto restart_radix_check; 2360 } 2361 } 2362 2363 #if KMP_MIC && REDUCE_TEAM_SIZE 2364 // The default team size is the total #threads in the machine 2365 // minus 1 thread for every core that has 3 or more threads. 2366 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2367 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2368 2369 for (index = threadIdIndex; index <= maxIndex; index++) { 2370 if (counts[index] > maxCt[index]) { 2371 maxCt[index] = counts[index]; 2372 } 2373 } 2374 2375 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2376 nCoresPerPkg = maxCt[coreIdIndex]; 2377 nPackages = totals[pkgIdIndex]; 2378 2379 // Check to see if the machine topology is uniform 2380 unsigned prod = totals[maxIndex]; 2381 for (index = threadIdIndex; index < maxIndex; index++) { 2382 prod *= maxCt[index]; 2383 } 2384 bool uniform = (prod == totals[threadIdIndex]); 2385 2386 // When affinity is off, this routine will still be called to set 2387 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 2388 // Make sure all these vars are set correctly, and return now if affinity is 2389 // not enabled. 2390 __kmp_ncores = totals[coreIdIndex]; 2391 2392 if (__kmp_affinity_verbose) { 2393 if (!KMP_AFFINITY_CAPABLE()) { 2394 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2395 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2396 if (uniform) { 2397 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2398 } else { 2399 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2400 } 2401 } else { 2402 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2403 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 2404 __kmp_affin_fullMask); 2405 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2406 if (__kmp_affinity_respect_mask) { 2407 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2408 } else { 2409 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2410 } 2411 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2412 if (uniform) { 2413 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2414 } else { 2415 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2416 } 2417 } 2418 kmp_str_buf_t buf; 2419 __kmp_str_buf_init(&buf); 2420 2421 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]); 2422 for (index = maxIndex - 1; index >= pkgIdIndex; index--) { 2423 __kmp_str_buf_print(&buf, " x %d", maxCt[index]); 2424 } 2425 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex], 2426 maxCt[threadIdIndex], __kmp_ncores); 2427 2428 __kmp_str_buf_free(&buf); 2429 } 2430 2431 #if KMP_MIC && REDUCE_TEAM_SIZE 2432 // Set the default team size. 2433 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2434 __kmp_dflt_team_nth = teamSize; 2435 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting " 2436 "__kmp_dflt_team_nth = %d\n", 2437 __kmp_dflt_team_nth)); 2438 } 2439 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2440 2441 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 2442 KMP_DEBUG_ASSERT(num_avail == __kmp_avail_proc); 2443 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 2444 for (i = 0; i < num_avail; ++i) { // fill the os indices 2445 __kmp_pu_os_idx[i] = threadInfo[i][osIdIndex]; 2446 } 2447 2448 if (__kmp_affinity_type == affinity_none) { 2449 __kmp_free(lastId); 2450 __kmp_free(totals); 2451 __kmp_free(maxCt); 2452 __kmp_free(counts); 2453 CLEANUP_THREAD_INFO; 2454 return 0; 2455 } 2456 2457 // Count the number of levels which have more nodes at that level than at the 2458 // parent's level (with there being an implicit root node of the top level). 2459 // This is equivalent to saying that there is at least one node at this level 2460 // which has a sibling. These levels are in the map, and the package level is 2461 // always in the map. 2462 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2463 int level = 0; 2464 for (index = threadIdIndex; index < maxIndex; index++) { 2465 KMP_ASSERT(totals[index] >= totals[index + 1]); 2466 inMap[index] = (totals[index] > totals[index + 1]); 2467 } 2468 inMap[maxIndex] = (totals[maxIndex] > 1); 2469 inMap[pkgIdIndex] = true; 2470 2471 int depth = 0; 2472 for (index = threadIdIndex; index <= maxIndex; index++) { 2473 if (inMap[index]) { 2474 depth++; 2475 } 2476 } 2477 KMP_ASSERT(depth > 0); 2478 2479 // Construct the data structure that is to be returned. 2480 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * num_avail); 2481 int pkgLevel = -1; 2482 int coreLevel = -1; 2483 int threadLevel = -1; 2484 2485 for (i = 0; i < num_avail; ++i) { 2486 Address addr(depth); 2487 unsigned os = threadInfo[i][osIdIndex]; 2488 int src_index; 2489 int dst_index = 0; 2490 2491 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2492 if (!inMap[src_index]) { 2493 continue; 2494 } 2495 addr.labels[dst_index] = threadInfo[i][src_index]; 2496 if (src_index == pkgIdIndex) { 2497 pkgLevel = dst_index; 2498 } else if (src_index == coreIdIndex) { 2499 coreLevel = dst_index; 2500 } else if (src_index == threadIdIndex) { 2501 threadLevel = dst_index; 2502 } 2503 dst_index++; 2504 } 2505 (*address2os)[i] = AddrUnsPair(addr, os); 2506 } 2507 2508 if (__kmp_affinity_gran_levels < 0) { 2509 // Set the granularity level based on what levels are modeled 2510 // in the machine topology map. 2511 unsigned src_index; 2512 __kmp_affinity_gran_levels = 0; 2513 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) { 2514 if (!inMap[src_index]) { 2515 continue; 2516 } 2517 switch (src_index) { 2518 case threadIdIndex: 2519 if (__kmp_affinity_gran > affinity_gran_thread) { 2520 __kmp_affinity_gran_levels++; 2521 } 2522 2523 break; 2524 case coreIdIndex: 2525 if (__kmp_affinity_gran > affinity_gran_core) { 2526 __kmp_affinity_gran_levels++; 2527 } 2528 break; 2529 2530 case pkgIdIndex: 2531 if (__kmp_affinity_gran > affinity_gran_package) { 2532 __kmp_affinity_gran_levels++; 2533 } 2534 break; 2535 } 2536 } 2537 } 2538 2539 if (__kmp_affinity_verbose) { 2540 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel, 2541 coreLevel, threadLevel); 2542 } 2543 2544 __kmp_free(inMap); 2545 __kmp_free(lastId); 2546 __kmp_free(totals); 2547 __kmp_free(maxCt); 2548 __kmp_free(counts); 2549 CLEANUP_THREAD_INFO; 2550 return depth; 2551 } 2552 2553 // Create and return a table of affinity masks, indexed by OS thread ID. 2554 // This routine handles OR'ing together all the affinity masks of threads 2555 // that are sufficiently close, if granularity > fine. 2556 static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex, 2557 unsigned *numUnique, 2558 AddrUnsPair *address2os, 2559 unsigned numAddrs) { 2560 // First form a table of affinity masks in order of OS thread id. 2561 unsigned depth; 2562 unsigned maxOsId; 2563 unsigned i; 2564 2565 KMP_ASSERT(numAddrs > 0); 2566 depth = address2os[0].first.depth; 2567 2568 maxOsId = 0; 2569 for (i = numAddrs - 1;; --i) { 2570 unsigned osId = address2os[i].second; 2571 if (osId > maxOsId) { 2572 maxOsId = osId; 2573 } 2574 if (i == 0) 2575 break; 2576 } 2577 kmp_affin_mask_t *osId2Mask; 2578 KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1)); 2579 2580 // Sort the address2os table according to physical order. Doing so will put 2581 // all threads on the same core/package/node in consecutive locations. 2582 qsort(address2os, numAddrs, sizeof(*address2os), 2583 __kmp_affinity_cmp_Address_labels); 2584 2585 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2586 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2587 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2588 } 2589 if (__kmp_affinity_gran_levels >= (int)depth) { 2590 if (__kmp_affinity_verbose || 2591 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 2592 KMP_WARNING(AffThreadsMayMigrate); 2593 } 2594 } 2595 2596 // Run through the table, forming the masks for all threads on each core. 2597 // Threads on the same core will have identical "Address" objects, not 2598 // considering the last level, which must be the thread id. All threads on a 2599 // core will appear consecutively. 2600 unsigned unique = 0; 2601 unsigned j = 0; // index of 1st thread on core 2602 unsigned leader = 0; 2603 Address *leaderAddr = &(address2os[0].first); 2604 kmp_affin_mask_t *sum; 2605 KMP_CPU_ALLOC_ON_STACK(sum); 2606 KMP_CPU_ZERO(sum); 2607 KMP_CPU_SET(address2os[0].second, sum); 2608 for (i = 1; i < numAddrs; i++) { 2609 // If this thread is sufficiently close to the leader (within the 2610 // granularity setting), then set the bit for this os thread in the 2611 // affinity mask for this group, and go on to the next thread. 2612 if (leaderAddr->isClose(address2os[i].first, __kmp_affinity_gran_levels)) { 2613 KMP_CPU_SET(address2os[i].second, sum); 2614 continue; 2615 } 2616 2617 // For every thread in this group, copy the mask to the thread's entry in 2618 // the osId2Mask table. Mark the first address as a leader. 2619 for (; j < i; j++) { 2620 unsigned osId = address2os[j].second; 2621 KMP_DEBUG_ASSERT(osId <= maxOsId); 2622 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2623 KMP_CPU_COPY(mask, sum); 2624 address2os[j].first.leader = (j == leader); 2625 } 2626 unique++; 2627 2628 // Start a new mask. 2629 leader = i; 2630 leaderAddr = &(address2os[i].first); 2631 KMP_CPU_ZERO(sum); 2632 KMP_CPU_SET(address2os[i].second, sum); 2633 } 2634 2635 // For every thread in last group, copy the mask to the thread's 2636 // entry in the osId2Mask table. 2637 for (; j < i; j++) { 2638 unsigned osId = address2os[j].second; 2639 KMP_DEBUG_ASSERT(osId <= maxOsId); 2640 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2641 KMP_CPU_COPY(mask, sum); 2642 address2os[j].first.leader = (j == leader); 2643 } 2644 unique++; 2645 KMP_CPU_FREE_FROM_STACK(sum); 2646 2647 *maxIndex = maxOsId; 2648 *numUnique = unique; 2649 return osId2Mask; 2650 } 2651 2652 // Stuff for the affinity proclist parsers. It's easier to declare these vars 2653 // as file-static than to try and pass them through the calling sequence of 2654 // the recursive-descent OMP_PLACES parser. 2655 static kmp_affin_mask_t *newMasks; 2656 static int numNewMasks; 2657 static int nextNewMask; 2658 2659 #define ADD_MASK(_mask) \ 2660 { \ 2661 if (nextNewMask >= numNewMasks) { \ 2662 int i; \ 2663 numNewMasks *= 2; \ 2664 kmp_affin_mask_t *temp; \ 2665 KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \ 2666 for (i = 0; i < numNewMasks / 2; i++) { \ 2667 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); \ 2668 kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i); \ 2669 KMP_CPU_COPY(dest, src); \ 2670 } \ 2671 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2); \ 2672 newMasks = temp; \ 2673 } \ 2674 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 2675 nextNewMask++; \ 2676 } 2677 2678 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId) \ 2679 { \ 2680 if (((_osId) > _maxOsId) || \ 2681 (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 2682 if (__kmp_affinity_verbose || \ 2683 (__kmp_affinity_warnings && \ 2684 (__kmp_affinity_type != affinity_none))) { \ 2685 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 2686 } \ 2687 } else { \ 2688 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 2689 } \ 2690 } 2691 2692 // Re-parse the proclist (for the explicit affinity type), and form the list 2693 // of affinity newMasks indexed by gtid. 2694 static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 2695 unsigned int *out_numMasks, 2696 const char *proclist, 2697 kmp_affin_mask_t *osId2Mask, 2698 int maxOsId) { 2699 int i; 2700 const char *scan = proclist; 2701 const char *next = proclist; 2702 2703 // We use malloc() for the temporary mask vector, so that we can use 2704 // realloc() to extend it. 2705 numNewMasks = 2; 2706 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 2707 nextNewMask = 0; 2708 kmp_affin_mask_t *sumMask; 2709 KMP_CPU_ALLOC(sumMask); 2710 int setSize = 0; 2711 2712 for (;;) { 2713 int start, end, stride; 2714 2715 SKIP_WS(scan); 2716 next = scan; 2717 if (*next == '\0') { 2718 break; 2719 } 2720 2721 if (*next == '{') { 2722 int num; 2723 setSize = 0; 2724 next++; // skip '{' 2725 SKIP_WS(next); 2726 scan = next; 2727 2728 // Read the first integer in the set. 2729 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist"); 2730 SKIP_DIGITS(next); 2731 num = __kmp_str_to_int(scan, *next); 2732 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2733 2734 // Copy the mask for that osId to the sum (union) mask. 2735 if ((num > maxOsId) || 2736 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2737 if (__kmp_affinity_verbose || 2738 (__kmp_affinity_warnings && 2739 (__kmp_affinity_type != affinity_none))) { 2740 KMP_WARNING(AffIgnoreInvalidProcID, num); 2741 } 2742 KMP_CPU_ZERO(sumMask); 2743 } else { 2744 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2745 setSize = 1; 2746 } 2747 2748 for (;;) { 2749 // Check for end of set. 2750 SKIP_WS(next); 2751 if (*next == '}') { 2752 next++; // skip '}' 2753 break; 2754 } 2755 2756 // Skip optional comma. 2757 if (*next == ',') { 2758 next++; 2759 } 2760 SKIP_WS(next); 2761 2762 // Read the next integer in the set. 2763 scan = next; 2764 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2765 2766 SKIP_DIGITS(next); 2767 num = __kmp_str_to_int(scan, *next); 2768 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2769 2770 // Add the mask for that osId to the sum mask. 2771 if ((num > maxOsId) || 2772 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2773 if (__kmp_affinity_verbose || 2774 (__kmp_affinity_warnings && 2775 (__kmp_affinity_type != affinity_none))) { 2776 KMP_WARNING(AffIgnoreInvalidProcID, num); 2777 } 2778 } else { 2779 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2780 setSize++; 2781 } 2782 } 2783 if (setSize > 0) { 2784 ADD_MASK(sumMask); 2785 } 2786 2787 SKIP_WS(next); 2788 if (*next == ',') { 2789 next++; 2790 } 2791 scan = next; 2792 continue; 2793 } 2794 2795 // Read the first integer. 2796 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2797 SKIP_DIGITS(next); 2798 start = __kmp_str_to_int(scan, *next); 2799 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 2800 SKIP_WS(next); 2801 2802 // If this isn't a range, then add a mask to the list and go on. 2803 if (*next != '-') { 2804 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2805 2806 // Skip optional comma. 2807 if (*next == ',') { 2808 next++; 2809 } 2810 scan = next; 2811 continue; 2812 } 2813 2814 // This is a range. Skip over the '-' and read in the 2nd int. 2815 next++; // skip '-' 2816 SKIP_WS(next); 2817 scan = next; 2818 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2819 SKIP_DIGITS(next); 2820 end = __kmp_str_to_int(scan, *next); 2821 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 2822 2823 // Check for a stride parameter 2824 stride = 1; 2825 SKIP_WS(next); 2826 if (*next == ':') { 2827 // A stride is specified. Skip over the ':" and read the 3rd int. 2828 int sign = +1; 2829 next++; // skip ':' 2830 SKIP_WS(next); 2831 scan = next; 2832 if (*next == '-') { 2833 sign = -1; 2834 next++; 2835 SKIP_WS(next); 2836 scan = next; 2837 } 2838 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2839 SKIP_DIGITS(next); 2840 stride = __kmp_str_to_int(scan, *next); 2841 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 2842 stride *= sign; 2843 } 2844 2845 // Do some range checks. 2846 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 2847 if (stride > 0) { 2848 KMP_ASSERT2(start <= end, "bad explicit proc list"); 2849 } else { 2850 KMP_ASSERT2(start >= end, "bad explicit proc list"); 2851 } 2852 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 2853 2854 // Add the mask for each OS proc # to the list. 2855 if (stride > 0) { 2856 do { 2857 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2858 start += stride; 2859 } while (start <= end); 2860 } else { 2861 do { 2862 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2863 start += stride; 2864 } while (start >= end); 2865 } 2866 2867 // Skip optional comma. 2868 SKIP_WS(next); 2869 if (*next == ',') { 2870 next++; 2871 } 2872 scan = next; 2873 } 2874 2875 *out_numMasks = nextNewMask; 2876 if (nextNewMask == 0) { 2877 *out_masks = NULL; 2878 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 2879 return; 2880 } 2881 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 2882 for (i = 0; i < nextNewMask; i++) { 2883 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 2884 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 2885 KMP_CPU_COPY(dest, src); 2886 } 2887 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 2888 KMP_CPU_FREE(sumMask); 2889 } 2890 2891 #if OMP_40_ENABLED 2892 2893 /*----------------------------------------------------------------------------- 2894 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 2895 places. Again, Here is the grammar: 2896 2897 place_list := place 2898 place_list := place , place_list 2899 place := num 2900 place := place : num 2901 place := place : num : signed 2902 place := { subplacelist } 2903 place := ! place // (lowest priority) 2904 subplace_list := subplace 2905 subplace_list := subplace , subplace_list 2906 subplace := num 2907 subplace := num : num 2908 subplace := num : num : signed 2909 signed := num 2910 signed := + signed 2911 signed := - signed 2912 -----------------------------------------------------------------------------*/ 2913 2914 static void __kmp_process_subplace_list(const char **scan, 2915 kmp_affin_mask_t *osId2Mask, 2916 int maxOsId, kmp_affin_mask_t *tempMask, 2917 int *setSize) { 2918 const char *next; 2919 2920 for (;;) { 2921 int start, count, stride, i; 2922 2923 // Read in the starting proc id 2924 SKIP_WS(*scan); 2925 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 2926 next = *scan; 2927 SKIP_DIGITS(next); 2928 start = __kmp_str_to_int(*scan, *next); 2929 KMP_ASSERT(start >= 0); 2930 *scan = next; 2931 2932 // valid follow sets are ',' ':' and '}' 2933 SKIP_WS(*scan); 2934 if (**scan == '}' || **scan == ',') { 2935 if ((start > maxOsId) || 2936 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2937 if (__kmp_affinity_verbose || 2938 (__kmp_affinity_warnings && 2939 (__kmp_affinity_type != affinity_none))) { 2940 KMP_WARNING(AffIgnoreInvalidProcID, start); 2941 } 2942 } else { 2943 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2944 (*setSize)++; 2945 } 2946 if (**scan == '}') { 2947 break; 2948 } 2949 (*scan)++; // skip ',' 2950 continue; 2951 } 2952 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 2953 (*scan)++; // skip ':' 2954 2955 // Read count parameter 2956 SKIP_WS(*scan); 2957 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 2958 next = *scan; 2959 SKIP_DIGITS(next); 2960 count = __kmp_str_to_int(*scan, *next); 2961 KMP_ASSERT(count >= 0); 2962 *scan = next; 2963 2964 // valid follow sets are ',' ':' and '}' 2965 SKIP_WS(*scan); 2966 if (**scan == '}' || **scan == ',') { 2967 for (i = 0; i < count; i++) { 2968 if ((start > maxOsId) || 2969 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2970 if (__kmp_affinity_verbose || 2971 (__kmp_affinity_warnings && 2972 (__kmp_affinity_type != affinity_none))) { 2973 KMP_WARNING(AffIgnoreInvalidProcID, start); 2974 } 2975 break; // don't proliferate warnings for large count 2976 } else { 2977 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2978 start++; 2979 (*setSize)++; 2980 } 2981 } 2982 if (**scan == '}') { 2983 break; 2984 } 2985 (*scan)++; // skip ',' 2986 continue; 2987 } 2988 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 2989 (*scan)++; // skip ':' 2990 2991 // Read stride parameter 2992 int sign = +1; 2993 for (;;) { 2994 SKIP_WS(*scan); 2995 if (**scan == '+') { 2996 (*scan)++; // skip '+' 2997 continue; 2998 } 2999 if (**scan == '-') { 3000 sign *= -1; 3001 (*scan)++; // skip '-' 3002 continue; 3003 } 3004 break; 3005 } 3006 SKIP_WS(*scan); 3007 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3008 next = *scan; 3009 SKIP_DIGITS(next); 3010 stride = __kmp_str_to_int(*scan, *next); 3011 KMP_ASSERT(stride >= 0); 3012 *scan = next; 3013 stride *= sign; 3014 3015 // valid follow sets are ',' and '}' 3016 SKIP_WS(*scan); 3017 if (**scan == '}' || **scan == ',') { 3018 for (i = 0; i < count; i++) { 3019 if ((start > maxOsId) || 3020 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3021 if (__kmp_affinity_verbose || 3022 (__kmp_affinity_warnings && 3023 (__kmp_affinity_type != affinity_none))) { 3024 KMP_WARNING(AffIgnoreInvalidProcID, start); 3025 } 3026 break; // don't proliferate warnings for large count 3027 } else { 3028 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3029 start += stride; 3030 (*setSize)++; 3031 } 3032 } 3033 if (**scan == '}') { 3034 break; 3035 } 3036 (*scan)++; // skip ',' 3037 continue; 3038 } 3039 3040 KMP_ASSERT2(0, "bad explicit places list"); 3041 } 3042 } 3043 3044 static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 3045 int maxOsId, kmp_affin_mask_t *tempMask, 3046 int *setSize) { 3047 const char *next; 3048 3049 // valid follow sets are '{' '!' and num 3050 SKIP_WS(*scan); 3051 if (**scan == '{') { 3052 (*scan)++; // skip '{' 3053 __kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize); 3054 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 3055 (*scan)++; // skip '}' 3056 } else if (**scan == '!') { 3057 (*scan)++; // skip '!' 3058 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 3059 KMP_CPU_COMPLEMENT(maxOsId, tempMask); 3060 } else if ((**scan >= '0') && (**scan <= '9')) { 3061 next = *scan; 3062 SKIP_DIGITS(next); 3063 int num = __kmp_str_to_int(*scan, *next); 3064 KMP_ASSERT(num >= 0); 3065 if ((num > maxOsId) || 3066 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3067 if (__kmp_affinity_verbose || 3068 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 3069 KMP_WARNING(AffIgnoreInvalidProcID, num); 3070 } 3071 } else { 3072 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 3073 (*setSize)++; 3074 } 3075 *scan = next; // skip num 3076 } else { 3077 KMP_ASSERT2(0, "bad explicit places list"); 3078 } 3079 } 3080 3081 // static void 3082 void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 3083 unsigned int *out_numMasks, 3084 const char *placelist, 3085 kmp_affin_mask_t *osId2Mask, 3086 int maxOsId) { 3087 int i, j, count, stride, sign; 3088 const char *scan = placelist; 3089 const char *next = placelist; 3090 3091 numNewMasks = 2; 3092 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 3093 nextNewMask = 0; 3094 3095 // tempMask is modified based on the previous or initial 3096 // place to form the current place 3097 // previousMask contains the previous place 3098 kmp_affin_mask_t *tempMask; 3099 kmp_affin_mask_t *previousMask; 3100 KMP_CPU_ALLOC(tempMask); 3101 KMP_CPU_ZERO(tempMask); 3102 KMP_CPU_ALLOC(previousMask); 3103 KMP_CPU_ZERO(previousMask); 3104 int setSize = 0; 3105 3106 for (;;) { 3107 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 3108 3109 // valid follow sets are ',' ':' and EOL 3110 SKIP_WS(scan); 3111 if (*scan == '\0' || *scan == ',') { 3112 if (setSize > 0) { 3113 ADD_MASK(tempMask); 3114 } 3115 KMP_CPU_ZERO(tempMask); 3116 setSize = 0; 3117 if (*scan == '\0') { 3118 break; 3119 } 3120 scan++; // skip ',' 3121 continue; 3122 } 3123 3124 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3125 scan++; // skip ':' 3126 3127 // Read count parameter 3128 SKIP_WS(scan); 3129 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 3130 next = scan; 3131 SKIP_DIGITS(next); 3132 count = __kmp_str_to_int(scan, *next); 3133 KMP_ASSERT(count >= 0); 3134 scan = next; 3135 3136 // valid follow sets are ',' ':' and EOL 3137 SKIP_WS(scan); 3138 if (*scan == '\0' || *scan == ',') { 3139 stride = +1; 3140 } else { 3141 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3142 scan++; // skip ':' 3143 3144 // Read stride parameter 3145 sign = +1; 3146 for (;;) { 3147 SKIP_WS(scan); 3148 if (*scan == '+') { 3149 scan++; // skip '+' 3150 continue; 3151 } 3152 if (*scan == '-') { 3153 sign *= -1; 3154 scan++; // skip '-' 3155 continue; 3156 } 3157 break; 3158 } 3159 SKIP_WS(scan); 3160 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 3161 next = scan; 3162 SKIP_DIGITS(next); 3163 stride = __kmp_str_to_int(scan, *next); 3164 KMP_DEBUG_ASSERT(stride >= 0); 3165 scan = next; 3166 stride *= sign; 3167 } 3168 3169 // Add places determined by initial_place : count : stride 3170 for (i = 0; i < count; i++) { 3171 if (setSize == 0) { 3172 break; 3173 } 3174 // Add the current place, then build the next place (tempMask) from that 3175 KMP_CPU_COPY(previousMask, tempMask); 3176 ADD_MASK(previousMask); 3177 KMP_CPU_ZERO(tempMask); 3178 setSize = 0; 3179 KMP_CPU_SET_ITERATE(j, previousMask) { 3180 if (!KMP_CPU_ISSET(j, previousMask)) { 3181 continue; 3182 } 3183 if ((j + stride > maxOsId) || (j + stride < 0) || 3184 (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) || 3185 (!KMP_CPU_ISSET(j + stride, 3186 KMP_CPU_INDEX(osId2Mask, j + stride)))) { 3187 if ((__kmp_affinity_verbose || 3188 (__kmp_affinity_warnings && 3189 (__kmp_affinity_type != affinity_none))) && 3190 i < count - 1) { 3191 KMP_WARNING(AffIgnoreInvalidProcID, j + stride); 3192 } 3193 continue; 3194 } 3195 KMP_CPU_SET(j + stride, tempMask); 3196 setSize++; 3197 } 3198 } 3199 KMP_CPU_ZERO(tempMask); 3200 setSize = 0; 3201 3202 // valid follow sets are ',' and EOL 3203 SKIP_WS(scan); 3204 if (*scan == '\0') { 3205 break; 3206 } 3207 if (*scan == ',') { 3208 scan++; // skip ',' 3209 continue; 3210 } 3211 3212 KMP_ASSERT2(0, "bad explicit places list"); 3213 } 3214 3215 *out_numMasks = nextNewMask; 3216 if (nextNewMask == 0) { 3217 *out_masks = NULL; 3218 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3219 return; 3220 } 3221 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3222 KMP_CPU_FREE(tempMask); 3223 KMP_CPU_FREE(previousMask); 3224 for (i = 0; i < nextNewMask; i++) { 3225 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 3226 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 3227 KMP_CPU_COPY(dest, src); 3228 } 3229 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3230 } 3231 3232 #endif /* OMP_40_ENABLED */ 3233 3234 #undef ADD_MASK 3235 #undef ADD_MASK_OSID 3236 3237 #if KMP_USE_HWLOC 3238 static int __kmp_hwloc_skip_PUs_obj(hwloc_topology_t t, hwloc_obj_t o) { 3239 // skip PUs descendants of the object o 3240 int skipped = 0; 3241 hwloc_obj_t hT = NULL; 3242 int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT); 3243 for (int i = 0; i < N; ++i) { 3244 KMP_DEBUG_ASSERT(hT); 3245 unsigned idx = hT->os_index; 3246 if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3247 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3248 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3249 ++skipped; 3250 } 3251 hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT); 3252 } 3253 return skipped; // count number of skipped units 3254 } 3255 3256 static int __kmp_hwloc_obj_has_PUs(hwloc_topology_t t, hwloc_obj_t o) { 3257 // check if obj has PUs present in fullMask 3258 hwloc_obj_t hT = NULL; 3259 int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT); 3260 for (int i = 0; i < N; ++i) { 3261 KMP_DEBUG_ASSERT(hT); 3262 unsigned idx = hT->os_index; 3263 if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) 3264 return 1; // found PU 3265 hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT); 3266 } 3267 return 0; // no PUs found 3268 } 3269 #endif // KMP_USE_HWLOC 3270 3271 static void __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) { 3272 AddrUnsPair *newAddr; 3273 if (__kmp_hws_requested == 0) 3274 goto _exit; // no topology limiting actions requested, exit 3275 #if KMP_USE_HWLOC 3276 if (__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 3277 // Number of subobjects calculated dynamically, this works fine for 3278 // any non-uniform topology. 3279 // L2 cache objects are determined by depth, other objects - by type. 3280 hwloc_topology_t tp = __kmp_hwloc_topology; 3281 int nS = 0, nN = 0, nL = 0, nC = 0, 3282 nT = 0; // logical index including skipped 3283 int nCr = 0, nTr = 0; // number of requested units 3284 int nPkg = 0, nCo = 0, n_new = 0, n_old = 0, nCpP = 0, nTpC = 0; // counters 3285 hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to) 3286 int L2depth, idx; 3287 3288 // check support of extensions ---------------------------------- 3289 int numa_support = 0, tile_support = 0; 3290 if (__kmp_pu_os_idx) 3291 hT = hwloc_get_pu_obj_by_os_index(tp, 3292 __kmp_pu_os_idx[__kmp_avail_proc - 1]); 3293 else 3294 hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, __kmp_avail_proc - 1); 3295 if (hT == NULL) { // something's gone wrong 3296 KMP_WARNING(AffHWSubsetUnsupported); 3297 goto _exit; 3298 } 3299 // check NUMA node 3300 hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT); 3301 hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT); 3302 if (hN != NULL && hN->depth > hS->depth) { 3303 numa_support = 1; // 1 in case socket includes node(s) 3304 } else if (__kmp_hws_node.num > 0) { 3305 // don't support sockets inside NUMA node (no such HW found for testing) 3306 KMP_WARNING(AffHWSubsetUnsupported); 3307 goto _exit; 3308 } 3309 // check L2 cahce, get object by depth because of multiple caches 3310 L2depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED); 3311 hL = hwloc_get_ancestor_obj_by_depth(tp, L2depth, hT); 3312 if (hL != NULL && 3313 __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1) { 3314 tile_support = 1; // no sense to count L2 if it includes single core 3315 } else if (__kmp_hws_tile.num > 0) { 3316 if (__kmp_hws_core.num == 0) { 3317 __kmp_hws_core = __kmp_hws_tile; // replace L2 with core 3318 __kmp_hws_tile.num = 0; 3319 } else { 3320 // L2 and core are both requested, but represent same object 3321 KMP_WARNING(AffHWSubsetInvalid); 3322 goto _exit; 3323 } 3324 } 3325 // end of check of extensions ----------------------------------- 3326 3327 // fill in unset items, validate settings ----------------------- 3328 if (__kmp_hws_socket.num == 0) 3329 __kmp_hws_socket.num = nPackages; // use all available sockets 3330 if (__kmp_hws_socket.offset >= nPackages) { 3331 KMP_WARNING(AffHWSubsetManySockets); 3332 goto _exit; 3333 } 3334 if (numa_support) { 3335 hN = NULL; 3336 int NN = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, 3337 &hN); // num nodes in socket 3338 if (__kmp_hws_node.num == 0) 3339 __kmp_hws_node.num = NN; // use all available nodes 3340 if (__kmp_hws_node.offset >= NN) { 3341 KMP_WARNING(AffHWSubsetManyNodes); 3342 goto _exit; 3343 } 3344 if (tile_support) { 3345 // get num tiles in node 3346 int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL); 3347 if (__kmp_hws_tile.num == 0) { 3348 __kmp_hws_tile.num = NL + 1; 3349 } // use all available tiles, some node may have more tiles, thus +1 3350 if (__kmp_hws_tile.offset >= NL) { 3351 KMP_WARNING(AffHWSubsetManyTiles); 3352 goto _exit; 3353 } 3354 int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, 3355 &hC); // num cores in tile 3356 if (__kmp_hws_core.num == 0) 3357 __kmp_hws_core.num = NC; // use all available cores 3358 if (__kmp_hws_core.offset >= NC) { 3359 KMP_WARNING(AffHWSubsetManyCores); 3360 goto _exit; 3361 } 3362 } else { // tile_support 3363 int NC = __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, 3364 &hC); // num cores in node 3365 if (__kmp_hws_core.num == 0) 3366 __kmp_hws_core.num = NC; // use all available cores 3367 if (__kmp_hws_core.offset >= NC) { 3368 KMP_WARNING(AffHWSubsetManyCores); 3369 goto _exit; 3370 } 3371 } // tile_support 3372 } else { // numa_support 3373 if (tile_support) { 3374 // get num tiles in socket 3375 int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL); 3376 if (__kmp_hws_tile.num == 0) 3377 __kmp_hws_tile.num = NL; // use all available tiles 3378 if (__kmp_hws_tile.offset >= NL) { 3379 KMP_WARNING(AffHWSubsetManyTiles); 3380 goto _exit; 3381 } 3382 int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, 3383 &hC); // num cores in tile 3384 if (__kmp_hws_core.num == 0) 3385 __kmp_hws_core.num = NC; // use all available cores 3386 if (__kmp_hws_core.offset >= NC) { 3387 KMP_WARNING(AffHWSubsetManyCores); 3388 goto _exit; 3389 } 3390 } else { // tile_support 3391 int NC = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, 3392 &hC); // num cores in socket 3393 if (__kmp_hws_core.num == 0) 3394 __kmp_hws_core.num = NC; // use all available cores 3395 if (__kmp_hws_core.offset >= NC) { 3396 KMP_WARNING(AffHWSubsetManyCores); 3397 goto _exit; 3398 } 3399 } // tile_support 3400 } 3401 if (__kmp_hws_proc.num == 0) 3402 __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all available procs 3403 if (__kmp_hws_proc.offset >= __kmp_nThreadsPerCore) { 3404 KMP_WARNING(AffHWSubsetManyProcs); 3405 goto _exit; 3406 } 3407 // end of validation -------------------------------------------- 3408 3409 if (pAddr) // pAddr is NULL in case of affinity_none 3410 newAddr = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * 3411 __kmp_avail_proc); // max size 3412 // main loop to form HW subset ---------------------------------- 3413 hS = NULL; 3414 int NP = hwloc_get_nbobjs_by_type(tp, HWLOC_OBJ_PACKAGE); 3415 for (int s = 0; s < NP; ++s) { 3416 // Check Socket ----------------------------------------------- 3417 hS = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hS); 3418 if (!__kmp_hwloc_obj_has_PUs(tp, hS)) 3419 continue; // skip socket if all PUs are out of fullMask 3420 ++nS; // only count objects those have PUs in affinity mask 3421 if (nS <= __kmp_hws_socket.offset || 3422 nS > __kmp_hws_socket.num + __kmp_hws_socket.offset) { 3423 n_old += __kmp_hwloc_skip_PUs_obj(tp, hS); // skip socket 3424 continue; // move to next socket 3425 } 3426 nCr = 0; // count number of cores per socket 3427 // socket requested, go down the topology tree 3428 // check 4 cases: (+NUMA+Tile), (+NUMA-Tile), (-NUMA+Tile), (-NUMA-Tile) 3429 if (numa_support) { 3430 nN = 0; 3431 hN = NULL; 3432 // num nodes in current socket 3433 int NN = 3434 __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, &hN); 3435 for (int n = 0; n < NN; ++n) { 3436 // Check NUMA Node ---------------------------------------- 3437 if (!__kmp_hwloc_obj_has_PUs(tp, hN)) { 3438 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3439 continue; // skip node if all PUs are out of fullMask 3440 } 3441 ++nN; 3442 if (nN <= __kmp_hws_node.offset || 3443 nN > __kmp_hws_node.num + __kmp_hws_node.offset) { 3444 // skip node as not requested 3445 n_old += __kmp_hwloc_skip_PUs_obj(tp, hN); // skip node 3446 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3447 continue; // move to next node 3448 } 3449 // node requested, go down the topology tree 3450 if (tile_support) { 3451 nL = 0; 3452 hL = NULL; 3453 int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL); 3454 for (int l = 0; l < NL; ++l) { 3455 // Check L2 (tile) ------------------------------------ 3456 if (!__kmp_hwloc_obj_has_PUs(tp, hL)) { 3457 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3458 continue; // skip tile if all PUs are out of fullMask 3459 } 3460 ++nL; 3461 if (nL <= __kmp_hws_tile.offset || 3462 nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) { 3463 // skip tile as not requested 3464 n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile 3465 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3466 continue; // move to next tile 3467 } 3468 // tile requested, go down the topology tree 3469 nC = 0; 3470 hC = NULL; 3471 // num cores in current tile 3472 int NC = __kmp_hwloc_count_children_by_type(tp, hL, 3473 HWLOC_OBJ_CORE, &hC); 3474 for (int c = 0; c < NC; ++c) { 3475 // Check Core --------------------------------------- 3476 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3477 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3478 continue; // skip core if all PUs are out of fullMask 3479 } 3480 ++nC; 3481 if (nC <= __kmp_hws_core.offset || 3482 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3483 // skip node as not requested 3484 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3485 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3486 continue; // move to next node 3487 } 3488 // core requested, go down to PUs 3489 nT = 0; 3490 nTr = 0; 3491 hT = NULL; 3492 // num procs in current core 3493 int NT = __kmp_hwloc_count_children_by_type(tp, hC, 3494 HWLOC_OBJ_PU, &hT); 3495 for (int t = 0; t < NT; ++t) { 3496 // Check PU --------------------------------------- 3497 idx = hT->os_index; 3498 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3499 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3500 continue; // skip PU if not in fullMask 3501 } 3502 ++nT; 3503 if (nT <= __kmp_hws_proc.offset || 3504 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3505 // skip PU 3506 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3507 ++n_old; 3508 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3509 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3510 continue; // move to next node 3511 } 3512 ++nTr; 3513 if (pAddr) // collect requested thread's data 3514 newAddr[n_new] = (*pAddr)[n_old]; 3515 ++n_new; 3516 ++n_old; 3517 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3518 } // threads loop 3519 if (nTr > 0) { 3520 ++nCr; // num cores per socket 3521 ++nCo; // total num cores 3522 if (nTr > nTpC) 3523 nTpC = nTr; // calc max threads per core 3524 } 3525 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3526 } // cores loop 3527 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3528 } // tiles loop 3529 } else { // tile_support 3530 // no tiles, check cores 3531 nC = 0; 3532 hC = NULL; 3533 // num cores in current node 3534 int NC = 3535 __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, &hC); 3536 for (int c = 0; c < NC; ++c) { 3537 // Check Core --------------------------------------- 3538 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3539 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3540 continue; // skip core if all PUs are out of fullMask 3541 } 3542 ++nC; 3543 if (nC <= __kmp_hws_core.offset || 3544 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3545 // skip node as not requested 3546 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3547 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3548 continue; // move to next node 3549 } 3550 // core requested, go down to PUs 3551 nT = 0; 3552 nTr = 0; 3553 hT = NULL; 3554 int NT = 3555 __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT); 3556 for (int t = 0; t < NT; ++t) { 3557 // Check PU --------------------------------------- 3558 idx = hT->os_index; 3559 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3560 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3561 continue; // skip PU if not in fullMask 3562 } 3563 ++nT; 3564 if (nT <= __kmp_hws_proc.offset || 3565 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3566 // skip PU 3567 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3568 ++n_old; 3569 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3570 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3571 continue; // move to next node 3572 } 3573 ++nTr; 3574 if (pAddr) // collect requested thread's data 3575 newAddr[n_new] = (*pAddr)[n_old]; 3576 ++n_new; 3577 ++n_old; 3578 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3579 } // threads loop 3580 if (nTr > 0) { 3581 ++nCr; // num cores per socket 3582 ++nCo; // total num cores 3583 if (nTr > nTpC) 3584 nTpC = nTr; // calc max threads per core 3585 } 3586 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3587 } // cores loop 3588 } // tiles support 3589 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3590 } // nodes loop 3591 } else { // numa_support 3592 // no NUMA support 3593 if (tile_support) { 3594 nL = 0; 3595 hL = NULL; 3596 // num tiles in current socket 3597 int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL); 3598 for (int l = 0; l < NL; ++l) { 3599 // Check L2 (tile) ------------------------------------ 3600 if (!__kmp_hwloc_obj_has_PUs(tp, hL)) { 3601 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3602 continue; // skip tile if all PUs are out of fullMask 3603 } 3604 ++nL; 3605 if (nL <= __kmp_hws_tile.offset || 3606 nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) { 3607 // skip tile as not requested 3608 n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile 3609 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3610 continue; // move to next tile 3611 } 3612 // tile requested, go down the topology tree 3613 nC = 0; 3614 hC = NULL; 3615 // num cores per tile 3616 int NC = 3617 __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC); 3618 for (int c = 0; c < NC; ++c) { 3619 // Check Core --------------------------------------- 3620 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3621 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3622 continue; // skip core if all PUs are out of fullMask 3623 } 3624 ++nC; 3625 if (nC <= __kmp_hws_core.offset || 3626 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3627 // skip node as not requested 3628 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3629 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3630 continue; // move to next node 3631 } 3632 // core requested, go down to PUs 3633 nT = 0; 3634 nTr = 0; 3635 hT = NULL; 3636 // num procs per core 3637 int NT = 3638 __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT); 3639 for (int t = 0; t < NT; ++t) { 3640 // Check PU --------------------------------------- 3641 idx = hT->os_index; 3642 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3643 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3644 continue; // skip PU if not in fullMask 3645 } 3646 ++nT; 3647 if (nT <= __kmp_hws_proc.offset || 3648 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3649 // skip PU 3650 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3651 ++n_old; 3652 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3653 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3654 continue; // move to next node 3655 } 3656 ++nTr; 3657 if (pAddr) // collect requested thread's data 3658 newAddr[n_new] = (*pAddr)[n_old]; 3659 ++n_new; 3660 ++n_old; 3661 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3662 } // threads loop 3663 if (nTr > 0) { 3664 ++nCr; // num cores per socket 3665 ++nCo; // total num cores 3666 if (nTr > nTpC) 3667 nTpC = nTr; // calc max threads per core 3668 } 3669 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3670 } // cores loop 3671 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3672 } // tiles loop 3673 } else { // tile_support 3674 // no tiles, check cores 3675 nC = 0; 3676 hC = NULL; 3677 // num cores in socket 3678 int NC = 3679 __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, &hC); 3680 for (int c = 0; c < NC; ++c) { 3681 // Check Core ------------------------------------------- 3682 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3683 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3684 continue; // skip core if all PUs are out of fullMask 3685 } 3686 ++nC; 3687 if (nC <= __kmp_hws_core.offset || 3688 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3689 // skip node as not requested 3690 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3691 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3692 continue; // move to next node 3693 } 3694 // core requested, go down to PUs 3695 nT = 0; 3696 nTr = 0; 3697 hT = NULL; 3698 // num procs per core 3699 int NT = 3700 __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT); 3701 for (int t = 0; t < NT; ++t) { 3702 // Check PU --------------------------------------- 3703 idx = hT->os_index; 3704 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3705 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3706 continue; // skip PU if not in fullMask 3707 } 3708 ++nT; 3709 if (nT <= __kmp_hws_proc.offset || 3710 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3711 // skip PU 3712 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3713 ++n_old; 3714 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3715 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3716 continue; // move to next node 3717 } 3718 ++nTr; 3719 if (pAddr) // collect requested thread's data 3720 newAddr[n_new] = (*pAddr)[n_old]; 3721 ++n_new; 3722 ++n_old; 3723 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3724 } // threads loop 3725 if (nTr > 0) { 3726 ++nCr; // num cores per socket 3727 ++nCo; // total num cores 3728 if (nTr > nTpC) 3729 nTpC = nTr; // calc max threads per core 3730 } 3731 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3732 } // cores loop 3733 } // tiles support 3734 } // numa_support 3735 if (nCr > 0) { // found cores? 3736 ++nPkg; // num sockets 3737 if (nCr > nCpP) 3738 nCpP = nCr; // calc max cores per socket 3739 } 3740 } // sockets loop 3741 3742 // check the subset is valid 3743 KMP_DEBUG_ASSERT(n_old == __kmp_avail_proc); 3744 KMP_DEBUG_ASSERT(nPkg > 0); 3745 KMP_DEBUG_ASSERT(nCpP > 0); 3746 KMP_DEBUG_ASSERT(nTpC > 0); 3747 KMP_DEBUG_ASSERT(nCo > 0); 3748 KMP_DEBUG_ASSERT(nPkg <= nPackages); 3749 KMP_DEBUG_ASSERT(nCpP <= nCoresPerPkg); 3750 KMP_DEBUG_ASSERT(nTpC <= __kmp_nThreadsPerCore); 3751 KMP_DEBUG_ASSERT(nCo <= __kmp_ncores); 3752 3753 nPackages = nPkg; // correct num sockets 3754 nCoresPerPkg = nCpP; // correct num cores per socket 3755 __kmp_nThreadsPerCore = nTpC; // correct num threads per core 3756 __kmp_avail_proc = n_new; // correct num procs 3757 __kmp_ncores = nCo; // correct num cores 3758 // hwloc topology method end 3759 } else 3760 #endif // KMP_USE_HWLOC 3761 { 3762 int n_old = 0, n_new = 0, proc_num = 0; 3763 if (__kmp_hws_node.num > 0 || __kmp_hws_tile.num > 0) { 3764 KMP_WARNING(AffHWSubsetNoHWLOC); 3765 goto _exit; 3766 } 3767 if (__kmp_hws_socket.num == 0) 3768 __kmp_hws_socket.num = nPackages; // use all available sockets 3769 if (__kmp_hws_core.num == 0) 3770 __kmp_hws_core.num = nCoresPerPkg; // use all available cores 3771 if (__kmp_hws_proc.num == 0 || __kmp_hws_proc.num > __kmp_nThreadsPerCore) 3772 __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all HW contexts 3773 if (!__kmp_affinity_uniform_topology()) { 3774 KMP_WARNING(AffHWSubsetNonUniform); 3775 goto _exit; // don't support non-uniform topology 3776 } 3777 if (depth > 3) { 3778 KMP_WARNING(AffHWSubsetNonThreeLevel); 3779 goto _exit; // don't support not-3-level topology 3780 } 3781 if (__kmp_hws_socket.offset + __kmp_hws_socket.num > nPackages) { 3782 KMP_WARNING(AffHWSubsetManySockets); 3783 goto _exit; 3784 } 3785 if (__kmp_hws_core.offset + __kmp_hws_core.num > nCoresPerPkg) { 3786 KMP_WARNING(AffHWSubsetManyCores); 3787 goto _exit; 3788 } 3789 // Form the requested subset 3790 if (pAddr) // pAddr is NULL in case of affinity_none 3791 newAddr = (AddrUnsPair *)__kmp_allocate( 3792 sizeof(AddrUnsPair) * __kmp_hws_socket.num * __kmp_hws_core.num * 3793 __kmp_hws_proc.num); 3794 for (int i = 0; i < nPackages; ++i) { 3795 if (i < __kmp_hws_socket.offset || 3796 i >= __kmp_hws_socket.offset + __kmp_hws_socket.num) { 3797 // skip not-requested socket 3798 n_old += nCoresPerPkg * __kmp_nThreadsPerCore; 3799 if (__kmp_pu_os_idx != NULL) { 3800 // walk through skipped socket 3801 for (int j = 0; j < nCoresPerPkg; ++j) { 3802 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 3803 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3804 ++proc_num; 3805 } 3806 } 3807 } 3808 } else { 3809 // walk through requested socket 3810 for (int j = 0; j < nCoresPerPkg; ++j) { 3811 if (j < __kmp_hws_core.offset || 3812 j >= __kmp_hws_core.offset + 3813 __kmp_hws_core.num) { // skip not-requested core 3814 n_old += __kmp_nThreadsPerCore; 3815 if (__kmp_pu_os_idx != NULL) { 3816 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 3817 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3818 ++proc_num; 3819 } 3820 } 3821 } else { 3822 // walk through requested core 3823 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 3824 if (k < __kmp_hws_proc.num) { 3825 if (pAddr) // collect requested thread's data 3826 newAddr[n_new] = (*pAddr)[n_old]; 3827 n_new++; 3828 } else { 3829 if (__kmp_pu_os_idx != NULL) 3830 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3831 } 3832 n_old++; 3833 ++proc_num; 3834 } 3835 } 3836 } 3837 } 3838 } 3839 KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore); 3840 KMP_DEBUG_ASSERT(n_new == 3841 __kmp_hws_socket.num * __kmp_hws_core.num * 3842 __kmp_hws_proc.num); 3843 nPackages = __kmp_hws_socket.num; // correct nPackages 3844 nCoresPerPkg = __kmp_hws_core.num; // correct nCoresPerPkg 3845 __kmp_nThreadsPerCore = __kmp_hws_proc.num; // correct __kmp_nThreadsPerCore 3846 __kmp_avail_proc = n_new; // correct avail_proc 3847 __kmp_ncores = nPackages * __kmp_hws_core.num; // correct ncores 3848 } // non-hwloc topology method 3849 if (pAddr) { 3850 __kmp_free(*pAddr); 3851 *pAddr = newAddr; // replace old topology with new one 3852 } 3853 if (__kmp_affinity_verbose) { 3854 char m[KMP_AFFIN_MASK_PRINT_LEN]; 3855 __kmp_affinity_print_mask(m, KMP_AFFIN_MASK_PRINT_LEN, 3856 __kmp_affin_fullMask); 3857 if (__kmp_affinity_respect_mask) { 3858 KMP_INFORM(InitOSProcSetRespect, "KMP_HW_SUBSET", m); 3859 } else { 3860 KMP_INFORM(InitOSProcSetNotRespect, "KMP_HW_SUBSET", m); 3861 } 3862 KMP_INFORM(AvailableOSProc, "KMP_HW_SUBSET", __kmp_avail_proc); 3863 kmp_str_buf_t buf; 3864 __kmp_str_buf_init(&buf); 3865 __kmp_str_buf_print(&buf, "%d", nPackages); 3866 KMP_INFORM(TopologyExtra, "KMP_HW_SUBSET", buf.str, nCoresPerPkg, 3867 __kmp_nThreadsPerCore, __kmp_ncores); 3868 __kmp_str_buf_free(&buf); 3869 } 3870 _exit: 3871 if (__kmp_pu_os_idx != NULL) { 3872 __kmp_free(__kmp_pu_os_idx); 3873 __kmp_pu_os_idx = NULL; 3874 } 3875 } 3876 3877 // This function figures out the deepest level at which there is at least one 3878 // cluster/core with more than one processing unit bound to it. 3879 static int __kmp_affinity_find_core_level(const AddrUnsPair *address2os, 3880 int nprocs, int bottom_level) { 3881 int core_level = 0; 3882 3883 for (int i = 0; i < nprocs; i++) { 3884 for (int j = bottom_level; j > 0; j--) { 3885 if (address2os[i].first.labels[j] > 0) { 3886 if (core_level < (j - 1)) { 3887 core_level = j - 1; 3888 } 3889 } 3890 } 3891 } 3892 return core_level; 3893 } 3894 3895 // This function counts number of clusters/cores at given level. 3896 static int __kmp_affinity_compute_ncores(const AddrUnsPair *address2os, 3897 int nprocs, int bottom_level, 3898 int core_level) { 3899 int ncores = 0; 3900 int i, j; 3901 3902 j = bottom_level; 3903 for (i = 0; i < nprocs; i++) { 3904 for (j = bottom_level; j > core_level; j--) { 3905 if ((i + 1) < nprocs) { 3906 if (address2os[i + 1].first.labels[j] > 0) { 3907 break; 3908 } 3909 } 3910 } 3911 if (j == core_level) { 3912 ncores++; 3913 } 3914 } 3915 if (j > core_level) { 3916 // In case of ( nprocs < __kmp_avail_proc ) we may end too deep and miss one 3917 // core. May occur when called from __kmp_affinity_find_core(). 3918 ncores++; 3919 } 3920 return ncores; 3921 } 3922 3923 // This function finds to which cluster/core given processing unit is bound. 3924 static int __kmp_affinity_find_core(const AddrUnsPair *address2os, int proc, 3925 int bottom_level, int core_level) { 3926 return __kmp_affinity_compute_ncores(address2os, proc + 1, bottom_level, 3927 core_level) - 3928 1; 3929 } 3930 3931 // This function finds maximal number of processing units bound to a 3932 // cluster/core at given level. 3933 static int __kmp_affinity_max_proc_per_core(const AddrUnsPair *address2os, 3934 int nprocs, int bottom_level, 3935 int core_level) { 3936 int maxprocpercore = 0; 3937 3938 if (core_level < bottom_level) { 3939 for (int i = 0; i < nprocs; i++) { 3940 int percore = address2os[i].first.labels[core_level + 1] + 1; 3941 3942 if (percore > maxprocpercore) { 3943 maxprocpercore = percore; 3944 } 3945 } 3946 } else { 3947 maxprocpercore = 1; 3948 } 3949 return maxprocpercore; 3950 } 3951 3952 static AddrUnsPair *address2os = NULL; 3953 static int *procarr = NULL; 3954 static int __kmp_aff_depth = 0; 3955 3956 #define KMP_EXIT_AFF_NONE \ 3957 KMP_ASSERT(__kmp_affinity_type == affinity_none); \ 3958 KMP_ASSERT(address2os == NULL); \ 3959 __kmp_apply_thread_places(NULL, 0); \ 3960 return; 3961 3962 static int __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) { 3963 const Address *aa = &(((const AddrUnsPair *)a)->first); 3964 const Address *bb = &(((const AddrUnsPair *)b)->first); 3965 unsigned depth = aa->depth; 3966 unsigned i; 3967 KMP_DEBUG_ASSERT(depth == bb->depth); 3968 KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth); 3969 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0); 3970 for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) { 3971 int j = depth - i - 1; 3972 if (aa->childNums[j] < bb->childNums[j]) 3973 return -1; 3974 if (aa->childNums[j] > bb->childNums[j]) 3975 return 1; 3976 } 3977 for (; i < depth; i++) { 3978 int j = i - __kmp_affinity_compact; 3979 if (aa->childNums[j] < bb->childNums[j]) 3980 return -1; 3981 if (aa->childNums[j] > bb->childNums[j]) 3982 return 1; 3983 } 3984 return 0; 3985 } 3986 3987 static void __kmp_aux_affinity_initialize(void) { 3988 if (__kmp_affinity_masks != NULL) { 3989 KMP_ASSERT(__kmp_affin_fullMask != NULL); 3990 return; 3991 } 3992 3993 // Create the "full" mask - this defines all of the processors that we 3994 // consider to be in the machine model. If respect is set, then it is the 3995 // initialization thread's affinity mask. Otherwise, it is all processors that 3996 // we know about on the machine. 3997 if (__kmp_affin_fullMask == NULL) { 3998 KMP_CPU_ALLOC(__kmp_affin_fullMask); 3999 } 4000 if (KMP_AFFINITY_CAPABLE()) { 4001 if (__kmp_affinity_respect_mask) { 4002 __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE); 4003 4004 // Count the number of available processors. 4005 unsigned i; 4006 __kmp_avail_proc = 0; 4007 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 4008 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 4009 continue; 4010 } 4011 __kmp_avail_proc++; 4012 } 4013 if (__kmp_avail_proc > __kmp_xproc) { 4014 if (__kmp_affinity_verbose || 4015 (__kmp_affinity_warnings && 4016 (__kmp_affinity_type != affinity_none))) { 4017 KMP_WARNING(ErrorInitializeAffinity); 4018 } 4019 __kmp_affinity_type = affinity_none; 4020 KMP_AFFINITY_DISABLE(); 4021 return; 4022 } 4023 } else { 4024 __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask); 4025 __kmp_avail_proc = __kmp_xproc; 4026 } 4027 } 4028 4029 if (__kmp_affinity_gran == affinity_gran_tile && 4030 // check if user's request is valid 4031 __kmp_affinity_dispatch->get_api_type() == KMPAffinity::NATIVE_OS) { 4032 KMP_WARNING(AffTilesNoHWLOC, "KMP_AFFINITY"); 4033 __kmp_affinity_gran = affinity_gran_package; 4034 } 4035 4036 int depth = -1; 4037 kmp_i18n_id_t msg_id = kmp_i18n_null; 4038 4039 // For backward compatibility, setting KMP_CPUINFO_FILE => 4040 // KMP_TOPOLOGY_METHOD=cpuinfo 4041 if ((__kmp_cpuinfo_file != NULL) && 4042 (__kmp_affinity_top_method == affinity_top_method_all)) { 4043 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 4044 } 4045 4046 if (__kmp_affinity_top_method == affinity_top_method_all) { 4047 // In the default code path, errors are not fatal - we just try using 4048 // another method. We only emit a warning message if affinity is on, or the 4049 // verbose flag is set, an the nowarnings flag was not set. 4050 const char *file_name = NULL; 4051 int line = 0; 4052 #if KMP_USE_HWLOC 4053 if (depth < 0 && 4054 __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 4055 if (__kmp_affinity_verbose) { 4056 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 4057 } 4058 if (!__kmp_hwloc_error) { 4059 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); 4060 if (depth == 0) { 4061 KMP_EXIT_AFF_NONE; 4062 } else if (depth < 0 && __kmp_affinity_verbose) { 4063 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 4064 } 4065 } else if (__kmp_affinity_verbose) { 4066 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 4067 } 4068 } 4069 #endif 4070 4071 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4072 4073 if (depth < 0) { 4074 if (__kmp_affinity_verbose) { 4075 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 4076 } 4077 4078 file_name = NULL; 4079 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 4080 if (depth == 0) { 4081 KMP_EXIT_AFF_NONE; 4082 } 4083 4084 if (depth < 0) { 4085 if (__kmp_affinity_verbose) { 4086 if (msg_id != kmp_i18n_null) { 4087 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", 4088 __kmp_i18n_catgets(msg_id), 4089 KMP_I18N_STR(DecodingLegacyAPIC)); 4090 } else { 4091 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 4092 KMP_I18N_STR(DecodingLegacyAPIC)); 4093 } 4094 } 4095 4096 file_name = NULL; 4097 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 4098 if (depth == 0) { 4099 KMP_EXIT_AFF_NONE; 4100 } 4101 } 4102 } 4103 4104 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4105 4106 #if KMP_OS_LINUX 4107 4108 if (depth < 0) { 4109 if (__kmp_affinity_verbose) { 4110 if (msg_id != kmp_i18n_null) { 4111 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", 4112 __kmp_i18n_catgets(msg_id), "/proc/cpuinfo"); 4113 } else { 4114 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo"); 4115 } 4116 } 4117 4118 FILE *f = fopen("/proc/cpuinfo", "r"); 4119 if (f == NULL) { 4120 msg_id = kmp_i18n_str_CantOpenCpuinfo; 4121 } else { 4122 file_name = "/proc/cpuinfo"; 4123 depth = 4124 __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 4125 fclose(f); 4126 if (depth == 0) { 4127 KMP_EXIT_AFF_NONE; 4128 } 4129 } 4130 } 4131 4132 #endif /* KMP_OS_LINUX */ 4133 4134 #if KMP_GROUP_AFFINITY 4135 4136 if ((depth < 0) && (__kmp_num_proc_groups > 1)) { 4137 if (__kmp_affinity_verbose) { 4138 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 4139 } 4140 4141 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 4142 KMP_ASSERT(depth != 0); 4143 } 4144 4145 #endif /* KMP_GROUP_AFFINITY */ 4146 4147 if (depth < 0) { 4148 if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) { 4149 if (file_name == NULL) { 4150 KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id)); 4151 } else if (line == 0) { 4152 KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id)); 4153 } else { 4154 KMP_INFORM(UsingFlatOSFileLine, file_name, line, 4155 __kmp_i18n_catgets(msg_id)); 4156 } 4157 } 4158 // FIXME - print msg if msg_id = kmp_i18n_null ??? 4159 4160 file_name = ""; 4161 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 4162 if (depth == 0) { 4163 KMP_EXIT_AFF_NONE; 4164 } 4165 KMP_ASSERT(depth > 0); 4166 KMP_ASSERT(address2os != NULL); 4167 } 4168 } 4169 4170 #if KMP_USE_HWLOC 4171 else if (__kmp_affinity_top_method == affinity_top_method_hwloc) { 4172 KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC); 4173 if (__kmp_affinity_verbose) { 4174 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 4175 } 4176 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); 4177 if (depth == 0) { 4178 KMP_EXIT_AFF_NONE; 4179 } 4180 } 4181 #endif // KMP_USE_HWLOC 4182 4183 // If the user has specified that a paricular topology discovery method is to be 4184 // used, then we abort if that method fails. The exception is group affinity, 4185 // which might have been implicitly set. 4186 4187 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4188 4189 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 4190 if (__kmp_affinity_verbose) { 4191 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 4192 } 4193 4194 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 4195 if (depth == 0) { 4196 KMP_EXIT_AFF_NONE; 4197 } 4198 if (depth < 0) { 4199 KMP_ASSERT(msg_id != kmp_i18n_null); 4200 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4201 } 4202 } else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 4203 if (__kmp_affinity_verbose) { 4204 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 4205 } 4206 4207 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 4208 if (depth == 0) { 4209 KMP_EXIT_AFF_NONE; 4210 } 4211 if (depth < 0) { 4212 KMP_ASSERT(msg_id != kmp_i18n_null); 4213 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4214 } 4215 } 4216 4217 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4218 4219 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 4220 const char *filename; 4221 if (__kmp_cpuinfo_file != NULL) { 4222 filename = __kmp_cpuinfo_file; 4223 } else { 4224 filename = "/proc/cpuinfo"; 4225 } 4226 4227 if (__kmp_affinity_verbose) { 4228 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 4229 } 4230 4231 FILE *f = fopen(filename, "r"); 4232 if (f == NULL) { 4233 int code = errno; 4234 if (__kmp_cpuinfo_file != NULL) { 4235 __kmp_fatal(KMP_MSG(CantOpenFileForReading, filename), KMP_ERR(code), 4236 KMP_HNT(NameComesFrom_CPUINFO_FILE), __kmp_msg_null); 4237 } else { 4238 __kmp_fatal(KMP_MSG(CantOpenFileForReading, filename), KMP_ERR(code), 4239 __kmp_msg_null); 4240 } 4241 } 4242 int line = 0; 4243 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 4244 fclose(f); 4245 if (depth < 0) { 4246 KMP_ASSERT(msg_id != kmp_i18n_null); 4247 if (line > 0) { 4248 KMP_FATAL(FileLineMsgExiting, filename, line, 4249 __kmp_i18n_catgets(msg_id)); 4250 } else { 4251 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 4252 } 4253 } 4254 if (__kmp_affinity_type == affinity_none) { 4255 KMP_ASSERT(depth == 0); 4256 KMP_EXIT_AFF_NONE; 4257 } 4258 } 4259 4260 #if KMP_GROUP_AFFINITY 4261 4262 else if (__kmp_affinity_top_method == affinity_top_method_group) { 4263 if (__kmp_affinity_verbose) { 4264 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 4265 } 4266 4267 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 4268 KMP_ASSERT(depth != 0); 4269 if (depth < 0) { 4270 KMP_ASSERT(msg_id != kmp_i18n_null); 4271 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4272 } 4273 } 4274 4275 #endif /* KMP_GROUP_AFFINITY */ 4276 4277 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 4278 if (__kmp_affinity_verbose) { 4279 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY"); 4280 } 4281 4282 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 4283 if (depth == 0) { 4284 KMP_EXIT_AFF_NONE; 4285 } 4286 // should not fail 4287 KMP_ASSERT(depth > 0); 4288 KMP_ASSERT(address2os != NULL); 4289 } 4290 4291 if (address2os == NULL) { 4292 if (KMP_AFFINITY_CAPABLE() && 4293 (__kmp_affinity_verbose || 4294 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) { 4295 KMP_WARNING(ErrorInitializeAffinity); 4296 } 4297 __kmp_affinity_type = affinity_none; 4298 KMP_AFFINITY_DISABLE(); 4299 return; 4300 } 4301 4302 if (__kmp_affinity_gran == affinity_gran_tile 4303 #if KMP_USE_HWLOC 4304 && __kmp_tile_depth == 0 4305 #endif 4306 ) { 4307 // tiles requested but not detected, warn user on this 4308 KMP_WARNING(AffTilesNoTiles, "KMP_AFFINITY"); 4309 } 4310 4311 __kmp_apply_thread_places(&address2os, depth); 4312 4313 // Create the table of masks, indexed by thread Id. 4314 unsigned maxIndex; 4315 unsigned numUnique; 4316 kmp_affin_mask_t *osId2Mask = 4317 __kmp_create_masks(&maxIndex, &numUnique, address2os, __kmp_avail_proc); 4318 if (__kmp_affinity_gran_levels == 0) { 4319 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 4320 } 4321 4322 // Set the childNums vector in all Address objects. This must be done before 4323 // we can sort using __kmp_affinity_cmp_Address_child_num(), which takes into 4324 // account the setting of __kmp_affinity_compact. 4325 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc); 4326 4327 switch (__kmp_affinity_type) { 4328 4329 case affinity_explicit: 4330 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 4331 #if OMP_40_ENABLED 4332 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 4333 #endif 4334 { 4335 __kmp_affinity_process_proclist( 4336 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 4337 __kmp_affinity_proclist, osId2Mask, maxIndex); 4338 } 4339 #if OMP_40_ENABLED 4340 else { 4341 __kmp_affinity_process_placelist( 4342 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 4343 __kmp_affinity_proclist, osId2Mask, maxIndex); 4344 } 4345 #endif 4346 if (__kmp_affinity_num_masks == 0) { 4347 if (__kmp_affinity_verbose || 4348 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 4349 KMP_WARNING(AffNoValidProcID); 4350 } 4351 __kmp_affinity_type = affinity_none; 4352 return; 4353 } 4354 break; 4355 4356 // The other affinity types rely on sorting the Addresses according to some 4357 // permutation of the machine topology tree. Set __kmp_affinity_compact and 4358 // __kmp_affinity_offset appropriately, then jump to a common code fragment 4359 // to do the sort and create the array of affinity masks. 4360 4361 case affinity_logical: 4362 __kmp_affinity_compact = 0; 4363 if (__kmp_affinity_offset) { 4364 __kmp_affinity_offset = 4365 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 4366 } 4367 goto sortAddresses; 4368 4369 case affinity_physical: 4370 if (__kmp_nThreadsPerCore > 1) { 4371 __kmp_affinity_compact = 1; 4372 if (__kmp_affinity_compact >= depth) { 4373 __kmp_affinity_compact = 0; 4374 } 4375 } else { 4376 __kmp_affinity_compact = 0; 4377 } 4378 if (__kmp_affinity_offset) { 4379 __kmp_affinity_offset = 4380 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 4381 } 4382 goto sortAddresses; 4383 4384 case affinity_scatter: 4385 if (__kmp_affinity_compact >= depth) { 4386 __kmp_affinity_compact = 0; 4387 } else { 4388 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 4389 } 4390 goto sortAddresses; 4391 4392 case affinity_compact: 4393 if (__kmp_affinity_compact >= depth) { 4394 __kmp_affinity_compact = depth - 1; 4395 } 4396 goto sortAddresses; 4397 4398 case affinity_balanced: 4399 if (depth <= 1) { 4400 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 4401 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 4402 } 4403 __kmp_affinity_type = affinity_none; 4404 return; 4405 } else if (__kmp_affinity_uniform_topology()) { 4406 break; 4407 } else { // Non-uniform topology 4408 4409 // Save the depth for further usage 4410 __kmp_aff_depth = depth; 4411 4412 int core_level = __kmp_affinity_find_core_level( 4413 address2os, __kmp_avail_proc, depth - 1); 4414 int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, 4415 depth - 1, core_level); 4416 int maxprocpercore = __kmp_affinity_max_proc_per_core( 4417 address2os, __kmp_avail_proc, depth - 1, core_level); 4418 4419 int nproc = ncores * maxprocpercore; 4420 if ((nproc < 2) || (nproc < __kmp_avail_proc)) { 4421 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 4422 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 4423 } 4424 __kmp_affinity_type = affinity_none; 4425 return; 4426 } 4427 4428 procarr = (int *)__kmp_allocate(sizeof(int) * nproc); 4429 for (int i = 0; i < nproc; i++) { 4430 procarr[i] = -1; 4431 } 4432 4433 int lastcore = -1; 4434 int inlastcore = 0; 4435 for (int i = 0; i < __kmp_avail_proc; i++) { 4436 int proc = address2os[i].second; 4437 int core = 4438 __kmp_affinity_find_core(address2os, i, depth - 1, core_level); 4439 4440 if (core == lastcore) { 4441 inlastcore++; 4442 } else { 4443 inlastcore = 0; 4444 } 4445 lastcore = core; 4446 4447 procarr[core * maxprocpercore + inlastcore] = proc; 4448 } 4449 4450 break; 4451 } 4452 4453 sortAddresses: 4454 // Allocate the gtid->affinity mask table. 4455 if (__kmp_affinity_dups) { 4456 __kmp_affinity_num_masks = __kmp_avail_proc; 4457 } else { 4458 __kmp_affinity_num_masks = numUnique; 4459 } 4460 4461 #if OMP_40_ENABLED 4462 if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) && 4463 (__kmp_affinity_num_places > 0) && 4464 ((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) { 4465 __kmp_affinity_num_masks = __kmp_affinity_num_places; 4466 } 4467 #endif 4468 4469 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4470 4471 // Sort the address2os table according to the current setting of 4472 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 4473 qsort(address2os, __kmp_avail_proc, sizeof(*address2os), 4474 __kmp_affinity_cmp_Address_child_num); 4475 { 4476 int i; 4477 unsigned j; 4478 for (i = 0, j = 0; i < __kmp_avail_proc; i++) { 4479 if ((!__kmp_affinity_dups) && (!address2os[i].first.leader)) { 4480 continue; 4481 } 4482 unsigned osId = address2os[i].second; 4483 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 4484 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j); 4485 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 4486 KMP_CPU_COPY(dest, src); 4487 if (++j >= __kmp_affinity_num_masks) { 4488 break; 4489 } 4490 } 4491 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 4492 } 4493 break; 4494 4495 default: 4496 KMP_ASSERT2(0, "Unexpected affinity setting"); 4497 } 4498 4499 KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1); 4500 machine_hierarchy.init(address2os, __kmp_avail_proc); 4501 } 4502 #undef KMP_EXIT_AFF_NONE 4503 4504 void __kmp_affinity_initialize(void) { 4505 // Much of the code above was written assumming that if a machine was not 4506 // affinity capable, then __kmp_affinity_type == affinity_none. We now 4507 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 4508 // There are too many checks for __kmp_affinity_type == affinity_none 4509 // in this code. Instead of trying to change them all, check if 4510 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 4511 // affinity_none, call the real initialization routine, then restore 4512 // __kmp_affinity_type to affinity_disabled. 4513 int disabled = (__kmp_affinity_type == affinity_disabled); 4514 if (!KMP_AFFINITY_CAPABLE()) { 4515 KMP_ASSERT(disabled); 4516 } 4517 if (disabled) { 4518 __kmp_affinity_type = affinity_none; 4519 } 4520 __kmp_aux_affinity_initialize(); 4521 if (disabled) { 4522 __kmp_affinity_type = affinity_disabled; 4523 } 4524 } 4525 4526 void __kmp_affinity_uninitialize(void) { 4527 if (__kmp_affinity_masks != NULL) { 4528 KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4529 __kmp_affinity_masks = NULL; 4530 } 4531 if (__kmp_affin_fullMask != NULL) { 4532 KMP_CPU_FREE(__kmp_affin_fullMask); 4533 __kmp_affin_fullMask = NULL; 4534 } 4535 __kmp_affinity_num_masks = 0; 4536 __kmp_affinity_type = affinity_default; 4537 #if OMP_40_ENABLED 4538 __kmp_affinity_num_places = 0; 4539 #endif 4540 if (__kmp_affinity_proclist != NULL) { 4541 __kmp_free(__kmp_affinity_proclist); 4542 __kmp_affinity_proclist = NULL; 4543 } 4544 if (address2os != NULL) { 4545 __kmp_free(address2os); 4546 address2os = NULL; 4547 } 4548 if (procarr != NULL) { 4549 __kmp_free(procarr); 4550 procarr = NULL; 4551 } 4552 #if KMP_USE_HWLOC 4553 if (__kmp_hwloc_topology != NULL) { 4554 hwloc_topology_destroy(__kmp_hwloc_topology); 4555 __kmp_hwloc_topology = NULL; 4556 } 4557 #endif 4558 KMPAffinity::destroy_api(); 4559 } 4560 4561 void __kmp_affinity_set_init_mask(int gtid, int isa_root) { 4562 if (!KMP_AFFINITY_CAPABLE()) { 4563 return; 4564 } 4565 4566 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4567 if (th->th.th_affin_mask == NULL) { 4568 KMP_CPU_ALLOC(th->th.th_affin_mask); 4569 } else { 4570 KMP_CPU_ZERO(th->th.th_affin_mask); 4571 } 4572 4573 // Copy the thread mask to the kmp_info_t strucuture. If 4574 // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that 4575 // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set, 4576 // then the full mask is the same as the mask of the initialization thread. 4577 kmp_affin_mask_t *mask; 4578 int i; 4579 4580 #if OMP_40_ENABLED 4581 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 4582 #endif 4583 { 4584 if ((__kmp_affinity_type == affinity_none) || 4585 (__kmp_affinity_type == affinity_balanced)) { 4586 #if KMP_GROUP_AFFINITY 4587 if (__kmp_num_proc_groups > 1) { 4588 return; 4589 } 4590 #endif 4591 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4592 i = KMP_PLACE_ALL; 4593 mask = __kmp_affin_fullMask; 4594 } else { 4595 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 4596 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4597 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4598 } 4599 } 4600 #if OMP_40_ENABLED 4601 else { 4602 if ((!isa_root) || 4603 (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 4604 #if KMP_GROUP_AFFINITY 4605 if (__kmp_num_proc_groups > 1) { 4606 return; 4607 } 4608 #endif 4609 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4610 i = KMP_PLACE_ALL; 4611 mask = __kmp_affin_fullMask; 4612 } else { 4613 // int i = some hash function or just a counter that doesn't 4614 // always start at 0. Use gtid for now. 4615 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 4616 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4617 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4618 } 4619 } 4620 #endif 4621 4622 #if OMP_40_ENABLED 4623 th->th.th_current_place = i; 4624 if (isa_root) { 4625 th->th.th_new_place = i; 4626 th->th.th_first_place = 0; 4627 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4628 } 4629 4630 if (i == KMP_PLACE_ALL) { 4631 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 4632 gtid)); 4633 } else { 4634 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 4635 gtid, i)); 4636 } 4637 #else 4638 if (i == -1) { 4639 KA_TRACE( 4640 100, 4641 ("__kmp_affinity_set_init_mask: binding T#%d to __kmp_affin_fullMask\n", 4642 gtid)); 4643 } else { 4644 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n", 4645 gtid, i)); 4646 } 4647 #endif /* OMP_40_ENABLED */ 4648 4649 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4650 4651 if (__kmp_affinity_verbose 4652 /* to avoid duplicate printing (will be correctly printed on barrier) */ 4653 && (__kmp_affinity_type == affinity_none || i != KMP_PLACE_ALL)) { 4654 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4655 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4656 th->th.th_affin_mask); 4657 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4658 __kmp_gettid(), gtid, buf); 4659 } 4660 4661 #if KMP_OS_WINDOWS 4662 // On Windows* OS, the process affinity mask might have changed. If the user 4663 // didn't request affinity and this call fails, just continue silently. 4664 // See CQ171393. 4665 if (__kmp_affinity_type == affinity_none) { 4666 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 4667 } else 4668 #endif 4669 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4670 } 4671 4672 #if OMP_40_ENABLED 4673 4674 void __kmp_affinity_set_place(int gtid) { 4675 int retval; 4676 4677 if (!KMP_AFFINITY_CAPABLE()) { 4678 return; 4679 } 4680 4681 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4682 4683 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current " 4684 "place = %d)\n", 4685 gtid, th->th.th_new_place, th->th.th_current_place)); 4686 4687 // Check that the new place is within this thread's partition. 4688 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4689 KMP_ASSERT(th->th.th_new_place >= 0); 4690 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 4691 if (th->th.th_first_place <= th->th.th_last_place) { 4692 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) && 4693 (th->th.th_new_place <= th->th.th_last_place)); 4694 } else { 4695 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) || 4696 (th->th.th_new_place >= th->th.th_last_place)); 4697 } 4698 4699 // Copy the thread mask to the kmp_info_t strucuture, 4700 // and set this thread's affinity. 4701 kmp_affin_mask_t *mask = 4702 KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place); 4703 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4704 th->th.th_current_place = th->th.th_new_place; 4705 4706 if (__kmp_affinity_verbose) { 4707 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4708 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4709 th->th.th_affin_mask); 4710 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), 4711 __kmp_gettid(), gtid, buf); 4712 } 4713 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4714 } 4715 4716 #endif /* OMP_40_ENABLED */ 4717 4718 int __kmp_aux_set_affinity(void **mask) { 4719 int gtid; 4720 kmp_info_t *th; 4721 int retval; 4722 4723 if (!KMP_AFFINITY_CAPABLE()) { 4724 return -1; 4725 } 4726 4727 gtid = __kmp_entry_gtid(); 4728 KA_TRACE(1000, ; { 4729 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4730 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4731 (kmp_affin_mask_t *)(*mask)); 4732 __kmp_debug_printf( 4733 "kmp_set_affinity: setting affinity mask for thread %d = %s\n", gtid, 4734 buf); 4735 }); 4736 4737 if (__kmp_env_consistency_check) { 4738 if ((mask == NULL) || (*mask == NULL)) { 4739 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4740 } else { 4741 unsigned proc; 4742 int num_procs = 0; 4743 4744 KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) { 4745 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4746 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4747 } 4748 if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 4749 continue; 4750 } 4751 num_procs++; 4752 } 4753 if (num_procs == 0) { 4754 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4755 } 4756 4757 #if KMP_GROUP_AFFINITY 4758 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 4759 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4760 } 4761 #endif /* KMP_GROUP_AFFINITY */ 4762 } 4763 } 4764 4765 th = __kmp_threads[gtid]; 4766 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4767 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4768 if (retval == 0) { 4769 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 4770 } 4771 4772 #if OMP_40_ENABLED 4773 th->th.th_current_place = KMP_PLACE_UNDEFINED; 4774 th->th.th_new_place = KMP_PLACE_UNDEFINED; 4775 th->th.th_first_place = 0; 4776 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4777 4778 // Turn off 4.0 affinity for the current tread at this parallel level. 4779 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 4780 #endif 4781 4782 return retval; 4783 } 4784 4785 int __kmp_aux_get_affinity(void **mask) { 4786 int gtid; 4787 int retval; 4788 kmp_info_t *th; 4789 4790 if (!KMP_AFFINITY_CAPABLE()) { 4791 return -1; 4792 } 4793 4794 gtid = __kmp_entry_gtid(); 4795 th = __kmp_threads[gtid]; 4796 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4797 4798 KA_TRACE(1000, ; { 4799 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4800 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4801 th->th.th_affin_mask); 4802 __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", 4803 gtid, buf); 4804 }); 4805 4806 if (__kmp_env_consistency_check) { 4807 if ((mask == NULL) || (*mask == NULL)) { 4808 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 4809 } 4810 } 4811 4812 #if !KMP_OS_WINDOWS 4813 4814 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4815 KA_TRACE(1000, ; { 4816 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4817 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4818 (kmp_affin_mask_t *)(*mask)); 4819 __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", 4820 gtid, buf); 4821 }); 4822 return retval; 4823 4824 #else 4825 4826 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 4827 return 0; 4828 4829 #endif /* KMP_OS_WINDOWS */ 4830 } 4831 4832 int __kmp_aux_get_affinity_max_proc() { 4833 if (!KMP_AFFINITY_CAPABLE()) { 4834 return 0; 4835 } 4836 #if KMP_GROUP_AFFINITY 4837 if (__kmp_num_proc_groups > 1) { 4838 return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT); 4839 } 4840 #endif 4841 return __kmp_xproc; 4842 } 4843 4844 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) { 4845 int retval; 4846 4847 if (!KMP_AFFINITY_CAPABLE()) { 4848 return -1; 4849 } 4850 4851 KA_TRACE(1000, ; { 4852 int gtid = __kmp_entry_gtid(); 4853 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4854 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4855 (kmp_affin_mask_t *)(*mask)); 4856 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in " 4857 "affinity mask for thread %d = %s\n", 4858 proc, gtid, buf); 4859 }); 4860 4861 if (__kmp_env_consistency_check) { 4862 if ((mask == NULL) || (*mask == NULL)) { 4863 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4864 } 4865 } 4866 4867 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4868 return -1; 4869 } 4870 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4871 return -2; 4872 } 4873 4874 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 4875 return 0; 4876 } 4877 4878 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) { 4879 int retval; 4880 4881 if (!KMP_AFFINITY_CAPABLE()) { 4882 return -1; 4883 } 4884 4885 KA_TRACE(1000, ; { 4886 int gtid = __kmp_entry_gtid(); 4887 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4888 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4889 (kmp_affin_mask_t *)(*mask)); 4890 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in " 4891 "affinity mask for thread %d = %s\n", 4892 proc, gtid, buf); 4893 }); 4894 4895 if (__kmp_env_consistency_check) { 4896 if ((mask == NULL) || (*mask == NULL)) { 4897 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 4898 } 4899 } 4900 4901 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4902 return -1; 4903 } 4904 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4905 return -2; 4906 } 4907 4908 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 4909 return 0; 4910 } 4911 4912 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) { 4913 int retval; 4914 4915 if (!KMP_AFFINITY_CAPABLE()) { 4916 return -1; 4917 } 4918 4919 KA_TRACE(1000, ; { 4920 int gtid = __kmp_entry_gtid(); 4921 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4922 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4923 (kmp_affin_mask_t *)(*mask)); 4924 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in " 4925 "affinity mask for thread %d = %s\n", 4926 proc, gtid, buf); 4927 }); 4928 4929 if (__kmp_env_consistency_check) { 4930 if ((mask == NULL) || (*mask == NULL)) { 4931 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc"); 4932 } 4933 } 4934 4935 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4936 return -1; 4937 } 4938 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4939 return 0; 4940 } 4941 4942 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 4943 } 4944 4945 // Dynamic affinity settings - Affinity balanced 4946 void __kmp_balanced_affinity(int tid, int nthreads) { 4947 bool fine_gran = true; 4948 4949 switch (__kmp_affinity_gran) { 4950 case affinity_gran_fine: 4951 case affinity_gran_thread: 4952 break; 4953 case affinity_gran_core: 4954 if (__kmp_nThreadsPerCore > 1) { 4955 fine_gran = false; 4956 } 4957 break; 4958 case affinity_gran_package: 4959 if (nCoresPerPkg > 1) { 4960 fine_gran = false; 4961 } 4962 break; 4963 default: 4964 fine_gran = false; 4965 } 4966 4967 if (__kmp_affinity_uniform_topology()) { 4968 int coreID; 4969 int threadID; 4970 // Number of hyper threads per core in HT machine 4971 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 4972 // Number of cores 4973 int ncores = __kmp_ncores; 4974 if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) { 4975 __kmp_nth_per_core = __kmp_avail_proc / nPackages; 4976 ncores = nPackages; 4977 } 4978 // How many threads will be bound to each core 4979 int chunk = nthreads / ncores; 4980 // How many cores will have an additional thread bound to it - "big cores" 4981 int big_cores = nthreads % ncores; 4982 // Number of threads on the big cores 4983 int big_nth = (chunk + 1) * big_cores; 4984 if (tid < big_nth) { 4985 coreID = tid / (chunk + 1); 4986 threadID = (tid % (chunk + 1)) % __kmp_nth_per_core; 4987 } else { // tid >= big_nth 4988 coreID = (tid - big_cores) / chunk; 4989 threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core; 4990 } 4991 4992 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 4993 "Illegal set affinity operation when not capable"); 4994 4995 kmp_affin_mask_t *mask; 4996 KMP_CPU_ALLOC_ON_STACK(mask); 4997 KMP_CPU_ZERO(mask); 4998 4999 if (fine_gran) { 5000 int osID = address2os[coreID * __kmp_nth_per_core + threadID].second; 5001 KMP_CPU_SET(osID, mask); 5002 } else { 5003 for (int i = 0; i < __kmp_nth_per_core; i++) { 5004 int osID; 5005 osID = address2os[coreID * __kmp_nth_per_core + i].second; 5006 KMP_CPU_SET(osID, mask); 5007 } 5008 } 5009 if (__kmp_affinity_verbose) { 5010 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5011 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 5012 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 5013 __kmp_gettid(), tid, buf); 5014 } 5015 __kmp_set_system_affinity(mask, TRUE); 5016 KMP_CPU_FREE_FROM_STACK(mask); 5017 } else { // Non-uniform topology 5018 5019 kmp_affin_mask_t *mask; 5020 KMP_CPU_ALLOC_ON_STACK(mask); 5021 KMP_CPU_ZERO(mask); 5022 5023 int core_level = __kmp_affinity_find_core_level( 5024 address2os, __kmp_avail_proc, __kmp_aff_depth - 1); 5025 int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, 5026 __kmp_aff_depth - 1, core_level); 5027 int nth_per_core = __kmp_affinity_max_proc_per_core( 5028 address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level); 5029 5030 // For performance gain consider the special case nthreads == 5031 // __kmp_avail_proc 5032 if (nthreads == __kmp_avail_proc) { 5033 if (fine_gran) { 5034 int osID = address2os[tid].second; 5035 KMP_CPU_SET(osID, mask); 5036 } else { 5037 int core = __kmp_affinity_find_core(address2os, tid, 5038 __kmp_aff_depth - 1, core_level); 5039 for (int i = 0; i < __kmp_avail_proc; i++) { 5040 int osID = address2os[i].second; 5041 if (__kmp_affinity_find_core(address2os, i, __kmp_aff_depth - 1, 5042 core_level) == core) { 5043 KMP_CPU_SET(osID, mask); 5044 } 5045 } 5046 } 5047 } else if (nthreads <= ncores) { 5048 5049 int core = 0; 5050 for (int i = 0; i < ncores; i++) { 5051 // Check if this core from procarr[] is in the mask 5052 int in_mask = 0; 5053 for (int j = 0; j < nth_per_core; j++) { 5054 if (procarr[i * nth_per_core + j] != -1) { 5055 in_mask = 1; 5056 break; 5057 } 5058 } 5059 if (in_mask) { 5060 if (tid == core) { 5061 for (int j = 0; j < nth_per_core; j++) { 5062 int osID = procarr[i * nth_per_core + j]; 5063 if (osID != -1) { 5064 KMP_CPU_SET(osID, mask); 5065 // For fine granularity it is enough to set the first available 5066 // osID for this core 5067 if (fine_gran) { 5068 break; 5069 } 5070 } 5071 } 5072 break; 5073 } else { 5074 core++; 5075 } 5076 } 5077 } 5078 } else { // nthreads > ncores 5079 // Array to save the number of processors at each core 5080 int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores); 5081 // Array to save the number of cores with "x" available processors; 5082 int *ncores_with_x_procs = 5083 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 5084 // Array to save the number of cores with # procs from x to nth_per_core 5085 int *ncores_with_x_to_max_procs = 5086 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 5087 5088 for (int i = 0; i <= nth_per_core; i++) { 5089 ncores_with_x_procs[i] = 0; 5090 ncores_with_x_to_max_procs[i] = 0; 5091 } 5092 5093 for (int i = 0; i < ncores; i++) { 5094 int cnt = 0; 5095 for (int j = 0; j < nth_per_core; j++) { 5096 if (procarr[i * nth_per_core + j] != -1) { 5097 cnt++; 5098 } 5099 } 5100 nproc_at_core[i] = cnt; 5101 ncores_with_x_procs[cnt]++; 5102 } 5103 5104 for (int i = 0; i <= nth_per_core; i++) { 5105 for (int j = i; j <= nth_per_core; j++) { 5106 ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j]; 5107 } 5108 } 5109 5110 // Max number of processors 5111 int nproc = nth_per_core * ncores; 5112 // An array to keep number of threads per each context 5113 int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc); 5114 for (int i = 0; i < nproc; i++) { 5115 newarr[i] = 0; 5116 } 5117 5118 int nth = nthreads; 5119 int flag = 0; 5120 while (nth > 0) { 5121 for (int j = 1; j <= nth_per_core; j++) { 5122 int cnt = ncores_with_x_to_max_procs[j]; 5123 for (int i = 0; i < ncores; i++) { 5124 // Skip the core with 0 processors 5125 if (nproc_at_core[i] == 0) { 5126 continue; 5127 } 5128 for (int k = 0; k < nth_per_core; k++) { 5129 if (procarr[i * nth_per_core + k] != -1) { 5130 if (newarr[i * nth_per_core + k] == 0) { 5131 newarr[i * nth_per_core + k] = 1; 5132 cnt--; 5133 nth--; 5134 break; 5135 } else { 5136 if (flag != 0) { 5137 newarr[i * nth_per_core + k]++; 5138 cnt--; 5139 nth--; 5140 break; 5141 } 5142 } 5143 } 5144 } 5145 if (cnt == 0 || nth == 0) { 5146 break; 5147 } 5148 } 5149 if (nth == 0) { 5150 break; 5151 } 5152 } 5153 flag = 1; 5154 } 5155 int sum = 0; 5156 for (int i = 0; i < nproc; i++) { 5157 sum += newarr[i]; 5158 if (sum > tid) { 5159 if (fine_gran) { 5160 int osID = procarr[i]; 5161 KMP_CPU_SET(osID, mask); 5162 } else { 5163 int coreID = i / nth_per_core; 5164 for (int ii = 0; ii < nth_per_core; ii++) { 5165 int osID = procarr[coreID * nth_per_core + ii]; 5166 if (osID != -1) { 5167 KMP_CPU_SET(osID, mask); 5168 } 5169 } 5170 } 5171 break; 5172 } 5173 } 5174 __kmp_free(newarr); 5175 } 5176 5177 if (__kmp_affinity_verbose) { 5178 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5179 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 5180 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 5181 __kmp_gettid(), tid, buf); 5182 } 5183 __kmp_set_system_affinity(mask, TRUE); 5184 KMP_CPU_FREE_FROM_STACK(mask); 5185 } 5186 } 5187 5188 #if KMP_OS_LINUX 5189 // We don't need this entry for Windows because 5190 // there is GetProcessAffinityMask() api 5191 // 5192 // The intended usage is indicated by these steps: 5193 // 1) The user gets the current affinity mask 5194 // 2) Then sets the affinity by calling this function 5195 // 3) Error check the return value 5196 // 4) Use non-OpenMP parallelization 5197 // 5) Reset the affinity to what was stored in step 1) 5198 #ifdef __cplusplus 5199 extern "C" 5200 #endif 5201 int 5202 kmp_set_thread_affinity_mask_initial() 5203 // the function returns 0 on success, 5204 // -1 if we cannot bind thread 5205 // >0 (errno) if an error happened during binding 5206 { 5207 int gtid = __kmp_get_gtid(); 5208 if (gtid < 0) { 5209 // Do not touch non-omp threads 5210 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5211 "non-omp thread, returning\n")); 5212 return -1; 5213 } 5214 if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) { 5215 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5216 "affinity not initialized, returning\n")); 5217 return -1; 5218 } 5219 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5220 "set full mask for thread %d\n", 5221 gtid)); 5222 KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL); 5223 return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE); 5224 } 5225 #endif 5226 5227 #endif // KMP_AFFINITY_SUPPORTED 5228