1 /* 2 * kmp_affinity.cpp -- affinity management 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // The LLVM Compiler Infrastructure 8 // 9 // This file is dual licensed under the MIT and the University of Illinois Open 10 // Source Licenses. See LICENSE.txt for details. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "kmp.h" 15 #include "kmp_affinity.h" 16 #include "kmp_i18n.h" 17 #include "kmp_io.h" 18 #include "kmp_str.h" 19 #include "kmp_wrapper_getpid.h" 20 21 // Store the real or imagined machine hierarchy here 22 static hierarchy_info machine_hierarchy; 23 24 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); } 25 26 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 27 kmp_uint32 depth; 28 // The test below is true if affinity is available, but set to "none". Need to 29 // init on first use of hierarchical barrier. 30 if (TCR_1(machine_hierarchy.uninitialized)) 31 machine_hierarchy.init(NULL, nproc); 32 33 // Adjust the hierarchy in case num threads exceeds original 34 if (nproc > machine_hierarchy.base_num_threads) 35 machine_hierarchy.resize(nproc); 36 37 depth = machine_hierarchy.depth; 38 KMP_DEBUG_ASSERT(depth > 0); 39 40 thr_bar->depth = depth; 41 thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0] - 1; 42 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 43 } 44 45 #if KMP_AFFINITY_SUPPORTED 46 47 bool KMPAffinity::picked_api = false; 48 49 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); } 50 void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); } 51 void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); } 52 void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); } 53 void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); } 54 void KMPAffinity::operator delete(void *p) { __kmp_free(p); } 55 56 void KMPAffinity::pick_api() { 57 KMPAffinity *affinity_dispatch; 58 if (picked_api) 59 return; 60 #if KMP_USE_HWLOC 61 // Only use Hwloc if affinity isn't explicitly disabled and 62 // user requests Hwloc topology method 63 if (__kmp_affinity_top_method == affinity_top_method_hwloc && 64 __kmp_affinity_type != affinity_disabled) { 65 affinity_dispatch = new KMPHwlocAffinity(); 66 } else 67 #endif 68 { 69 affinity_dispatch = new KMPNativeAffinity(); 70 } 71 __kmp_affinity_dispatch = affinity_dispatch; 72 picked_api = true; 73 } 74 75 void KMPAffinity::destroy_api() { 76 if (__kmp_affinity_dispatch != NULL) { 77 delete __kmp_affinity_dispatch; 78 __kmp_affinity_dispatch = NULL; 79 picked_api = false; 80 } 81 } 82 83 // Print the affinity mask to the character array in a pretty format. 84 char *__kmp_affinity_print_mask(char *buf, int buf_len, 85 kmp_affin_mask_t *mask) { 86 KMP_ASSERT(buf_len >= 40); 87 char *scan = buf; 88 char *end = buf + buf_len - 1; 89 90 // Find first element / check for empty set. 91 size_t i; 92 i = mask->begin(); 93 if (i == mask->end()) { 94 KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}"); 95 while (*scan != '\0') 96 scan++; 97 KMP_ASSERT(scan <= end); 98 return buf; 99 } 100 101 KMP_SNPRINTF(scan, end - scan + 1, "{%ld", (long)i); 102 while (*scan != '\0') 103 scan++; 104 i++; 105 for (; i != mask->end(); i = mask->next(i)) { 106 if (!KMP_CPU_ISSET(i, mask)) { 107 continue; 108 } 109 110 // Check for buffer overflow. A string of the form ",<n>" will have at most 111 // 10 characters, plus we want to leave room to print ",...}" if the set is 112 // too large to print for a total of 15 characters. We already left room for 113 // '\0' in setting end. 114 if (end - scan < 15) { 115 break; 116 } 117 KMP_SNPRINTF(scan, end - scan + 1, ",%-ld", (long)i); 118 while (*scan != '\0') 119 scan++; 120 } 121 if (i != mask->end()) { 122 KMP_SNPRINTF(scan, end - scan + 1, ",..."); 123 while (*scan != '\0') 124 scan++; 125 } 126 KMP_SNPRINTF(scan, end - scan + 1, "}"); 127 while (*scan != '\0') 128 scan++; 129 KMP_ASSERT(scan <= end); 130 return buf; 131 } 132 133 void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) { 134 KMP_CPU_ZERO(mask); 135 136 #if KMP_GROUP_AFFINITY 137 138 if (__kmp_num_proc_groups > 1) { 139 int group; 140 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 141 for (group = 0; group < __kmp_num_proc_groups; group++) { 142 int i; 143 int num = __kmp_GetActiveProcessorCount(group); 144 for (i = 0; i < num; i++) { 145 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 146 } 147 } 148 } else 149 150 #endif /* KMP_GROUP_AFFINITY */ 151 152 { 153 int proc; 154 for (proc = 0; proc < __kmp_xproc; proc++) { 155 KMP_CPU_SET(proc, mask); 156 } 157 } 158 } 159 160 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be 161 // called to renumber the labels from [0..n] and place them into the child_num 162 // vector of the address object. This is done in case the labels used for 163 // the children at one node of the hierarchy differ from those used for 164 // another node at the same level. Example: suppose the machine has 2 nodes 165 // with 2 packages each. The first node contains packages 601 and 602, and 166 // second node contains packages 603 and 604. If we try to sort the table 167 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604 168 // because we are paying attention to the labels themselves, not the ordinal 169 // child numbers. By using the child numbers in the sort, the result is 170 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604. 171 static void __kmp_affinity_assign_child_nums(AddrUnsPair *address2os, 172 int numAddrs) { 173 KMP_DEBUG_ASSERT(numAddrs > 0); 174 int depth = address2os->first.depth; 175 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 176 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 177 int labCt; 178 for (labCt = 0; labCt < depth; labCt++) { 179 address2os[0].first.childNums[labCt] = counts[labCt] = 0; 180 lastLabel[labCt] = address2os[0].first.labels[labCt]; 181 } 182 int i; 183 for (i = 1; i < numAddrs; i++) { 184 for (labCt = 0; labCt < depth; labCt++) { 185 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) { 186 int labCt2; 187 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) { 188 counts[labCt2] = 0; 189 lastLabel[labCt2] = address2os[i].first.labels[labCt2]; 190 } 191 counts[labCt]++; 192 lastLabel[labCt] = address2os[i].first.labels[labCt]; 193 break; 194 } 195 } 196 for (labCt = 0; labCt < depth; labCt++) { 197 address2os[i].first.childNums[labCt] = counts[labCt]; 198 } 199 for (; labCt < (int)Address::maxDepth; labCt++) { 200 address2os[i].first.childNums[labCt] = 0; 201 } 202 } 203 __kmp_free(lastLabel); 204 __kmp_free(counts); 205 } 206 207 // All of the __kmp_affinity_create_*_map() routines should set 208 // __kmp_affinity_masks to a vector of affinity mask objects of length 209 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and return 210 // the number of levels in the machine topology tree (zero if 211 // __kmp_affinity_type == affinity_none). 212 // 213 // All of the __kmp_affinity_create_*_map() routines should set 214 // *__kmp_affin_fullMask to the affinity mask for the initialization thread. 215 // They need to save and restore the mask, and it could be needed later, so 216 // saving it is just an optimization to avoid calling kmp_get_system_affinity() 217 // again. 218 kmp_affin_mask_t *__kmp_affin_fullMask = NULL; 219 220 static int nCoresPerPkg, nPackages; 221 static int __kmp_nThreadsPerCore; 222 #ifndef KMP_DFLT_NTH_CORES 223 static int __kmp_ncores; 224 #endif 225 static int *__kmp_pu_os_idx = NULL; 226 227 // __kmp_affinity_uniform_topology() doesn't work when called from 228 // places which support arbitrarily many levels in the machine topology 229 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map() 230 // __kmp_affinity_create_x2apicid_map(). 231 inline static bool __kmp_affinity_uniform_topology() { 232 return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages); 233 } 234 235 // Print out the detailed machine topology map, i.e. the physical locations 236 // of each OS proc. 237 static void __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, 238 int depth, int pkgLevel, 239 int coreLevel, int threadLevel) { 240 int proc; 241 242 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 243 for (proc = 0; proc < len; proc++) { 244 int level; 245 kmp_str_buf_t buf; 246 __kmp_str_buf_init(&buf); 247 for (level = 0; level < depth; level++) { 248 if (level == threadLevel) { 249 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread)); 250 } else if (level == coreLevel) { 251 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core)); 252 } else if (level == pkgLevel) { 253 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package)); 254 } else if (level > pkgLevel) { 255 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node), 256 level - pkgLevel - 1); 257 } else { 258 __kmp_str_buf_print(&buf, "L%d ", level); 259 } 260 __kmp_str_buf_print(&buf, "%d ", address2os[proc].first.labels[level]); 261 } 262 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second, 263 buf.str); 264 __kmp_str_buf_free(&buf); 265 } 266 } 267 268 #if KMP_USE_HWLOC 269 270 // This function removes the topology levels that are radix 1 and don't offer 271 // further information about the topology. The most common example is when you 272 // have one thread context per core, we don't want the extra thread context 273 // level if it offers no unique labels. So they are removed. 274 // return value: the new depth of address2os 275 static int __kmp_affinity_remove_radix_one_levels(AddrUnsPair *address2os, 276 int nActiveThreads, int depth, 277 int *pkgLevel, int *coreLevel, 278 int *threadLevel) { 279 int level; 280 int i; 281 int radix1_detected; 282 283 for (level = depth - 1; level >= 0; --level) { 284 // Always keep the package level 285 if (level == *pkgLevel) 286 continue; 287 // Detect if this level is radix 1 288 radix1_detected = 1; 289 for (i = 1; i < nActiveThreads; ++i) { 290 if (address2os[0].first.labels[level] != 291 address2os[i].first.labels[level]) { 292 // There are differing label values for this level so it stays 293 radix1_detected = 0; 294 break; 295 } 296 } 297 if (!radix1_detected) 298 continue; 299 // Radix 1 was detected 300 if (level == *threadLevel) { 301 // If only one thread per core, then just decrement 302 // the depth which removes the threadlevel from address2os 303 for (i = 0; i < nActiveThreads; ++i) { 304 address2os[i].first.depth--; 305 } 306 *threadLevel = -1; 307 } else if (level == *coreLevel) { 308 // For core level, we move the thread labels over if they are still 309 // valid (*threadLevel != -1), and also reduce the depth another level 310 for (i = 0; i < nActiveThreads; ++i) { 311 if (*threadLevel != -1) { 312 address2os[i].first.labels[*coreLevel] = 313 address2os[i].first.labels[*threadLevel]; 314 } 315 address2os[i].first.depth--; 316 } 317 *coreLevel = -1; 318 } 319 } 320 return address2os[0].first.depth; 321 } 322 323 // Returns the number of objects of type 'type' below 'obj' within the topology 324 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is 325 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET 326 // object. 327 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj, 328 hwloc_obj_type_t type) { 329 int retval = 0; 330 hwloc_obj_t first; 331 for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type, 332 obj->logical_index, type, 0); 333 first != NULL && 334 hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, obj->type, first) == 335 obj; 336 first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type, 337 first)) { 338 ++retval; 339 } 340 return retval; 341 } 342 343 static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os, 344 kmp_i18n_id_t *const msg_id) { 345 *address2os = NULL; 346 *msg_id = kmp_i18n_null; 347 348 // Save the affinity mask for the current thread. 349 kmp_affin_mask_t *oldMask; 350 KMP_CPU_ALLOC(oldMask); 351 __kmp_get_system_affinity(oldMask, TRUE); 352 353 int depth = 3; 354 int pkgLevel = 0; 355 int coreLevel = 1; 356 int threadLevel = 2; 357 358 if (!KMP_AFFINITY_CAPABLE()) { 359 // Hack to try and infer the machine topology using only the data 360 // available from cpuid on the current thread, and __kmp_xproc. 361 KMP_ASSERT(__kmp_affinity_type == affinity_none); 362 363 nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj( 364 hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PACKAGE, 0), 365 HWLOC_OBJ_CORE); 366 __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj( 367 hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, 0), 368 HWLOC_OBJ_PU); 369 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 370 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 371 if (__kmp_affinity_verbose) { 372 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 373 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 374 if (__kmp_affinity_uniform_topology()) { 375 KMP_INFORM(Uniform, "KMP_AFFINITY"); 376 } else { 377 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 378 } 379 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 380 __kmp_nThreadsPerCore, __kmp_ncores); 381 } 382 KMP_CPU_FREE(oldMask); 383 return 0; 384 } 385 386 // Allocate the data structure to be returned. 387 AddrUnsPair *retval = 388 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 389 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 390 391 // When affinity is off, this routine will still be called to set 392 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 393 // nCoresPerPkg, & nPackages. Make sure all these vars are set 394 // correctly, and return if affinity is not enabled. 395 396 hwloc_obj_t pu; 397 hwloc_obj_t core; 398 hwloc_obj_t socket; 399 int nActiveThreads = 0; 400 int socket_identifier = 0; 401 // re-calculate globals to count only accessible resources 402 __kmp_ncores = nPackages = nCoresPerPkg = __kmp_nThreadsPerCore = 0; 403 for (socket = 404 hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PACKAGE, 0); 405 socket != NULL; socket = hwloc_get_next_obj_by_type( 406 __kmp_hwloc_topology, HWLOC_OBJ_PACKAGE, socket), 407 socket_identifier++) { 408 int core_identifier = 0; 409 int num_active_cores = 0; 410 for (core = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, socket->type, 411 socket->logical_index, 412 HWLOC_OBJ_CORE, 0); 413 core != NULL && 414 hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, socket->type, 415 core) == socket; 416 core = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, 417 core), 418 core_identifier++) { 419 int pu_identifier = 0; 420 int num_active_threads = 0; 421 for (pu = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, core->type, 422 core->logical_index, HWLOC_OBJ_PU, 423 0); 424 pu != NULL && 425 hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, core->type, 426 pu) == core; 427 pu = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PU, 428 pu), 429 pu_identifier++) { 430 Address addr(3); 431 if (!KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask)) 432 continue; // skip inactive (inaccessible) unit 433 KA_TRACE(20, 434 ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n", 435 socket->os_index, socket->logical_index, core->os_index, 436 core->logical_index, pu->os_index, pu->logical_index)); 437 addr.labels[0] = socket_identifier; // package 438 addr.labels[1] = core_identifier; // core 439 addr.labels[2] = pu_identifier; // pu 440 retval[nActiveThreads] = AddrUnsPair(addr, pu->os_index); 441 __kmp_pu_os_idx[nActiveThreads] = 442 pu->os_index; // keep os index for each active pu 443 nActiveThreads++; 444 ++num_active_threads; // count active threads per core 445 } 446 if (num_active_threads) { // were there any active threads on the core? 447 ++__kmp_ncores; // count total active cores 448 ++num_active_cores; // count active cores per socket 449 if (num_active_threads > __kmp_nThreadsPerCore) 450 __kmp_nThreadsPerCore = num_active_threads; // calc maximum 451 } 452 } 453 if (num_active_cores) { // were there any active cores on the socket? 454 ++nPackages; // count total active packages 455 if (num_active_cores > nCoresPerPkg) 456 nCoresPerPkg = num_active_cores; // calc maximum 457 } 458 } 459 460 // If there's only one thread context to bind to, return now. 461 KMP_DEBUG_ASSERT(nActiveThreads == __kmp_avail_proc); 462 KMP_ASSERT(nActiveThreads > 0); 463 if (nActiveThreads == 1) { 464 __kmp_ncores = nPackages = 1; 465 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 466 if (__kmp_affinity_verbose) { 467 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 468 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 469 470 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 471 if (__kmp_affinity_respect_mask) { 472 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 473 } else { 474 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 475 } 476 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 477 KMP_INFORM(Uniform, "KMP_AFFINITY"); 478 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 479 __kmp_nThreadsPerCore, __kmp_ncores); 480 } 481 482 if (__kmp_affinity_type == affinity_none) { 483 __kmp_free(retval); 484 KMP_CPU_FREE(oldMask); 485 return 0; 486 } 487 488 // Form an Address object which only includes the package level. 489 Address addr(1); 490 addr.labels[0] = retval[0].first.labels[pkgLevel]; 491 retval[0].first = addr; 492 493 if (__kmp_affinity_gran_levels < 0) { 494 __kmp_affinity_gran_levels = 0; 495 } 496 497 if (__kmp_affinity_verbose) { 498 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 499 } 500 501 *address2os = retval; 502 KMP_CPU_FREE(oldMask); 503 return 1; 504 } 505 506 // Sort the table by physical Id. 507 qsort(retval, nActiveThreads, sizeof(*retval), 508 __kmp_affinity_cmp_Address_labels); 509 510 // Check to see if the machine topology is uniform 511 unsigned uniform = 512 (nPackages * nCoresPerPkg * __kmp_nThreadsPerCore == nActiveThreads); 513 514 // Print the machine topology summary. 515 if (__kmp_affinity_verbose) { 516 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 517 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 518 519 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 520 if (__kmp_affinity_respect_mask) { 521 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 522 } else { 523 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 524 } 525 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 526 if (uniform) { 527 KMP_INFORM(Uniform, "KMP_AFFINITY"); 528 } else { 529 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 530 } 531 532 kmp_str_buf_t buf; 533 __kmp_str_buf_init(&buf); 534 535 __kmp_str_buf_print(&buf, "%d", nPackages); 536 // for (level = 1; level <= pkgLevel; level++) { 537 // __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 538 // } 539 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 540 __kmp_nThreadsPerCore, __kmp_ncores); 541 542 __kmp_str_buf_free(&buf); 543 } 544 545 if (__kmp_affinity_type == affinity_none) { 546 __kmp_free(retval); 547 KMP_CPU_FREE(oldMask); 548 return 0; 549 } 550 551 // Find any levels with radiix 1, and remove them from the map 552 // (except for the package level). 553 depth = __kmp_affinity_remove_radix_one_levels( 554 retval, nActiveThreads, depth, &pkgLevel, &coreLevel, &threadLevel); 555 556 if (__kmp_affinity_gran_levels < 0) { 557 // Set the granularity level based on what levels are modeled 558 // in the machine topology map. 559 __kmp_affinity_gran_levels = 0; 560 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 561 __kmp_affinity_gran_levels++; 562 } 563 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 564 __kmp_affinity_gran_levels++; 565 } 566 if (__kmp_affinity_gran > affinity_gran_package) { 567 __kmp_affinity_gran_levels++; 568 } 569 } 570 571 if (__kmp_affinity_verbose) { 572 __kmp_affinity_print_topology(retval, nActiveThreads, depth, pkgLevel, 573 coreLevel, threadLevel); 574 } 575 576 KMP_CPU_FREE(oldMask); 577 *address2os = retval; 578 return depth; 579 } 580 #endif // KMP_USE_HWLOC 581 582 // If we don't know how to retrieve the machine's processor topology, or 583 // encounter an error in doing so, this routine is called to form a "flat" 584 // mapping of os thread id's <-> processor id's. 585 static int __kmp_affinity_create_flat_map(AddrUnsPair **address2os, 586 kmp_i18n_id_t *const msg_id) { 587 *address2os = NULL; 588 *msg_id = kmp_i18n_null; 589 590 // Even if __kmp_affinity_type == affinity_none, this routine might still 591 // called to set __kmp_ncores, as well as 592 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 593 if (!KMP_AFFINITY_CAPABLE()) { 594 KMP_ASSERT(__kmp_affinity_type == affinity_none); 595 __kmp_ncores = nPackages = __kmp_xproc; 596 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 597 if (__kmp_affinity_verbose) { 598 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY"); 599 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 600 KMP_INFORM(Uniform, "KMP_AFFINITY"); 601 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 602 __kmp_nThreadsPerCore, __kmp_ncores); 603 } 604 return 0; 605 } 606 607 // When affinity is off, this routine will still be called to set 608 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 609 // Make sure all these vars are set correctly, and return now if affinity is 610 // not enabled. 611 __kmp_ncores = nPackages = __kmp_avail_proc; 612 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 613 if (__kmp_affinity_verbose) { 614 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 615 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 616 __kmp_affin_fullMask); 617 618 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY"); 619 if (__kmp_affinity_respect_mask) { 620 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 621 } else { 622 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 623 } 624 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 625 KMP_INFORM(Uniform, "KMP_AFFINITY"); 626 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 627 __kmp_nThreadsPerCore, __kmp_ncores); 628 } 629 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 630 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 631 if (__kmp_affinity_type == affinity_none) { 632 int avail_ct = 0; 633 int i; 634 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 635 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) 636 continue; 637 __kmp_pu_os_idx[avail_ct++] = i; // suppose indices are flat 638 } 639 return 0; 640 } 641 642 // Contruct the data structure to be returned. 643 *address2os = 644 (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 645 int avail_ct = 0; 646 unsigned int i; 647 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 648 // Skip this proc if it is not included in the machine model. 649 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 650 continue; 651 } 652 __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat 653 Address addr(1); 654 addr.labels[0] = i; 655 (*address2os)[avail_ct++] = AddrUnsPair(addr, i); 656 } 657 if (__kmp_affinity_verbose) { 658 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 659 } 660 661 if (__kmp_affinity_gran_levels < 0) { 662 // Only the package level is modeled in the machine topology map, 663 // so the #levels of granularity is either 0 or 1. 664 if (__kmp_affinity_gran > affinity_gran_package) { 665 __kmp_affinity_gran_levels = 1; 666 } else { 667 __kmp_affinity_gran_levels = 0; 668 } 669 } 670 return 1; 671 } 672 673 #if KMP_GROUP_AFFINITY 674 675 // If multiple Windows* OS processor groups exist, we can create a 2-level 676 // topology map with the groups at level 0 and the individual procs at level 1. 677 // This facilitates letting the threads float among all procs in a group, 678 // if granularity=group (the default when there are multiple groups). 679 static int __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os, 680 kmp_i18n_id_t *const msg_id) { 681 *address2os = NULL; 682 *msg_id = kmp_i18n_null; 683 684 // If we aren't affinity capable, then return now. 685 // The flat mapping will be used. 686 if (!KMP_AFFINITY_CAPABLE()) { 687 // FIXME set *msg_id 688 return -1; 689 } 690 691 // Contruct the data structure to be returned. 692 *address2os = 693 (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 694 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 695 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 696 int avail_ct = 0; 697 int i; 698 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 699 // Skip this proc if it is not included in the machine model. 700 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 701 continue; 702 } 703 __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat 704 Address addr(2); 705 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR)); 706 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR)); 707 (*address2os)[avail_ct++] = AddrUnsPair(addr, i); 708 709 if (__kmp_affinity_verbose) { 710 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0], 711 addr.labels[1]); 712 } 713 } 714 715 if (__kmp_affinity_gran_levels < 0) { 716 if (__kmp_affinity_gran == affinity_gran_group) { 717 __kmp_affinity_gran_levels = 1; 718 } else if ((__kmp_affinity_gran == affinity_gran_fine) || 719 (__kmp_affinity_gran == affinity_gran_thread)) { 720 __kmp_affinity_gran_levels = 0; 721 } else { 722 const char *gran_str = NULL; 723 if (__kmp_affinity_gran == affinity_gran_core) { 724 gran_str = "core"; 725 } else if (__kmp_affinity_gran == affinity_gran_package) { 726 gran_str = "package"; 727 } else if (__kmp_affinity_gran == affinity_gran_node) { 728 gran_str = "node"; 729 } else { 730 KMP_ASSERT(0); 731 } 732 733 // Warning: can't use affinity granularity \"gran\" with group topology 734 // method, using "thread" 735 __kmp_affinity_gran_levels = 0; 736 } 737 } 738 return 2; 739 } 740 741 #endif /* KMP_GROUP_AFFINITY */ 742 743 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 744 745 static int __kmp_cpuid_mask_width(int count) { 746 int r = 0; 747 748 while ((1 << r) < count) 749 ++r; 750 return r; 751 } 752 753 class apicThreadInfo { 754 public: 755 unsigned osId; // param to __kmp_affinity_bind_thread 756 unsigned apicId; // from cpuid after binding 757 unsigned maxCoresPerPkg; // "" 758 unsigned maxThreadsPerPkg; // "" 759 unsigned pkgId; // inferred from above values 760 unsigned coreId; // "" 761 unsigned threadId; // "" 762 }; 763 764 static int __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, 765 const void *b) { 766 const apicThreadInfo *aa = (const apicThreadInfo *)a; 767 const apicThreadInfo *bb = (const apicThreadInfo *)b; 768 if (aa->osId < bb->osId) 769 return -1; 770 if (aa->osId > bb->osId) 771 return 1; 772 return 0; 773 } 774 775 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, 776 const void *b) { 777 const apicThreadInfo *aa = (const apicThreadInfo *)a; 778 const apicThreadInfo *bb = (const apicThreadInfo *)b; 779 if (aa->pkgId < bb->pkgId) 780 return -1; 781 if (aa->pkgId > bb->pkgId) 782 return 1; 783 if (aa->coreId < bb->coreId) 784 return -1; 785 if (aa->coreId > bb->coreId) 786 return 1; 787 if (aa->threadId < bb->threadId) 788 return -1; 789 if (aa->threadId > bb->threadId) 790 return 1; 791 return 0; 792 } 793 794 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 795 // an algorithm which cycles through the available os threads, setting 796 // the current thread's affinity mask to that thread, and then retrieves 797 // the Apic Id for each thread context using the cpuid instruction. 798 static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os, 799 kmp_i18n_id_t *const msg_id) { 800 kmp_cpuid buf; 801 int rc; 802 *address2os = NULL; 803 *msg_id = kmp_i18n_null; 804 805 // Check if cpuid leaf 4 is supported. 806 __kmp_x86_cpuid(0, 0, &buf); 807 if (buf.eax < 4) { 808 *msg_id = kmp_i18n_str_NoLeaf4Support; 809 return -1; 810 } 811 812 // The algorithm used starts by setting the affinity to each available thread 813 // and retrieving info from the cpuid instruction, so if we are not capable of 814 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we 815 // need to do something else - use the defaults that we calculated from 816 // issuing cpuid without binding to each proc. 817 if (!KMP_AFFINITY_CAPABLE()) { 818 // Hack to try and infer the machine topology using only the data 819 // available from cpuid on the current thread, and __kmp_xproc. 820 KMP_ASSERT(__kmp_affinity_type == affinity_none); 821 822 // Get an upper bound on the number of threads per package using cpuid(1). 823 // On some OS/chps combinations where HT is supported by the chip but is 824 // disabled, this value will be 2 on a single core chip. Usually, it will be 825 // 2 if HT is enabled and 1 if HT is disabled. 826 __kmp_x86_cpuid(1, 0, &buf); 827 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 828 if (maxThreadsPerPkg == 0) { 829 maxThreadsPerPkg = 1; 830 } 831 832 // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded 833 // value. 834 // 835 // The author of cpu_count.cpp treated this only an upper bound on the 836 // number of cores, but I haven't seen any cases where it was greater than 837 // the actual number of cores, so we will treat it as exact in this block of 838 // code. 839 // 840 // First, we need to check if cpuid(4) is supported on this chip. To see if 841 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or 842 // greater. 843 __kmp_x86_cpuid(0, 0, &buf); 844 if (buf.eax >= 4) { 845 __kmp_x86_cpuid(4, 0, &buf); 846 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 847 } else { 848 nCoresPerPkg = 1; 849 } 850 851 // There is no way to reliably tell if HT is enabled without issuing the 852 // cpuid instruction from every thread, can correlating the cpuid info, so 853 // if the machine is not affinity capable, we assume that HT is off. We have 854 // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine 855 // does not support HT. 856 // 857 // - Older OSes are usually found on machines with older chips, which do not 858 // support HT. 859 // - The performance penalty for mistakenly identifying a machine as HT when 860 // it isn't (which results in blocktime being incorrecly set to 0) is 861 // greater than the penalty when for mistakenly identifying a machine as 862 // being 1 thread/core when it is really HT enabled (which results in 863 // blocktime being incorrectly set to a positive value). 864 __kmp_ncores = __kmp_xproc; 865 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 866 __kmp_nThreadsPerCore = 1; 867 if (__kmp_affinity_verbose) { 868 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY"); 869 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 870 if (__kmp_affinity_uniform_topology()) { 871 KMP_INFORM(Uniform, "KMP_AFFINITY"); 872 } else { 873 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 874 } 875 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 876 __kmp_nThreadsPerCore, __kmp_ncores); 877 } 878 return 0; 879 } 880 881 // From here on, we can assume that it is safe to call 882 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 883 // __kmp_affinity_type = affinity_none. 884 885 // Save the affinity mask for the current thread. 886 kmp_affin_mask_t *oldMask; 887 KMP_CPU_ALLOC(oldMask); 888 KMP_ASSERT(oldMask != NULL); 889 __kmp_get_system_affinity(oldMask, TRUE); 890 891 // Run through each of the available contexts, binding the current thread 892 // to it, and obtaining the pertinent information using the cpuid instr. 893 // 894 // The relevant information is: 895 // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 896 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 897 // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value 898 // of this field determines the width of the core# + thread# fields in the 899 // Apic Id. It is also an upper bound on the number of threads per 900 // package, but it has been verified that situations happen were it is not 901 // exact. In particular, on certain OS/chip combinations where Intel(R) 902 // Hyper-Threading Technology is supported by the chip but has been 903 // disabled, the value of this field will be 2 (for a single core chip). 904 // On other OS/chip combinations supporting Intel(R) Hyper-Threading 905 // Technology, the value of this field will be 1 when Intel(R) 906 // Hyper-Threading Technology is disabled and 2 when it is enabled. 907 // - Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The value 908 // of this field (+1) determines the width of the core# field in the Apic 909 // Id. The comments in "cpucount.cpp" say that this value is an upper 910 // bound, but the IA-32 architecture manual says that it is exactly the 911 // number of cores per package, and I haven't seen any case where it 912 // wasn't. 913 // 914 // From this information, deduce the package Id, core Id, and thread Id, 915 // and set the corresponding fields in the apicThreadInfo struct. 916 unsigned i; 917 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 918 __kmp_avail_proc * sizeof(apicThreadInfo)); 919 unsigned nApics = 0; 920 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 921 // Skip this proc if it is not included in the machine model. 922 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 923 continue; 924 } 925 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 926 927 __kmp_affinity_dispatch->bind_thread(i); 928 threadInfo[nApics].osId = i; 929 930 // The apic id and max threads per pkg come from cpuid(1). 931 __kmp_x86_cpuid(1, 0, &buf); 932 if (((buf.edx >> 9) & 1) == 0) { 933 __kmp_set_system_affinity(oldMask, TRUE); 934 __kmp_free(threadInfo); 935 KMP_CPU_FREE(oldMask); 936 *msg_id = kmp_i18n_str_ApicNotPresent; 937 return -1; 938 } 939 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 940 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 941 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 942 threadInfo[nApics].maxThreadsPerPkg = 1; 943 } 944 945 // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded 946 // value. 947 // 948 // First, we need to check if cpuid(4) is supported on this chip. To see if 949 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n 950 // or greater. 951 __kmp_x86_cpuid(0, 0, &buf); 952 if (buf.eax >= 4) { 953 __kmp_x86_cpuid(4, 0, &buf); 954 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 955 } else { 956 threadInfo[nApics].maxCoresPerPkg = 1; 957 } 958 959 // Infer the pkgId / coreId / threadId using only the info obtained locally. 960 int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg); 961 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 962 963 int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg); 964 int widthT = widthCT - widthC; 965 if (widthT < 0) { 966 // I've never seen this one happen, but I suppose it could, if the cpuid 967 // instruction on a chip was really screwed up. Make sure to restore the 968 // affinity mask before the tail call. 969 __kmp_set_system_affinity(oldMask, TRUE); 970 __kmp_free(threadInfo); 971 KMP_CPU_FREE(oldMask); 972 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 973 return -1; 974 } 975 976 int maskC = (1 << widthC) - 1; 977 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC; 978 979 int maskT = (1 << widthT) - 1; 980 threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT; 981 982 nApics++; 983 } 984 985 // We've collected all the info we need. 986 // Restore the old affinity mask for this thread. 987 __kmp_set_system_affinity(oldMask, TRUE); 988 989 // If there's only one thread context to bind to, form an Address object 990 // with depth 1 and return immediately (or, if affinity is off, set 991 // address2os to NULL and return). 992 // 993 // If it is configured to omit the package level when there is only a single 994 // package, the logic at the end of this routine won't work if there is only 995 // a single thread - it would try to form an Address object with depth 0. 996 KMP_ASSERT(nApics > 0); 997 if (nApics == 1) { 998 __kmp_ncores = nPackages = 1; 999 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1000 if (__kmp_affinity_verbose) { 1001 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1002 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1003 1004 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1005 if (__kmp_affinity_respect_mask) { 1006 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1007 } else { 1008 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1009 } 1010 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1011 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1012 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1013 __kmp_nThreadsPerCore, __kmp_ncores); 1014 } 1015 1016 if (__kmp_affinity_type == affinity_none) { 1017 __kmp_free(threadInfo); 1018 KMP_CPU_FREE(oldMask); 1019 return 0; 1020 } 1021 1022 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair)); 1023 Address addr(1); 1024 addr.labels[0] = threadInfo[0].pkgId; 1025 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId); 1026 1027 if (__kmp_affinity_gran_levels < 0) { 1028 __kmp_affinity_gran_levels = 0; 1029 } 1030 1031 if (__kmp_affinity_verbose) { 1032 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 1033 } 1034 1035 __kmp_free(threadInfo); 1036 KMP_CPU_FREE(oldMask); 1037 return 1; 1038 } 1039 1040 // Sort the threadInfo table by physical Id. 1041 qsort(threadInfo, nApics, sizeof(*threadInfo), 1042 __kmp_affinity_cmp_apicThreadInfo_phys_id); 1043 1044 // The table is now sorted by pkgId / coreId / threadId, but we really don't 1045 // know the radix of any of the fields. pkgId's may be sparsely assigned among 1046 // the chips on a system. Although coreId's are usually assigned 1047 // [0 .. coresPerPkg-1] and threadId's are usually assigned 1048 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1049 // 1050 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 1051 // total # packages) are at this point - we want to determine that now. We 1052 // only have an upper bound on the first two figures. 1053 // 1054 // We also perform a consistency check at this point: the values returned by 1055 // the cpuid instruction for any thread bound to a given package had better 1056 // return the same info for maxThreadsPerPkg and maxCoresPerPkg. 1057 nPackages = 1; 1058 nCoresPerPkg = 1; 1059 __kmp_nThreadsPerCore = 1; 1060 unsigned nCores = 1; 1061 1062 unsigned pkgCt = 1; // to determine radii 1063 unsigned lastPkgId = threadInfo[0].pkgId; 1064 unsigned coreCt = 1; 1065 unsigned lastCoreId = threadInfo[0].coreId; 1066 unsigned threadCt = 1; 1067 unsigned lastThreadId = threadInfo[0].threadId; 1068 1069 // intra-pkg consist checks 1070 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 1071 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 1072 1073 for (i = 1; i < nApics; i++) { 1074 if (threadInfo[i].pkgId != lastPkgId) { 1075 nCores++; 1076 pkgCt++; 1077 lastPkgId = threadInfo[i].pkgId; 1078 if ((int)coreCt > nCoresPerPkg) 1079 nCoresPerPkg = coreCt; 1080 coreCt = 1; 1081 lastCoreId = threadInfo[i].coreId; 1082 if ((int)threadCt > __kmp_nThreadsPerCore) 1083 __kmp_nThreadsPerCore = threadCt; 1084 threadCt = 1; 1085 lastThreadId = threadInfo[i].threadId; 1086 1087 // This is a different package, so go on to the next iteration without 1088 // doing any consistency checks. Reset the consistency check vars, though. 1089 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 1090 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 1091 continue; 1092 } 1093 1094 if (threadInfo[i].coreId != lastCoreId) { 1095 nCores++; 1096 coreCt++; 1097 lastCoreId = threadInfo[i].coreId; 1098 if ((int)threadCt > __kmp_nThreadsPerCore) 1099 __kmp_nThreadsPerCore = threadCt; 1100 threadCt = 1; 1101 lastThreadId = threadInfo[i].threadId; 1102 } else if (threadInfo[i].threadId != lastThreadId) { 1103 threadCt++; 1104 lastThreadId = threadInfo[i].threadId; 1105 } else { 1106 __kmp_free(threadInfo); 1107 KMP_CPU_FREE(oldMask); 1108 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1109 return -1; 1110 } 1111 1112 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 1113 // fields agree between all the threads bounds to a given package. 1114 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) || 1115 (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 1116 __kmp_free(threadInfo); 1117 KMP_CPU_FREE(oldMask); 1118 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1119 return -1; 1120 } 1121 } 1122 nPackages = pkgCt; 1123 if ((int)coreCt > nCoresPerPkg) 1124 nCoresPerPkg = coreCt; 1125 if ((int)threadCt > __kmp_nThreadsPerCore) 1126 __kmp_nThreadsPerCore = threadCt; 1127 1128 // When affinity is off, this routine will still be called to set 1129 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1130 // Make sure all these vars are set correctly, and return now if affinity is 1131 // not enabled. 1132 __kmp_ncores = nCores; 1133 if (__kmp_affinity_verbose) { 1134 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1135 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1136 1137 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1138 if (__kmp_affinity_respect_mask) { 1139 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1140 } else { 1141 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1142 } 1143 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1144 if (__kmp_affinity_uniform_topology()) { 1145 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1146 } else { 1147 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1148 } 1149 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1150 __kmp_nThreadsPerCore, __kmp_ncores); 1151 } 1152 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 1153 KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc); 1154 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 1155 for (i = 0; i < nApics; ++i) { 1156 __kmp_pu_os_idx[i] = threadInfo[i].osId; 1157 } 1158 if (__kmp_affinity_type == affinity_none) { 1159 __kmp_free(threadInfo); 1160 KMP_CPU_FREE(oldMask); 1161 return 0; 1162 } 1163 1164 // Now that we've determined the number of packages, the number of cores per 1165 // package, and the number of threads per core, we can construct the data 1166 // structure that is to be returned. 1167 int pkgLevel = 0; 1168 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1; 1169 int threadLevel = 1170 (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 1171 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 1172 1173 KMP_ASSERT(depth > 0); 1174 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1175 1176 for (i = 0; i < nApics; ++i) { 1177 Address addr(depth); 1178 unsigned os = threadInfo[i].osId; 1179 int d = 0; 1180 1181 if (pkgLevel >= 0) { 1182 addr.labels[d++] = threadInfo[i].pkgId; 1183 } 1184 if (coreLevel >= 0) { 1185 addr.labels[d++] = threadInfo[i].coreId; 1186 } 1187 if (threadLevel >= 0) { 1188 addr.labels[d++] = threadInfo[i].threadId; 1189 } 1190 (*address2os)[i] = AddrUnsPair(addr, os); 1191 } 1192 1193 if (__kmp_affinity_gran_levels < 0) { 1194 // Set the granularity level based on what levels are modeled in the machine 1195 // topology map. 1196 __kmp_affinity_gran_levels = 0; 1197 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1198 __kmp_affinity_gran_levels++; 1199 } 1200 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1201 __kmp_affinity_gran_levels++; 1202 } 1203 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) { 1204 __kmp_affinity_gran_levels++; 1205 } 1206 } 1207 1208 if (__kmp_affinity_verbose) { 1209 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel, 1210 coreLevel, threadLevel); 1211 } 1212 1213 __kmp_free(threadInfo); 1214 KMP_CPU_FREE(oldMask); 1215 return depth; 1216 } 1217 1218 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 1219 // architectures support a newer interface for specifying the x2APIC Ids, 1220 // based on cpuid leaf 11. 1221 static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os, 1222 kmp_i18n_id_t *const msg_id) { 1223 kmp_cpuid buf; 1224 *address2os = NULL; 1225 *msg_id = kmp_i18n_null; 1226 1227 // Check to see if cpuid leaf 11 is supported. 1228 __kmp_x86_cpuid(0, 0, &buf); 1229 if (buf.eax < 11) { 1230 *msg_id = kmp_i18n_str_NoLeaf11Support; 1231 return -1; 1232 } 1233 __kmp_x86_cpuid(11, 0, &buf); 1234 if (buf.ebx == 0) { 1235 *msg_id = kmp_i18n_str_NoLeaf11Support; 1236 return -1; 1237 } 1238 1239 // Find the number of levels in the machine topology. While we're at it, get 1240 // the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will try to 1241 // get more accurate values later by explicitly counting them, but get 1242 // reasonable defaults now, in case we return early. 1243 int level; 1244 int threadLevel = -1; 1245 int coreLevel = -1; 1246 int pkgLevel = -1; 1247 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1248 1249 for (level = 0;; level++) { 1250 if (level > 31) { 1251 // FIXME: Hack for DPD200163180 1252 // 1253 // If level is big then something went wrong -> exiting 1254 // 1255 // There could actually be 32 valid levels in the machine topology, but so 1256 // far, the only machine we have seen which does not exit this loop before 1257 // iteration 32 has fubar x2APIC settings. 1258 // 1259 // For now, just reject this case based upon loop trip count. 1260 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1261 return -1; 1262 } 1263 __kmp_x86_cpuid(11, level, &buf); 1264 if (buf.ebx == 0) { 1265 if (pkgLevel < 0) { 1266 // Will infer nPackages from __kmp_xproc 1267 pkgLevel = level; 1268 level++; 1269 } 1270 break; 1271 } 1272 int kind = (buf.ecx >> 8) & 0xff; 1273 if (kind == 1) { 1274 // SMT level 1275 threadLevel = level; 1276 coreLevel = -1; 1277 pkgLevel = -1; 1278 __kmp_nThreadsPerCore = buf.ebx & 0xffff; 1279 if (__kmp_nThreadsPerCore == 0) { 1280 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1281 return -1; 1282 } 1283 } else if (kind == 2) { 1284 // core level 1285 coreLevel = level; 1286 pkgLevel = -1; 1287 nCoresPerPkg = buf.ebx & 0xffff; 1288 if (nCoresPerPkg == 0) { 1289 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1290 return -1; 1291 } 1292 } else { 1293 if (level <= 0) { 1294 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1295 return -1; 1296 } 1297 if (pkgLevel >= 0) { 1298 continue; 1299 } 1300 pkgLevel = level; 1301 nPackages = buf.ebx & 0xffff; 1302 if (nPackages == 0) { 1303 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1304 return -1; 1305 } 1306 } 1307 } 1308 int depth = level; 1309 1310 // In the above loop, "level" was counted from the finest level (usually 1311 // thread) to the coarsest. The caller expects that we will place the labels 1312 // in (*address2os)[].first.labels[] in the inverse order, so we need to 1313 // invert the vars saying which level means what. 1314 if (threadLevel >= 0) { 1315 threadLevel = depth - threadLevel - 1; 1316 } 1317 if (coreLevel >= 0) { 1318 coreLevel = depth - coreLevel - 1; 1319 } 1320 KMP_DEBUG_ASSERT(pkgLevel >= 0); 1321 pkgLevel = depth - pkgLevel - 1; 1322 1323 // The algorithm used starts by setting the affinity to each available thread 1324 // and retrieving info from the cpuid instruction, so if we are not capable of 1325 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we 1326 // need to do something else - use the defaults that we calculated from 1327 // issuing cpuid without binding to each proc. 1328 if (!KMP_AFFINITY_CAPABLE()) { 1329 // Hack to try and infer the machine topology using only the data 1330 // available from cpuid on the current thread, and __kmp_xproc. 1331 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1332 1333 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1334 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1335 if (__kmp_affinity_verbose) { 1336 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 1337 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1338 if (__kmp_affinity_uniform_topology()) { 1339 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1340 } else { 1341 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1342 } 1343 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1344 __kmp_nThreadsPerCore, __kmp_ncores); 1345 } 1346 return 0; 1347 } 1348 1349 // From here on, we can assume that it is safe to call 1350 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 1351 // __kmp_affinity_type = affinity_none. 1352 1353 // Save the affinity mask for the current thread. 1354 kmp_affin_mask_t *oldMask; 1355 KMP_CPU_ALLOC(oldMask); 1356 __kmp_get_system_affinity(oldMask, TRUE); 1357 1358 // Allocate the data structure to be returned. 1359 AddrUnsPair *retval = 1360 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 1361 1362 // Run through each of the available contexts, binding the current thread 1363 // to it, and obtaining the pertinent information using the cpuid instr. 1364 unsigned int proc; 1365 int nApics = 0; 1366 KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) { 1367 // Skip this proc if it is not included in the machine model. 1368 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 1369 continue; 1370 } 1371 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc); 1372 1373 __kmp_affinity_dispatch->bind_thread(proc); 1374 1375 // Extract labels for each level in the machine topology map from Apic ID. 1376 Address addr(depth); 1377 int prev_shift = 0; 1378 1379 for (level = 0; level < depth; level++) { 1380 __kmp_x86_cpuid(11, level, &buf); 1381 unsigned apicId = buf.edx; 1382 if (buf.ebx == 0) { 1383 if (level != depth - 1) { 1384 KMP_CPU_FREE(oldMask); 1385 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1386 return -1; 1387 } 1388 addr.labels[depth - level - 1] = apicId >> prev_shift; 1389 level++; 1390 break; 1391 } 1392 int shift = buf.eax & 0x1f; 1393 int mask = (1 << shift) - 1; 1394 addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift; 1395 prev_shift = shift; 1396 } 1397 if (level != depth) { 1398 KMP_CPU_FREE(oldMask); 1399 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1400 return -1; 1401 } 1402 1403 retval[nApics] = AddrUnsPair(addr, proc); 1404 nApics++; 1405 } 1406 1407 // We've collected all the info we need. 1408 // Restore the old affinity mask for this thread. 1409 __kmp_set_system_affinity(oldMask, TRUE); 1410 1411 // If there's only one thread context to bind to, return now. 1412 KMP_ASSERT(nApics > 0); 1413 if (nApics == 1) { 1414 __kmp_ncores = nPackages = 1; 1415 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1416 if (__kmp_affinity_verbose) { 1417 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1418 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1419 1420 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1421 if (__kmp_affinity_respect_mask) { 1422 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1423 } else { 1424 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1425 } 1426 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1427 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1428 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1429 __kmp_nThreadsPerCore, __kmp_ncores); 1430 } 1431 1432 if (__kmp_affinity_type == affinity_none) { 1433 __kmp_free(retval); 1434 KMP_CPU_FREE(oldMask); 1435 return 0; 1436 } 1437 1438 // Form an Address object which only includes the package level. 1439 Address addr(1); 1440 addr.labels[0] = retval[0].first.labels[pkgLevel]; 1441 retval[0].first = addr; 1442 1443 if (__kmp_affinity_gran_levels < 0) { 1444 __kmp_affinity_gran_levels = 0; 1445 } 1446 1447 if (__kmp_affinity_verbose) { 1448 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 1449 } 1450 1451 *address2os = retval; 1452 KMP_CPU_FREE(oldMask); 1453 return 1; 1454 } 1455 1456 // Sort the table by physical Id. 1457 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels); 1458 1459 // Find the radix at each of the levels. 1460 unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1461 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1462 unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1463 unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1464 for (level = 0; level < depth; level++) { 1465 totals[level] = 1; 1466 maxCt[level] = 1; 1467 counts[level] = 1; 1468 last[level] = retval[0].first.labels[level]; 1469 } 1470 1471 // From here on, the iteration variable "level" runs from the finest level to 1472 // the coarsest, i.e. we iterate forward through 1473 // (*address2os)[].first.labels[] - in the previous loops, we iterated 1474 // backwards. 1475 for (proc = 1; (int)proc < nApics; proc++) { 1476 int level; 1477 for (level = 0; level < depth; level++) { 1478 if (retval[proc].first.labels[level] != last[level]) { 1479 int j; 1480 for (j = level + 1; j < depth; j++) { 1481 totals[j]++; 1482 counts[j] = 1; 1483 // The line below causes printing incorrect topology information in 1484 // case the max value for some level (maxCt[level]) is encountered 1485 // earlier than some less value while going through the array. For 1486 // example, let pkg0 has 4 cores and pkg1 has 2 cores. Then 1487 // maxCt[1] == 2 1488 // whereas it must be 4. 1489 // TODO!!! Check if it can be commented safely 1490 // maxCt[j] = 1; 1491 last[j] = retval[proc].first.labels[j]; 1492 } 1493 totals[level]++; 1494 counts[level]++; 1495 if (counts[level] > maxCt[level]) { 1496 maxCt[level] = counts[level]; 1497 } 1498 last[level] = retval[proc].first.labels[level]; 1499 break; 1500 } else if (level == depth - 1) { 1501 __kmp_free(last); 1502 __kmp_free(maxCt); 1503 __kmp_free(counts); 1504 __kmp_free(totals); 1505 __kmp_free(retval); 1506 KMP_CPU_FREE(oldMask); 1507 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 1508 return -1; 1509 } 1510 } 1511 } 1512 1513 // When affinity is off, this routine will still be called to set 1514 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1515 // Make sure all these vars are set correctly, and return if affinity is not 1516 // enabled. 1517 if (threadLevel >= 0) { 1518 __kmp_nThreadsPerCore = maxCt[threadLevel]; 1519 } else { 1520 __kmp_nThreadsPerCore = 1; 1521 } 1522 nPackages = totals[pkgLevel]; 1523 1524 if (coreLevel >= 0) { 1525 __kmp_ncores = totals[coreLevel]; 1526 nCoresPerPkg = maxCt[coreLevel]; 1527 } else { 1528 __kmp_ncores = nPackages; 1529 nCoresPerPkg = 1; 1530 } 1531 1532 // Check to see if the machine topology is uniform 1533 unsigned prod = maxCt[0]; 1534 for (level = 1; level < depth; level++) { 1535 prod *= maxCt[level]; 1536 } 1537 bool uniform = (prod == totals[level - 1]); 1538 1539 // Print the machine topology summary. 1540 if (__kmp_affinity_verbose) { 1541 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 1542 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1543 1544 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1545 if (__kmp_affinity_respect_mask) { 1546 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 1547 } else { 1548 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 1549 } 1550 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1551 if (uniform) { 1552 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1553 } else { 1554 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1555 } 1556 1557 kmp_str_buf_t buf; 1558 __kmp_str_buf_init(&buf); 1559 1560 __kmp_str_buf_print(&buf, "%d", totals[0]); 1561 for (level = 1; level <= pkgLevel; level++) { 1562 __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 1563 } 1564 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 1565 __kmp_nThreadsPerCore, __kmp_ncores); 1566 1567 __kmp_str_buf_free(&buf); 1568 } 1569 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 1570 KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc); 1571 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 1572 for (proc = 0; (int)proc < nApics; ++proc) { 1573 __kmp_pu_os_idx[proc] = retval[proc].second; 1574 } 1575 if (__kmp_affinity_type == affinity_none) { 1576 __kmp_free(last); 1577 __kmp_free(maxCt); 1578 __kmp_free(counts); 1579 __kmp_free(totals); 1580 __kmp_free(retval); 1581 KMP_CPU_FREE(oldMask); 1582 return 0; 1583 } 1584 1585 // Find any levels with radiix 1, and remove them from the map 1586 // (except for the package level). 1587 int new_depth = 0; 1588 for (level = 0; level < depth; level++) { 1589 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1590 continue; 1591 } 1592 new_depth++; 1593 } 1594 1595 // If we are removing any levels, allocate a new vector to return, 1596 // and copy the relevant information to it. 1597 if (new_depth != depth) { 1598 AddrUnsPair *new_retval = 1599 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1600 for (proc = 0; (int)proc < nApics; proc++) { 1601 Address addr(new_depth); 1602 new_retval[proc] = AddrUnsPair(addr, retval[proc].second); 1603 } 1604 int new_level = 0; 1605 int newPkgLevel = -1; 1606 int newCoreLevel = -1; 1607 int newThreadLevel = -1; 1608 int i; 1609 for (level = 0; level < depth; level++) { 1610 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1611 // Remove this level. Never remove the package level 1612 continue; 1613 } 1614 if (level == pkgLevel) { 1615 newPkgLevel = level; 1616 } 1617 if (level == coreLevel) { 1618 newCoreLevel = level; 1619 } 1620 if (level == threadLevel) { 1621 newThreadLevel = level; 1622 } 1623 for (proc = 0; (int)proc < nApics; proc++) { 1624 new_retval[proc].first.labels[new_level] = 1625 retval[proc].first.labels[level]; 1626 } 1627 new_level++; 1628 } 1629 1630 __kmp_free(retval); 1631 retval = new_retval; 1632 depth = new_depth; 1633 pkgLevel = newPkgLevel; 1634 coreLevel = newCoreLevel; 1635 threadLevel = newThreadLevel; 1636 } 1637 1638 if (__kmp_affinity_gran_levels < 0) { 1639 // Set the granularity level based on what levels are modeled 1640 // in the machine topology map. 1641 __kmp_affinity_gran_levels = 0; 1642 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1643 __kmp_affinity_gran_levels++; 1644 } 1645 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1646 __kmp_affinity_gran_levels++; 1647 } 1648 if (__kmp_affinity_gran > affinity_gran_package) { 1649 __kmp_affinity_gran_levels++; 1650 } 1651 } 1652 1653 if (__kmp_affinity_verbose) { 1654 __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, coreLevel, 1655 threadLevel); 1656 } 1657 1658 __kmp_free(last); 1659 __kmp_free(maxCt); 1660 __kmp_free(counts); 1661 __kmp_free(totals); 1662 KMP_CPU_FREE(oldMask); 1663 *address2os = retval; 1664 return depth; 1665 } 1666 1667 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1668 1669 #define osIdIndex 0 1670 #define threadIdIndex 1 1671 #define coreIdIndex 2 1672 #define pkgIdIndex 3 1673 #define nodeIdIndex 4 1674 1675 typedef unsigned *ProcCpuInfo; 1676 static unsigned maxIndex = pkgIdIndex; 1677 1678 static int __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b) { 1679 const unsigned *aa = (const unsigned *)a; 1680 const unsigned *bb = (const unsigned *)b; 1681 if (aa[osIdIndex] < bb[osIdIndex]) 1682 return -1; 1683 if (aa[osIdIndex] > bb[osIdIndex]) 1684 return 1; 1685 return 0; 1686 } 1687 1688 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, 1689 const void *b) { 1690 unsigned i; 1691 const unsigned *aa = *(unsigned *const *)a; 1692 const unsigned *bb = *(unsigned *const *)b; 1693 for (i = maxIndex;; i--) { 1694 if (aa[i] < bb[i]) 1695 return -1; 1696 if (aa[i] > bb[i]) 1697 return 1; 1698 if (i == osIdIndex) 1699 break; 1700 } 1701 return 0; 1702 } 1703 1704 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 1705 // affinity map. 1706 static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, 1707 int *line, 1708 kmp_i18n_id_t *const msg_id, 1709 FILE *f) { 1710 *address2os = NULL; 1711 *msg_id = kmp_i18n_null; 1712 1713 // Scan of the file, and count the number of "processor" (osId) fields, 1714 // and find the highest value of <n> for a node_<n> field. 1715 char buf[256]; 1716 unsigned num_records = 0; 1717 while (!feof(f)) { 1718 buf[sizeof(buf) - 1] = 1; 1719 if (!fgets(buf, sizeof(buf), f)) { 1720 // Read errors presumably because of EOF 1721 break; 1722 } 1723 1724 char s1[] = "processor"; 1725 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1726 num_records++; 1727 continue; 1728 } 1729 1730 // FIXME - this will match "node_<n> <garbage>" 1731 unsigned level; 1732 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 1733 if (nodeIdIndex + level >= maxIndex) { 1734 maxIndex = nodeIdIndex + level; 1735 } 1736 continue; 1737 } 1738 } 1739 1740 // Check for empty file / no valid processor records, or too many. The number 1741 // of records can't exceed the number of valid bits in the affinity mask. 1742 if (num_records == 0) { 1743 *line = 0; 1744 *msg_id = kmp_i18n_str_NoProcRecords; 1745 return -1; 1746 } 1747 if (num_records > (unsigned)__kmp_xproc) { 1748 *line = 0; 1749 *msg_id = kmp_i18n_str_TooManyProcRecords; 1750 return -1; 1751 } 1752 1753 // Set the file pointer back to the begginning, so that we can scan the file 1754 // again, this time performing a full parse of the data. Allocate a vector of 1755 // ProcCpuInfo object, where we will place the data. Adding an extra element 1756 // at the end allows us to remove a lot of extra checks for termination 1757 // conditions. 1758 if (fseek(f, 0, SEEK_SET) != 0) { 1759 *line = 0; 1760 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 1761 return -1; 1762 } 1763 1764 // Allocate the array of records to store the proc info in. The dummy 1765 // element at the end makes the logic in filling them out easier to code. 1766 unsigned **threadInfo = 1767 (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *)); 1768 unsigned i; 1769 for (i = 0; i <= num_records; i++) { 1770 threadInfo[i] = 1771 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 1772 } 1773 1774 #define CLEANUP_THREAD_INFO \ 1775 for (i = 0; i <= num_records; i++) { \ 1776 __kmp_free(threadInfo[i]); \ 1777 } \ 1778 __kmp_free(threadInfo); 1779 1780 // A value of UINT_MAX means that we didn't find the field 1781 unsigned __index; 1782 1783 #define INIT_PROC_INFO(p) \ 1784 for (__index = 0; __index <= maxIndex; __index++) { \ 1785 (p)[__index] = UINT_MAX; \ 1786 } 1787 1788 for (i = 0; i <= num_records; i++) { 1789 INIT_PROC_INFO(threadInfo[i]); 1790 } 1791 1792 unsigned num_avail = 0; 1793 *line = 0; 1794 while (!feof(f)) { 1795 // Create an inner scoping level, so that all the goto targets at the end of 1796 // the loop appear in an outer scoping level. This avoids warnings about 1797 // jumping past an initialization to a target in the same block. 1798 { 1799 buf[sizeof(buf) - 1] = 1; 1800 bool long_line = false; 1801 if (!fgets(buf, sizeof(buf), f)) { 1802 // Read errors presumably because of EOF 1803 // If there is valid data in threadInfo[num_avail], then fake 1804 // a blank line in ensure that the last address gets parsed. 1805 bool valid = false; 1806 for (i = 0; i <= maxIndex; i++) { 1807 if (threadInfo[num_avail][i] != UINT_MAX) { 1808 valid = true; 1809 } 1810 } 1811 if (!valid) { 1812 break; 1813 } 1814 buf[0] = 0; 1815 } else if (!buf[sizeof(buf) - 1]) { 1816 // The line is longer than the buffer. Set a flag and don't 1817 // emit an error if we were going to ignore the line, anyway. 1818 long_line = true; 1819 1820 #define CHECK_LINE \ 1821 if (long_line) { \ 1822 CLEANUP_THREAD_INFO; \ 1823 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 1824 return -1; \ 1825 } 1826 } 1827 (*line)++; 1828 1829 char s1[] = "processor"; 1830 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1831 CHECK_LINE; 1832 char *p = strchr(buf + sizeof(s1) - 1, ':'); 1833 unsigned val; 1834 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 1835 goto no_val; 1836 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) 1837 goto dup_field; 1838 threadInfo[num_avail][osIdIndex] = val; 1839 #if KMP_OS_LINUX && USE_SYSFS_INFO 1840 char path[256]; 1841 KMP_SNPRINTF( 1842 path, sizeof(path), 1843 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 1844 threadInfo[num_avail][osIdIndex]); 1845 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 1846 1847 KMP_SNPRINTF(path, sizeof(path), 1848 "/sys/devices/system/cpu/cpu%u/topology/core_id", 1849 threadInfo[num_avail][osIdIndex]); 1850 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 1851 continue; 1852 #else 1853 } 1854 char s2[] = "physical id"; 1855 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 1856 CHECK_LINE; 1857 char *p = strchr(buf + sizeof(s2) - 1, ':'); 1858 unsigned val; 1859 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 1860 goto no_val; 1861 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) 1862 goto dup_field; 1863 threadInfo[num_avail][pkgIdIndex] = val; 1864 continue; 1865 } 1866 char s3[] = "core id"; 1867 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 1868 CHECK_LINE; 1869 char *p = strchr(buf + sizeof(s3) - 1, ':'); 1870 unsigned val; 1871 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 1872 goto no_val; 1873 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) 1874 goto dup_field; 1875 threadInfo[num_avail][coreIdIndex] = val; 1876 continue; 1877 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 1878 } 1879 char s4[] = "thread id"; 1880 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 1881 CHECK_LINE; 1882 char *p = strchr(buf + sizeof(s4) - 1, ':'); 1883 unsigned val; 1884 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 1885 goto no_val; 1886 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) 1887 goto dup_field; 1888 threadInfo[num_avail][threadIdIndex] = val; 1889 continue; 1890 } 1891 unsigned level; 1892 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 1893 CHECK_LINE; 1894 char *p = strchr(buf + sizeof(s4) - 1, ':'); 1895 unsigned val; 1896 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 1897 goto no_val; 1898 KMP_ASSERT(nodeIdIndex + level <= maxIndex); 1899 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) 1900 goto dup_field; 1901 threadInfo[num_avail][nodeIdIndex + level] = val; 1902 continue; 1903 } 1904 1905 // We didn't recognize the leading token on the line. There are lots of 1906 // leading tokens that we don't recognize - if the line isn't empty, go on 1907 // to the next line. 1908 if ((*buf != 0) && (*buf != '\n')) { 1909 // If the line is longer than the buffer, read characters 1910 // until we find a newline. 1911 if (long_line) { 1912 int ch; 1913 while (((ch = fgetc(f)) != EOF) && (ch != '\n')) 1914 ; 1915 } 1916 continue; 1917 } 1918 1919 // A newline has signalled the end of the processor record. 1920 // Check that there aren't too many procs specified. 1921 if ((int)num_avail == __kmp_xproc) { 1922 CLEANUP_THREAD_INFO; 1923 *msg_id = kmp_i18n_str_TooManyEntries; 1924 return -1; 1925 } 1926 1927 // Check for missing fields. The osId field must be there, and we 1928 // currently require that the physical id field is specified, also. 1929 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 1930 CLEANUP_THREAD_INFO; 1931 *msg_id = kmp_i18n_str_MissingProcField; 1932 return -1; 1933 } 1934 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 1935 CLEANUP_THREAD_INFO; 1936 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 1937 return -1; 1938 } 1939 1940 // Skip this proc if it is not included in the machine model. 1941 if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], 1942 __kmp_affin_fullMask)) { 1943 INIT_PROC_INFO(threadInfo[num_avail]); 1944 continue; 1945 } 1946 1947 // We have a successful parse of this proc's info. 1948 // Increment the counter, and prepare for the next proc. 1949 num_avail++; 1950 KMP_ASSERT(num_avail <= num_records); 1951 INIT_PROC_INFO(threadInfo[num_avail]); 1952 } 1953 continue; 1954 1955 no_val: 1956 CLEANUP_THREAD_INFO; 1957 *msg_id = kmp_i18n_str_MissingValCpuinfo; 1958 return -1; 1959 1960 dup_field: 1961 CLEANUP_THREAD_INFO; 1962 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 1963 return -1; 1964 } 1965 *line = 0; 1966 1967 #if KMP_MIC && REDUCE_TEAM_SIZE 1968 unsigned teamSize = 0; 1969 #endif // KMP_MIC && REDUCE_TEAM_SIZE 1970 1971 // check for num_records == __kmp_xproc ??? 1972 1973 // If there's only one thread context to bind to, form an Address object with 1974 // depth 1 and return immediately (or, if affinity is off, set address2os to 1975 // NULL and return). 1976 // 1977 // If it is configured to omit the package level when there is only a single 1978 // package, the logic at the end of this routine won't work if there is only a 1979 // single thread - it would try to form an Address object with depth 0. 1980 KMP_ASSERT(num_avail > 0); 1981 KMP_ASSERT(num_avail <= num_records); 1982 if (num_avail == 1) { 1983 __kmp_ncores = 1; 1984 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1985 if (__kmp_affinity_verbose) { 1986 if (!KMP_AFFINITY_CAPABLE()) { 1987 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 1988 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1989 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1990 } else { 1991 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1992 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 1993 __kmp_affin_fullMask); 1994 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 1995 if (__kmp_affinity_respect_mask) { 1996 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1997 } else { 1998 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1999 } 2000 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2001 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2002 } 2003 int index; 2004 kmp_str_buf_t buf; 2005 __kmp_str_buf_init(&buf); 2006 __kmp_str_buf_print(&buf, "1"); 2007 for (index = maxIndex - 1; index > pkgIdIndex; index--) { 2008 __kmp_str_buf_print(&buf, " x 1"); 2009 } 2010 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1); 2011 __kmp_str_buf_free(&buf); 2012 } 2013 2014 if (__kmp_affinity_type == affinity_none) { 2015 CLEANUP_THREAD_INFO; 2016 return 0; 2017 } 2018 2019 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair)); 2020 Address addr(1); 2021 addr.labels[0] = threadInfo[0][pkgIdIndex]; 2022 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]); 2023 2024 if (__kmp_affinity_gran_levels < 0) { 2025 __kmp_affinity_gran_levels = 0; 2026 } 2027 2028 if (__kmp_affinity_verbose) { 2029 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 2030 } 2031 2032 CLEANUP_THREAD_INFO; 2033 return 1; 2034 } 2035 2036 // Sort the threadInfo table by physical Id. 2037 qsort(threadInfo, num_avail, sizeof(*threadInfo), 2038 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 2039 2040 // The table is now sorted by pkgId / coreId / threadId, but we really don't 2041 // know the radix of any of the fields. pkgId's may be sparsely assigned among 2042 // the chips on a system. Although coreId's are usually assigned 2043 // [0 .. coresPerPkg-1] and threadId's are usually assigned 2044 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2045 // 2046 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 2047 // total # packages) are at this point - we want to determine that now. We 2048 // only have an upper bound on the first two figures. 2049 unsigned *counts = 2050 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2051 unsigned *maxCt = 2052 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2053 unsigned *totals = 2054 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2055 unsigned *lastId = 2056 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2057 2058 bool assign_thread_ids = false; 2059 unsigned threadIdCt; 2060 unsigned index; 2061 2062 restart_radix_check: 2063 threadIdCt = 0; 2064 2065 // Initialize the counter arrays with data from threadInfo[0]. 2066 if (assign_thread_ids) { 2067 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 2068 threadInfo[0][threadIdIndex] = threadIdCt++; 2069 } else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 2070 threadIdCt = threadInfo[0][threadIdIndex] + 1; 2071 } 2072 } 2073 for (index = 0; index <= maxIndex; index++) { 2074 counts[index] = 1; 2075 maxCt[index] = 1; 2076 totals[index] = 1; 2077 lastId[index] = threadInfo[0][index]; 2078 ; 2079 } 2080 2081 // Run through the rest of the OS procs. 2082 for (i = 1; i < num_avail; i++) { 2083 // Find the most significant index whose id differs from the id for the 2084 // previous OS proc. 2085 for (index = maxIndex; index >= threadIdIndex; index--) { 2086 if (assign_thread_ids && (index == threadIdIndex)) { 2087 // Auto-assign the thread id field if it wasn't specified. 2088 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2089 threadInfo[i][threadIdIndex] = threadIdCt++; 2090 } 2091 // Apparently the thread id field was specified for some entries and not 2092 // others. Start the thread id counter off at the next higher thread id. 2093 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2094 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2095 } 2096 } 2097 if (threadInfo[i][index] != lastId[index]) { 2098 // Run through all indices which are less significant, and reset the 2099 // counts to 1. At all levels up to and including index, we need to 2100 // increment the totals and record the last id. 2101 unsigned index2; 2102 for (index2 = threadIdIndex; index2 < index; index2++) { 2103 totals[index2]++; 2104 if (counts[index2] > maxCt[index2]) { 2105 maxCt[index2] = counts[index2]; 2106 } 2107 counts[index2] = 1; 2108 lastId[index2] = threadInfo[i][index2]; 2109 } 2110 counts[index]++; 2111 totals[index]++; 2112 lastId[index] = threadInfo[i][index]; 2113 2114 if (assign_thread_ids && (index > threadIdIndex)) { 2115 2116 #if KMP_MIC && REDUCE_TEAM_SIZE 2117 // The default team size is the total #threads in the machine 2118 // minus 1 thread for every core that has 3 or more threads. 2119 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2120 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2121 2122 // Restart the thread counter, as we are on a new core. 2123 threadIdCt = 0; 2124 2125 // Auto-assign the thread id field if it wasn't specified. 2126 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2127 threadInfo[i][threadIdIndex] = threadIdCt++; 2128 } 2129 2130 // Aparrently the thread id field was specified for some entries and 2131 // not others. Start the thread id counter off at the next higher 2132 // thread id. 2133 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2134 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2135 } 2136 } 2137 break; 2138 } 2139 } 2140 if (index < threadIdIndex) { 2141 // If thread ids were specified, it is an error if they are not unique. 2142 // Also, check that we waven't already restarted the loop (to be safe - 2143 // shouldn't need to). 2144 if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) { 2145 __kmp_free(lastId); 2146 __kmp_free(totals); 2147 __kmp_free(maxCt); 2148 __kmp_free(counts); 2149 CLEANUP_THREAD_INFO; 2150 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2151 return -1; 2152 } 2153 2154 // If the thread ids were not specified and we see entries entries that 2155 // are duplicates, start the loop over and assign the thread ids manually. 2156 assign_thread_ids = true; 2157 goto restart_radix_check; 2158 } 2159 } 2160 2161 #if KMP_MIC && REDUCE_TEAM_SIZE 2162 // The default team size is the total #threads in the machine 2163 // minus 1 thread for every core that has 3 or more threads. 2164 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2165 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2166 2167 for (index = threadIdIndex; index <= maxIndex; index++) { 2168 if (counts[index] > maxCt[index]) { 2169 maxCt[index] = counts[index]; 2170 } 2171 } 2172 2173 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2174 nCoresPerPkg = maxCt[coreIdIndex]; 2175 nPackages = totals[pkgIdIndex]; 2176 2177 // Check to see if the machine topology is uniform 2178 unsigned prod = totals[maxIndex]; 2179 for (index = threadIdIndex; index < maxIndex; index++) { 2180 prod *= maxCt[index]; 2181 } 2182 bool uniform = (prod == totals[threadIdIndex]); 2183 2184 // When affinity is off, this routine will still be called to set 2185 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 2186 // Make sure all these vars are set correctly, and return now if affinity is 2187 // not enabled. 2188 __kmp_ncores = totals[coreIdIndex]; 2189 2190 if (__kmp_affinity_verbose) { 2191 if (!KMP_AFFINITY_CAPABLE()) { 2192 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2193 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2194 if (uniform) { 2195 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2196 } else { 2197 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2198 } 2199 } else { 2200 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2201 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 2202 __kmp_affin_fullMask); 2203 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2204 if (__kmp_affinity_respect_mask) { 2205 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2206 } else { 2207 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2208 } 2209 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2210 if (uniform) { 2211 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2212 } else { 2213 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2214 } 2215 } 2216 kmp_str_buf_t buf; 2217 __kmp_str_buf_init(&buf); 2218 2219 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]); 2220 for (index = maxIndex - 1; index >= pkgIdIndex; index--) { 2221 __kmp_str_buf_print(&buf, " x %d", maxCt[index]); 2222 } 2223 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex], 2224 maxCt[threadIdIndex], __kmp_ncores); 2225 2226 __kmp_str_buf_free(&buf); 2227 } 2228 2229 #if KMP_MIC && REDUCE_TEAM_SIZE 2230 // Set the default team size. 2231 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2232 __kmp_dflt_team_nth = teamSize; 2233 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting " 2234 "__kmp_dflt_team_nth = %d\n", 2235 __kmp_dflt_team_nth)); 2236 } 2237 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2238 2239 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 2240 KMP_DEBUG_ASSERT(num_avail == __kmp_avail_proc); 2241 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 2242 for (i = 0; i < num_avail; ++i) { // fill the os indices 2243 __kmp_pu_os_idx[i] = threadInfo[i][osIdIndex]; 2244 } 2245 2246 if (__kmp_affinity_type == affinity_none) { 2247 __kmp_free(lastId); 2248 __kmp_free(totals); 2249 __kmp_free(maxCt); 2250 __kmp_free(counts); 2251 CLEANUP_THREAD_INFO; 2252 return 0; 2253 } 2254 2255 // Count the number of levels which have more nodes at that level than at the 2256 // parent's level (with there being an implicit root node of the top level). 2257 // This is equivalent to saying that there is at least one node at this level 2258 // which has a sibling. These levels are in the map, and the package level is 2259 // always in the map. 2260 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2261 int level = 0; 2262 for (index = threadIdIndex; index < maxIndex; index++) { 2263 KMP_ASSERT(totals[index] >= totals[index + 1]); 2264 inMap[index] = (totals[index] > totals[index + 1]); 2265 } 2266 inMap[maxIndex] = (totals[maxIndex] > 1); 2267 inMap[pkgIdIndex] = true; 2268 2269 int depth = 0; 2270 for (index = threadIdIndex; index <= maxIndex; index++) { 2271 if (inMap[index]) { 2272 depth++; 2273 } 2274 } 2275 KMP_ASSERT(depth > 0); 2276 2277 // Construct the data structure that is to be returned. 2278 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * num_avail); 2279 int pkgLevel = -1; 2280 int coreLevel = -1; 2281 int threadLevel = -1; 2282 2283 for (i = 0; i < num_avail; ++i) { 2284 Address addr(depth); 2285 unsigned os = threadInfo[i][osIdIndex]; 2286 int src_index; 2287 int dst_index = 0; 2288 2289 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2290 if (!inMap[src_index]) { 2291 continue; 2292 } 2293 addr.labels[dst_index] = threadInfo[i][src_index]; 2294 if (src_index == pkgIdIndex) { 2295 pkgLevel = dst_index; 2296 } else if (src_index == coreIdIndex) { 2297 coreLevel = dst_index; 2298 } else if (src_index == threadIdIndex) { 2299 threadLevel = dst_index; 2300 } 2301 dst_index++; 2302 } 2303 (*address2os)[i] = AddrUnsPair(addr, os); 2304 } 2305 2306 if (__kmp_affinity_gran_levels < 0) { 2307 // Set the granularity level based on what levels are modeled 2308 // in the machine topology map. 2309 unsigned src_index; 2310 __kmp_affinity_gran_levels = 0; 2311 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) { 2312 if (!inMap[src_index]) { 2313 continue; 2314 } 2315 switch (src_index) { 2316 case threadIdIndex: 2317 if (__kmp_affinity_gran > affinity_gran_thread) { 2318 __kmp_affinity_gran_levels++; 2319 } 2320 2321 break; 2322 case coreIdIndex: 2323 if (__kmp_affinity_gran > affinity_gran_core) { 2324 __kmp_affinity_gran_levels++; 2325 } 2326 break; 2327 2328 case pkgIdIndex: 2329 if (__kmp_affinity_gran > affinity_gran_package) { 2330 __kmp_affinity_gran_levels++; 2331 } 2332 break; 2333 } 2334 } 2335 } 2336 2337 if (__kmp_affinity_verbose) { 2338 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel, 2339 coreLevel, threadLevel); 2340 } 2341 2342 __kmp_free(inMap); 2343 __kmp_free(lastId); 2344 __kmp_free(totals); 2345 __kmp_free(maxCt); 2346 __kmp_free(counts); 2347 CLEANUP_THREAD_INFO; 2348 return depth; 2349 } 2350 2351 // Create and return a table of affinity masks, indexed by OS thread ID. 2352 // This routine handles OR'ing together all the affinity masks of threads 2353 // that are sufficiently close, if granularity > fine. 2354 static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex, 2355 unsigned *numUnique, 2356 AddrUnsPair *address2os, 2357 unsigned numAddrs) { 2358 // First form a table of affinity masks in order of OS thread id. 2359 unsigned depth; 2360 unsigned maxOsId; 2361 unsigned i; 2362 2363 KMP_ASSERT(numAddrs > 0); 2364 depth = address2os[0].first.depth; 2365 2366 maxOsId = 0; 2367 for (i = 0; i < numAddrs; i++) { 2368 unsigned osId = address2os[i].second; 2369 if (osId > maxOsId) { 2370 maxOsId = osId; 2371 } 2372 } 2373 kmp_affin_mask_t *osId2Mask; 2374 KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1)); 2375 2376 // Sort the address2os table according to physical order. Doing so will put 2377 // all threads on the same core/package/node in consecutive locations. 2378 qsort(address2os, numAddrs, sizeof(*address2os), 2379 __kmp_affinity_cmp_Address_labels); 2380 2381 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2382 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2383 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2384 } 2385 if (__kmp_affinity_gran_levels >= (int)depth) { 2386 if (__kmp_affinity_verbose || 2387 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 2388 KMP_WARNING(AffThreadsMayMigrate); 2389 } 2390 } 2391 2392 // Run through the table, forming the masks for all threads on each core. 2393 // Threads on the same core will have identical "Address" objects, not 2394 // considering the last level, which must be the thread id. All threads on a 2395 // core will appear consecutively. 2396 unsigned unique = 0; 2397 unsigned j = 0; // index of 1st thread on core 2398 unsigned leader = 0; 2399 Address *leaderAddr = &(address2os[0].first); 2400 kmp_affin_mask_t *sum; 2401 KMP_CPU_ALLOC_ON_STACK(sum); 2402 KMP_CPU_ZERO(sum); 2403 KMP_CPU_SET(address2os[0].second, sum); 2404 for (i = 1; i < numAddrs; i++) { 2405 // If this thread is sufficiently close to the leader (within the 2406 // granularity setting), then set the bit for this os thread in the 2407 // affinity mask for this group, and go on to the next thread. 2408 if (leaderAddr->isClose(address2os[i].first, __kmp_affinity_gran_levels)) { 2409 KMP_CPU_SET(address2os[i].second, sum); 2410 continue; 2411 } 2412 2413 // For every thread in this group, copy the mask to the thread's entry in 2414 // the osId2Mask table. Mark the first address as a leader. 2415 for (; j < i; j++) { 2416 unsigned osId = address2os[j].second; 2417 KMP_DEBUG_ASSERT(osId <= maxOsId); 2418 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2419 KMP_CPU_COPY(mask, sum); 2420 address2os[j].first.leader = (j == leader); 2421 } 2422 unique++; 2423 2424 // Start a new mask. 2425 leader = i; 2426 leaderAddr = &(address2os[i].first); 2427 KMP_CPU_ZERO(sum); 2428 KMP_CPU_SET(address2os[i].second, sum); 2429 } 2430 2431 // For every thread in last group, copy the mask to the thread's 2432 // entry in the osId2Mask table. 2433 for (; j < i; j++) { 2434 unsigned osId = address2os[j].second; 2435 KMP_DEBUG_ASSERT(osId <= maxOsId); 2436 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2437 KMP_CPU_COPY(mask, sum); 2438 address2os[j].first.leader = (j == leader); 2439 } 2440 unique++; 2441 KMP_CPU_FREE_FROM_STACK(sum); 2442 2443 *maxIndex = maxOsId; 2444 *numUnique = unique; 2445 return osId2Mask; 2446 } 2447 2448 // Stuff for the affinity proclist parsers. It's easier to declare these vars 2449 // as file-static than to try and pass them through the calling sequence of 2450 // the recursive-descent OMP_PLACES parser. 2451 static kmp_affin_mask_t *newMasks; 2452 static int numNewMasks; 2453 static int nextNewMask; 2454 2455 #define ADD_MASK(_mask) \ 2456 { \ 2457 if (nextNewMask >= numNewMasks) { \ 2458 int i; \ 2459 numNewMasks *= 2; \ 2460 kmp_affin_mask_t *temp; \ 2461 KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \ 2462 for (i = 0; i < numNewMasks / 2; i++) { \ 2463 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); \ 2464 kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i); \ 2465 KMP_CPU_COPY(dest, src); \ 2466 } \ 2467 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2); \ 2468 newMasks = temp; \ 2469 } \ 2470 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 2471 nextNewMask++; \ 2472 } 2473 2474 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId) \ 2475 { \ 2476 if (((_osId) > _maxOsId) || \ 2477 (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 2478 if (__kmp_affinity_verbose || \ 2479 (__kmp_affinity_warnings && \ 2480 (__kmp_affinity_type != affinity_none))) { \ 2481 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 2482 } \ 2483 } else { \ 2484 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 2485 } \ 2486 } 2487 2488 // Re-parse the proclist (for the explicit affinity type), and form the list 2489 // of affinity newMasks indexed by gtid. 2490 static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 2491 unsigned int *out_numMasks, 2492 const char *proclist, 2493 kmp_affin_mask_t *osId2Mask, 2494 int maxOsId) { 2495 int i; 2496 const char *scan = proclist; 2497 const char *next = proclist; 2498 2499 // We use malloc() for the temporary mask vector, so that we can use 2500 // realloc() to extend it. 2501 numNewMasks = 2; 2502 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 2503 nextNewMask = 0; 2504 kmp_affin_mask_t *sumMask; 2505 KMP_CPU_ALLOC(sumMask); 2506 int setSize = 0; 2507 2508 for (;;) { 2509 int start, end, stride; 2510 2511 SKIP_WS(scan); 2512 next = scan; 2513 if (*next == '\0') { 2514 break; 2515 } 2516 2517 if (*next == '{') { 2518 int num; 2519 setSize = 0; 2520 next++; // skip '{' 2521 SKIP_WS(next); 2522 scan = next; 2523 2524 // Read the first integer in the set. 2525 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist"); 2526 SKIP_DIGITS(next); 2527 num = __kmp_str_to_int(scan, *next); 2528 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2529 2530 // Copy the mask for that osId to the sum (union) mask. 2531 if ((num > maxOsId) || 2532 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2533 if (__kmp_affinity_verbose || 2534 (__kmp_affinity_warnings && 2535 (__kmp_affinity_type != affinity_none))) { 2536 KMP_WARNING(AffIgnoreInvalidProcID, num); 2537 } 2538 KMP_CPU_ZERO(sumMask); 2539 } else { 2540 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2541 setSize = 1; 2542 } 2543 2544 for (;;) { 2545 // Check for end of set. 2546 SKIP_WS(next); 2547 if (*next == '}') { 2548 next++; // skip '}' 2549 break; 2550 } 2551 2552 // Skip optional comma. 2553 if (*next == ',') { 2554 next++; 2555 } 2556 SKIP_WS(next); 2557 2558 // Read the next integer in the set. 2559 scan = next; 2560 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2561 2562 SKIP_DIGITS(next); 2563 num = __kmp_str_to_int(scan, *next); 2564 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2565 2566 // Add the mask for that osId to the sum mask. 2567 if ((num > maxOsId) || 2568 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2569 if (__kmp_affinity_verbose || 2570 (__kmp_affinity_warnings && 2571 (__kmp_affinity_type != affinity_none))) { 2572 KMP_WARNING(AffIgnoreInvalidProcID, num); 2573 } 2574 } else { 2575 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2576 setSize++; 2577 } 2578 } 2579 if (setSize > 0) { 2580 ADD_MASK(sumMask); 2581 } 2582 2583 SKIP_WS(next); 2584 if (*next == ',') { 2585 next++; 2586 } 2587 scan = next; 2588 continue; 2589 } 2590 2591 // Read the first integer. 2592 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2593 SKIP_DIGITS(next); 2594 start = __kmp_str_to_int(scan, *next); 2595 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 2596 SKIP_WS(next); 2597 2598 // If this isn't a range, then add a mask to the list and go on. 2599 if (*next != '-') { 2600 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2601 2602 // Skip optional comma. 2603 if (*next == ',') { 2604 next++; 2605 } 2606 scan = next; 2607 continue; 2608 } 2609 2610 // This is a range. Skip over the '-' and read in the 2nd int. 2611 next++; // skip '-' 2612 SKIP_WS(next); 2613 scan = next; 2614 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2615 SKIP_DIGITS(next); 2616 end = __kmp_str_to_int(scan, *next); 2617 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 2618 2619 // Check for a stride parameter 2620 stride = 1; 2621 SKIP_WS(next); 2622 if (*next == ':') { 2623 // A stride is specified. Skip over the ':" and read the 3rd int. 2624 int sign = +1; 2625 next++; // skip ':' 2626 SKIP_WS(next); 2627 scan = next; 2628 if (*next == '-') { 2629 sign = -1; 2630 next++; 2631 SKIP_WS(next); 2632 scan = next; 2633 } 2634 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2635 SKIP_DIGITS(next); 2636 stride = __kmp_str_to_int(scan, *next); 2637 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 2638 stride *= sign; 2639 } 2640 2641 // Do some range checks. 2642 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 2643 if (stride > 0) { 2644 KMP_ASSERT2(start <= end, "bad explicit proc list"); 2645 } else { 2646 KMP_ASSERT2(start >= end, "bad explicit proc list"); 2647 } 2648 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 2649 2650 // Add the mask for each OS proc # to the list. 2651 if (stride > 0) { 2652 do { 2653 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2654 start += stride; 2655 } while (start <= end); 2656 } else { 2657 do { 2658 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2659 start += stride; 2660 } while (start >= end); 2661 } 2662 2663 // Skip optional comma. 2664 SKIP_WS(next); 2665 if (*next == ',') { 2666 next++; 2667 } 2668 scan = next; 2669 } 2670 2671 *out_numMasks = nextNewMask; 2672 if (nextNewMask == 0) { 2673 *out_masks = NULL; 2674 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 2675 return; 2676 } 2677 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 2678 for (i = 0; i < nextNewMask; i++) { 2679 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 2680 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 2681 KMP_CPU_COPY(dest, src); 2682 } 2683 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 2684 KMP_CPU_FREE(sumMask); 2685 } 2686 2687 #if OMP_40_ENABLED 2688 2689 /*----------------------------------------------------------------------------- 2690 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 2691 places. Again, Here is the grammar: 2692 2693 place_list := place 2694 place_list := place , place_list 2695 place := num 2696 place := place : num 2697 place := place : num : signed 2698 place := { subplacelist } 2699 place := ! place // (lowest priority) 2700 subplace_list := subplace 2701 subplace_list := subplace , subplace_list 2702 subplace := num 2703 subplace := num : num 2704 subplace := num : num : signed 2705 signed := num 2706 signed := + signed 2707 signed := - signed 2708 -----------------------------------------------------------------------------*/ 2709 2710 static void __kmp_process_subplace_list(const char **scan, 2711 kmp_affin_mask_t *osId2Mask, 2712 int maxOsId, kmp_affin_mask_t *tempMask, 2713 int *setSize) { 2714 const char *next; 2715 2716 for (;;) { 2717 int start, count, stride, i; 2718 2719 // Read in the starting proc id 2720 SKIP_WS(*scan); 2721 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 2722 next = *scan; 2723 SKIP_DIGITS(next); 2724 start = __kmp_str_to_int(*scan, *next); 2725 KMP_ASSERT(start >= 0); 2726 *scan = next; 2727 2728 // valid follow sets are ',' ':' and '}' 2729 SKIP_WS(*scan); 2730 if (**scan == '}' || **scan == ',') { 2731 if ((start > maxOsId) || 2732 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2733 if (__kmp_affinity_verbose || 2734 (__kmp_affinity_warnings && 2735 (__kmp_affinity_type != affinity_none))) { 2736 KMP_WARNING(AffIgnoreInvalidProcID, start); 2737 } 2738 } else { 2739 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2740 (*setSize)++; 2741 } 2742 if (**scan == '}') { 2743 break; 2744 } 2745 (*scan)++; // skip ',' 2746 continue; 2747 } 2748 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 2749 (*scan)++; // skip ':' 2750 2751 // Read count parameter 2752 SKIP_WS(*scan); 2753 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 2754 next = *scan; 2755 SKIP_DIGITS(next); 2756 count = __kmp_str_to_int(*scan, *next); 2757 KMP_ASSERT(count >= 0); 2758 *scan = next; 2759 2760 // valid follow sets are ',' ':' and '}' 2761 SKIP_WS(*scan); 2762 if (**scan == '}' || **scan == ',') { 2763 for (i = 0; i < count; i++) { 2764 if ((start > maxOsId) || 2765 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2766 if (__kmp_affinity_verbose || 2767 (__kmp_affinity_warnings && 2768 (__kmp_affinity_type != affinity_none))) { 2769 KMP_WARNING(AffIgnoreInvalidProcID, start); 2770 } 2771 break; // don't proliferate warnings for large count 2772 } else { 2773 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2774 start++; 2775 (*setSize)++; 2776 } 2777 } 2778 if (**scan == '}') { 2779 break; 2780 } 2781 (*scan)++; // skip ',' 2782 continue; 2783 } 2784 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 2785 (*scan)++; // skip ':' 2786 2787 // Read stride parameter 2788 int sign = +1; 2789 for (;;) { 2790 SKIP_WS(*scan); 2791 if (**scan == '+') { 2792 (*scan)++; // skip '+' 2793 continue; 2794 } 2795 if (**scan == '-') { 2796 sign *= -1; 2797 (*scan)++; // skip '-' 2798 continue; 2799 } 2800 break; 2801 } 2802 SKIP_WS(*scan); 2803 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 2804 next = *scan; 2805 SKIP_DIGITS(next); 2806 stride = __kmp_str_to_int(*scan, *next); 2807 KMP_ASSERT(stride >= 0); 2808 *scan = next; 2809 stride *= sign; 2810 2811 // valid follow sets are ',' and '}' 2812 SKIP_WS(*scan); 2813 if (**scan == '}' || **scan == ',') { 2814 for (i = 0; i < count; i++) { 2815 if ((start > maxOsId) || 2816 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2817 if (__kmp_affinity_verbose || 2818 (__kmp_affinity_warnings && 2819 (__kmp_affinity_type != affinity_none))) { 2820 KMP_WARNING(AffIgnoreInvalidProcID, start); 2821 } 2822 break; // don't proliferate warnings for large count 2823 } else { 2824 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2825 start += stride; 2826 (*setSize)++; 2827 } 2828 } 2829 if (**scan == '}') { 2830 break; 2831 } 2832 (*scan)++; // skip ',' 2833 continue; 2834 } 2835 2836 KMP_ASSERT2(0, "bad explicit places list"); 2837 } 2838 } 2839 2840 static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 2841 int maxOsId, kmp_affin_mask_t *tempMask, 2842 int *setSize) { 2843 const char *next; 2844 2845 // valid follow sets are '{' '!' and num 2846 SKIP_WS(*scan); 2847 if (**scan == '{') { 2848 (*scan)++; // skip '{' 2849 __kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize); 2850 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 2851 (*scan)++; // skip '}' 2852 } else if (**scan == '!') { 2853 (*scan)++; // skip '!' 2854 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 2855 KMP_CPU_COMPLEMENT(maxOsId, tempMask); 2856 } else if ((**scan >= '0') && (**scan <= '9')) { 2857 next = *scan; 2858 SKIP_DIGITS(next); 2859 int num = __kmp_str_to_int(*scan, *next); 2860 KMP_ASSERT(num >= 0); 2861 if ((num > maxOsId) || 2862 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2863 if (__kmp_affinity_verbose || 2864 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 2865 KMP_WARNING(AffIgnoreInvalidProcID, num); 2866 } 2867 } else { 2868 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 2869 (*setSize)++; 2870 } 2871 *scan = next; // skip num 2872 } else { 2873 KMP_ASSERT2(0, "bad explicit places list"); 2874 } 2875 } 2876 2877 // static void 2878 void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 2879 unsigned int *out_numMasks, 2880 const char *placelist, 2881 kmp_affin_mask_t *osId2Mask, 2882 int maxOsId) { 2883 int i, j, count, stride, sign; 2884 const char *scan = placelist; 2885 const char *next = placelist; 2886 2887 numNewMasks = 2; 2888 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 2889 nextNewMask = 0; 2890 2891 // tempMask is modified based on the previous or initial 2892 // place to form the current place 2893 // previousMask contains the previous place 2894 kmp_affin_mask_t *tempMask; 2895 kmp_affin_mask_t *previousMask; 2896 KMP_CPU_ALLOC(tempMask); 2897 KMP_CPU_ZERO(tempMask); 2898 KMP_CPU_ALLOC(previousMask); 2899 KMP_CPU_ZERO(previousMask); 2900 int setSize = 0; 2901 2902 for (;;) { 2903 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 2904 2905 // valid follow sets are ',' ':' and EOL 2906 SKIP_WS(scan); 2907 if (*scan == '\0' || *scan == ',') { 2908 if (setSize > 0) { 2909 ADD_MASK(tempMask); 2910 } 2911 KMP_CPU_ZERO(tempMask); 2912 setSize = 0; 2913 if (*scan == '\0') { 2914 break; 2915 } 2916 scan++; // skip ',' 2917 continue; 2918 } 2919 2920 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 2921 scan++; // skip ':' 2922 2923 // Read count parameter 2924 SKIP_WS(scan); 2925 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 2926 next = scan; 2927 SKIP_DIGITS(next); 2928 count = __kmp_str_to_int(scan, *next); 2929 KMP_ASSERT(count >= 0); 2930 scan = next; 2931 2932 // valid follow sets are ',' ':' and EOL 2933 SKIP_WS(scan); 2934 if (*scan == '\0' || *scan == ',') { 2935 stride = +1; 2936 } else { 2937 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 2938 scan++; // skip ':' 2939 2940 // Read stride parameter 2941 sign = +1; 2942 for (;;) { 2943 SKIP_WS(scan); 2944 if (*scan == '+') { 2945 scan++; // skip '+' 2946 continue; 2947 } 2948 if (*scan == '-') { 2949 sign *= -1; 2950 scan++; // skip '-' 2951 continue; 2952 } 2953 break; 2954 } 2955 SKIP_WS(scan); 2956 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 2957 next = scan; 2958 SKIP_DIGITS(next); 2959 stride = __kmp_str_to_int(scan, *next); 2960 KMP_DEBUG_ASSERT(stride >= 0); 2961 scan = next; 2962 stride *= sign; 2963 } 2964 2965 // Add places determined by initial_place : count : stride 2966 for (i = 0; i < count; i++) { 2967 if (setSize == 0) { 2968 break; 2969 } 2970 // Add the current place, then build the next place (tempMask) from that 2971 KMP_CPU_COPY(previousMask, tempMask); 2972 ADD_MASK(previousMask); 2973 KMP_CPU_ZERO(tempMask); 2974 setSize = 0; 2975 KMP_CPU_SET_ITERATE(j, previousMask) { 2976 if (!KMP_CPU_ISSET(j, previousMask)) { 2977 continue; 2978 } 2979 if ((j + stride > maxOsId) || (j + stride < 0) || 2980 (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) || 2981 (!KMP_CPU_ISSET(j + stride, 2982 KMP_CPU_INDEX(osId2Mask, j + stride)))) { 2983 if ((__kmp_affinity_verbose || 2984 (__kmp_affinity_warnings && 2985 (__kmp_affinity_type != affinity_none))) && 2986 i < count - 1) { 2987 KMP_WARNING(AffIgnoreInvalidProcID, j + stride); 2988 } 2989 continue; 2990 } 2991 KMP_CPU_SET(j + stride, tempMask); 2992 setSize++; 2993 } 2994 } 2995 KMP_CPU_ZERO(tempMask); 2996 setSize = 0; 2997 2998 // valid follow sets are ',' and EOL 2999 SKIP_WS(scan); 3000 if (*scan == '\0') { 3001 break; 3002 } 3003 if (*scan == ',') { 3004 scan++; // skip ',' 3005 continue; 3006 } 3007 3008 KMP_ASSERT2(0, "bad explicit places list"); 3009 } 3010 3011 *out_numMasks = nextNewMask; 3012 if (nextNewMask == 0) { 3013 *out_masks = NULL; 3014 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3015 return; 3016 } 3017 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3018 KMP_CPU_FREE(tempMask); 3019 KMP_CPU_FREE(previousMask); 3020 for (i = 0; i < nextNewMask; i++) { 3021 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 3022 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 3023 KMP_CPU_COPY(dest, src); 3024 } 3025 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3026 } 3027 3028 #endif /* OMP_40_ENABLED */ 3029 3030 #undef ADD_MASK 3031 #undef ADD_MASK_OSID 3032 3033 #if KMP_USE_HWLOC 3034 static int __kmp_hwloc_count_children_by_type(hwloc_topology_t t, hwloc_obj_t o, 3035 hwloc_obj_type_t type, 3036 hwloc_obj_t *f) { 3037 if (!hwloc_compare_types(o->type, type)) { 3038 if (*f == NULL) 3039 *f = o; // output first descendant found 3040 return 1; 3041 } 3042 int sum = 0; 3043 for (unsigned i = 0; i < o->arity; i++) 3044 sum += __kmp_hwloc_count_children_by_type(t, o->children[i], type, f); 3045 return sum; // will be 0 if no one found (as PU arity is 0) 3046 } 3047 3048 static int __kmp_hwloc_count_children_by_depth(hwloc_topology_t t, 3049 hwloc_obj_t o, unsigned depth, 3050 hwloc_obj_t *f) { 3051 if (o->depth == depth) { 3052 if (*f == NULL) 3053 *f = o; // output first descendant found 3054 return 1; 3055 } 3056 int sum = 0; 3057 for (unsigned i = 0; i < o->arity; i++) 3058 sum += __kmp_hwloc_count_children_by_depth(t, o->children[i], depth, f); 3059 return sum; // will be 0 if no one found (as PU arity is 0) 3060 } 3061 3062 static int __kmp_hwloc_skip_PUs_obj(hwloc_topology_t t, hwloc_obj_t o) { 3063 // skip PUs descendants of the object o 3064 int skipped = 0; 3065 hwloc_obj_t hT = NULL; 3066 int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT); 3067 for (int i = 0; i < N; ++i) { 3068 KMP_DEBUG_ASSERT(hT); 3069 unsigned idx = hT->os_index; 3070 if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3071 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3072 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3073 ++skipped; 3074 } 3075 hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT); 3076 } 3077 return skipped; // count number of skipped units 3078 } 3079 3080 static int __kmp_hwloc_obj_has_PUs(hwloc_topology_t t, hwloc_obj_t o) { 3081 // check if obj has PUs present in fullMask 3082 hwloc_obj_t hT = NULL; 3083 int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT); 3084 for (int i = 0; i < N; ++i) { 3085 KMP_DEBUG_ASSERT(hT); 3086 unsigned idx = hT->os_index; 3087 if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) 3088 return 1; // found PU 3089 hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT); 3090 } 3091 return 0; // no PUs found 3092 } 3093 #endif // KMP_USE_HWLOC 3094 3095 static void __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) { 3096 AddrUnsPair *newAddr; 3097 if (__kmp_hws_requested == 0) 3098 goto _exit; // no topology limiting actions requested, exit 3099 #if KMP_USE_HWLOC 3100 if (__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 3101 // Number of subobjects calculated dynamically, this works fine for 3102 // any non-uniform topology. 3103 // L2 cache objects are determined by depth, other objects - by type. 3104 hwloc_topology_t tp = __kmp_hwloc_topology; 3105 int nS = 0, nN = 0, nL = 0, nC = 0, 3106 nT = 0; // logical index including skipped 3107 int nCr = 0, nTr = 0; // number of requested units 3108 int nPkg = 0, nCo = 0, n_new = 0, n_old = 0, nCpP = 0, nTpC = 0; // counters 3109 hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to) 3110 int L2depth, idx; 3111 3112 // check support of extensions ---------------------------------- 3113 int numa_support = 0, tile_support = 0; 3114 if (__kmp_pu_os_idx) 3115 hT = hwloc_get_pu_obj_by_os_index(tp, 3116 __kmp_pu_os_idx[__kmp_avail_proc - 1]); 3117 else 3118 hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, __kmp_avail_proc - 1); 3119 if (hT == NULL) { // something's gone wrong 3120 KMP_WARNING(AffHWSubsetUnsupported); 3121 goto _exit; 3122 } 3123 // check NUMA node 3124 hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT); 3125 hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT); 3126 if (hN != NULL && hN->depth > hS->depth) { 3127 numa_support = 1; // 1 in case socket includes node(s) 3128 } else if (__kmp_hws_node.num > 0) { 3129 // don't support sockets inside NUMA node (no such HW found for testing) 3130 KMP_WARNING(AffHWSubsetUnsupported); 3131 goto _exit; 3132 } 3133 // check L2 cahce, get object by depth because of multiple caches 3134 L2depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED); 3135 hL = hwloc_get_ancestor_obj_by_depth(tp, L2depth, hT); 3136 if (hL != NULL && 3137 __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1) { 3138 tile_support = 1; // no sense to count L2 if it includes single core 3139 } else if (__kmp_hws_tile.num > 0) { 3140 if (__kmp_hws_core.num == 0) { 3141 __kmp_hws_core = __kmp_hws_tile; // replace L2 with core 3142 __kmp_hws_tile.num = 0; 3143 } else { 3144 // L2 and core are both requested, but represent same object 3145 KMP_WARNING(AffHWSubsetInvalid); 3146 goto _exit; 3147 } 3148 } 3149 // end of check of extensions ----------------------------------- 3150 3151 // fill in unset items, validate settings ----------------------- 3152 if (__kmp_hws_socket.num == 0) 3153 __kmp_hws_socket.num = nPackages; // use all available sockets 3154 if (__kmp_hws_socket.offset >= nPackages) { 3155 KMP_WARNING(AffHWSubsetManySockets); 3156 goto _exit; 3157 } 3158 if (numa_support) { 3159 int NN = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, 3160 &hN); // num nodes in socket 3161 if (__kmp_hws_node.num == 0) 3162 __kmp_hws_node.num = NN; // use all available nodes 3163 if (__kmp_hws_node.offset >= NN) { 3164 KMP_WARNING(AffHWSubsetManyNodes); 3165 goto _exit; 3166 } 3167 if (tile_support) { 3168 // get num tiles in node 3169 int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL); 3170 if (__kmp_hws_tile.num == 0) { 3171 __kmp_hws_tile.num = NL + 1; 3172 } // use all available tiles, some node may have more tiles, thus +1 3173 if (__kmp_hws_tile.offset >= NL) { 3174 KMP_WARNING(AffHWSubsetManyTiles); 3175 goto _exit; 3176 } 3177 int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, 3178 &hC); // num cores in tile 3179 if (__kmp_hws_core.num == 0) 3180 __kmp_hws_core.num = NC; // use all available cores 3181 if (__kmp_hws_core.offset >= NC) { 3182 KMP_WARNING(AffHWSubsetManyCores); 3183 goto _exit; 3184 } 3185 } else { // tile_support 3186 int NC = __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, 3187 &hC); // num cores in node 3188 if (__kmp_hws_core.num == 0) 3189 __kmp_hws_core.num = NC; // use all available cores 3190 if (__kmp_hws_core.offset >= NC) { 3191 KMP_WARNING(AffHWSubsetManyCores); 3192 goto _exit; 3193 } 3194 } // tile_support 3195 } else { // numa_support 3196 if (tile_support) { 3197 // get num tiles in socket 3198 int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL); 3199 if (__kmp_hws_tile.num == 0) 3200 __kmp_hws_tile.num = NL; // use all available tiles 3201 if (__kmp_hws_tile.offset >= NL) { 3202 KMP_WARNING(AffHWSubsetManyTiles); 3203 goto _exit; 3204 } 3205 int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, 3206 &hC); // num cores in tile 3207 if (__kmp_hws_core.num == 0) 3208 __kmp_hws_core.num = NC; // use all available cores 3209 if (__kmp_hws_core.offset >= NC) { 3210 KMP_WARNING(AffHWSubsetManyCores); 3211 goto _exit; 3212 } 3213 } else { // tile_support 3214 int NC = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, 3215 &hC); // num cores in socket 3216 if (__kmp_hws_core.num == 0) 3217 __kmp_hws_core.num = NC; // use all available cores 3218 if (__kmp_hws_core.offset >= NC) { 3219 KMP_WARNING(AffHWSubsetManyCores); 3220 goto _exit; 3221 } 3222 } // tile_support 3223 } 3224 if (__kmp_hws_proc.num == 0) 3225 __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all available procs 3226 if (__kmp_hws_proc.offset >= __kmp_nThreadsPerCore) { 3227 KMP_WARNING(AffHWSubsetManyProcs); 3228 goto _exit; 3229 } 3230 // end of validation -------------------------------------------- 3231 3232 if (pAddr) // pAddr is NULL in case of affinity_none 3233 newAddr = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * 3234 __kmp_avail_proc); // max size 3235 // main loop to form HW subset ---------------------------------- 3236 hS = NULL; 3237 int NP = hwloc_get_nbobjs_by_type(tp, HWLOC_OBJ_PACKAGE); 3238 for (int s = 0; s < NP; ++s) { 3239 // Check Socket ----------------------------------------------- 3240 hS = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hS); 3241 if (!__kmp_hwloc_obj_has_PUs(tp, hS)) 3242 continue; // skip socket if all PUs are out of fullMask 3243 ++nS; // only count objects those have PUs in affinity mask 3244 if (nS <= __kmp_hws_socket.offset || 3245 nS > __kmp_hws_socket.num + __kmp_hws_socket.offset) { 3246 n_old += __kmp_hwloc_skip_PUs_obj(tp, hS); // skip socket 3247 continue; // move to next socket 3248 } 3249 nCr = 0; // count number of cores per socket 3250 // socket requested, go down the topology tree 3251 // check 4 cases: (+NUMA+Tile), (+NUMA-Tile), (-NUMA+Tile), (-NUMA-Tile) 3252 if (numa_support) { 3253 nN = 0; 3254 hN = NULL; 3255 // num nodes in current socket 3256 int NN = 3257 __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, &hN); 3258 for (int n = 0; n < NN; ++n) { 3259 // Check NUMA Node ---------------------------------------- 3260 if (!__kmp_hwloc_obj_has_PUs(tp, hN)) { 3261 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3262 continue; // skip node if all PUs are out of fullMask 3263 } 3264 ++nN; 3265 if (nN <= __kmp_hws_node.offset || 3266 nN > __kmp_hws_node.num + __kmp_hws_node.offset) { 3267 // skip node as not requested 3268 n_old += __kmp_hwloc_skip_PUs_obj(tp, hN); // skip node 3269 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3270 continue; // move to next node 3271 } 3272 // node requested, go down the topology tree 3273 if (tile_support) { 3274 nL = 0; 3275 hL = NULL; 3276 int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL); 3277 for (int l = 0; l < NL; ++l) { 3278 // Check L2 (tile) ------------------------------------ 3279 if (!__kmp_hwloc_obj_has_PUs(tp, hL)) { 3280 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3281 continue; // skip tile if all PUs are out of fullMask 3282 } 3283 ++nL; 3284 if (nL <= __kmp_hws_tile.offset || 3285 nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) { 3286 // skip tile as not requested 3287 n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile 3288 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3289 continue; // move to next tile 3290 } 3291 // tile requested, go down the topology tree 3292 nC = 0; 3293 hC = NULL; 3294 // num cores in current tile 3295 int NC = __kmp_hwloc_count_children_by_type(tp, hL, 3296 HWLOC_OBJ_CORE, &hC); 3297 for (int c = 0; c < NC; ++c) { 3298 // Check Core --------------------------------------- 3299 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3300 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3301 continue; // skip core if all PUs are out of fullMask 3302 } 3303 ++nC; 3304 if (nC <= __kmp_hws_core.offset || 3305 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3306 // skip node as not requested 3307 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3308 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3309 continue; // move to next node 3310 } 3311 // core requested, go down to PUs 3312 nT = 0; 3313 nTr = 0; 3314 hT = NULL; 3315 // num procs in current core 3316 int NT = __kmp_hwloc_count_children_by_type(tp, hC, 3317 HWLOC_OBJ_PU, &hT); 3318 for (int t = 0; t < NT; ++t) { 3319 // Check PU --------------------------------------- 3320 idx = hT->os_index; 3321 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3322 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3323 continue; // skip PU if not in fullMask 3324 } 3325 ++nT; 3326 if (nT <= __kmp_hws_proc.offset || 3327 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3328 // skip PU 3329 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3330 ++n_old; 3331 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3332 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3333 continue; // move to next node 3334 } 3335 ++nTr; 3336 if (pAddr) // collect requested thread's data 3337 newAddr[n_new] = (*pAddr)[n_old]; 3338 ++n_new; 3339 ++n_old; 3340 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3341 } // threads loop 3342 if (nTr > 0) { 3343 ++nCr; // num cores per socket 3344 ++nCo; // total num cores 3345 if (nTr > nTpC) 3346 nTpC = nTr; // calc max threads per core 3347 } 3348 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3349 } // cores loop 3350 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3351 } // tiles loop 3352 } else { // tile_support 3353 // no tiles, check cores 3354 nC = 0; 3355 hC = NULL; 3356 // num cores in current node 3357 int NC = 3358 __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, &hC); 3359 for (int c = 0; c < NC; ++c) { 3360 // Check Core --------------------------------------- 3361 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3362 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3363 continue; // skip core if all PUs are out of fullMask 3364 } 3365 ++nC; 3366 if (nC <= __kmp_hws_core.offset || 3367 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3368 // skip node as not requested 3369 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3370 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3371 continue; // move to next node 3372 } 3373 // core requested, go down to PUs 3374 nT = 0; 3375 nTr = 0; 3376 hT = NULL; 3377 int NT = 3378 __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT); 3379 for (int t = 0; t < NT; ++t) { 3380 // Check PU --------------------------------------- 3381 idx = hT->os_index; 3382 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3383 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3384 continue; // skip PU if not in fullMask 3385 } 3386 ++nT; 3387 if (nT <= __kmp_hws_proc.offset || 3388 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3389 // skip PU 3390 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3391 ++n_old; 3392 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3393 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3394 continue; // move to next node 3395 } 3396 ++nTr; 3397 if (pAddr) // collect requested thread's data 3398 newAddr[n_new] = (*pAddr)[n_old]; 3399 ++n_new; 3400 ++n_old; 3401 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3402 } // threads loop 3403 if (nTr > 0) { 3404 ++nCr; // num cores per socket 3405 ++nCo; // total num cores 3406 if (nTr > nTpC) 3407 nTpC = nTr; // calc max threads per core 3408 } 3409 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3410 } // cores loop 3411 } // tiles support 3412 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3413 } // nodes loop 3414 } else { // numa_support 3415 // no NUMA support 3416 if (tile_support) { 3417 nL = 0; 3418 hL = NULL; 3419 // num tiles in current socket 3420 int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL); 3421 for (int l = 0; l < NL; ++l) { 3422 // Check L2 (tile) ------------------------------------ 3423 if (!__kmp_hwloc_obj_has_PUs(tp, hL)) { 3424 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3425 continue; // skip tile if all PUs are out of fullMask 3426 } 3427 ++nL; 3428 if (nL <= __kmp_hws_tile.offset || 3429 nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) { 3430 // skip tile as not requested 3431 n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile 3432 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3433 continue; // move to next tile 3434 } 3435 // tile requested, go down the topology tree 3436 nC = 0; 3437 hC = NULL; 3438 // num cores per tile 3439 int NC = 3440 __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC); 3441 for (int c = 0; c < NC; ++c) { 3442 // Check Core --------------------------------------- 3443 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3444 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3445 continue; // skip core if all PUs are out of fullMask 3446 } 3447 ++nC; 3448 if (nC <= __kmp_hws_core.offset || 3449 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3450 // skip node as not requested 3451 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3452 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3453 continue; // move to next node 3454 } 3455 // core requested, go down to PUs 3456 nT = 0; 3457 nTr = 0; 3458 hT = NULL; 3459 // num procs per core 3460 int NT = 3461 __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT); 3462 for (int t = 0; t < NT; ++t) { 3463 // Check PU --------------------------------------- 3464 idx = hT->os_index; 3465 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3466 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3467 continue; // skip PU if not in fullMask 3468 } 3469 ++nT; 3470 if (nT <= __kmp_hws_proc.offset || 3471 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3472 // skip PU 3473 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3474 ++n_old; 3475 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3476 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3477 continue; // move to next node 3478 } 3479 ++nTr; 3480 if (pAddr) // collect requested thread's data 3481 newAddr[n_new] = (*pAddr)[n_old]; 3482 ++n_new; 3483 ++n_old; 3484 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3485 } // threads loop 3486 if (nTr > 0) { 3487 ++nCr; // num cores per socket 3488 ++nCo; // total num cores 3489 if (nTr > nTpC) 3490 nTpC = nTr; // calc max threads per core 3491 } 3492 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3493 } // cores loop 3494 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3495 } // tiles loop 3496 } else { // tile_support 3497 // no tiles, check cores 3498 nC = 0; 3499 hC = NULL; 3500 // num cores in socket 3501 int NC = 3502 __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, &hC); 3503 for (int c = 0; c < NC; ++c) { 3504 // Check Core ------------------------------------------- 3505 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3506 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3507 continue; // skip core if all PUs are out of fullMask 3508 } 3509 ++nC; 3510 if (nC <= __kmp_hws_core.offset || 3511 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3512 // skip node as not requested 3513 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3514 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3515 continue; // move to next node 3516 } 3517 // core requested, go down to PUs 3518 nT = 0; 3519 nTr = 0; 3520 hT = NULL; 3521 // num procs per core 3522 int NT = 3523 __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT); 3524 for (int t = 0; t < NT; ++t) { 3525 // Check PU --------------------------------------- 3526 idx = hT->os_index; 3527 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3528 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3529 continue; // skip PU if not in fullMask 3530 } 3531 ++nT; 3532 if (nT <= __kmp_hws_proc.offset || 3533 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3534 // skip PU 3535 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3536 ++n_old; 3537 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3538 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3539 continue; // move to next node 3540 } 3541 ++nTr; 3542 if (pAddr) // collect requested thread's data 3543 newAddr[n_new] = (*pAddr)[n_old]; 3544 ++n_new; 3545 ++n_old; 3546 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3547 } // threads loop 3548 if (nTr > 0) { 3549 ++nCr; // num cores per socket 3550 ++nCo; // total num cores 3551 if (nTr > nTpC) 3552 nTpC = nTr; // calc max threads per core 3553 } 3554 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3555 } // cores loop 3556 } // tiles support 3557 } // numa_support 3558 if (nCr > 0) { // found cores? 3559 ++nPkg; // num sockets 3560 if (nCr > nCpP) 3561 nCpP = nCr; // calc max cores per socket 3562 } 3563 } // sockets loop 3564 3565 // check the subset is valid 3566 KMP_DEBUG_ASSERT(n_old == __kmp_avail_proc); 3567 KMP_DEBUG_ASSERT(nPkg > 0); 3568 KMP_DEBUG_ASSERT(nCpP > 0); 3569 KMP_DEBUG_ASSERT(nTpC > 0); 3570 KMP_DEBUG_ASSERT(nCo > 0); 3571 KMP_DEBUG_ASSERT(nPkg <= nPackages); 3572 KMP_DEBUG_ASSERT(nCpP <= nCoresPerPkg); 3573 KMP_DEBUG_ASSERT(nTpC <= __kmp_nThreadsPerCore); 3574 KMP_DEBUG_ASSERT(nCo <= __kmp_ncores); 3575 3576 nPackages = nPkg; // correct num sockets 3577 nCoresPerPkg = nCpP; // correct num cores per socket 3578 __kmp_nThreadsPerCore = nTpC; // correct num threads per core 3579 __kmp_avail_proc = n_new; // correct num procs 3580 __kmp_ncores = nCo; // correct num cores 3581 // hwloc topology method end 3582 } else 3583 #endif // KMP_USE_HWLOC 3584 { 3585 int n_old = 0, n_new = 0, proc_num = 0; 3586 if (__kmp_hws_node.num > 0 || __kmp_hws_tile.num > 0) { 3587 KMP_WARNING(AffHWSubsetNoHWLOC); 3588 goto _exit; 3589 } 3590 if (__kmp_hws_socket.num == 0) 3591 __kmp_hws_socket.num = nPackages; // use all available sockets 3592 if (__kmp_hws_core.num == 0) 3593 __kmp_hws_core.num = nCoresPerPkg; // use all available cores 3594 if (__kmp_hws_proc.num == 0 || __kmp_hws_proc.num > __kmp_nThreadsPerCore) 3595 __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all HW contexts 3596 if (!__kmp_affinity_uniform_topology()) { 3597 KMP_WARNING(AffHWSubsetNonUniform); 3598 goto _exit; // don't support non-uniform topology 3599 } 3600 if (depth > 3) { 3601 KMP_WARNING(AffHWSubsetNonThreeLevel); 3602 goto _exit; // don't support not-3-level topology 3603 } 3604 if (__kmp_hws_socket.offset + __kmp_hws_socket.num > nPackages) { 3605 KMP_WARNING(AffHWSubsetManySockets); 3606 goto _exit; 3607 } 3608 if (__kmp_hws_core.offset + __kmp_hws_core.num > nCoresPerPkg) { 3609 KMP_WARNING(AffHWSubsetManyCores); 3610 goto _exit; 3611 } 3612 // Form the requested subset 3613 if (pAddr) // pAddr is NULL in case of affinity_none 3614 newAddr = (AddrUnsPair *)__kmp_allocate( 3615 sizeof(AddrUnsPair) * __kmp_hws_socket.num * __kmp_hws_core.num * 3616 __kmp_hws_proc.num); 3617 for (int i = 0; i < nPackages; ++i) { 3618 if (i < __kmp_hws_socket.offset || 3619 i >= __kmp_hws_socket.offset + __kmp_hws_socket.num) { 3620 // skip not-requested socket 3621 n_old += nCoresPerPkg * __kmp_nThreadsPerCore; 3622 if (__kmp_pu_os_idx != NULL) { 3623 // walk through skipped socket 3624 for (int j = 0; j < nCoresPerPkg; ++j) { 3625 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 3626 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3627 ++proc_num; 3628 } 3629 } 3630 } 3631 } else { 3632 // walk through requested socket 3633 for (int j = 0; j < nCoresPerPkg; ++j) { 3634 if (j < __kmp_hws_core.offset || 3635 j >= __kmp_hws_core.offset + 3636 __kmp_hws_core.num) { // skip not-requested core 3637 n_old += __kmp_nThreadsPerCore; 3638 if (__kmp_pu_os_idx != NULL) { 3639 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 3640 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3641 ++proc_num; 3642 } 3643 } 3644 } else { 3645 // walk through requested core 3646 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 3647 if (k < __kmp_hws_proc.num) { 3648 if (pAddr) // collect requested thread's data 3649 newAddr[n_new] = (*pAddr)[n_old]; 3650 n_new++; 3651 } else { 3652 if (__kmp_pu_os_idx != NULL) 3653 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3654 } 3655 n_old++; 3656 ++proc_num; 3657 } 3658 } 3659 } 3660 } 3661 } 3662 KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore); 3663 KMP_DEBUG_ASSERT(n_new == 3664 __kmp_hws_socket.num * __kmp_hws_core.num * 3665 __kmp_hws_proc.num); 3666 nPackages = __kmp_hws_socket.num; // correct nPackages 3667 nCoresPerPkg = __kmp_hws_core.num; // correct nCoresPerPkg 3668 __kmp_nThreadsPerCore = __kmp_hws_proc.num; // correct __kmp_nThreadsPerCore 3669 __kmp_avail_proc = n_new; // correct avail_proc 3670 __kmp_ncores = nPackages * __kmp_hws_core.num; // correct ncores 3671 } // non-hwloc topology method 3672 if (pAddr) { 3673 __kmp_free(*pAddr); 3674 *pAddr = newAddr; // replace old topology with new one 3675 } 3676 if (__kmp_affinity_verbose) { 3677 char m[KMP_AFFIN_MASK_PRINT_LEN]; 3678 __kmp_affinity_print_mask(m, KMP_AFFIN_MASK_PRINT_LEN, 3679 __kmp_affin_fullMask); 3680 if (__kmp_affinity_respect_mask) { 3681 KMP_INFORM(InitOSProcSetRespect, "KMP_HW_SUBSET", m); 3682 } else { 3683 KMP_INFORM(InitOSProcSetNotRespect, "KMP_HW_SUBSET", m); 3684 } 3685 KMP_INFORM(AvailableOSProc, "KMP_HW_SUBSET", __kmp_avail_proc); 3686 kmp_str_buf_t buf; 3687 __kmp_str_buf_init(&buf); 3688 __kmp_str_buf_print(&buf, "%d", nPackages); 3689 KMP_INFORM(TopologyExtra, "KMP_HW_SUBSET", buf.str, nCoresPerPkg, 3690 __kmp_nThreadsPerCore, __kmp_ncores); 3691 __kmp_str_buf_free(&buf); 3692 } 3693 _exit: 3694 if (__kmp_pu_os_idx != NULL) { 3695 __kmp_free(__kmp_pu_os_idx); 3696 __kmp_pu_os_idx = NULL; 3697 } 3698 } 3699 3700 // This function figures out the deepest level at which there is at least one 3701 // cluster/core with more than one processing unit bound to it. 3702 static int __kmp_affinity_find_core_level(const AddrUnsPair *address2os, 3703 int nprocs, int bottom_level) { 3704 int core_level = 0; 3705 3706 for (int i = 0; i < nprocs; i++) { 3707 for (int j = bottom_level; j > 0; j--) { 3708 if (address2os[i].first.labels[j] > 0) { 3709 if (core_level < (j - 1)) { 3710 core_level = j - 1; 3711 } 3712 } 3713 } 3714 } 3715 return core_level; 3716 } 3717 3718 // This function counts number of clusters/cores at given level. 3719 static int __kmp_affinity_compute_ncores(const AddrUnsPair *address2os, 3720 int nprocs, int bottom_level, 3721 int core_level) { 3722 int ncores = 0; 3723 int i, j; 3724 3725 j = bottom_level; 3726 for (i = 0; i < nprocs; i++) { 3727 for (j = bottom_level; j > core_level; j--) { 3728 if ((i + 1) < nprocs) { 3729 if (address2os[i + 1].first.labels[j] > 0) { 3730 break; 3731 } 3732 } 3733 } 3734 if (j == core_level) { 3735 ncores++; 3736 } 3737 } 3738 if (j > core_level) { 3739 // In case of ( nprocs < __kmp_avail_proc ) we may end too deep and miss one 3740 // core. May occur when called from __kmp_affinity_find_core(). 3741 ncores++; 3742 } 3743 return ncores; 3744 } 3745 3746 // This function finds to which cluster/core given processing unit is bound. 3747 static int __kmp_affinity_find_core(const AddrUnsPair *address2os, int proc, 3748 int bottom_level, int core_level) { 3749 return __kmp_affinity_compute_ncores(address2os, proc + 1, bottom_level, 3750 core_level) - 3751 1; 3752 } 3753 3754 // This function finds maximal number of processing units bound to a 3755 // cluster/core at given level. 3756 static int __kmp_affinity_max_proc_per_core(const AddrUnsPair *address2os, 3757 int nprocs, int bottom_level, 3758 int core_level) { 3759 int maxprocpercore = 0; 3760 3761 if (core_level < bottom_level) { 3762 for (int i = 0; i < nprocs; i++) { 3763 int percore = address2os[i].first.labels[core_level + 1] + 1; 3764 3765 if (percore > maxprocpercore) { 3766 maxprocpercore = percore; 3767 } 3768 } 3769 } else { 3770 maxprocpercore = 1; 3771 } 3772 return maxprocpercore; 3773 } 3774 3775 static AddrUnsPair *address2os = NULL; 3776 static int *procarr = NULL; 3777 static int __kmp_aff_depth = 0; 3778 3779 #define KMP_EXIT_AFF_NONE \ 3780 KMP_ASSERT(__kmp_affinity_type == affinity_none); \ 3781 KMP_ASSERT(address2os == NULL); \ 3782 __kmp_apply_thread_places(NULL, 0); \ 3783 return; 3784 3785 static int __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) { 3786 const Address *aa = &(((const AddrUnsPair *)a)->first); 3787 const Address *bb = &(((const AddrUnsPair *)b)->first); 3788 unsigned depth = aa->depth; 3789 unsigned i; 3790 KMP_DEBUG_ASSERT(depth == bb->depth); 3791 KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth); 3792 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0); 3793 for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) { 3794 int j = depth - i - 1; 3795 if (aa->childNums[j] < bb->childNums[j]) 3796 return -1; 3797 if (aa->childNums[j] > bb->childNums[j]) 3798 return 1; 3799 } 3800 for (; i < depth; i++) { 3801 int j = i - __kmp_affinity_compact; 3802 if (aa->childNums[j] < bb->childNums[j]) 3803 return -1; 3804 if (aa->childNums[j] > bb->childNums[j]) 3805 return 1; 3806 } 3807 return 0; 3808 } 3809 3810 static void __kmp_aux_affinity_initialize(void) { 3811 if (__kmp_affinity_masks != NULL) { 3812 KMP_ASSERT(__kmp_affin_fullMask != NULL); 3813 return; 3814 } 3815 3816 // Create the "full" mask - this defines all of the processors that we 3817 // consider to be in the machine model. If respect is set, then it is the 3818 // initialization thread's affinity mask. Otherwise, it is all processors that 3819 // we know about on the machine. 3820 if (__kmp_affin_fullMask == NULL) { 3821 KMP_CPU_ALLOC(__kmp_affin_fullMask); 3822 } 3823 if (KMP_AFFINITY_CAPABLE()) { 3824 if (__kmp_affinity_respect_mask) { 3825 __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE); 3826 3827 // Count the number of available processors. 3828 unsigned i; 3829 __kmp_avail_proc = 0; 3830 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 3831 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 3832 continue; 3833 } 3834 __kmp_avail_proc++; 3835 } 3836 if (__kmp_avail_proc > __kmp_xproc) { 3837 if (__kmp_affinity_verbose || 3838 (__kmp_affinity_warnings && 3839 (__kmp_affinity_type != affinity_none))) { 3840 KMP_WARNING(ErrorInitializeAffinity); 3841 } 3842 __kmp_affinity_type = affinity_none; 3843 KMP_AFFINITY_DISABLE(); 3844 return; 3845 } 3846 } else { 3847 __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask); 3848 __kmp_avail_proc = __kmp_xproc; 3849 } 3850 } 3851 3852 int depth = -1; 3853 kmp_i18n_id_t msg_id = kmp_i18n_null; 3854 3855 // For backward compatibility, setting KMP_CPUINFO_FILE => 3856 // KMP_TOPOLOGY_METHOD=cpuinfo 3857 if ((__kmp_cpuinfo_file != NULL) && 3858 (__kmp_affinity_top_method == affinity_top_method_all)) { 3859 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 3860 } 3861 3862 if (__kmp_affinity_top_method == affinity_top_method_all) { 3863 // In the default code path, errors are not fatal - we just try using 3864 // another method. We only emit a warning message if affinity is on, or the 3865 // verbose flag is set, an the nowarnings flag was not set. 3866 const char *file_name = NULL; 3867 int line = 0; 3868 #if KMP_USE_HWLOC 3869 if (depth < 0 && 3870 __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 3871 if (__kmp_affinity_verbose) { 3872 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 3873 } 3874 if (!__kmp_hwloc_error) { 3875 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); 3876 if (depth == 0) { 3877 KMP_EXIT_AFF_NONE; 3878 } else if (depth < 0 && __kmp_affinity_verbose) { 3879 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 3880 } 3881 } else if (__kmp_affinity_verbose) { 3882 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 3883 } 3884 } 3885 #endif 3886 3887 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 3888 3889 if (depth < 0) { 3890 if (__kmp_affinity_verbose) { 3891 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 3892 } 3893 3894 file_name = NULL; 3895 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3896 if (depth == 0) { 3897 KMP_EXIT_AFF_NONE; 3898 } 3899 3900 if (depth < 0) { 3901 if (__kmp_affinity_verbose) { 3902 if (msg_id != kmp_i18n_null) { 3903 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", 3904 __kmp_i18n_catgets(msg_id), 3905 KMP_I18N_STR(DecodingLegacyAPIC)); 3906 } else { 3907 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3908 KMP_I18N_STR(DecodingLegacyAPIC)); 3909 } 3910 } 3911 3912 file_name = NULL; 3913 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3914 if (depth == 0) { 3915 KMP_EXIT_AFF_NONE; 3916 } 3917 } 3918 } 3919 3920 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3921 3922 #if KMP_OS_LINUX 3923 3924 if (depth < 0) { 3925 if (__kmp_affinity_verbose) { 3926 if (msg_id != kmp_i18n_null) { 3927 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", 3928 __kmp_i18n_catgets(msg_id), "/proc/cpuinfo"); 3929 } else { 3930 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo"); 3931 } 3932 } 3933 3934 FILE *f = fopen("/proc/cpuinfo", "r"); 3935 if (f == NULL) { 3936 msg_id = kmp_i18n_str_CantOpenCpuinfo; 3937 } else { 3938 file_name = "/proc/cpuinfo"; 3939 depth = 3940 __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3941 fclose(f); 3942 if (depth == 0) { 3943 KMP_EXIT_AFF_NONE; 3944 } 3945 } 3946 } 3947 3948 #endif /* KMP_OS_LINUX */ 3949 3950 #if KMP_GROUP_AFFINITY 3951 3952 if ((depth < 0) && (__kmp_num_proc_groups > 1)) { 3953 if (__kmp_affinity_verbose) { 3954 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 3955 } 3956 3957 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 3958 KMP_ASSERT(depth != 0); 3959 } 3960 3961 #endif /* KMP_GROUP_AFFINITY */ 3962 3963 if (depth < 0) { 3964 if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) { 3965 if (file_name == NULL) { 3966 KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id)); 3967 } else if (line == 0) { 3968 KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id)); 3969 } else { 3970 KMP_INFORM(UsingFlatOSFileLine, file_name, line, 3971 __kmp_i18n_catgets(msg_id)); 3972 } 3973 } 3974 // FIXME - print msg if msg_id = kmp_i18n_null ??? 3975 3976 file_name = ""; 3977 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3978 if (depth == 0) { 3979 KMP_EXIT_AFF_NONE; 3980 } 3981 KMP_ASSERT(depth > 0); 3982 KMP_ASSERT(address2os != NULL); 3983 } 3984 } 3985 3986 // If the user has specified that a paricular topology discovery method is to be 3987 // used, then we abort if that method fails. The exception is group affinity, 3988 // which might have been implicitly set. 3989 3990 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 3991 3992 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 3993 if (__kmp_affinity_verbose) { 3994 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 3995 } 3996 3997 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3998 if (depth == 0) { 3999 KMP_EXIT_AFF_NONE; 4000 } 4001 if (depth < 0) { 4002 KMP_ASSERT(msg_id != kmp_i18n_null); 4003 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4004 } 4005 } else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 4006 if (__kmp_affinity_verbose) { 4007 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 4008 } 4009 4010 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 4011 if (depth == 0) { 4012 KMP_EXIT_AFF_NONE; 4013 } 4014 if (depth < 0) { 4015 KMP_ASSERT(msg_id != kmp_i18n_null); 4016 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4017 } 4018 } 4019 4020 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4021 4022 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 4023 const char *filename; 4024 if (__kmp_cpuinfo_file != NULL) { 4025 filename = __kmp_cpuinfo_file; 4026 } else { 4027 filename = "/proc/cpuinfo"; 4028 } 4029 4030 if (__kmp_affinity_verbose) { 4031 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 4032 } 4033 4034 FILE *f = fopen(filename, "r"); 4035 if (f == NULL) { 4036 int code = errno; 4037 if (__kmp_cpuinfo_file != NULL) { 4038 __kmp_fatal(KMP_MSG(CantOpenFileForReading, filename), KMP_ERR(code), 4039 KMP_HNT(NameComesFrom_CPUINFO_FILE), __kmp_msg_null); 4040 } else { 4041 __kmp_fatal(KMP_MSG(CantOpenFileForReading, filename), KMP_ERR(code), 4042 __kmp_msg_null); 4043 } 4044 } 4045 int line = 0; 4046 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 4047 fclose(f); 4048 if (depth < 0) { 4049 KMP_ASSERT(msg_id != kmp_i18n_null); 4050 if (line > 0) { 4051 KMP_FATAL(FileLineMsgExiting, filename, line, 4052 __kmp_i18n_catgets(msg_id)); 4053 } else { 4054 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 4055 } 4056 } 4057 if (__kmp_affinity_type == affinity_none) { 4058 KMP_ASSERT(depth == 0); 4059 KMP_EXIT_AFF_NONE; 4060 } 4061 } 4062 4063 #if KMP_GROUP_AFFINITY 4064 4065 else if (__kmp_affinity_top_method == affinity_top_method_group) { 4066 if (__kmp_affinity_verbose) { 4067 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 4068 } 4069 4070 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 4071 KMP_ASSERT(depth != 0); 4072 if (depth < 0) { 4073 KMP_ASSERT(msg_id != kmp_i18n_null); 4074 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4075 } 4076 } 4077 4078 #endif /* KMP_GROUP_AFFINITY */ 4079 4080 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 4081 if (__kmp_affinity_verbose) { 4082 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY"); 4083 } 4084 4085 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 4086 if (depth == 0) { 4087 KMP_EXIT_AFF_NONE; 4088 } 4089 // should not fail 4090 KMP_ASSERT(depth > 0); 4091 KMP_ASSERT(address2os != NULL); 4092 } 4093 4094 #if KMP_USE_HWLOC 4095 else if (__kmp_affinity_top_method == affinity_top_method_hwloc) { 4096 KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC); 4097 if (__kmp_affinity_verbose) { 4098 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 4099 } 4100 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); 4101 if (depth == 0) { 4102 KMP_EXIT_AFF_NONE; 4103 } 4104 } 4105 #endif // KMP_USE_HWLOC 4106 4107 if (address2os == NULL) { 4108 if (KMP_AFFINITY_CAPABLE() && 4109 (__kmp_affinity_verbose || 4110 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) { 4111 KMP_WARNING(ErrorInitializeAffinity); 4112 } 4113 __kmp_affinity_type = affinity_none; 4114 KMP_AFFINITY_DISABLE(); 4115 return; 4116 } 4117 4118 __kmp_apply_thread_places(&address2os, depth); 4119 4120 // Create the table of masks, indexed by thread Id. 4121 unsigned maxIndex; 4122 unsigned numUnique; 4123 kmp_affin_mask_t *osId2Mask = 4124 __kmp_create_masks(&maxIndex, &numUnique, address2os, __kmp_avail_proc); 4125 if (__kmp_affinity_gran_levels == 0) { 4126 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 4127 } 4128 4129 // Set the childNums vector in all Address objects. This must be done before 4130 // we can sort using __kmp_affinity_cmp_Address_child_num(), which takes into 4131 // account the setting of __kmp_affinity_compact. 4132 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc); 4133 4134 switch (__kmp_affinity_type) { 4135 4136 case affinity_explicit: 4137 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 4138 #if OMP_40_ENABLED 4139 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 4140 #endif 4141 { 4142 __kmp_affinity_process_proclist( 4143 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 4144 __kmp_affinity_proclist, osId2Mask, maxIndex); 4145 } 4146 #if OMP_40_ENABLED 4147 else { 4148 __kmp_affinity_process_placelist( 4149 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 4150 __kmp_affinity_proclist, osId2Mask, maxIndex); 4151 } 4152 #endif 4153 if (__kmp_affinity_num_masks == 0) { 4154 if (__kmp_affinity_verbose || 4155 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 4156 KMP_WARNING(AffNoValidProcID); 4157 } 4158 __kmp_affinity_type = affinity_none; 4159 return; 4160 } 4161 break; 4162 4163 // The other affinity types rely on sorting the Addresses according to some 4164 // permutation of the machine topology tree. Set __kmp_affinity_compact and 4165 // __kmp_affinity_offset appropriately, then jump to a common code fragment 4166 // to do the sort and create the array of affinity masks. 4167 4168 case affinity_logical: 4169 __kmp_affinity_compact = 0; 4170 if (__kmp_affinity_offset) { 4171 __kmp_affinity_offset = 4172 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 4173 } 4174 goto sortAddresses; 4175 4176 case affinity_physical: 4177 if (__kmp_nThreadsPerCore > 1) { 4178 __kmp_affinity_compact = 1; 4179 if (__kmp_affinity_compact >= depth) { 4180 __kmp_affinity_compact = 0; 4181 } 4182 } else { 4183 __kmp_affinity_compact = 0; 4184 } 4185 if (__kmp_affinity_offset) { 4186 __kmp_affinity_offset = 4187 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 4188 } 4189 goto sortAddresses; 4190 4191 case affinity_scatter: 4192 if (__kmp_affinity_compact >= depth) { 4193 __kmp_affinity_compact = 0; 4194 } else { 4195 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 4196 } 4197 goto sortAddresses; 4198 4199 case affinity_compact: 4200 if (__kmp_affinity_compact >= depth) { 4201 __kmp_affinity_compact = depth - 1; 4202 } 4203 goto sortAddresses; 4204 4205 case affinity_balanced: 4206 if (depth <= 1) { 4207 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 4208 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 4209 } 4210 __kmp_affinity_type = affinity_none; 4211 return; 4212 } else if (__kmp_affinity_uniform_topology()) { 4213 break; 4214 } else { // Non-uniform topology 4215 4216 // Save the depth for further usage 4217 __kmp_aff_depth = depth; 4218 4219 int core_level = __kmp_affinity_find_core_level( 4220 address2os, __kmp_avail_proc, depth - 1); 4221 int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, 4222 depth - 1, core_level); 4223 int maxprocpercore = __kmp_affinity_max_proc_per_core( 4224 address2os, __kmp_avail_proc, depth - 1, core_level); 4225 4226 int nproc = ncores * maxprocpercore; 4227 if ((nproc < 2) || (nproc < __kmp_avail_proc)) { 4228 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 4229 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 4230 } 4231 __kmp_affinity_type = affinity_none; 4232 return; 4233 } 4234 4235 procarr = (int *)__kmp_allocate(sizeof(int) * nproc); 4236 for (int i = 0; i < nproc; i++) { 4237 procarr[i] = -1; 4238 } 4239 4240 int lastcore = -1; 4241 int inlastcore = 0; 4242 for (int i = 0; i < __kmp_avail_proc; i++) { 4243 int proc = address2os[i].second; 4244 int core = 4245 __kmp_affinity_find_core(address2os, i, depth - 1, core_level); 4246 4247 if (core == lastcore) { 4248 inlastcore++; 4249 } else { 4250 inlastcore = 0; 4251 } 4252 lastcore = core; 4253 4254 procarr[core * maxprocpercore + inlastcore] = proc; 4255 } 4256 4257 break; 4258 } 4259 4260 sortAddresses: 4261 // Allocate the gtid->affinity mask table. 4262 if (__kmp_affinity_dups) { 4263 __kmp_affinity_num_masks = __kmp_avail_proc; 4264 } else { 4265 __kmp_affinity_num_masks = numUnique; 4266 } 4267 4268 #if OMP_40_ENABLED 4269 if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) && 4270 (__kmp_affinity_num_places > 0) && 4271 ((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) { 4272 __kmp_affinity_num_masks = __kmp_affinity_num_places; 4273 } 4274 #endif 4275 4276 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4277 4278 // Sort the address2os table according to the current setting of 4279 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 4280 qsort(address2os, __kmp_avail_proc, sizeof(*address2os), 4281 __kmp_affinity_cmp_Address_child_num); 4282 { 4283 int i; 4284 unsigned j; 4285 for (i = 0, j = 0; i < __kmp_avail_proc; i++) { 4286 if ((!__kmp_affinity_dups) && (!address2os[i].first.leader)) { 4287 continue; 4288 } 4289 unsigned osId = address2os[i].second; 4290 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 4291 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j); 4292 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 4293 KMP_CPU_COPY(dest, src); 4294 if (++j >= __kmp_affinity_num_masks) { 4295 break; 4296 } 4297 } 4298 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 4299 } 4300 break; 4301 4302 default: 4303 KMP_ASSERT2(0, "Unexpected affinity setting"); 4304 } 4305 4306 KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1); 4307 machine_hierarchy.init(address2os, __kmp_avail_proc); 4308 } 4309 #undef KMP_EXIT_AFF_NONE 4310 4311 void __kmp_affinity_initialize(void) { 4312 // Much of the code above was written assumming that if a machine was not 4313 // affinity capable, then __kmp_affinity_type == affinity_none. We now 4314 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 4315 // There are too many checks for __kmp_affinity_type == affinity_none 4316 // in this code. Instead of trying to change them all, check if 4317 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 4318 // affinity_none, call the real initialization routine, then restore 4319 // __kmp_affinity_type to affinity_disabled. 4320 int disabled = (__kmp_affinity_type == affinity_disabled); 4321 if (!KMP_AFFINITY_CAPABLE()) { 4322 KMP_ASSERT(disabled); 4323 } 4324 if (disabled) { 4325 __kmp_affinity_type = affinity_none; 4326 } 4327 __kmp_aux_affinity_initialize(); 4328 if (disabled) { 4329 __kmp_affinity_type = affinity_disabled; 4330 } 4331 } 4332 4333 void __kmp_affinity_uninitialize(void) { 4334 if (__kmp_affinity_masks != NULL) { 4335 KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4336 __kmp_affinity_masks = NULL; 4337 } 4338 if (__kmp_affin_fullMask != NULL) { 4339 KMP_CPU_FREE(__kmp_affin_fullMask); 4340 __kmp_affin_fullMask = NULL; 4341 } 4342 __kmp_affinity_num_masks = 0; 4343 __kmp_affinity_type = affinity_default; 4344 #if OMP_40_ENABLED 4345 __kmp_affinity_num_places = 0; 4346 #endif 4347 if (__kmp_affinity_proclist != NULL) { 4348 __kmp_free(__kmp_affinity_proclist); 4349 __kmp_affinity_proclist = NULL; 4350 } 4351 if (address2os != NULL) { 4352 __kmp_free(address2os); 4353 address2os = NULL; 4354 } 4355 if (procarr != NULL) { 4356 __kmp_free(procarr); 4357 procarr = NULL; 4358 } 4359 #if KMP_USE_HWLOC 4360 if (__kmp_hwloc_topology != NULL) { 4361 hwloc_topology_destroy(__kmp_hwloc_topology); 4362 __kmp_hwloc_topology = NULL; 4363 } 4364 #endif 4365 KMPAffinity::destroy_api(); 4366 } 4367 4368 void __kmp_affinity_set_init_mask(int gtid, int isa_root) { 4369 if (!KMP_AFFINITY_CAPABLE()) { 4370 return; 4371 } 4372 4373 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4374 if (th->th.th_affin_mask == NULL) { 4375 KMP_CPU_ALLOC(th->th.th_affin_mask); 4376 } else { 4377 KMP_CPU_ZERO(th->th.th_affin_mask); 4378 } 4379 4380 // Copy the thread mask to the kmp_info_t strucuture. If 4381 // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that 4382 // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set, 4383 // then the full mask is the same as the mask of the initialization thread. 4384 kmp_affin_mask_t *mask; 4385 int i; 4386 4387 #if OMP_40_ENABLED 4388 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 4389 #endif 4390 { 4391 if ((__kmp_affinity_type == affinity_none) || 4392 (__kmp_affinity_type == affinity_balanced)) { 4393 #if KMP_GROUP_AFFINITY 4394 if (__kmp_num_proc_groups > 1) { 4395 return; 4396 } 4397 #endif 4398 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4399 i = KMP_PLACE_ALL; 4400 mask = __kmp_affin_fullMask; 4401 } else { 4402 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 4403 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4404 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4405 } 4406 } 4407 #if OMP_40_ENABLED 4408 else { 4409 if ((!isa_root) || 4410 (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 4411 #if KMP_GROUP_AFFINITY 4412 if (__kmp_num_proc_groups > 1) { 4413 return; 4414 } 4415 #endif 4416 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4417 i = KMP_PLACE_ALL; 4418 mask = __kmp_affin_fullMask; 4419 } else { 4420 // int i = some hash function or just a counter that doesn't 4421 // always start at 0. Use gtid for now. 4422 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 4423 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4424 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4425 } 4426 } 4427 #endif 4428 4429 #if OMP_40_ENABLED 4430 th->th.th_current_place = i; 4431 if (isa_root) { 4432 th->th.th_new_place = i; 4433 th->th.th_first_place = 0; 4434 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4435 } 4436 4437 if (i == KMP_PLACE_ALL) { 4438 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 4439 gtid)); 4440 } else { 4441 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 4442 gtid, i)); 4443 } 4444 #else 4445 if (i == -1) { 4446 KA_TRACE( 4447 100, 4448 ("__kmp_affinity_set_init_mask: binding T#%d to __kmp_affin_fullMask\n", 4449 gtid)); 4450 } else { 4451 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n", 4452 gtid, i)); 4453 } 4454 #endif /* OMP_40_ENABLED */ 4455 4456 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4457 4458 if (__kmp_affinity_verbose) { 4459 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4460 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4461 th->th.th_affin_mask); 4462 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4463 __kmp_gettid(), gtid, buf); 4464 } 4465 4466 #if KMP_OS_WINDOWS 4467 // On Windows* OS, the process affinity mask might have changed. If the user 4468 // didn't request affinity and this call fails, just continue silently. 4469 // See CQ171393. 4470 if (__kmp_affinity_type == affinity_none) { 4471 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 4472 } else 4473 #endif 4474 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4475 } 4476 4477 #if OMP_40_ENABLED 4478 4479 void __kmp_affinity_set_place(int gtid) { 4480 int retval; 4481 4482 if (!KMP_AFFINITY_CAPABLE()) { 4483 return; 4484 } 4485 4486 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4487 4488 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current " 4489 "place = %d)\n", 4490 gtid, th->th.th_new_place, th->th.th_current_place)); 4491 4492 // Check that the new place is within this thread's partition. 4493 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4494 KMP_ASSERT(th->th.th_new_place >= 0); 4495 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 4496 if (th->th.th_first_place <= th->th.th_last_place) { 4497 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) && 4498 (th->th.th_new_place <= th->th.th_last_place)); 4499 } else { 4500 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) || 4501 (th->th.th_new_place >= th->th.th_last_place)); 4502 } 4503 4504 // Copy the thread mask to the kmp_info_t strucuture, 4505 // and set this thread's affinity. 4506 kmp_affin_mask_t *mask = 4507 KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place); 4508 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4509 th->th.th_current_place = th->th.th_new_place; 4510 4511 if (__kmp_affinity_verbose) { 4512 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4513 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4514 th->th.th_affin_mask); 4515 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), 4516 __kmp_gettid(), gtid, buf); 4517 } 4518 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4519 } 4520 4521 #endif /* OMP_40_ENABLED */ 4522 4523 int __kmp_aux_set_affinity(void **mask) { 4524 int gtid; 4525 kmp_info_t *th; 4526 int retval; 4527 4528 if (!KMP_AFFINITY_CAPABLE()) { 4529 return -1; 4530 } 4531 4532 gtid = __kmp_entry_gtid(); 4533 KA_TRACE(1000, ; { 4534 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4535 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4536 (kmp_affin_mask_t *)(*mask)); 4537 __kmp_debug_printf( 4538 "kmp_set_affinity: setting affinity mask for thread %d = %s\n", gtid, 4539 buf); 4540 }); 4541 4542 if (__kmp_env_consistency_check) { 4543 if ((mask == NULL) || (*mask == NULL)) { 4544 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4545 } else { 4546 unsigned proc; 4547 int num_procs = 0; 4548 4549 KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) { 4550 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4551 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4552 } 4553 if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 4554 continue; 4555 } 4556 num_procs++; 4557 } 4558 if (num_procs == 0) { 4559 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4560 } 4561 4562 #if KMP_GROUP_AFFINITY 4563 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 4564 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4565 } 4566 #endif /* KMP_GROUP_AFFINITY */ 4567 } 4568 } 4569 4570 th = __kmp_threads[gtid]; 4571 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4572 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4573 if (retval == 0) { 4574 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 4575 } 4576 4577 #if OMP_40_ENABLED 4578 th->th.th_current_place = KMP_PLACE_UNDEFINED; 4579 th->th.th_new_place = KMP_PLACE_UNDEFINED; 4580 th->th.th_first_place = 0; 4581 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4582 4583 // Turn off 4.0 affinity for the current tread at this parallel level. 4584 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 4585 #endif 4586 4587 return retval; 4588 } 4589 4590 int __kmp_aux_get_affinity(void **mask) { 4591 int gtid; 4592 int retval; 4593 kmp_info_t *th; 4594 4595 if (!KMP_AFFINITY_CAPABLE()) { 4596 return -1; 4597 } 4598 4599 gtid = __kmp_entry_gtid(); 4600 th = __kmp_threads[gtid]; 4601 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4602 4603 KA_TRACE(1000, ; { 4604 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4605 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4606 th->th.th_affin_mask); 4607 __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", 4608 gtid, buf); 4609 }); 4610 4611 if (__kmp_env_consistency_check) { 4612 if ((mask == NULL) || (*mask == NULL)) { 4613 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 4614 } 4615 } 4616 4617 #if !KMP_OS_WINDOWS 4618 4619 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4620 KA_TRACE(1000, ; { 4621 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4622 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4623 (kmp_affin_mask_t *)(*mask)); 4624 __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", 4625 gtid, buf); 4626 }); 4627 return retval; 4628 4629 #else 4630 4631 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 4632 return 0; 4633 4634 #endif /* KMP_OS_WINDOWS */ 4635 } 4636 4637 int __kmp_aux_get_affinity_max_proc() { 4638 if (!KMP_AFFINITY_CAPABLE()) { 4639 return 0; 4640 } 4641 #if KMP_GROUP_AFFINITY 4642 if (__kmp_num_proc_groups > 1) { 4643 return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT); 4644 } 4645 #endif 4646 return __kmp_xproc; 4647 } 4648 4649 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) { 4650 int retval; 4651 4652 if (!KMP_AFFINITY_CAPABLE()) { 4653 return -1; 4654 } 4655 4656 KA_TRACE(1000, ; { 4657 int gtid = __kmp_entry_gtid(); 4658 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4659 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4660 (kmp_affin_mask_t *)(*mask)); 4661 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in " 4662 "affinity mask for thread %d = %s\n", 4663 proc, gtid, buf); 4664 }); 4665 4666 if (__kmp_env_consistency_check) { 4667 if ((mask == NULL) || (*mask == NULL)) { 4668 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4669 } 4670 } 4671 4672 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4673 return -1; 4674 } 4675 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4676 return -2; 4677 } 4678 4679 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 4680 return 0; 4681 } 4682 4683 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) { 4684 int retval; 4685 4686 if (!KMP_AFFINITY_CAPABLE()) { 4687 return -1; 4688 } 4689 4690 KA_TRACE(1000, ; { 4691 int gtid = __kmp_entry_gtid(); 4692 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4693 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4694 (kmp_affin_mask_t *)(*mask)); 4695 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in " 4696 "affinity mask for thread %d = %s\n", 4697 proc, gtid, buf); 4698 }); 4699 4700 if (__kmp_env_consistency_check) { 4701 if ((mask == NULL) || (*mask == NULL)) { 4702 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 4703 } 4704 } 4705 4706 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4707 return -1; 4708 } 4709 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4710 return -2; 4711 } 4712 4713 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 4714 return 0; 4715 } 4716 4717 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) { 4718 int retval; 4719 4720 if (!KMP_AFFINITY_CAPABLE()) { 4721 return -1; 4722 } 4723 4724 KA_TRACE(1000, ; { 4725 int gtid = __kmp_entry_gtid(); 4726 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4727 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4728 (kmp_affin_mask_t *)(*mask)); 4729 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in " 4730 "affinity mask for thread %d = %s\n", 4731 proc, gtid, buf); 4732 }); 4733 4734 if (__kmp_env_consistency_check) { 4735 if ((mask == NULL) || (*mask == NULL)) { 4736 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc"); 4737 } 4738 } 4739 4740 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4741 return -1; 4742 } 4743 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4744 return 0; 4745 } 4746 4747 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 4748 } 4749 4750 // Dynamic affinity settings - Affinity balanced 4751 void __kmp_balanced_affinity(int tid, int nthreads) { 4752 bool fine_gran = true; 4753 4754 switch (__kmp_affinity_gran) { 4755 case affinity_gran_fine: 4756 case affinity_gran_thread: 4757 break; 4758 case affinity_gran_core: 4759 if (__kmp_nThreadsPerCore > 1) { 4760 fine_gran = false; 4761 } 4762 break; 4763 case affinity_gran_package: 4764 if (nCoresPerPkg > 1) { 4765 fine_gran = false; 4766 } 4767 break; 4768 default: 4769 fine_gran = false; 4770 } 4771 4772 if (__kmp_affinity_uniform_topology()) { 4773 int coreID; 4774 int threadID; 4775 // Number of hyper threads per core in HT machine 4776 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 4777 // Number of cores 4778 int ncores = __kmp_ncores; 4779 if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) { 4780 __kmp_nth_per_core = __kmp_avail_proc / nPackages; 4781 ncores = nPackages; 4782 } 4783 // How many threads will be bound to each core 4784 int chunk = nthreads / ncores; 4785 // How many cores will have an additional thread bound to it - "big cores" 4786 int big_cores = nthreads % ncores; 4787 // Number of threads on the big cores 4788 int big_nth = (chunk + 1) * big_cores; 4789 if (tid < big_nth) { 4790 coreID = tid / (chunk + 1); 4791 threadID = (tid % (chunk + 1)) % __kmp_nth_per_core; 4792 } else { // tid >= big_nth 4793 coreID = (tid - big_cores) / chunk; 4794 threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core; 4795 } 4796 4797 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 4798 "Illegal set affinity operation when not capable"); 4799 4800 kmp_affin_mask_t *mask; 4801 KMP_CPU_ALLOC_ON_STACK(mask); 4802 KMP_CPU_ZERO(mask); 4803 4804 if (fine_gran) { 4805 int osID = address2os[coreID * __kmp_nth_per_core + threadID].second; 4806 KMP_CPU_SET(osID, mask); 4807 } else { 4808 for (int i = 0; i < __kmp_nth_per_core; i++) { 4809 int osID; 4810 osID = address2os[coreID * __kmp_nth_per_core + i].second; 4811 KMP_CPU_SET(osID, mask); 4812 } 4813 } 4814 if (__kmp_affinity_verbose) { 4815 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4816 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4817 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4818 __kmp_gettid(), tid, buf); 4819 } 4820 __kmp_set_system_affinity(mask, TRUE); 4821 KMP_CPU_FREE_FROM_STACK(mask); 4822 } else { // Non-uniform topology 4823 4824 kmp_affin_mask_t *mask; 4825 KMP_CPU_ALLOC_ON_STACK(mask); 4826 KMP_CPU_ZERO(mask); 4827 4828 int core_level = __kmp_affinity_find_core_level( 4829 address2os, __kmp_avail_proc, __kmp_aff_depth - 1); 4830 int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, 4831 __kmp_aff_depth - 1, core_level); 4832 int nth_per_core = __kmp_affinity_max_proc_per_core( 4833 address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level); 4834 4835 // For performance gain consider the special case nthreads == 4836 // __kmp_avail_proc 4837 if (nthreads == __kmp_avail_proc) { 4838 if (fine_gran) { 4839 int osID = address2os[tid].second; 4840 KMP_CPU_SET(osID, mask); 4841 } else { 4842 int core = __kmp_affinity_find_core(address2os, tid, 4843 __kmp_aff_depth - 1, core_level); 4844 for (int i = 0; i < __kmp_avail_proc; i++) { 4845 int osID = address2os[i].second; 4846 if (__kmp_affinity_find_core(address2os, i, __kmp_aff_depth - 1, 4847 core_level) == core) { 4848 KMP_CPU_SET(osID, mask); 4849 } 4850 } 4851 } 4852 } else if (nthreads <= ncores) { 4853 4854 int core = 0; 4855 for (int i = 0; i < ncores; i++) { 4856 // Check if this core from procarr[] is in the mask 4857 int in_mask = 0; 4858 for (int j = 0; j < nth_per_core; j++) { 4859 if (procarr[i * nth_per_core + j] != -1) { 4860 in_mask = 1; 4861 break; 4862 } 4863 } 4864 if (in_mask) { 4865 if (tid == core) { 4866 for (int j = 0; j < nth_per_core; j++) { 4867 int osID = procarr[i * nth_per_core + j]; 4868 if (osID != -1) { 4869 KMP_CPU_SET(osID, mask); 4870 // For fine granularity it is enough to set the first available 4871 // osID for this core 4872 if (fine_gran) { 4873 break; 4874 } 4875 } 4876 } 4877 break; 4878 } else { 4879 core++; 4880 } 4881 } 4882 } 4883 } else { // nthreads > ncores 4884 // Array to save the number of processors at each core 4885 int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores); 4886 // Array to save the number of cores with "x" available processors; 4887 int *ncores_with_x_procs = 4888 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 4889 // Array to save the number of cores with # procs from x to nth_per_core 4890 int *ncores_with_x_to_max_procs = 4891 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 4892 4893 for (int i = 0; i <= nth_per_core; i++) { 4894 ncores_with_x_procs[i] = 0; 4895 ncores_with_x_to_max_procs[i] = 0; 4896 } 4897 4898 for (int i = 0; i < ncores; i++) { 4899 int cnt = 0; 4900 for (int j = 0; j < nth_per_core; j++) { 4901 if (procarr[i * nth_per_core + j] != -1) { 4902 cnt++; 4903 } 4904 } 4905 nproc_at_core[i] = cnt; 4906 ncores_with_x_procs[cnt]++; 4907 } 4908 4909 for (int i = 0; i <= nth_per_core; i++) { 4910 for (int j = i; j <= nth_per_core; j++) { 4911 ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j]; 4912 } 4913 } 4914 4915 // Max number of processors 4916 int nproc = nth_per_core * ncores; 4917 // An array to keep number of threads per each context 4918 int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc); 4919 for (int i = 0; i < nproc; i++) { 4920 newarr[i] = 0; 4921 } 4922 4923 int nth = nthreads; 4924 int flag = 0; 4925 while (nth > 0) { 4926 for (int j = 1; j <= nth_per_core; j++) { 4927 int cnt = ncores_with_x_to_max_procs[j]; 4928 for (int i = 0; i < ncores; i++) { 4929 // Skip the core with 0 processors 4930 if (nproc_at_core[i] == 0) { 4931 continue; 4932 } 4933 for (int k = 0; k < nth_per_core; k++) { 4934 if (procarr[i * nth_per_core + k] != -1) { 4935 if (newarr[i * nth_per_core + k] == 0) { 4936 newarr[i * nth_per_core + k] = 1; 4937 cnt--; 4938 nth--; 4939 break; 4940 } else { 4941 if (flag != 0) { 4942 newarr[i * nth_per_core + k]++; 4943 cnt--; 4944 nth--; 4945 break; 4946 } 4947 } 4948 } 4949 } 4950 if (cnt == 0 || nth == 0) { 4951 break; 4952 } 4953 } 4954 if (nth == 0) { 4955 break; 4956 } 4957 } 4958 flag = 1; 4959 } 4960 int sum = 0; 4961 for (int i = 0; i < nproc; i++) { 4962 sum += newarr[i]; 4963 if (sum > tid) { 4964 if (fine_gran) { 4965 int osID = procarr[i]; 4966 KMP_CPU_SET(osID, mask); 4967 } else { 4968 int coreID = i / nth_per_core; 4969 for (int ii = 0; ii < nth_per_core; ii++) { 4970 int osID = procarr[coreID * nth_per_core + ii]; 4971 if (osID != -1) { 4972 KMP_CPU_SET(osID, mask); 4973 } 4974 } 4975 } 4976 break; 4977 } 4978 } 4979 __kmp_free(newarr); 4980 } 4981 4982 if (__kmp_affinity_verbose) { 4983 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4984 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4985 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4986 __kmp_gettid(), tid, buf); 4987 } 4988 __kmp_set_system_affinity(mask, TRUE); 4989 KMP_CPU_FREE_FROM_STACK(mask); 4990 } 4991 } 4992 4993 #if KMP_OS_LINUX 4994 // We don't need this entry for Windows because 4995 // there is GetProcessAffinityMask() api 4996 // 4997 // The intended usage is indicated by these steps: 4998 // 1) The user gets the current affinity mask 4999 // 2) Then sets the affinity by calling this function 5000 // 3) Error check the return value 5001 // 4) Use non-OpenMP parallelization 5002 // 5) Reset the affinity to what was stored in step 1) 5003 #ifdef __cplusplus 5004 extern "C" 5005 #endif 5006 int 5007 kmp_set_thread_affinity_mask_initial() 5008 // the function returns 0 on success, 5009 // -1 if we cannot bind thread 5010 // >0 (errno) if an error happened during binding 5011 { 5012 int gtid = __kmp_get_gtid(); 5013 if (gtid < 0) { 5014 // Do not touch non-omp threads 5015 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5016 "non-omp thread, returning\n")); 5017 return -1; 5018 } 5019 if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) { 5020 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5021 "affinity not initialized, returning\n")); 5022 return -1; 5023 } 5024 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5025 "set full mask for thread %d\n", 5026 gtid)); 5027 KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL); 5028 return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE); 5029 } 5030 #endif 5031 5032 #endif // KMP_AFFINITY_SUPPORTED 5033