1 /* 2 * kmp_affinity.cpp -- affinity management 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "kmp.h" 17 #include "kmp_affinity.h" 18 #include "kmp_i18n.h" 19 #include "kmp_io.h" 20 #include "kmp_str.h" 21 #include "kmp_wrapper_getpid.h" 22 23 // Store the real or imagined machine hierarchy here 24 static hierarchy_info machine_hierarchy; 25 26 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); } 27 28 29 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 30 kmp_uint32 depth; 31 // The test below is true if affinity is available, but set to "none". Need to 32 // init on first use of hierarchical barrier. 33 if (TCR_1(machine_hierarchy.uninitialized)) 34 machine_hierarchy.init(NULL, nproc); 35 36 // Adjust the hierarchy in case num threads exceeds original 37 if (nproc > machine_hierarchy.base_num_threads) 38 machine_hierarchy.resize(nproc); 39 40 depth = machine_hierarchy.depth; 41 KMP_DEBUG_ASSERT(depth > 0); 42 43 thr_bar->depth = depth; 44 thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0] - 1; 45 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 46 } 47 48 #if KMP_AFFINITY_SUPPORTED 49 50 bool KMPAffinity::picked_api = false; 51 52 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); } 53 void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); } 54 void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); } 55 void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); } 56 void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); } 57 void KMPAffinity::operator delete(void *p) { __kmp_free(p); } 58 59 void KMPAffinity::pick_api() { 60 KMPAffinity *affinity_dispatch; 61 if (picked_api) 62 return; 63 #if KMP_USE_HWLOC 64 // Only use Hwloc if affinity isn't explicitly disabled and 65 // user requests Hwloc topology method 66 if (__kmp_affinity_top_method == affinity_top_method_hwloc && 67 __kmp_affinity_type != affinity_disabled) { 68 affinity_dispatch = new KMPHwlocAffinity(); 69 } else 70 #endif 71 { 72 affinity_dispatch = new KMPNativeAffinity(); 73 } 74 __kmp_affinity_dispatch = affinity_dispatch; 75 picked_api = true; 76 } 77 78 void KMPAffinity::destroy_api() { 79 if (__kmp_affinity_dispatch != NULL) { 80 delete __kmp_affinity_dispatch; 81 __kmp_affinity_dispatch = NULL; 82 picked_api = false; 83 } 84 } 85 86 // Print the affinity mask to the character array in a pretty format. 87 char *__kmp_affinity_print_mask(char *buf, int buf_len, 88 kmp_affin_mask_t *mask) { 89 KMP_ASSERT(buf_len >= 40); 90 char *scan = buf; 91 char *end = buf + buf_len - 1; 92 93 // Find first element / check for empty set. 94 size_t i; 95 i = mask->begin(); 96 if (i == mask->end()) { 97 KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}"); 98 while (*scan != '\0') 99 scan++; 100 KMP_ASSERT(scan <= end); 101 return buf; 102 } 103 104 KMP_SNPRINTF(scan, end - scan + 1, "{%ld", (long)i); 105 while (*scan != '\0') 106 scan++; 107 i++; 108 for (; i != mask->end(); i = mask->next(i)) { 109 if (!KMP_CPU_ISSET(i, mask)) { 110 continue; 111 } 112 113 // Check for buffer overflow. A string of the form ",<n>" will have at most 114 // 10 characters, plus we want to leave room to print ",...}" if the set is 115 // too large to print for a total of 15 characters. We already left room for 116 // '\0' in setting end. 117 if (end - scan < 15) { 118 break; 119 } 120 KMP_SNPRINTF(scan, end - scan + 1, ",%-ld", (long)i); 121 while (*scan != '\0') 122 scan++; 123 } 124 if (i != mask->end()) { 125 KMP_SNPRINTF(scan, end - scan + 1, ",..."); 126 while (*scan != '\0') 127 scan++; 128 } 129 KMP_SNPRINTF(scan, end - scan + 1, "}"); 130 while (*scan != '\0') 131 scan++; 132 KMP_ASSERT(scan <= end); 133 return buf; 134 } 135 136 void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) { 137 KMP_CPU_ZERO(mask); 138 139 #if KMP_GROUP_AFFINITY 140 141 if (__kmp_num_proc_groups > 1) { 142 int group; 143 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 144 for (group = 0; group < __kmp_num_proc_groups; group++) { 145 int i; 146 int num = __kmp_GetActiveProcessorCount(group); 147 for (i = 0; i < num; i++) { 148 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 149 } 150 } 151 } else 152 153 #endif /* KMP_GROUP_AFFINITY */ 154 155 { 156 int proc; 157 for (proc = 0; proc < __kmp_xproc; proc++) { 158 KMP_CPU_SET(proc, mask); 159 } 160 } 161 } 162 163 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be 164 // called to renumber the labels from [0..n] and place them into the child_num 165 // vector of the address object. This is done in case the labels used for 166 // the children at one node of the hierarchy differ from those used for 167 // another node at the same level. Example: suppose the machine has 2 nodes 168 // with 2 packages each. The first node contains packages 601 and 602, and 169 // second node contains packages 603 and 604. If we try to sort the table 170 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604 171 // because we are paying attention to the labels themselves, not the ordinal 172 // child numbers. By using the child numbers in the sort, the result is 173 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604. 174 static void __kmp_affinity_assign_child_nums(AddrUnsPair *address2os, 175 int numAddrs) { 176 KMP_DEBUG_ASSERT(numAddrs > 0); 177 int depth = address2os->first.depth; 178 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 179 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 180 int labCt; 181 for (labCt = 0; labCt < depth; labCt++) { 182 address2os[0].first.childNums[labCt] = counts[labCt] = 0; 183 lastLabel[labCt] = address2os[0].first.labels[labCt]; 184 } 185 int i; 186 for (i = 1; i < numAddrs; i++) { 187 for (labCt = 0; labCt < depth; labCt++) { 188 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) { 189 int labCt2; 190 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) { 191 counts[labCt2] = 0; 192 lastLabel[labCt2] = address2os[i].first.labels[labCt2]; 193 } 194 counts[labCt]++; 195 lastLabel[labCt] = address2os[i].first.labels[labCt]; 196 break; 197 } 198 } 199 for (labCt = 0; labCt < depth; labCt++) { 200 address2os[i].first.childNums[labCt] = counts[labCt]; 201 } 202 for (; labCt < (int)Address::maxDepth; labCt++) { 203 address2os[i].first.childNums[labCt] = 0; 204 } 205 } 206 __kmp_free(lastLabel); 207 __kmp_free(counts); 208 } 209 210 // All of the __kmp_affinity_create_*_map() routines should set 211 // __kmp_affinity_masks to a vector of affinity mask objects of length 212 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and return 213 // the number of levels in the machine topology tree (zero if 214 // __kmp_affinity_type == affinity_none). 215 // 216 // All of the __kmp_affinity_create_*_map() routines should set 217 // *__kmp_affin_fullMask to the affinity mask for the initialization thread. 218 // They need to save and restore the mask, and it could be needed later, so 219 // saving it is just an optimization to avoid calling kmp_get_system_affinity() 220 // again. 221 kmp_affin_mask_t *__kmp_affin_fullMask = NULL; 222 223 static int nCoresPerPkg, nPackages; 224 static int __kmp_nThreadsPerCore; 225 #ifndef KMP_DFLT_NTH_CORES 226 static int __kmp_ncores; 227 #endif 228 static int *__kmp_pu_os_idx = NULL; 229 230 // __kmp_affinity_uniform_topology() doesn't work when called from 231 // places which support arbitrarily many levels in the machine topology 232 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map() 233 // __kmp_affinity_create_x2apicid_map(). 234 inline static bool __kmp_affinity_uniform_topology() { 235 return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages); 236 } 237 238 // Print out the detailed machine topology map, i.e. the physical locations 239 // of each OS proc. 240 static void __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, 241 int depth, int pkgLevel, 242 int coreLevel, int threadLevel) { 243 int proc; 244 245 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 246 for (proc = 0; proc < len; proc++) { 247 int level; 248 kmp_str_buf_t buf; 249 __kmp_str_buf_init(&buf); 250 for (level = 0; level < depth; level++) { 251 if (level == threadLevel) { 252 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread)); 253 } else if (level == coreLevel) { 254 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core)); 255 } else if (level == pkgLevel) { 256 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package)); 257 } else if (level > pkgLevel) { 258 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node), 259 level - pkgLevel - 1); 260 } else { 261 __kmp_str_buf_print(&buf, "L%d ", level); 262 } 263 __kmp_str_buf_print(&buf, "%d ", address2os[proc].first.labels[level]); 264 } 265 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second, 266 buf.str); 267 __kmp_str_buf_free(&buf); 268 } 269 } 270 271 #if KMP_USE_HWLOC 272 273 // This function removes the topology levels that are radix 1 and don't offer 274 // further information about the topology. The most common example is when you 275 // have one thread context per core, we don't want the extra thread context 276 // level if it offers no unique labels. So they are removed. 277 // return value: the new depth of address2os 278 static int __kmp_affinity_remove_radix_one_levels(AddrUnsPair *address2os, 279 int nActiveThreads, int depth, 280 int *pkgLevel, int *coreLevel, 281 int *threadLevel) { 282 int level; 283 int i; 284 int radix1_detected; 285 286 for (level = depth - 1; level >= 0; --level) { 287 // Always keep the package level 288 if (level == *pkgLevel) 289 continue; 290 // Detect if this level is radix 1 291 radix1_detected = 1; 292 for (i = 1; i < nActiveThreads; ++i) { 293 if (address2os[0].first.labels[level] != 294 address2os[i].first.labels[level]) { 295 // There are differing label values for this level so it stays 296 radix1_detected = 0; 297 break; 298 } 299 } 300 if (!radix1_detected) 301 continue; 302 // Radix 1 was detected 303 if (level == *threadLevel) { 304 // If only one thread per core, then just decrement 305 // the depth which removes the threadlevel from address2os 306 for (i = 0; i < nActiveThreads; ++i) { 307 address2os[i].first.depth--; 308 } 309 *threadLevel = -1; 310 } else if (level == *coreLevel) { 311 // For core level, we move the thread labels over if they are still 312 // valid (*threadLevel != -1), and also reduce the depth another level 313 for (i = 0; i < nActiveThreads; ++i) { 314 if (*threadLevel != -1) { 315 address2os[i].first.labels[*coreLevel] = 316 address2os[i].first.labels[*threadLevel]; 317 } 318 address2os[i].first.depth--; 319 } 320 *coreLevel = -1; 321 } 322 } 323 return address2os[0].first.depth; 324 } 325 326 // Returns the number of objects of type 'type' below 'obj' within the topology 327 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is 328 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET 329 // object. 330 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj, 331 hwloc_obj_type_t type) { 332 int retval = 0; 333 hwloc_obj_t first; 334 for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type, 335 obj->logical_index, type, 0); 336 first != NULL && 337 hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, obj->type, first) == 338 obj; 339 first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type, 340 first)) { 341 ++retval; 342 } 343 return retval; 344 } 345 346 static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os, 347 kmp_i18n_id_t *const msg_id) { 348 *address2os = NULL; 349 *msg_id = kmp_i18n_null; 350 351 // Save the affinity mask for the current thread. 352 kmp_affin_mask_t *oldMask; 353 KMP_CPU_ALLOC(oldMask); 354 __kmp_get_system_affinity(oldMask, TRUE); 355 356 int depth = 3; 357 int pkgLevel = 0; 358 int coreLevel = 1; 359 int threadLevel = 2; 360 361 if (!KMP_AFFINITY_CAPABLE()) { 362 // Hack to try and infer the machine topology using only the data 363 // available from cpuid on the current thread, and __kmp_xproc. 364 KMP_ASSERT(__kmp_affinity_type == affinity_none); 365 366 nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj( 367 hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PACKAGE, 0), 368 HWLOC_OBJ_CORE); 369 __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj( 370 hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, 0), 371 HWLOC_OBJ_PU); 372 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 373 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 374 if (__kmp_affinity_verbose) { 375 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 376 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 377 if (__kmp_affinity_uniform_topology()) { 378 KMP_INFORM(Uniform, "KMP_AFFINITY"); 379 } else { 380 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 381 } 382 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 383 __kmp_nThreadsPerCore, __kmp_ncores); 384 } 385 KMP_CPU_FREE(oldMask); 386 return 0; 387 } 388 389 // Allocate the data structure to be returned. 390 AddrUnsPair *retval = 391 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 392 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 393 394 // When affinity is off, this routine will still be called to set 395 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 396 // nCoresPerPkg, & nPackages. Make sure all these vars are set 397 // correctly, and return if affinity is not enabled. 398 399 hwloc_obj_t pu; 400 hwloc_obj_t core; 401 hwloc_obj_t socket; 402 int nActiveThreads = 0; 403 int socket_identifier = 0; 404 // re-calculate globals to count only accessible resources 405 __kmp_ncores = nPackages = nCoresPerPkg = __kmp_nThreadsPerCore = 0; 406 for (socket = 407 hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PACKAGE, 0); 408 socket != NULL; 409 socket = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, 410 HWLOC_OBJ_PACKAGE, socket), 411 socket_identifier++) { 412 int core_identifier = 0; 413 int num_active_cores = 0; 414 for (core = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, socket->type, 415 socket->logical_index, 416 HWLOC_OBJ_CORE, 0); 417 core != NULL && 418 hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, socket->type, 419 core) == socket; 420 core = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, 421 core), 422 core_identifier++) { 423 int pu_identifier = 0; 424 int num_active_threads = 0; 425 for (pu = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, core->type, 426 core->logical_index, HWLOC_OBJ_PU, 427 0); 428 pu != NULL && 429 hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, core->type, 430 pu) == core; 431 pu = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PU, 432 pu), 433 pu_identifier++) { 434 Address addr(3); 435 if(!KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask)) 436 continue; // skip inactive (inaccessible) unit 437 KA_TRACE(20, 438 ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n", 439 socket->os_index, socket->logical_index, core->os_index, 440 core->logical_index, pu->os_index,pu->logical_index)); 441 addr.labels[0] = socket_identifier; // package 442 addr.labels[1] = core_identifier; // core 443 addr.labels[2] = pu_identifier; // pu 444 retval[nActiveThreads] = AddrUnsPair(addr, pu->os_index); 445 __kmp_pu_os_idx[nActiveThreads] = 446 pu->os_index; // keep os index for each active pu 447 nActiveThreads++; 448 ++num_active_threads; // count active threads per core 449 } 450 if (num_active_threads) { // were there any active threads on the core? 451 ++__kmp_ncores; // count total active cores 452 ++num_active_cores; // count active cores per socket 453 if (num_active_threads > __kmp_nThreadsPerCore) 454 __kmp_nThreadsPerCore = num_active_threads; // calc maximum 455 } 456 } 457 if (num_active_cores) { // were there any active cores on the socket? 458 ++nPackages; // count total active packages 459 if (num_active_cores > nCoresPerPkg) 460 nCoresPerPkg = num_active_cores; // calc maximum 461 } 462 } 463 464 // If there's only one thread context to bind to, return now. 465 KMP_DEBUG_ASSERT(nActiveThreads == __kmp_avail_proc); 466 KMP_ASSERT(nActiveThreads > 0); 467 if (nActiveThreads == 1) { 468 __kmp_ncores = nPackages = 1; 469 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 470 if (__kmp_affinity_verbose) { 471 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 472 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 473 474 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 475 if (__kmp_affinity_respect_mask) { 476 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 477 } else { 478 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 479 } 480 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 481 KMP_INFORM(Uniform, "KMP_AFFINITY"); 482 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 483 __kmp_nThreadsPerCore, __kmp_ncores); 484 } 485 486 if (__kmp_affinity_type == affinity_none) { 487 __kmp_free(retval); 488 KMP_CPU_FREE(oldMask); 489 return 0; 490 } 491 492 // Form an Address object which only includes the package level. 493 Address addr(1); 494 addr.labels[0] = retval[0].first.labels[pkgLevel]; 495 retval[0].first = addr; 496 497 if (__kmp_affinity_gran_levels < 0) { 498 __kmp_affinity_gran_levels = 0; 499 } 500 501 if (__kmp_affinity_verbose) { 502 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 503 } 504 505 *address2os = retval; 506 KMP_CPU_FREE(oldMask); 507 return 1; 508 } 509 510 // Sort the table by physical Id. 511 qsort(retval, nActiveThreads, sizeof(*retval), 512 __kmp_affinity_cmp_Address_labels); 513 514 // Check to see if the machine topology is uniform 515 unsigned uniform = 516 (nPackages * nCoresPerPkg * __kmp_nThreadsPerCore == nActiveThreads); 517 518 // Print the machine topology summary. 519 if (__kmp_affinity_verbose) { 520 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 521 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 522 523 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 524 if (__kmp_affinity_respect_mask) { 525 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 526 } else { 527 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 528 } 529 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 530 if (uniform) { 531 KMP_INFORM(Uniform, "KMP_AFFINITY"); 532 } else { 533 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 534 } 535 536 kmp_str_buf_t buf; 537 __kmp_str_buf_init(&buf); 538 539 __kmp_str_buf_print(&buf, "%d", nPackages); 540 // for (level = 1; level <= pkgLevel; level++) { 541 // __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 542 // } 543 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 544 __kmp_nThreadsPerCore, __kmp_ncores); 545 546 __kmp_str_buf_free(&buf); 547 } 548 549 if (__kmp_affinity_type == affinity_none) { 550 __kmp_free(retval); 551 KMP_CPU_FREE(oldMask); 552 return 0; 553 } 554 555 // Find any levels with radiix 1, and remove them from the map 556 // (except for the package level). 557 depth = __kmp_affinity_remove_radix_one_levels( 558 retval, nActiveThreads, depth, &pkgLevel, &coreLevel, &threadLevel); 559 560 if (__kmp_affinity_gran_levels < 0) { 561 // Set the granularity level based on what levels are modeled 562 // in the machine topology map. 563 __kmp_affinity_gran_levels = 0; 564 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 565 __kmp_affinity_gran_levels++; 566 } 567 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 568 __kmp_affinity_gran_levels++; 569 } 570 if (__kmp_affinity_gran > affinity_gran_package) { 571 __kmp_affinity_gran_levels++; 572 } 573 } 574 575 if (__kmp_affinity_verbose) { 576 __kmp_affinity_print_topology(retval, nActiveThreads, depth, pkgLevel, 577 coreLevel, threadLevel); 578 } 579 580 KMP_CPU_FREE(oldMask); 581 *address2os = retval; 582 return depth; 583 } 584 #endif // KMP_USE_HWLOC 585 586 // If we don't know how to retrieve the machine's processor topology, or 587 // encounter an error in doing so, this routine is called to form a "flat" 588 // mapping of os thread id's <-> processor id's. 589 static int __kmp_affinity_create_flat_map(AddrUnsPair **address2os, 590 kmp_i18n_id_t *const msg_id) { 591 *address2os = NULL; 592 *msg_id = kmp_i18n_null; 593 594 // Even if __kmp_affinity_type == affinity_none, this routine might still 595 // called to set __kmp_ncores, as well as 596 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 597 if (!KMP_AFFINITY_CAPABLE()) { 598 KMP_ASSERT(__kmp_affinity_type == affinity_none); 599 __kmp_ncores = nPackages = __kmp_xproc; 600 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 601 if (__kmp_affinity_verbose) { 602 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY"); 603 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 604 KMP_INFORM(Uniform, "KMP_AFFINITY"); 605 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 606 __kmp_nThreadsPerCore, __kmp_ncores); 607 } 608 return 0; 609 } 610 611 // When affinity is off, this routine will still be called to set 612 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 613 // Make sure all these vars are set correctly, and return now if affinity is 614 // not enabled. 615 __kmp_ncores = nPackages = __kmp_avail_proc; 616 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 617 if (__kmp_affinity_verbose) { 618 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 619 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 620 __kmp_affin_fullMask); 621 622 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY"); 623 if (__kmp_affinity_respect_mask) { 624 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 625 } else { 626 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 627 } 628 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 629 KMP_INFORM(Uniform, "KMP_AFFINITY"); 630 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 631 __kmp_nThreadsPerCore, __kmp_ncores); 632 } 633 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 634 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 635 if (__kmp_affinity_type == affinity_none) { 636 int avail_ct = 0; 637 int i; 638 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 639 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) 640 continue; 641 __kmp_pu_os_idx[avail_ct++] = i; // suppose indices are flat 642 } 643 return 0; 644 } 645 646 // Contruct the data structure to be returned. 647 *address2os = 648 (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 649 int avail_ct = 0; 650 unsigned int i; 651 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 652 // Skip this proc if it is not included in the machine model. 653 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 654 continue; 655 } 656 __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat 657 Address addr(1); 658 addr.labels[0] = i; 659 (*address2os)[avail_ct++] = AddrUnsPair(addr, i); 660 } 661 if (__kmp_affinity_verbose) { 662 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 663 } 664 665 if (__kmp_affinity_gran_levels < 0) { 666 // Only the package level is modeled in the machine topology map, 667 // so the #levels of granularity is either 0 or 1. 668 if (__kmp_affinity_gran > affinity_gran_package) { 669 __kmp_affinity_gran_levels = 1; 670 } else { 671 __kmp_affinity_gran_levels = 0; 672 } 673 } 674 return 1; 675 } 676 677 #if KMP_GROUP_AFFINITY 678 679 // If multiple Windows* OS processor groups exist, we can create a 2-level 680 // topology map with the groups at level 0 and the individual procs at level 1. 681 // This facilitates letting the threads float among all procs in a group, 682 // if granularity=group (the default when there are multiple groups). 683 static int __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os, 684 kmp_i18n_id_t *const msg_id) { 685 *address2os = NULL; 686 *msg_id = kmp_i18n_null; 687 688 // If we aren't affinity capable, then return now. 689 // The flat mapping will be used. 690 if (!KMP_AFFINITY_CAPABLE()) { 691 // FIXME set *msg_id 692 return -1; 693 } 694 695 // Contruct the data structure to be returned. 696 *address2os = 697 (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 698 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 699 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 700 int avail_ct = 0; 701 int i; 702 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 703 // Skip this proc if it is not included in the machine model. 704 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 705 continue; 706 } 707 __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat 708 Address addr(2); 709 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR)); 710 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR)); 711 (*address2os)[avail_ct++] = AddrUnsPair(addr, i); 712 713 if (__kmp_affinity_verbose) { 714 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0], 715 addr.labels[1]); 716 } 717 } 718 719 if (__kmp_affinity_gran_levels < 0) { 720 if (__kmp_affinity_gran == affinity_gran_group) { 721 __kmp_affinity_gran_levels = 1; 722 } else if ((__kmp_affinity_gran == affinity_gran_fine) || 723 (__kmp_affinity_gran == affinity_gran_thread)) { 724 __kmp_affinity_gran_levels = 0; 725 } else { 726 const char *gran_str = NULL; 727 if (__kmp_affinity_gran == affinity_gran_core) { 728 gran_str = "core"; 729 } else if (__kmp_affinity_gran == affinity_gran_package) { 730 gran_str = "package"; 731 } else if (__kmp_affinity_gran == affinity_gran_node) { 732 gran_str = "node"; 733 } else { 734 KMP_ASSERT(0); 735 } 736 737 // Warning: can't use affinity granularity \"gran\" with group topology 738 // method, using "thread" 739 __kmp_affinity_gran_levels = 0; 740 } 741 } 742 return 2; 743 } 744 745 #endif /* KMP_GROUP_AFFINITY */ 746 747 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 748 749 static int __kmp_cpuid_mask_width(int count) { 750 int r = 0; 751 752 while ((1 << r) < count) 753 ++r; 754 return r; 755 } 756 757 class apicThreadInfo { 758 public: 759 unsigned osId; // param to __kmp_affinity_bind_thread 760 unsigned apicId; // from cpuid after binding 761 unsigned maxCoresPerPkg; // "" 762 unsigned maxThreadsPerPkg; // "" 763 unsigned pkgId; // inferred from above values 764 unsigned coreId; // "" 765 unsigned threadId; // "" 766 }; 767 768 static int __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, 769 const void *b) { 770 const apicThreadInfo *aa = (const apicThreadInfo *)a; 771 const apicThreadInfo *bb = (const apicThreadInfo *)b; 772 if (aa->osId < bb->osId) 773 return -1; 774 if (aa->osId > bb->osId) 775 return 1; 776 return 0; 777 } 778 779 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, 780 const void *b) { 781 const apicThreadInfo *aa = (const apicThreadInfo *)a; 782 const apicThreadInfo *bb = (const apicThreadInfo *)b; 783 if (aa->pkgId < bb->pkgId) 784 return -1; 785 if (aa->pkgId > bb->pkgId) 786 return 1; 787 if (aa->coreId < bb->coreId) 788 return -1; 789 if (aa->coreId > bb->coreId) 790 return 1; 791 if (aa->threadId < bb->threadId) 792 return -1; 793 if (aa->threadId > bb->threadId) 794 return 1; 795 return 0; 796 } 797 798 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 799 // an algorithm which cycles through the available os threads, setting 800 // the current thread's affinity mask to that thread, and then retrieves 801 // the Apic Id for each thread context using the cpuid instruction. 802 static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os, 803 kmp_i18n_id_t *const msg_id) { 804 kmp_cpuid buf; 805 int rc; 806 *address2os = NULL; 807 *msg_id = kmp_i18n_null; 808 809 // Check if cpuid leaf 4 is supported. 810 __kmp_x86_cpuid(0, 0, &buf); 811 if (buf.eax < 4) { 812 *msg_id = kmp_i18n_str_NoLeaf4Support; 813 return -1; 814 } 815 816 // The algorithm used starts by setting the affinity to each available thread 817 // and retrieving info from the cpuid instruction, so if we are not capable of 818 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we 819 // need to do something else - use the defaults that we calculated from 820 // issuing cpuid without binding to each proc. 821 if (!KMP_AFFINITY_CAPABLE()) { 822 // Hack to try and infer the machine topology using only the data 823 // available from cpuid on the current thread, and __kmp_xproc. 824 KMP_ASSERT(__kmp_affinity_type == affinity_none); 825 826 // Get an upper bound on the number of threads per package using cpuid(1). 827 // On some OS/chps combinations where HT is supported by the chip but is 828 // disabled, this value will be 2 on a single core chip. Usually, it will be 829 // 2 if HT is enabled and 1 if HT is disabled. 830 __kmp_x86_cpuid(1, 0, &buf); 831 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 832 if (maxThreadsPerPkg == 0) { 833 maxThreadsPerPkg = 1; 834 } 835 836 // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded 837 // value. 838 // 839 // The author of cpu_count.cpp treated this only an upper bound on the 840 // number of cores, but I haven't seen any cases where it was greater than 841 // the actual number of cores, so we will treat it as exact in this block of 842 // code. 843 // 844 // First, we need to check if cpuid(4) is supported on this chip. To see if 845 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or 846 // greater. 847 __kmp_x86_cpuid(0, 0, &buf); 848 if (buf.eax >= 4) { 849 __kmp_x86_cpuid(4, 0, &buf); 850 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 851 } else { 852 nCoresPerPkg = 1; 853 } 854 855 // There is no way to reliably tell if HT is enabled without issuing the 856 // cpuid instruction from every thread, can correlating the cpuid info, so 857 // if the machine is not affinity capable, we assume that HT is off. We have 858 // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine 859 // does not support HT. 860 // 861 // - Older OSes are usually found on machines with older chips, which do not 862 // support HT. 863 // - The performance penalty for mistakenly identifying a machine as HT when 864 // it isn't (which results in blocktime being incorrecly set to 0) is 865 // greater than the penalty when for mistakenly identifying a machine as 866 // being 1 thread/core when it is really HT enabled (which results in 867 // blocktime being incorrectly set to a positive value). 868 __kmp_ncores = __kmp_xproc; 869 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 870 __kmp_nThreadsPerCore = 1; 871 if (__kmp_affinity_verbose) { 872 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY"); 873 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 874 if (__kmp_affinity_uniform_topology()) { 875 KMP_INFORM(Uniform, "KMP_AFFINITY"); 876 } else { 877 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 878 } 879 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 880 __kmp_nThreadsPerCore, __kmp_ncores); 881 } 882 return 0; 883 } 884 885 // From here on, we can assume that it is safe to call 886 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 887 // __kmp_affinity_type = affinity_none. 888 889 // Save the affinity mask for the current thread. 890 kmp_affin_mask_t *oldMask; 891 KMP_CPU_ALLOC(oldMask); 892 KMP_ASSERT(oldMask != NULL); 893 __kmp_get_system_affinity(oldMask, TRUE); 894 895 // Run through each of the available contexts, binding the current thread 896 // to it, and obtaining the pertinent information using the cpuid instr. 897 // 898 // The relevant information is: 899 // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 900 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 901 // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value 902 // of this field determines the width of the core# + thread# fields in the 903 // Apic Id. It is also an upper bound on the number of threads per 904 // package, but it has been verified that situations happen were it is not 905 // exact. In particular, on certain OS/chip combinations where Intel(R) 906 // Hyper-Threading Technology is supported by the chip but has been 907 // disabled, the value of this field will be 2 (for a single core chip). 908 // On other OS/chip combinations supporting Intel(R) Hyper-Threading 909 // Technology, the value of this field will be 1 when Intel(R) 910 // Hyper-Threading Technology is disabled and 2 when it is enabled. 911 // - Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The value 912 // of this field (+1) determines the width of the core# field in the Apic 913 // Id. The comments in "cpucount.cpp" say that this value is an upper 914 // bound, but the IA-32 architecture manual says that it is exactly the 915 // number of cores per package, and I haven't seen any case where it 916 // wasn't. 917 // 918 // From this information, deduce the package Id, core Id, and thread Id, 919 // and set the corresponding fields in the apicThreadInfo struct. 920 unsigned i; 921 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 922 __kmp_avail_proc * sizeof(apicThreadInfo)); 923 unsigned nApics = 0; 924 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 925 // Skip this proc if it is not included in the machine model. 926 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 927 continue; 928 } 929 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 930 931 __kmp_affinity_dispatch->bind_thread(i); 932 threadInfo[nApics].osId = i; 933 934 // The apic id and max threads per pkg come from cpuid(1). 935 __kmp_x86_cpuid(1, 0, &buf); 936 if (((buf.edx >> 9) & 1) == 0) { 937 __kmp_set_system_affinity(oldMask, TRUE); 938 __kmp_free(threadInfo); 939 KMP_CPU_FREE(oldMask); 940 *msg_id = kmp_i18n_str_ApicNotPresent; 941 return -1; 942 } 943 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 944 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 945 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 946 threadInfo[nApics].maxThreadsPerPkg = 1; 947 } 948 949 // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded 950 // value. 951 // 952 // First, we need to check if cpuid(4) is supported on this chip. To see if 953 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n 954 // or greater. 955 __kmp_x86_cpuid(0, 0, &buf); 956 if (buf.eax >= 4) { 957 __kmp_x86_cpuid(4, 0, &buf); 958 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 959 } else { 960 threadInfo[nApics].maxCoresPerPkg = 1; 961 } 962 963 // Infer the pkgId / coreId / threadId using only the info obtained locally. 964 int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg); 965 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 966 967 int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg); 968 int widthT = widthCT - widthC; 969 if (widthT < 0) { 970 // I've never seen this one happen, but I suppose it could, if the cpuid 971 // instruction on a chip was really screwed up. Make sure to restore the 972 // affinity mask before the tail call. 973 __kmp_set_system_affinity(oldMask, TRUE); 974 __kmp_free(threadInfo); 975 KMP_CPU_FREE(oldMask); 976 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 977 return -1; 978 } 979 980 int maskC = (1 << widthC) - 1; 981 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC; 982 983 int maskT = (1 << widthT) - 1; 984 threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT; 985 986 nApics++; 987 } 988 989 // We've collected all the info we need. 990 // Restore the old affinity mask for this thread. 991 __kmp_set_system_affinity(oldMask, TRUE); 992 993 // If there's only one thread context to bind to, form an Address object 994 // with depth 1 and return immediately (or, if affinity is off, set 995 // address2os to NULL and return). 996 // 997 // If it is configured to omit the package level when there is only a single 998 // package, the logic at the end of this routine won't work if there is only 999 // a single thread - it would try to form an Address object with depth 0. 1000 KMP_ASSERT(nApics > 0); 1001 if (nApics == 1) { 1002 __kmp_ncores = nPackages = 1; 1003 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1004 if (__kmp_affinity_verbose) { 1005 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1006 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1007 1008 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1009 if (__kmp_affinity_respect_mask) { 1010 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1011 } else { 1012 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1013 } 1014 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1015 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1016 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1017 __kmp_nThreadsPerCore, __kmp_ncores); 1018 } 1019 1020 if (__kmp_affinity_type == affinity_none) { 1021 __kmp_free(threadInfo); 1022 KMP_CPU_FREE(oldMask); 1023 return 0; 1024 } 1025 1026 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair)); 1027 Address addr(1); 1028 addr.labels[0] = threadInfo[0].pkgId; 1029 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId); 1030 1031 if (__kmp_affinity_gran_levels < 0) { 1032 __kmp_affinity_gran_levels = 0; 1033 } 1034 1035 if (__kmp_affinity_verbose) { 1036 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 1037 } 1038 1039 __kmp_free(threadInfo); 1040 KMP_CPU_FREE(oldMask); 1041 return 1; 1042 } 1043 1044 // Sort the threadInfo table by physical Id. 1045 qsort(threadInfo, nApics, sizeof(*threadInfo), 1046 __kmp_affinity_cmp_apicThreadInfo_phys_id); 1047 1048 // The table is now sorted by pkgId / coreId / threadId, but we really don't 1049 // know the radix of any of the fields. pkgId's may be sparsely assigned among 1050 // the chips on a system. Although coreId's are usually assigned 1051 // [0 .. coresPerPkg-1] and threadId's are usually assigned 1052 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1053 // 1054 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 1055 // total # packages) are at this point - we want to determine that now. We 1056 // only have an upper bound on the first two figures. 1057 // 1058 // We also perform a consistency check at this point: the values returned by 1059 // the cpuid instruction for any thread bound to a given package had better 1060 // return the same info for maxThreadsPerPkg and maxCoresPerPkg. 1061 nPackages = 1; 1062 nCoresPerPkg = 1; 1063 __kmp_nThreadsPerCore = 1; 1064 unsigned nCores = 1; 1065 1066 unsigned pkgCt = 1; // to determine radii 1067 unsigned lastPkgId = threadInfo[0].pkgId; 1068 unsigned coreCt = 1; 1069 unsigned lastCoreId = threadInfo[0].coreId; 1070 unsigned threadCt = 1; 1071 unsigned lastThreadId = threadInfo[0].threadId; 1072 1073 // intra-pkg consist checks 1074 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 1075 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 1076 1077 for (i = 1; i < nApics; i++) { 1078 if (threadInfo[i].pkgId != lastPkgId) { 1079 nCores++; 1080 pkgCt++; 1081 lastPkgId = threadInfo[i].pkgId; 1082 if ((int)coreCt > nCoresPerPkg) 1083 nCoresPerPkg = coreCt; 1084 coreCt = 1; 1085 lastCoreId = threadInfo[i].coreId; 1086 if ((int)threadCt > __kmp_nThreadsPerCore) 1087 __kmp_nThreadsPerCore = threadCt; 1088 threadCt = 1; 1089 lastThreadId = threadInfo[i].threadId; 1090 1091 // This is a different package, so go on to the next iteration without 1092 // doing any consistency checks. Reset the consistency check vars, though. 1093 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 1094 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 1095 continue; 1096 } 1097 1098 if (threadInfo[i].coreId != lastCoreId) { 1099 nCores++; 1100 coreCt++; 1101 lastCoreId = threadInfo[i].coreId; 1102 if ((int)threadCt > __kmp_nThreadsPerCore) 1103 __kmp_nThreadsPerCore = threadCt; 1104 threadCt = 1; 1105 lastThreadId = threadInfo[i].threadId; 1106 } else if (threadInfo[i].threadId != lastThreadId) { 1107 threadCt++; 1108 lastThreadId = threadInfo[i].threadId; 1109 } else { 1110 __kmp_free(threadInfo); 1111 KMP_CPU_FREE(oldMask); 1112 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1113 return -1; 1114 } 1115 1116 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 1117 // fields agree between all the threads bounds to a given package. 1118 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) || 1119 (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 1120 __kmp_free(threadInfo); 1121 KMP_CPU_FREE(oldMask); 1122 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1123 return -1; 1124 } 1125 } 1126 nPackages = pkgCt; 1127 if ((int)coreCt > nCoresPerPkg) 1128 nCoresPerPkg = coreCt; 1129 if ((int)threadCt > __kmp_nThreadsPerCore) 1130 __kmp_nThreadsPerCore = threadCt; 1131 1132 // When affinity is off, this routine will still be called to set 1133 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1134 // Make sure all these vars are set correctly, and return now if affinity is 1135 // not enabled. 1136 __kmp_ncores = nCores; 1137 if (__kmp_affinity_verbose) { 1138 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1139 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1140 1141 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1142 if (__kmp_affinity_respect_mask) { 1143 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1144 } else { 1145 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1146 } 1147 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1148 if (__kmp_affinity_uniform_topology()) { 1149 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1150 } else { 1151 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1152 } 1153 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1154 __kmp_nThreadsPerCore, __kmp_ncores); 1155 } 1156 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 1157 KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc); 1158 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 1159 for (i = 0; i < nApics; ++i) { 1160 __kmp_pu_os_idx[i] = threadInfo[i].osId; 1161 } 1162 if (__kmp_affinity_type == affinity_none) { 1163 __kmp_free(threadInfo); 1164 KMP_CPU_FREE(oldMask); 1165 return 0; 1166 } 1167 1168 // Now that we've determined the number of packages, the number of cores per 1169 // package, and the number of threads per core, we can construct the data 1170 // structure that is to be returned. 1171 int pkgLevel = 0; 1172 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1; 1173 int threadLevel = 1174 (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 1175 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 1176 1177 KMP_ASSERT(depth > 0); 1178 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1179 1180 for (i = 0; i < nApics; ++i) { 1181 Address addr(depth); 1182 unsigned os = threadInfo[i].osId; 1183 int d = 0; 1184 1185 if (pkgLevel >= 0) { 1186 addr.labels[d++] = threadInfo[i].pkgId; 1187 } 1188 if (coreLevel >= 0) { 1189 addr.labels[d++] = threadInfo[i].coreId; 1190 } 1191 if (threadLevel >= 0) { 1192 addr.labels[d++] = threadInfo[i].threadId; 1193 } 1194 (*address2os)[i] = AddrUnsPair(addr, os); 1195 } 1196 1197 if (__kmp_affinity_gran_levels < 0) { 1198 // Set the granularity level based on what levels are modeled in the machine 1199 // topology map. 1200 __kmp_affinity_gran_levels = 0; 1201 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1202 __kmp_affinity_gran_levels++; 1203 } 1204 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1205 __kmp_affinity_gran_levels++; 1206 } 1207 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) { 1208 __kmp_affinity_gran_levels++; 1209 } 1210 } 1211 1212 if (__kmp_affinity_verbose) { 1213 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel, 1214 coreLevel, threadLevel); 1215 } 1216 1217 __kmp_free(threadInfo); 1218 KMP_CPU_FREE(oldMask); 1219 return depth; 1220 } 1221 1222 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 1223 // architectures support a newer interface for specifying the x2APIC Ids, 1224 // based on cpuid leaf 11. 1225 static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os, 1226 kmp_i18n_id_t *const msg_id) { 1227 kmp_cpuid buf; 1228 *address2os = NULL; 1229 *msg_id = kmp_i18n_null; 1230 1231 // Check to see if cpuid leaf 11 is supported. 1232 __kmp_x86_cpuid(0, 0, &buf); 1233 if (buf.eax < 11) { 1234 *msg_id = kmp_i18n_str_NoLeaf11Support; 1235 return -1; 1236 } 1237 __kmp_x86_cpuid(11, 0, &buf); 1238 if (buf.ebx == 0) { 1239 *msg_id = kmp_i18n_str_NoLeaf11Support; 1240 return -1; 1241 } 1242 1243 // Find the number of levels in the machine topology. While we're at it, get 1244 // the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will try to 1245 // get more accurate values later by explicitly counting them, but get 1246 // reasonable defaults now, in case we return early. 1247 int level; 1248 int threadLevel = -1; 1249 int coreLevel = -1; 1250 int pkgLevel = -1; 1251 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1252 1253 for (level = 0;; level++) { 1254 if (level > 31) { 1255 // FIXME: Hack for DPD200163180 1256 // 1257 // If level is big then something went wrong -> exiting 1258 // 1259 // There could actually be 32 valid levels in the machine topology, but so 1260 // far, the only machine we have seen which does not exit this loop before 1261 // iteration 32 has fubar x2APIC settings. 1262 // 1263 // For now, just reject this case based upon loop trip count. 1264 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1265 return -1; 1266 } 1267 __kmp_x86_cpuid(11, level, &buf); 1268 if (buf.ebx == 0) { 1269 if (pkgLevel < 0) { 1270 // Will infer nPackages from __kmp_xproc 1271 pkgLevel = level; 1272 level++; 1273 } 1274 break; 1275 } 1276 int kind = (buf.ecx >> 8) & 0xff; 1277 if (kind == 1) { 1278 // SMT level 1279 threadLevel = level; 1280 coreLevel = -1; 1281 pkgLevel = -1; 1282 __kmp_nThreadsPerCore = buf.ebx & 0xffff; 1283 if (__kmp_nThreadsPerCore == 0) { 1284 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1285 return -1; 1286 } 1287 } else if (kind == 2) { 1288 // core level 1289 coreLevel = level; 1290 pkgLevel = -1; 1291 nCoresPerPkg = buf.ebx & 0xffff; 1292 if (nCoresPerPkg == 0) { 1293 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1294 return -1; 1295 } 1296 } else { 1297 if (level <= 0) { 1298 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1299 return -1; 1300 } 1301 if (pkgLevel >= 0) { 1302 continue; 1303 } 1304 pkgLevel = level; 1305 nPackages = buf.ebx & 0xffff; 1306 if (nPackages == 0) { 1307 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1308 return -1; 1309 } 1310 } 1311 } 1312 int depth = level; 1313 1314 // In the above loop, "level" was counted from the finest level (usually 1315 // thread) to the coarsest. The caller expects that we will place the labels 1316 // in (*address2os)[].first.labels[] in the inverse order, so we need to 1317 // invert the vars saying which level means what. 1318 if (threadLevel >= 0) { 1319 threadLevel = depth - threadLevel - 1; 1320 } 1321 if (coreLevel >= 0) { 1322 coreLevel = depth - coreLevel - 1; 1323 } 1324 KMP_DEBUG_ASSERT(pkgLevel >= 0); 1325 pkgLevel = depth - pkgLevel - 1; 1326 1327 // The algorithm used starts by setting the affinity to each available thread 1328 // and retrieving info from the cpuid instruction, so if we are not capable of 1329 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we 1330 // need to do something else - use the defaults that we calculated from 1331 // issuing cpuid without binding to each proc. 1332 if (!KMP_AFFINITY_CAPABLE()) { 1333 // Hack to try and infer the machine topology using only the data 1334 // available from cpuid on the current thread, and __kmp_xproc. 1335 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1336 1337 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1338 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1339 if (__kmp_affinity_verbose) { 1340 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 1341 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1342 if (__kmp_affinity_uniform_topology()) { 1343 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1344 } else { 1345 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1346 } 1347 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1348 __kmp_nThreadsPerCore, __kmp_ncores); 1349 } 1350 return 0; 1351 } 1352 1353 // From here on, we can assume that it is safe to call 1354 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 1355 // __kmp_affinity_type = affinity_none. 1356 1357 // Save the affinity mask for the current thread. 1358 kmp_affin_mask_t *oldMask; 1359 KMP_CPU_ALLOC(oldMask); 1360 __kmp_get_system_affinity(oldMask, TRUE); 1361 1362 // Allocate the data structure to be returned. 1363 AddrUnsPair *retval = 1364 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 1365 1366 // Run through each of the available contexts, binding the current thread 1367 // to it, and obtaining the pertinent information using the cpuid instr. 1368 unsigned int proc; 1369 int nApics = 0; 1370 KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) { 1371 // Skip this proc if it is not included in the machine model. 1372 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 1373 continue; 1374 } 1375 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc); 1376 1377 __kmp_affinity_dispatch->bind_thread(proc); 1378 1379 // Extract labels for each level in the machine topology map from Apic ID. 1380 Address addr(depth); 1381 int prev_shift = 0; 1382 1383 for (level = 0; level < depth; level++) { 1384 __kmp_x86_cpuid(11, level, &buf); 1385 unsigned apicId = buf.edx; 1386 if (buf.ebx == 0) { 1387 if (level != depth - 1) { 1388 KMP_CPU_FREE(oldMask); 1389 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1390 return -1; 1391 } 1392 addr.labels[depth - level - 1] = apicId >> prev_shift; 1393 level++; 1394 break; 1395 } 1396 int shift = buf.eax & 0x1f; 1397 int mask = (1 << shift) - 1; 1398 addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift; 1399 prev_shift = shift; 1400 } 1401 if (level != depth) { 1402 KMP_CPU_FREE(oldMask); 1403 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1404 return -1; 1405 } 1406 1407 retval[nApics] = AddrUnsPair(addr, proc); 1408 nApics++; 1409 } 1410 1411 // We've collected all the info we need. 1412 // Restore the old affinity mask for this thread. 1413 __kmp_set_system_affinity(oldMask, TRUE); 1414 1415 // If there's only one thread context to bind to, return now. 1416 KMP_ASSERT(nApics > 0); 1417 if (nApics == 1) { 1418 __kmp_ncores = nPackages = 1; 1419 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1420 if (__kmp_affinity_verbose) { 1421 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1422 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1423 1424 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1425 if (__kmp_affinity_respect_mask) { 1426 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1427 } else { 1428 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1429 } 1430 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1431 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1432 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1433 __kmp_nThreadsPerCore, __kmp_ncores); 1434 } 1435 1436 if (__kmp_affinity_type == affinity_none) { 1437 __kmp_free(retval); 1438 KMP_CPU_FREE(oldMask); 1439 return 0; 1440 } 1441 1442 // Form an Address object which only includes the package level. 1443 Address addr(1); 1444 addr.labels[0] = retval[0].first.labels[pkgLevel]; 1445 retval[0].first = addr; 1446 1447 if (__kmp_affinity_gran_levels < 0) { 1448 __kmp_affinity_gran_levels = 0; 1449 } 1450 1451 if (__kmp_affinity_verbose) { 1452 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 1453 } 1454 1455 *address2os = retval; 1456 KMP_CPU_FREE(oldMask); 1457 return 1; 1458 } 1459 1460 // Sort the table by physical Id. 1461 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels); 1462 1463 // Find the radix at each of the levels. 1464 unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1465 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1466 unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1467 unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1468 for (level = 0; level < depth; level++) { 1469 totals[level] = 1; 1470 maxCt[level] = 1; 1471 counts[level] = 1; 1472 last[level] = retval[0].first.labels[level]; 1473 } 1474 1475 // From here on, the iteration variable "level" runs from the finest level to 1476 // the coarsest, i.e. we iterate forward through 1477 // (*address2os)[].first.labels[] - in the previous loops, we iterated 1478 // backwards. 1479 for (proc = 1; (int)proc < nApics; proc++) { 1480 int level; 1481 for (level = 0; level < depth; level++) { 1482 if (retval[proc].first.labels[level] != last[level]) { 1483 int j; 1484 for (j = level + 1; j < depth; j++) { 1485 totals[j]++; 1486 counts[j] = 1; 1487 // The line below causes printing incorrect topology information in 1488 // case the max value for some level (maxCt[level]) is encountered 1489 // earlier than some less value while going through the array. For 1490 // example, let pkg0 has 4 cores and pkg1 has 2 cores. Then 1491 // maxCt[1] == 2 1492 // whereas it must be 4. 1493 // TODO!!! Check if it can be commented safely 1494 // maxCt[j] = 1; 1495 last[j] = retval[proc].first.labels[j]; 1496 } 1497 totals[level]++; 1498 counts[level]++; 1499 if (counts[level] > maxCt[level]) { 1500 maxCt[level] = counts[level]; 1501 } 1502 last[level] = retval[proc].first.labels[level]; 1503 break; 1504 } else if (level == depth - 1) { 1505 __kmp_free(last); 1506 __kmp_free(maxCt); 1507 __kmp_free(counts); 1508 __kmp_free(totals); 1509 __kmp_free(retval); 1510 KMP_CPU_FREE(oldMask); 1511 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 1512 return -1; 1513 } 1514 } 1515 } 1516 1517 // When affinity is off, this routine will still be called to set 1518 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1519 // Make sure all these vars are set correctly, and return if affinity is not 1520 // enabled. 1521 if (threadLevel >= 0) { 1522 __kmp_nThreadsPerCore = maxCt[threadLevel]; 1523 } else { 1524 __kmp_nThreadsPerCore = 1; 1525 } 1526 nPackages = totals[pkgLevel]; 1527 1528 if (coreLevel >= 0) { 1529 __kmp_ncores = totals[coreLevel]; 1530 nCoresPerPkg = maxCt[coreLevel]; 1531 } else { 1532 __kmp_ncores = nPackages; 1533 nCoresPerPkg = 1; 1534 } 1535 1536 // Check to see if the machine topology is uniform 1537 unsigned prod = maxCt[0]; 1538 for (level = 1; level < depth; level++) { 1539 prod *= maxCt[level]; 1540 } 1541 bool uniform = (prod == totals[level - 1]); 1542 1543 // Print the machine topology summary. 1544 if (__kmp_affinity_verbose) { 1545 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 1546 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1547 1548 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1549 if (__kmp_affinity_respect_mask) { 1550 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 1551 } else { 1552 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 1553 } 1554 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1555 if (uniform) { 1556 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1557 } else { 1558 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1559 } 1560 1561 kmp_str_buf_t buf; 1562 __kmp_str_buf_init(&buf); 1563 1564 __kmp_str_buf_print(&buf, "%d", totals[0]); 1565 for (level = 1; level <= pkgLevel; level++) { 1566 __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 1567 } 1568 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 1569 __kmp_nThreadsPerCore, __kmp_ncores); 1570 1571 __kmp_str_buf_free(&buf); 1572 } 1573 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 1574 KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc); 1575 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 1576 for (proc = 0; (int)proc < nApics; ++proc) { 1577 __kmp_pu_os_idx[proc] = retval[proc].second; 1578 } 1579 if (__kmp_affinity_type == affinity_none) { 1580 __kmp_free(last); 1581 __kmp_free(maxCt); 1582 __kmp_free(counts); 1583 __kmp_free(totals); 1584 __kmp_free(retval); 1585 KMP_CPU_FREE(oldMask); 1586 return 0; 1587 } 1588 1589 // Find any levels with radiix 1, and remove them from the map 1590 // (except for the package level). 1591 int new_depth = 0; 1592 for (level = 0; level < depth; level++) { 1593 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1594 continue; 1595 } 1596 new_depth++; 1597 } 1598 1599 // If we are removing any levels, allocate a new vector to return, 1600 // and copy the relevant information to it. 1601 if (new_depth != depth) { 1602 AddrUnsPair *new_retval = 1603 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1604 for (proc = 0; (int)proc < nApics; proc++) { 1605 Address addr(new_depth); 1606 new_retval[proc] = AddrUnsPair(addr, retval[proc].second); 1607 } 1608 int new_level = 0; 1609 int newPkgLevel = -1; 1610 int newCoreLevel = -1; 1611 int newThreadLevel = -1; 1612 int i; 1613 for (level = 0; level < depth; level++) { 1614 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1615 // Remove this level. Never remove the package level 1616 continue; 1617 } 1618 if (level == pkgLevel) { 1619 newPkgLevel = level; 1620 } 1621 if (level == coreLevel) { 1622 newCoreLevel = level; 1623 } 1624 if (level == threadLevel) { 1625 newThreadLevel = level; 1626 } 1627 for (proc = 0; (int)proc < nApics; proc++) { 1628 new_retval[proc].first.labels[new_level] = 1629 retval[proc].first.labels[level]; 1630 } 1631 new_level++; 1632 } 1633 1634 __kmp_free(retval); 1635 retval = new_retval; 1636 depth = new_depth; 1637 pkgLevel = newPkgLevel; 1638 coreLevel = newCoreLevel; 1639 threadLevel = newThreadLevel; 1640 } 1641 1642 if (__kmp_affinity_gran_levels < 0) { 1643 // Set the granularity level based on what levels are modeled 1644 // in the machine topology map. 1645 __kmp_affinity_gran_levels = 0; 1646 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1647 __kmp_affinity_gran_levels++; 1648 } 1649 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1650 __kmp_affinity_gran_levels++; 1651 } 1652 if (__kmp_affinity_gran > affinity_gran_package) { 1653 __kmp_affinity_gran_levels++; 1654 } 1655 } 1656 1657 if (__kmp_affinity_verbose) { 1658 __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, coreLevel, 1659 threadLevel); 1660 } 1661 1662 __kmp_free(last); 1663 __kmp_free(maxCt); 1664 __kmp_free(counts); 1665 __kmp_free(totals); 1666 KMP_CPU_FREE(oldMask); 1667 *address2os = retval; 1668 return depth; 1669 } 1670 1671 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1672 1673 #define osIdIndex 0 1674 #define threadIdIndex 1 1675 #define coreIdIndex 2 1676 #define pkgIdIndex 3 1677 #define nodeIdIndex 4 1678 1679 typedef unsigned *ProcCpuInfo; 1680 static unsigned maxIndex = pkgIdIndex; 1681 1682 static int __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b) { 1683 const unsigned *aa = (const unsigned *)a; 1684 const unsigned *bb = (const unsigned *)b; 1685 if (aa[osIdIndex] < bb[osIdIndex]) 1686 return -1; 1687 if (aa[osIdIndex] > bb[osIdIndex]) 1688 return 1; 1689 return 0; 1690 }; 1691 1692 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, 1693 const void *b) { 1694 unsigned i; 1695 const unsigned *aa = *((const unsigned **)a); 1696 const unsigned *bb = *((const unsigned **)b); 1697 for (i = maxIndex;; i--) { 1698 if (aa[i] < bb[i]) 1699 return -1; 1700 if (aa[i] > bb[i]) 1701 return 1; 1702 if (i == osIdIndex) 1703 break; 1704 } 1705 return 0; 1706 } 1707 1708 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 1709 // affinity map. 1710 static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, 1711 int *line, 1712 kmp_i18n_id_t *const msg_id, 1713 FILE *f) { 1714 *address2os = NULL; 1715 *msg_id = kmp_i18n_null; 1716 1717 // Scan of the file, and count the number of "processor" (osId) fields, 1718 // and find the highest value of <n> for a node_<n> field. 1719 char buf[256]; 1720 unsigned num_records = 0; 1721 while (!feof(f)) { 1722 buf[sizeof(buf) - 1] = 1; 1723 if (!fgets(buf, sizeof(buf), f)) { 1724 // Read errors presumably because of EOF 1725 break; 1726 } 1727 1728 char s1[] = "processor"; 1729 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1730 num_records++; 1731 continue; 1732 } 1733 1734 // FIXME - this will match "node_<n> <garbage>" 1735 unsigned level; 1736 if (KMP_SSCANF(buf, "node_%d id", &level) == 1) { 1737 if (nodeIdIndex + level >= maxIndex) { 1738 maxIndex = nodeIdIndex + level; 1739 } 1740 continue; 1741 } 1742 } 1743 1744 // Check for empty file / no valid processor records, or too many. The number 1745 // of records can't exceed the number of valid bits in the affinity mask. 1746 if (num_records == 0) { 1747 *line = 0; 1748 *msg_id = kmp_i18n_str_NoProcRecords; 1749 return -1; 1750 } 1751 if (num_records > (unsigned)__kmp_xproc) { 1752 *line = 0; 1753 *msg_id = kmp_i18n_str_TooManyProcRecords; 1754 return -1; 1755 } 1756 1757 // Set the file pointer back to the begginning, so that we can scan the file 1758 // again, this time performing a full parse of the data. Allocate a vector of 1759 // ProcCpuInfo object, where we will place the data. Adding an extra element 1760 // at the end allows us to remove a lot of extra checks for termination 1761 // conditions. 1762 if (fseek(f, 0, SEEK_SET) != 0) { 1763 *line = 0; 1764 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 1765 return -1; 1766 } 1767 1768 // Allocate the array of records to store the proc info in. The dummy 1769 // element at the end makes the logic in filling them out easier to code. 1770 unsigned **threadInfo = 1771 (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *)); 1772 unsigned i; 1773 for (i = 0; i <= num_records; i++) { 1774 threadInfo[i] = 1775 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 1776 } 1777 1778 #define CLEANUP_THREAD_INFO \ 1779 for (i = 0; i <= num_records; i++) { \ 1780 __kmp_free(threadInfo[i]); \ 1781 } \ 1782 __kmp_free(threadInfo); 1783 1784 // A value of UINT_MAX means that we didn't find the field 1785 unsigned __index; 1786 1787 #define INIT_PROC_INFO(p) \ 1788 for (__index = 0; __index <= maxIndex; __index++) { \ 1789 (p)[__index] = UINT_MAX; \ 1790 } 1791 1792 for (i = 0; i <= num_records; i++) { 1793 INIT_PROC_INFO(threadInfo[i]); 1794 } 1795 1796 unsigned num_avail = 0; 1797 *line = 0; 1798 while (!feof(f)) { 1799 // Create an inner scoping level, so that all the goto targets at the end of 1800 // the loop appear in an outer scoping level. This avoids warnings about 1801 // jumping past an initialization to a target in the same block. 1802 { 1803 buf[sizeof(buf) - 1] = 1; 1804 bool long_line = false; 1805 if (!fgets(buf, sizeof(buf), f)) { 1806 // Read errors presumably because of EOF 1807 // If there is valid data in threadInfo[num_avail], then fake 1808 // a blank line in ensure that the last address gets parsed. 1809 bool valid = false; 1810 for (i = 0; i <= maxIndex; i++) { 1811 if (threadInfo[num_avail][i] != UINT_MAX) { 1812 valid = true; 1813 } 1814 } 1815 if (!valid) { 1816 break; 1817 } 1818 buf[0] = 0; 1819 } else if (!buf[sizeof(buf) - 1]) { 1820 // The line is longer than the buffer. Set a flag and don't 1821 // emit an error if we were going to ignore the line, anyway. 1822 long_line = true; 1823 1824 #define CHECK_LINE \ 1825 if (long_line) { \ 1826 CLEANUP_THREAD_INFO; \ 1827 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 1828 return -1; \ 1829 } 1830 } 1831 (*line)++; 1832 1833 char s1[] = "processor"; 1834 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1835 CHECK_LINE; 1836 char *p = strchr(buf + sizeof(s1) - 1, ':'); 1837 unsigned val; 1838 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 1839 goto no_val; 1840 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) 1841 goto dup_field; 1842 threadInfo[num_avail][osIdIndex] = val; 1843 #if KMP_OS_LINUX && USE_SYSFS_INFO 1844 char path[256]; 1845 KMP_SNPRINTF( 1846 path, sizeof(path), 1847 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 1848 threadInfo[num_avail][osIdIndex]); 1849 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 1850 1851 KMP_SNPRINTF(path, sizeof(path), 1852 "/sys/devices/system/cpu/cpu%u/topology/core_id", 1853 threadInfo[num_avail][osIdIndex]); 1854 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 1855 continue; 1856 #else 1857 } 1858 char s2[] = "physical id"; 1859 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 1860 CHECK_LINE; 1861 char *p = strchr(buf + sizeof(s2) - 1, ':'); 1862 unsigned val; 1863 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 1864 goto no_val; 1865 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) 1866 goto dup_field; 1867 threadInfo[num_avail][pkgIdIndex] = val; 1868 continue; 1869 } 1870 char s3[] = "core id"; 1871 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 1872 CHECK_LINE; 1873 char *p = strchr(buf + sizeof(s3) - 1, ':'); 1874 unsigned val; 1875 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 1876 goto no_val; 1877 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) 1878 goto dup_field; 1879 threadInfo[num_avail][coreIdIndex] = val; 1880 continue; 1881 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 1882 } 1883 char s4[] = "thread id"; 1884 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 1885 CHECK_LINE; 1886 char *p = strchr(buf + sizeof(s4) - 1, ':'); 1887 unsigned val; 1888 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 1889 goto no_val; 1890 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) 1891 goto dup_field; 1892 threadInfo[num_avail][threadIdIndex] = val; 1893 continue; 1894 } 1895 unsigned level; 1896 if (KMP_SSCANF(buf, "node_%d id", &level) == 1) { 1897 CHECK_LINE; 1898 char *p = strchr(buf + sizeof(s4) - 1, ':'); 1899 unsigned val; 1900 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 1901 goto no_val; 1902 KMP_ASSERT(nodeIdIndex + level <= maxIndex); 1903 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) 1904 goto dup_field; 1905 threadInfo[num_avail][nodeIdIndex + level] = val; 1906 continue; 1907 } 1908 1909 // We didn't recognize the leading token on the line. There are lots of 1910 // leading tokens that we don't recognize - if the line isn't empty, go on 1911 // to the next line. 1912 if ((*buf != 0) && (*buf != '\n')) { 1913 // If the line is longer than the buffer, read characters 1914 // until we find a newline. 1915 if (long_line) { 1916 int ch; 1917 while (((ch = fgetc(f)) != EOF) && (ch != '\n')) 1918 ; 1919 } 1920 continue; 1921 } 1922 1923 // A newline has signalled the end of the processor record. 1924 // Check that there aren't too many procs specified. 1925 if ((int)num_avail == __kmp_xproc) { 1926 CLEANUP_THREAD_INFO; 1927 *msg_id = kmp_i18n_str_TooManyEntries; 1928 return -1; 1929 } 1930 1931 // Check for missing fields. The osId field must be there, and we 1932 // currently require that the physical id field is specified, also. 1933 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 1934 CLEANUP_THREAD_INFO; 1935 *msg_id = kmp_i18n_str_MissingProcField; 1936 return -1; 1937 } 1938 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 1939 CLEANUP_THREAD_INFO; 1940 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 1941 return -1; 1942 } 1943 1944 // Skip this proc if it is not included in the machine model. 1945 if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], 1946 __kmp_affin_fullMask)) { 1947 INIT_PROC_INFO(threadInfo[num_avail]); 1948 continue; 1949 } 1950 1951 // We have a successful parse of this proc's info. 1952 // Increment the counter, and prepare for the next proc. 1953 num_avail++; 1954 KMP_ASSERT(num_avail <= num_records); 1955 INIT_PROC_INFO(threadInfo[num_avail]); 1956 } 1957 continue; 1958 1959 no_val: 1960 CLEANUP_THREAD_INFO; 1961 *msg_id = kmp_i18n_str_MissingValCpuinfo; 1962 return -1; 1963 1964 dup_field: 1965 CLEANUP_THREAD_INFO; 1966 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 1967 return -1; 1968 } 1969 *line = 0; 1970 1971 #if KMP_MIC && REDUCE_TEAM_SIZE 1972 unsigned teamSize = 0; 1973 #endif // KMP_MIC && REDUCE_TEAM_SIZE 1974 1975 // check for num_records == __kmp_xproc ??? 1976 1977 // If there's only one thread context to bind to, form an Address object with 1978 // depth 1 and return immediately (or, if affinity is off, set address2os to 1979 // NULL and return). 1980 // 1981 // If it is configured to omit the package level when there is only a single 1982 // package, the logic at the end of this routine won't work if there is only a 1983 // single thread - it would try to form an Address object with depth 0. 1984 KMP_ASSERT(num_avail > 0); 1985 KMP_ASSERT(num_avail <= num_records); 1986 if (num_avail == 1) { 1987 __kmp_ncores = 1; 1988 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1989 if (__kmp_affinity_verbose) { 1990 if (!KMP_AFFINITY_CAPABLE()) { 1991 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 1992 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1993 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1994 } else { 1995 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1996 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 1997 __kmp_affin_fullMask); 1998 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 1999 if (__kmp_affinity_respect_mask) { 2000 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2001 } else { 2002 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2003 } 2004 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2005 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2006 } 2007 int index; 2008 kmp_str_buf_t buf; 2009 __kmp_str_buf_init(&buf); 2010 __kmp_str_buf_print(&buf, "1"); 2011 for (index = maxIndex - 1; index > pkgIdIndex; index--) { 2012 __kmp_str_buf_print(&buf, " x 1"); 2013 } 2014 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1); 2015 __kmp_str_buf_free(&buf); 2016 } 2017 2018 if (__kmp_affinity_type == affinity_none) { 2019 CLEANUP_THREAD_INFO; 2020 return 0; 2021 } 2022 2023 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair)); 2024 Address addr(1); 2025 addr.labels[0] = threadInfo[0][pkgIdIndex]; 2026 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]); 2027 2028 if (__kmp_affinity_gran_levels < 0) { 2029 __kmp_affinity_gran_levels = 0; 2030 } 2031 2032 if (__kmp_affinity_verbose) { 2033 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 2034 } 2035 2036 CLEANUP_THREAD_INFO; 2037 return 1; 2038 } 2039 2040 // Sort the threadInfo table by physical Id. 2041 qsort(threadInfo, num_avail, sizeof(*threadInfo), 2042 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 2043 2044 // The table is now sorted by pkgId / coreId / threadId, but we really don't 2045 // know the radix of any of the fields. pkgId's may be sparsely assigned among 2046 // the chips on a system. Although coreId's are usually assigned 2047 // [0 .. coresPerPkg-1] and threadId's are usually assigned 2048 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2049 // 2050 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 2051 // total # packages) are at this point - we want to determine that now. We 2052 // only have an upper bound on the first two figures. 2053 unsigned *counts = 2054 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2055 unsigned *maxCt = 2056 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2057 unsigned *totals = 2058 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2059 unsigned *lastId = 2060 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2061 2062 bool assign_thread_ids = false; 2063 unsigned threadIdCt; 2064 unsigned index; 2065 2066 restart_radix_check: 2067 threadIdCt = 0; 2068 2069 // Initialize the counter arrays with data from threadInfo[0]. 2070 if (assign_thread_ids) { 2071 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 2072 threadInfo[0][threadIdIndex] = threadIdCt++; 2073 } else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 2074 threadIdCt = threadInfo[0][threadIdIndex] + 1; 2075 } 2076 } 2077 for (index = 0; index <= maxIndex; index++) { 2078 counts[index] = 1; 2079 maxCt[index] = 1; 2080 totals[index] = 1; 2081 lastId[index] = threadInfo[0][index]; 2082 ; 2083 } 2084 2085 // Run through the rest of the OS procs. 2086 for (i = 1; i < num_avail; i++) { 2087 // Find the most significant index whose id differs from the id for the 2088 // previous OS proc. 2089 for (index = maxIndex; index >= threadIdIndex; index--) { 2090 if (assign_thread_ids && (index == threadIdIndex)) { 2091 // Auto-assign the thread id field if it wasn't specified. 2092 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2093 threadInfo[i][threadIdIndex] = threadIdCt++; 2094 } 2095 // Apparently the thread id field was specified for some entries and not 2096 // others. Start the thread id counter off at the next higher thread id. 2097 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2098 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2099 } 2100 } 2101 if (threadInfo[i][index] != lastId[index]) { 2102 // Run through all indices which are less significant, and reset the 2103 // counts to 1. At all levels up to and including index, we need to 2104 // increment the totals and record the last id. 2105 unsigned index2; 2106 for (index2 = threadIdIndex; index2 < index; index2++) { 2107 totals[index2]++; 2108 if (counts[index2] > maxCt[index2]) { 2109 maxCt[index2] = counts[index2]; 2110 } 2111 counts[index2] = 1; 2112 lastId[index2] = threadInfo[i][index2]; 2113 } 2114 counts[index]++; 2115 totals[index]++; 2116 lastId[index] = threadInfo[i][index]; 2117 2118 if (assign_thread_ids && (index > threadIdIndex)) { 2119 2120 #if KMP_MIC && REDUCE_TEAM_SIZE 2121 // The default team size is the total #threads in the machine 2122 // minus 1 thread for every core that has 3 or more threads. 2123 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2124 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2125 2126 // Restart the thread counter, as we are on a new core. 2127 threadIdCt = 0; 2128 2129 // Auto-assign the thread id field if it wasn't specified. 2130 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2131 threadInfo[i][threadIdIndex] = threadIdCt++; 2132 } 2133 2134 // Aparrently the thread id field was specified for some entries and 2135 // not others. Start the thread id counter off at the next higher 2136 // thread id. 2137 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2138 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2139 } 2140 } 2141 break; 2142 } 2143 } 2144 if (index < threadIdIndex) { 2145 // If thread ids were specified, it is an error if they are not unique. 2146 // Also, check that we waven't already restarted the loop (to be safe - 2147 // shouldn't need to). 2148 if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) { 2149 __kmp_free(lastId); 2150 __kmp_free(totals); 2151 __kmp_free(maxCt); 2152 __kmp_free(counts); 2153 CLEANUP_THREAD_INFO; 2154 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2155 return -1; 2156 } 2157 2158 // If the thread ids were not specified and we see entries entries that 2159 // are duplicates, start the loop over and assign the thread ids manually. 2160 assign_thread_ids = true; 2161 goto restart_radix_check; 2162 } 2163 } 2164 2165 #if KMP_MIC && REDUCE_TEAM_SIZE 2166 // The default team size is the total #threads in the machine 2167 // minus 1 thread for every core that has 3 or more threads. 2168 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2169 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2170 2171 for (index = threadIdIndex; index <= maxIndex; index++) { 2172 if (counts[index] > maxCt[index]) { 2173 maxCt[index] = counts[index]; 2174 } 2175 } 2176 2177 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2178 nCoresPerPkg = maxCt[coreIdIndex]; 2179 nPackages = totals[pkgIdIndex]; 2180 2181 // Check to see if the machine topology is uniform 2182 unsigned prod = totals[maxIndex]; 2183 for (index = threadIdIndex; index < maxIndex; index++) { 2184 prod *= maxCt[index]; 2185 } 2186 bool uniform = (prod == totals[threadIdIndex]); 2187 2188 // When affinity is off, this routine will still be called to set 2189 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 2190 // Make sure all these vars are set correctly, and return now if affinity is 2191 // not enabled. 2192 __kmp_ncores = totals[coreIdIndex]; 2193 2194 if (__kmp_affinity_verbose) { 2195 if (!KMP_AFFINITY_CAPABLE()) { 2196 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2197 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2198 if (uniform) { 2199 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2200 } else { 2201 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2202 } 2203 } else { 2204 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2205 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 2206 __kmp_affin_fullMask); 2207 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2208 if (__kmp_affinity_respect_mask) { 2209 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2210 } else { 2211 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2212 } 2213 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2214 if (uniform) { 2215 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2216 } else { 2217 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2218 } 2219 } 2220 kmp_str_buf_t buf; 2221 __kmp_str_buf_init(&buf); 2222 2223 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]); 2224 for (index = maxIndex - 1; index >= pkgIdIndex; index--) { 2225 __kmp_str_buf_print(&buf, " x %d", maxCt[index]); 2226 } 2227 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex], 2228 maxCt[threadIdIndex], __kmp_ncores); 2229 2230 __kmp_str_buf_free(&buf); 2231 } 2232 2233 #if KMP_MIC && REDUCE_TEAM_SIZE 2234 // Set the default team size. 2235 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2236 __kmp_dflt_team_nth = teamSize; 2237 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting " 2238 "__kmp_dflt_team_nth = %d\n", 2239 __kmp_dflt_team_nth)); 2240 } 2241 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2242 2243 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 2244 KMP_DEBUG_ASSERT(num_avail == __kmp_avail_proc); 2245 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 2246 for (i = 0; i < num_avail; ++i) { // fill the os indices 2247 __kmp_pu_os_idx[i] = threadInfo[i][osIdIndex]; 2248 } 2249 2250 if (__kmp_affinity_type == affinity_none) { 2251 __kmp_free(lastId); 2252 __kmp_free(totals); 2253 __kmp_free(maxCt); 2254 __kmp_free(counts); 2255 CLEANUP_THREAD_INFO; 2256 return 0; 2257 } 2258 2259 // Count the number of levels which have more nodes at that level than at the 2260 // parent's level (with there being an implicit root node of the top level). 2261 // This is equivalent to saying that there is at least one node at this level 2262 // which has a sibling. These levels are in the map, and the package level is 2263 // always in the map. 2264 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2265 int level = 0; 2266 for (index = threadIdIndex; index < maxIndex; index++) { 2267 KMP_ASSERT(totals[index] >= totals[index + 1]); 2268 inMap[index] = (totals[index] > totals[index + 1]); 2269 } 2270 inMap[maxIndex] = (totals[maxIndex] > 1); 2271 inMap[pkgIdIndex] = true; 2272 2273 int depth = 0; 2274 for (index = threadIdIndex; index <= maxIndex; index++) { 2275 if (inMap[index]) { 2276 depth++; 2277 } 2278 } 2279 KMP_ASSERT(depth > 0); 2280 2281 // Construct the data structure that is to be returned. 2282 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * num_avail); 2283 int pkgLevel = -1; 2284 int coreLevel = -1; 2285 int threadLevel = -1; 2286 2287 for (i = 0; i < num_avail; ++i) { 2288 Address addr(depth); 2289 unsigned os = threadInfo[i][osIdIndex]; 2290 int src_index; 2291 int dst_index = 0; 2292 2293 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2294 if (!inMap[src_index]) { 2295 continue; 2296 } 2297 addr.labels[dst_index] = threadInfo[i][src_index]; 2298 if (src_index == pkgIdIndex) { 2299 pkgLevel = dst_index; 2300 } else if (src_index == coreIdIndex) { 2301 coreLevel = dst_index; 2302 } else if (src_index == threadIdIndex) { 2303 threadLevel = dst_index; 2304 } 2305 dst_index++; 2306 } 2307 (*address2os)[i] = AddrUnsPair(addr, os); 2308 } 2309 2310 if (__kmp_affinity_gran_levels < 0) { 2311 // Set the granularity level based on what levels are modeled 2312 // in the machine topology map. 2313 unsigned src_index; 2314 __kmp_affinity_gran_levels = 0; 2315 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) { 2316 if (!inMap[src_index]) { 2317 continue; 2318 } 2319 switch (src_index) { 2320 case threadIdIndex: 2321 if (__kmp_affinity_gran > affinity_gran_thread) { 2322 __kmp_affinity_gran_levels++; 2323 } 2324 2325 break; 2326 case coreIdIndex: 2327 if (__kmp_affinity_gran > affinity_gran_core) { 2328 __kmp_affinity_gran_levels++; 2329 } 2330 break; 2331 2332 case pkgIdIndex: 2333 if (__kmp_affinity_gran > affinity_gran_package) { 2334 __kmp_affinity_gran_levels++; 2335 } 2336 break; 2337 } 2338 } 2339 } 2340 2341 if (__kmp_affinity_verbose) { 2342 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel, 2343 coreLevel, threadLevel); 2344 } 2345 2346 __kmp_free(inMap); 2347 __kmp_free(lastId); 2348 __kmp_free(totals); 2349 __kmp_free(maxCt); 2350 __kmp_free(counts); 2351 CLEANUP_THREAD_INFO; 2352 return depth; 2353 } 2354 2355 // Create and return a table of affinity masks, indexed by OS thread ID. 2356 // This routine handles OR'ing together all the affinity masks of threads 2357 // that are sufficiently close, if granularity > fine. 2358 static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex, 2359 unsigned *numUnique, 2360 AddrUnsPair *address2os, 2361 unsigned numAddrs) { 2362 // First form a table of affinity masks in order of OS thread id. 2363 unsigned depth; 2364 unsigned maxOsId; 2365 unsigned i; 2366 2367 KMP_ASSERT(numAddrs > 0); 2368 depth = address2os[0].first.depth; 2369 2370 maxOsId = 0; 2371 for (i = 0; i < numAddrs; i++) { 2372 unsigned osId = address2os[i].second; 2373 if (osId > maxOsId) { 2374 maxOsId = osId; 2375 } 2376 } 2377 kmp_affin_mask_t *osId2Mask; 2378 KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1)); 2379 2380 // Sort the address2os table according to physical order. Doing so will put 2381 // all threads on the same core/package/node in consecutive locations. 2382 qsort(address2os, numAddrs, sizeof(*address2os), 2383 __kmp_affinity_cmp_Address_labels); 2384 2385 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2386 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2387 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2388 } 2389 if (__kmp_affinity_gran_levels >= (int)depth) { 2390 if (__kmp_affinity_verbose || 2391 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 2392 KMP_WARNING(AffThreadsMayMigrate); 2393 } 2394 } 2395 2396 // Run through the table, forming the masks for all threads on each core. 2397 // Threads on the same core will have identical "Address" objects, not 2398 // considering the last level, which must be the thread id. All threads on a 2399 // core will appear consecutively. 2400 unsigned unique = 0; 2401 unsigned j = 0; // index of 1st thread on core 2402 unsigned leader = 0; 2403 Address *leaderAddr = &(address2os[0].first); 2404 kmp_affin_mask_t *sum; 2405 KMP_CPU_ALLOC_ON_STACK(sum); 2406 KMP_CPU_ZERO(sum); 2407 KMP_CPU_SET(address2os[0].second, sum); 2408 for (i = 1; i < numAddrs; i++) { 2409 // If this thread is sufficiently close to the leader (within the 2410 // granularity setting), then set the bit for this os thread in the 2411 // affinity mask for this group, and go on to the next thread. 2412 if (leaderAddr->isClose(address2os[i].first, __kmp_affinity_gran_levels)) { 2413 KMP_CPU_SET(address2os[i].second, sum); 2414 continue; 2415 } 2416 2417 // For every thread in this group, copy the mask to the thread's entry in 2418 // the osId2Mask table. Mark the first address as a leader. 2419 for (; j < i; j++) { 2420 unsigned osId = address2os[j].second; 2421 KMP_DEBUG_ASSERT(osId <= maxOsId); 2422 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2423 KMP_CPU_COPY(mask, sum); 2424 address2os[j].first.leader = (j == leader); 2425 } 2426 unique++; 2427 2428 // Start a new mask. 2429 leader = i; 2430 leaderAddr = &(address2os[i].first); 2431 KMP_CPU_ZERO(sum); 2432 KMP_CPU_SET(address2os[i].second, sum); 2433 } 2434 2435 // For every thread in last group, copy the mask to the thread's 2436 // entry in the osId2Mask table. 2437 for (; j < i; j++) { 2438 unsigned osId = address2os[j].second; 2439 KMP_DEBUG_ASSERT(osId <= maxOsId); 2440 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2441 KMP_CPU_COPY(mask, sum); 2442 address2os[j].first.leader = (j == leader); 2443 } 2444 unique++; 2445 KMP_CPU_FREE_FROM_STACK(sum); 2446 2447 *maxIndex = maxOsId; 2448 *numUnique = unique; 2449 return osId2Mask; 2450 } 2451 2452 // Stuff for the affinity proclist parsers. It's easier to declare these vars 2453 // as file-static than to try and pass them through the calling sequence of 2454 // the recursive-descent OMP_PLACES parser. 2455 static kmp_affin_mask_t *newMasks; 2456 static int numNewMasks; 2457 static int nextNewMask; 2458 2459 #define ADD_MASK(_mask) \ 2460 { \ 2461 if (nextNewMask >= numNewMasks) { \ 2462 int i; \ 2463 numNewMasks *= 2; \ 2464 kmp_affin_mask_t *temp; \ 2465 KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \ 2466 for (i = 0; i < numNewMasks / 2; i++) { \ 2467 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); \ 2468 kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i); \ 2469 KMP_CPU_COPY(dest, src); \ 2470 } \ 2471 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2); \ 2472 newMasks = temp; \ 2473 } \ 2474 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 2475 nextNewMask++; \ 2476 } 2477 2478 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId) \ 2479 { \ 2480 if (((_osId) > _maxOsId) || \ 2481 (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 2482 if (__kmp_affinity_verbose || \ 2483 (__kmp_affinity_warnings && \ 2484 (__kmp_affinity_type != affinity_none))) { \ 2485 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 2486 } \ 2487 } else { \ 2488 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 2489 } \ 2490 } 2491 2492 // Re-parse the proclist (for the explicit affinity type), and form the list 2493 // of affinity newMasks indexed by gtid. 2494 static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 2495 unsigned int *out_numMasks, 2496 const char *proclist, 2497 kmp_affin_mask_t *osId2Mask, 2498 int maxOsId) { 2499 int i; 2500 const char *scan = proclist; 2501 const char *next = proclist; 2502 2503 // We use malloc() for the temporary mask vector, so that we can use 2504 // realloc() to extend it. 2505 numNewMasks = 2; 2506 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 2507 nextNewMask = 0; 2508 kmp_affin_mask_t *sumMask; 2509 KMP_CPU_ALLOC(sumMask); 2510 int setSize = 0; 2511 2512 for (;;) { 2513 int start, end, stride; 2514 2515 SKIP_WS(scan); 2516 next = scan; 2517 if (*next == '\0') { 2518 break; 2519 } 2520 2521 if (*next == '{') { 2522 int num; 2523 setSize = 0; 2524 next++; // skip '{' 2525 SKIP_WS(next); 2526 scan = next; 2527 2528 // Read the first integer in the set. 2529 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist"); 2530 SKIP_DIGITS(next); 2531 num = __kmp_str_to_int(scan, *next); 2532 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2533 2534 // Copy the mask for that osId to the sum (union) mask. 2535 if ((num > maxOsId) || 2536 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2537 if (__kmp_affinity_verbose || 2538 (__kmp_affinity_warnings && 2539 (__kmp_affinity_type != affinity_none))) { 2540 KMP_WARNING(AffIgnoreInvalidProcID, num); 2541 } 2542 KMP_CPU_ZERO(sumMask); 2543 } else { 2544 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2545 setSize = 1; 2546 } 2547 2548 for (;;) { 2549 // Check for end of set. 2550 SKIP_WS(next); 2551 if (*next == '}') { 2552 next++; // skip '}' 2553 break; 2554 } 2555 2556 // Skip optional comma. 2557 if (*next == ',') { 2558 next++; 2559 } 2560 SKIP_WS(next); 2561 2562 // Read the next integer in the set. 2563 scan = next; 2564 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2565 2566 SKIP_DIGITS(next); 2567 num = __kmp_str_to_int(scan, *next); 2568 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2569 2570 // Add the mask for that osId to the sum mask. 2571 if ((num > maxOsId) || 2572 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2573 if (__kmp_affinity_verbose || 2574 (__kmp_affinity_warnings && 2575 (__kmp_affinity_type != affinity_none))) { 2576 KMP_WARNING(AffIgnoreInvalidProcID, num); 2577 } 2578 } else { 2579 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2580 setSize++; 2581 } 2582 } 2583 if (setSize > 0) { 2584 ADD_MASK(sumMask); 2585 } 2586 2587 SKIP_WS(next); 2588 if (*next == ',') { 2589 next++; 2590 } 2591 scan = next; 2592 continue; 2593 } 2594 2595 // Read the first integer. 2596 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2597 SKIP_DIGITS(next); 2598 start = __kmp_str_to_int(scan, *next); 2599 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 2600 SKIP_WS(next); 2601 2602 // If this isn't a range, then add a mask to the list and go on. 2603 if (*next != '-') { 2604 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2605 2606 // Skip optional comma. 2607 if (*next == ',') { 2608 next++; 2609 } 2610 scan = next; 2611 continue; 2612 } 2613 2614 // This is a range. Skip over the '-' and read in the 2nd int. 2615 next++; // skip '-' 2616 SKIP_WS(next); 2617 scan = next; 2618 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2619 SKIP_DIGITS(next); 2620 end = __kmp_str_to_int(scan, *next); 2621 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 2622 2623 // Check for a stride parameter 2624 stride = 1; 2625 SKIP_WS(next); 2626 if (*next == ':') { 2627 // A stride is specified. Skip over the ':" and read the 3rd int. 2628 int sign = +1; 2629 next++; // skip ':' 2630 SKIP_WS(next); 2631 scan = next; 2632 if (*next == '-') { 2633 sign = -1; 2634 next++; 2635 SKIP_WS(next); 2636 scan = next; 2637 } 2638 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2639 SKIP_DIGITS(next); 2640 stride = __kmp_str_to_int(scan, *next); 2641 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 2642 stride *= sign; 2643 } 2644 2645 // Do some range checks. 2646 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 2647 if (stride > 0) { 2648 KMP_ASSERT2(start <= end, "bad explicit proc list"); 2649 } else { 2650 KMP_ASSERT2(start >= end, "bad explicit proc list"); 2651 } 2652 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 2653 2654 // Add the mask for each OS proc # to the list. 2655 if (stride > 0) { 2656 do { 2657 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2658 start += stride; 2659 } while (start <= end); 2660 } else { 2661 do { 2662 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2663 start += stride; 2664 } while (start >= end); 2665 } 2666 2667 // Skip optional comma. 2668 SKIP_WS(next); 2669 if (*next == ',') { 2670 next++; 2671 } 2672 scan = next; 2673 } 2674 2675 *out_numMasks = nextNewMask; 2676 if (nextNewMask == 0) { 2677 *out_masks = NULL; 2678 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 2679 return; 2680 } 2681 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 2682 for (i = 0; i < nextNewMask; i++) { 2683 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 2684 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 2685 KMP_CPU_COPY(dest, src); 2686 } 2687 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 2688 KMP_CPU_FREE(sumMask); 2689 } 2690 2691 #if OMP_40_ENABLED 2692 2693 /*----------------------------------------------------------------------------- 2694 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 2695 places. Again, Here is the grammar: 2696 2697 place_list := place 2698 place_list := place , place_list 2699 place := num 2700 place := place : num 2701 place := place : num : signed 2702 place := { subplacelist } 2703 place := ! place // (lowest priority) 2704 subplace_list := subplace 2705 subplace_list := subplace , subplace_list 2706 subplace := num 2707 subplace := num : num 2708 subplace := num : num : signed 2709 signed := num 2710 signed := + signed 2711 signed := - signed 2712 -----------------------------------------------------------------------------*/ 2713 2714 static void __kmp_process_subplace_list(const char **scan, 2715 kmp_affin_mask_t *osId2Mask, 2716 int maxOsId, kmp_affin_mask_t *tempMask, 2717 int *setSize) { 2718 const char *next; 2719 2720 for (;;) { 2721 int start, count, stride, i; 2722 2723 // Read in the starting proc id 2724 SKIP_WS(*scan); 2725 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 2726 next = *scan; 2727 SKIP_DIGITS(next); 2728 start = __kmp_str_to_int(*scan, *next); 2729 KMP_ASSERT(start >= 0); 2730 *scan = next; 2731 2732 // valid follow sets are ',' ':' and '}' 2733 SKIP_WS(*scan); 2734 if (**scan == '}' || **scan == ',') { 2735 if ((start > maxOsId) || 2736 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2737 if (__kmp_affinity_verbose || 2738 (__kmp_affinity_warnings && 2739 (__kmp_affinity_type != affinity_none))) { 2740 KMP_WARNING(AffIgnoreInvalidProcID, start); 2741 } 2742 } else { 2743 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2744 (*setSize)++; 2745 } 2746 if (**scan == '}') { 2747 break; 2748 } 2749 (*scan)++; // skip ',' 2750 continue; 2751 } 2752 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 2753 (*scan)++; // skip ':' 2754 2755 // Read count parameter 2756 SKIP_WS(*scan); 2757 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 2758 next = *scan; 2759 SKIP_DIGITS(next); 2760 count = __kmp_str_to_int(*scan, *next); 2761 KMP_ASSERT(count >= 0); 2762 *scan = next; 2763 2764 // valid follow sets are ',' ':' and '}' 2765 SKIP_WS(*scan); 2766 if (**scan == '}' || **scan == ',') { 2767 for (i = 0; i < count; i++) { 2768 if ((start > maxOsId) || 2769 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2770 if (__kmp_affinity_verbose || 2771 (__kmp_affinity_warnings && 2772 (__kmp_affinity_type != affinity_none))) { 2773 KMP_WARNING(AffIgnoreInvalidProcID, start); 2774 } 2775 break; // don't proliferate warnings for large count 2776 } else { 2777 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2778 start++; 2779 (*setSize)++; 2780 } 2781 } 2782 if (**scan == '}') { 2783 break; 2784 } 2785 (*scan)++; // skip ',' 2786 continue; 2787 } 2788 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 2789 (*scan)++; // skip ':' 2790 2791 // Read stride parameter 2792 int sign = +1; 2793 for (;;) { 2794 SKIP_WS(*scan); 2795 if (**scan == '+') { 2796 (*scan)++; // skip '+' 2797 continue; 2798 } 2799 if (**scan == '-') { 2800 sign *= -1; 2801 (*scan)++; // skip '-' 2802 continue; 2803 } 2804 break; 2805 } 2806 SKIP_WS(*scan); 2807 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 2808 next = *scan; 2809 SKIP_DIGITS(next); 2810 stride = __kmp_str_to_int(*scan, *next); 2811 KMP_ASSERT(stride >= 0); 2812 *scan = next; 2813 stride *= sign; 2814 2815 // valid follow sets are ',' and '}' 2816 SKIP_WS(*scan); 2817 if (**scan == '}' || **scan == ',') { 2818 for (i = 0; i < count; i++) { 2819 if ((start > maxOsId) || 2820 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2821 if (__kmp_affinity_verbose || 2822 (__kmp_affinity_warnings && 2823 (__kmp_affinity_type != affinity_none))) { 2824 KMP_WARNING(AffIgnoreInvalidProcID, start); 2825 } 2826 break; // don't proliferate warnings for large count 2827 } else { 2828 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2829 start += stride; 2830 (*setSize)++; 2831 } 2832 } 2833 if (**scan == '}') { 2834 break; 2835 } 2836 (*scan)++; // skip ',' 2837 continue; 2838 } 2839 2840 KMP_ASSERT2(0, "bad explicit places list"); 2841 } 2842 } 2843 2844 static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 2845 int maxOsId, kmp_affin_mask_t *tempMask, 2846 int *setSize) { 2847 const char *next; 2848 2849 // valid follow sets are '{' '!' and num 2850 SKIP_WS(*scan); 2851 if (**scan == '{') { 2852 (*scan)++; // skip '{' 2853 __kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize); 2854 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 2855 (*scan)++; // skip '}' 2856 } else if (**scan == '!') { 2857 (*scan)++; // skip '!' 2858 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 2859 KMP_CPU_COMPLEMENT(maxOsId, tempMask); 2860 } else if ((**scan >= '0') && (**scan <= '9')) { 2861 next = *scan; 2862 SKIP_DIGITS(next); 2863 int num = __kmp_str_to_int(*scan, *next); 2864 KMP_ASSERT(num >= 0); 2865 if ((num > maxOsId) || 2866 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2867 if (__kmp_affinity_verbose || 2868 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 2869 KMP_WARNING(AffIgnoreInvalidProcID, num); 2870 } 2871 } else { 2872 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 2873 (*setSize)++; 2874 } 2875 *scan = next; // skip num 2876 } else { 2877 KMP_ASSERT2(0, "bad explicit places list"); 2878 } 2879 } 2880 2881 // static void 2882 void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 2883 unsigned int *out_numMasks, 2884 const char *placelist, 2885 kmp_affin_mask_t *osId2Mask, 2886 int maxOsId) { 2887 int i, j, count, stride, sign; 2888 const char *scan = placelist; 2889 const char *next = placelist; 2890 2891 numNewMasks = 2; 2892 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 2893 nextNewMask = 0; 2894 2895 // tempMask is modified based on the previous or initial 2896 // place to form the current place 2897 // previousMask contains the previous place 2898 kmp_affin_mask_t *tempMask; 2899 kmp_affin_mask_t *previousMask; 2900 KMP_CPU_ALLOC(tempMask); 2901 KMP_CPU_ZERO(tempMask); 2902 KMP_CPU_ALLOC(previousMask); 2903 KMP_CPU_ZERO(previousMask); 2904 int setSize = 0; 2905 2906 for (;;) { 2907 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 2908 2909 // valid follow sets are ',' ':' and EOL 2910 SKIP_WS(scan); 2911 if (*scan == '\0' || *scan == ',') { 2912 if (setSize > 0) { 2913 ADD_MASK(tempMask); 2914 } 2915 KMP_CPU_ZERO(tempMask); 2916 setSize = 0; 2917 if (*scan == '\0') { 2918 break; 2919 } 2920 scan++; // skip ',' 2921 continue; 2922 } 2923 2924 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 2925 scan++; // skip ':' 2926 2927 // Read count parameter 2928 SKIP_WS(scan); 2929 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 2930 next = scan; 2931 SKIP_DIGITS(next); 2932 count = __kmp_str_to_int(scan, *next); 2933 KMP_ASSERT(count >= 0); 2934 scan = next; 2935 2936 // valid follow sets are ',' ':' and EOL 2937 SKIP_WS(scan); 2938 if (*scan == '\0' || *scan == ',') { 2939 stride = +1; 2940 } else { 2941 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 2942 scan++; // skip ':' 2943 2944 // Read stride parameter 2945 sign = +1; 2946 for (;;) { 2947 SKIP_WS(scan); 2948 if (*scan == '+') { 2949 scan++; // skip '+' 2950 continue; 2951 } 2952 if (*scan == '-') { 2953 sign *= -1; 2954 scan++; // skip '-' 2955 continue; 2956 } 2957 break; 2958 } 2959 SKIP_WS(scan); 2960 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 2961 next = scan; 2962 SKIP_DIGITS(next); 2963 stride = __kmp_str_to_int(scan, *next); 2964 KMP_DEBUG_ASSERT(stride >= 0); 2965 scan = next; 2966 stride *= sign; 2967 } 2968 2969 // Add places determined by initial_place : count : stride 2970 for (i = 0; i < count; i++) { 2971 if (setSize == 0) { 2972 break; 2973 } 2974 // Add the current place, then build the next place (tempMask) from that 2975 KMP_CPU_COPY(previousMask, tempMask); 2976 ADD_MASK(previousMask); 2977 KMP_CPU_ZERO(tempMask); 2978 setSize = 0; 2979 KMP_CPU_SET_ITERATE(j, previousMask) { 2980 if (!KMP_CPU_ISSET(j, previousMask)) { 2981 continue; 2982 } 2983 if ((j + stride > maxOsId) || (j + stride < 0) || 2984 (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) || 2985 (!KMP_CPU_ISSET(j + stride, 2986 KMP_CPU_INDEX(osId2Mask, j + stride)))) { 2987 if ((__kmp_affinity_verbose || 2988 (__kmp_affinity_warnings && 2989 (__kmp_affinity_type != affinity_none))) && 2990 i < count - 1) { 2991 KMP_WARNING(AffIgnoreInvalidProcID, j + stride); 2992 } 2993 continue; 2994 } 2995 KMP_CPU_SET(j + stride, tempMask); 2996 setSize++; 2997 } 2998 } 2999 KMP_CPU_ZERO(tempMask); 3000 setSize = 0; 3001 3002 // valid follow sets are ',' and EOL 3003 SKIP_WS(scan); 3004 if (*scan == '\0') { 3005 break; 3006 } 3007 if (*scan == ',') { 3008 scan++; // skip ',' 3009 continue; 3010 } 3011 3012 KMP_ASSERT2(0, "bad explicit places list"); 3013 } 3014 3015 *out_numMasks = nextNewMask; 3016 if (nextNewMask == 0) { 3017 *out_masks = NULL; 3018 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3019 return; 3020 } 3021 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3022 KMP_CPU_FREE(tempMask); 3023 KMP_CPU_FREE(previousMask); 3024 for (i = 0; i < nextNewMask; i++) { 3025 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 3026 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 3027 KMP_CPU_COPY(dest, src); 3028 } 3029 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3030 } 3031 3032 #endif /* OMP_40_ENABLED */ 3033 3034 #undef ADD_MASK 3035 #undef ADD_MASK_OSID 3036 3037 #if KMP_USE_HWLOC 3038 static int __kmp_hwloc_count_children_by_type(hwloc_topology_t t, hwloc_obj_t o, 3039 hwloc_obj_type_t type, 3040 hwloc_obj_t* f) { 3041 if (!hwloc_compare_types(o->type, type)) { 3042 if (*f == NULL) 3043 *f = o; // output first descendant found 3044 return 1; 3045 } 3046 int sum = 0; 3047 for (unsigned i = 0; i < o->arity; i++) 3048 sum += __kmp_hwloc_count_children_by_type(t, o->children[i], type, f); 3049 return sum; // will be 0 if no one found (as PU arity is 0) 3050 } 3051 3052 static int __kmp_hwloc_count_children_by_depth(hwloc_topology_t t, 3053 hwloc_obj_t o, unsigned depth, 3054 hwloc_obj_t* f) { 3055 if (o->depth == depth) { 3056 if (*f == NULL) 3057 *f = o; // output first descendant found 3058 return 1; 3059 } 3060 int sum = 0; 3061 for (unsigned i = 0; i < o->arity; i++) 3062 sum += __kmp_hwloc_count_children_by_depth(t, o->children[i], depth, f); 3063 return sum; // will be 0 if no one found (as PU arity is 0) 3064 } 3065 3066 static int __kmp_hwloc_skip_PUs_obj(hwloc_topology_t t, hwloc_obj_t o) { 3067 // skip PUs descendants of the object o 3068 int skipped = 0; 3069 hwloc_obj_t hT = NULL; 3070 int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT); 3071 for (int i = 0; i < N; ++i) { 3072 KMP_DEBUG_ASSERT(hT); 3073 unsigned idx = hT->os_index; 3074 if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3075 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3076 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3077 ++skipped; 3078 } 3079 hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT); 3080 } 3081 return skipped; // count number of skipped units 3082 } 3083 3084 static int __kmp_hwloc_obj_has_PUs(hwloc_topology_t t, hwloc_obj_t o) { 3085 // check if obj has PUs present in fullMask 3086 hwloc_obj_t hT = NULL; 3087 int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT); 3088 for (int i = 0; i < N; ++i) { 3089 KMP_DEBUG_ASSERT(hT); 3090 unsigned idx = hT->os_index; 3091 if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) 3092 return 1; // found PU 3093 hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT); 3094 } 3095 return 0; // no PUs found 3096 } 3097 #endif // KMP_USE_HWLOC 3098 3099 static void __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) { 3100 AddrUnsPair *newAddr; 3101 if (__kmp_hws_requested == 0) 3102 goto _exit; // no topology limiting actions requested, exit 3103 #if KMP_USE_HWLOC 3104 if (__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 3105 // Number of subobjects calculated dynamically, this works fine for 3106 // any non-uniform topology. 3107 // L2 cache objects are determined by depth, other objects - by type. 3108 hwloc_topology_t tp = __kmp_hwloc_topology; 3109 int nS=0, nN=0, nL=0, nC=0, nT=0; // logical index including skipped 3110 int nCr=0, nTr=0; // number of requested units 3111 int nPkg=0, nCo=0, n_new=0, n_old = 0, nCpP=0, nTpC=0; // counters 3112 hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to) 3113 int L2depth, idx; 3114 3115 // check support of extensions ---------------------------------- 3116 int numa_support = 0, tile_support = 0; 3117 if (__kmp_pu_os_idx) 3118 hT = hwloc_get_pu_obj_by_os_index(tp, 3119 __kmp_pu_os_idx[__kmp_avail_proc - 1]); 3120 else 3121 hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, __kmp_avail_proc - 1); 3122 if (hT == NULL) { // something's gone wrong 3123 KMP_WARNING(AffHWSubsetUnsupported); 3124 goto _exit; 3125 } 3126 // check NUMA node 3127 hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT); 3128 hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT); 3129 if (hN != NULL && hN->depth > hS->depth) { 3130 numa_support = 1; // 1 in case socket includes node(s) 3131 } else if (__kmp_hws_node.num > 0) { 3132 // don't support sockets inside NUMA node (no such HW found for testing) 3133 KMP_WARNING(AffHWSubsetUnsupported); 3134 goto _exit; 3135 } 3136 // check L2 cahce, get object by depth because of multiple caches 3137 L2depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED); 3138 hL = hwloc_get_ancestor_obj_by_depth(tp, L2depth, hT); 3139 if (hL != NULL && __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, 3140 &hC) > 1) { 3141 tile_support = 1; // no sense to count L2 if it includes single core 3142 } else if (__kmp_hws_tile.num > 0) { 3143 if (__kmp_hws_core.num == 0) { 3144 __kmp_hws_core = __kmp_hws_tile; // replace L2 with core 3145 __kmp_hws_tile.num = 0; 3146 } else { 3147 // L2 and core are both requested, but represent same object 3148 KMP_WARNING(AffHWSubsetInvalid); 3149 goto _exit; 3150 } 3151 } 3152 // end of check of extensions ----------------------------------- 3153 3154 // fill in unset items, validate settings ----------------------- 3155 if (__kmp_hws_socket.num == 0) 3156 __kmp_hws_socket.num = nPackages; // use all available sockets 3157 if (__kmp_hws_socket.offset >= nPackages) { 3158 KMP_WARNING(AffHWSubsetManySockets); 3159 goto _exit; 3160 } 3161 if (numa_support) { 3162 int NN = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, 3163 &hN); // num nodes in socket 3164 if (__kmp_hws_node.num == 0) 3165 __kmp_hws_node.num = NN; // use all available nodes 3166 if (__kmp_hws_node.offset >= NN) { 3167 KMP_WARNING(AffHWSubsetManyNodes); 3168 goto _exit; 3169 } 3170 if (tile_support) { 3171 // get num tiles in node 3172 int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL); 3173 if (__kmp_hws_tile.num == 0) { 3174 __kmp_hws_tile.num = NL + 1; 3175 } // use all available tiles, some node may have more tiles, thus +1 3176 if (__kmp_hws_tile.offset >= NL) { 3177 KMP_WARNING(AffHWSubsetManyTiles); 3178 goto _exit; 3179 } 3180 int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, 3181 &hC); // num cores in tile 3182 if (__kmp_hws_core.num == 0) 3183 __kmp_hws_core.num = NC; // use all available cores 3184 if (__kmp_hws_core.offset >= NC) { 3185 KMP_WARNING(AffHWSubsetManyCores); 3186 goto _exit; 3187 } 3188 } else { // tile_support 3189 int NC = __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, 3190 &hC); // num cores in node 3191 if (__kmp_hws_core.num == 0) 3192 __kmp_hws_core.num = NC; // use all available cores 3193 if (__kmp_hws_core.offset >= NC) { 3194 KMP_WARNING(AffHWSubsetManyCores); 3195 goto _exit; 3196 } 3197 } // tile_support 3198 } else { // numa_support 3199 if (tile_support) { 3200 // get num tiles in socket 3201 int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL); 3202 if (__kmp_hws_tile.num == 0) 3203 __kmp_hws_tile.num = NL; // use all available tiles 3204 if (__kmp_hws_tile.offset >= NL) { 3205 KMP_WARNING(AffHWSubsetManyTiles); 3206 goto _exit; 3207 } 3208 int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, 3209 &hC); // num cores in tile 3210 if (__kmp_hws_core.num == 0) 3211 __kmp_hws_core.num = NC; // use all available cores 3212 if (__kmp_hws_core.offset >= NC) { 3213 KMP_WARNING(AffHWSubsetManyCores); 3214 goto _exit; 3215 } 3216 } else { // tile_support 3217 int NC = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, 3218 &hC); // num cores in socket 3219 if (__kmp_hws_core.num == 0) 3220 __kmp_hws_core.num = NC; // use all available cores 3221 if (__kmp_hws_core.offset >= NC) { 3222 KMP_WARNING(AffHWSubsetManyCores); 3223 goto _exit; 3224 } 3225 } // tile_support 3226 } 3227 if (__kmp_hws_proc.num == 0) 3228 __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all available procs 3229 if (__kmp_hws_proc.offset >= __kmp_nThreadsPerCore) { 3230 KMP_WARNING(AffHWSubsetManyProcs); 3231 goto _exit; 3232 } 3233 // end of validation -------------------------------------------- 3234 3235 if (pAddr) // pAddr is NULL in case of affinity_none 3236 newAddr = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * 3237 __kmp_avail_proc); // max size 3238 // main loop to form HW subset ---------------------------------- 3239 hS = NULL; 3240 int NP = hwloc_get_nbobjs_by_type(tp, HWLOC_OBJ_PACKAGE); 3241 for (int s = 0; s < NP; ++s) { 3242 // Check Socket ----------------------------------------------- 3243 hS = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hS); 3244 if (!__kmp_hwloc_obj_has_PUs(tp, hS)) 3245 continue; // skip socket if all PUs are out of fullMask 3246 ++nS; // only count objects those have PUs in affinity mask 3247 if (nS <= __kmp_hws_socket.offset || 3248 nS > __kmp_hws_socket.num + __kmp_hws_socket.offset) { 3249 n_old += __kmp_hwloc_skip_PUs_obj(tp, hS); // skip socket 3250 continue; // move to next socket 3251 } 3252 nCr = 0; // count number of cores per socket 3253 // socket requested, go down the topology tree 3254 // check 4 cases: (+NUMA+Tile), (+NUMA-Tile), (-NUMA+Tile), (-NUMA-Tile) 3255 if (numa_support) { 3256 nN = 0; 3257 hN = NULL; 3258 // num nodes in current socket 3259 int NN = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, 3260 &hN); 3261 for (int n = 0; n < NN; ++n) { 3262 // Check NUMA Node ---------------------------------------- 3263 if (!__kmp_hwloc_obj_has_PUs(tp, hN)) { 3264 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3265 continue; // skip node if all PUs are out of fullMask 3266 } 3267 ++nN; 3268 if (nN <= __kmp_hws_node.offset || 3269 nN > __kmp_hws_node.num + __kmp_hws_node.offset) { 3270 // skip node as not requested 3271 n_old += __kmp_hwloc_skip_PUs_obj(tp, hN); // skip node 3272 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3273 continue; // move to next node 3274 } 3275 // node requested, go down the topology tree 3276 if (tile_support) { 3277 nL = 0; 3278 hL = NULL; 3279 int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL); 3280 for (int l = 0; l < NL; ++l) { 3281 // Check L2 (tile) ------------------------------------ 3282 if (!__kmp_hwloc_obj_has_PUs(tp, hL)) { 3283 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3284 continue; // skip tile if all PUs are out of fullMask 3285 } 3286 ++nL; 3287 if (nL <= __kmp_hws_tile.offset || 3288 nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) { 3289 // skip tile as not requested 3290 n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile 3291 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3292 continue; // move to next tile 3293 } 3294 // tile requested, go down the topology tree 3295 nC = 0; 3296 hC = NULL; 3297 // num cores in current tile 3298 int NC = __kmp_hwloc_count_children_by_type(tp, hL, 3299 HWLOC_OBJ_CORE, &hC); 3300 for (int c = 0; c < NC; ++c) { 3301 // Check Core --------------------------------------- 3302 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3303 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3304 continue; // skip core if all PUs are out of fullMask 3305 } 3306 ++nC; 3307 if (nC <= __kmp_hws_core.offset || 3308 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3309 // skip node as not requested 3310 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3311 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3312 continue; // move to next node 3313 } 3314 // core requested, go down to PUs 3315 nT = 0; 3316 nTr = 0; 3317 hT = NULL; 3318 // num procs in current core 3319 int NT = __kmp_hwloc_count_children_by_type(tp, hC, 3320 HWLOC_OBJ_PU, &hT); 3321 for (int t = 0; t < NT; ++t) { 3322 // Check PU --------------------------------------- 3323 idx = hT->os_index; 3324 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3325 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3326 continue; // skip PU if not in fullMask 3327 } 3328 ++nT; 3329 if (nT <= __kmp_hws_proc.offset || 3330 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3331 // skip PU 3332 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3333 ++n_old; 3334 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3335 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3336 continue; // move to next node 3337 } 3338 ++nTr; 3339 if (pAddr) // collect requested thread's data 3340 newAddr[n_new] = (*pAddr)[n_old]; 3341 ++n_new; 3342 ++n_old; 3343 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3344 } // threads loop 3345 if (nTr > 0) { 3346 ++nCr; // num cores per socket 3347 ++nCo; // total num cores 3348 if (nTr > nTpC) 3349 nTpC = nTr; // calc max threads per core 3350 } 3351 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3352 } // cores loop 3353 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3354 } // tiles loop 3355 } else { // tile_support 3356 // no tiles, check cores 3357 nC = 0; 3358 hC = NULL; 3359 // num cores in current node 3360 int NC = __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, 3361 &hC); 3362 for (int c = 0; c < NC; ++c) { 3363 // Check Core --------------------------------------- 3364 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3365 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3366 continue; // skip core if all PUs are out of fullMask 3367 } 3368 ++nC; 3369 if (nC <= __kmp_hws_core.offset || 3370 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3371 // skip node as not requested 3372 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3373 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3374 continue; // move to next node 3375 } 3376 // core requested, go down to PUs 3377 nT = 0; 3378 nTr = 0; 3379 hT = NULL; 3380 int NT = __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, 3381 &hT); 3382 for (int t = 0; t < NT; ++t) { 3383 // Check PU --------------------------------------- 3384 idx = hT->os_index; 3385 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3386 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3387 continue; // skip PU if not in fullMask 3388 } 3389 ++nT; 3390 if (nT <= __kmp_hws_proc.offset || 3391 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3392 // skip PU 3393 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3394 ++n_old; 3395 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3396 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3397 continue; // move to next node 3398 } 3399 ++nTr; 3400 if (pAddr) // collect requested thread's data 3401 newAddr[n_new] = (*pAddr)[n_old]; 3402 ++n_new; 3403 ++n_old; 3404 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3405 } // threads loop 3406 if (nTr > 0) { 3407 ++nCr; // num cores per socket 3408 ++nCo; // total num cores 3409 if (nTr > nTpC) 3410 nTpC = nTr; // calc max threads per core 3411 } 3412 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3413 } // cores loop 3414 } // tiles support 3415 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3416 } // nodes loop 3417 } else { // numa_support 3418 // no NUMA support 3419 if (tile_support) { 3420 nL = 0; 3421 hL = NULL; 3422 // num tiles in current socket 3423 int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL); 3424 for (int l = 0; l < NL; ++l) { 3425 // Check L2 (tile) ------------------------------------ 3426 if (!__kmp_hwloc_obj_has_PUs(tp, hL)) { 3427 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3428 continue; // skip tile if all PUs are out of fullMask 3429 } 3430 ++nL; 3431 if (nL <= __kmp_hws_tile.offset || 3432 nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) { 3433 // skip tile as not requested 3434 n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile 3435 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3436 continue; // move to next tile 3437 } 3438 // tile requested, go down the topology tree 3439 nC = 0; 3440 hC = NULL; 3441 // num cores per tile 3442 int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, 3443 &hC); 3444 for (int c = 0; c < NC; ++c) { 3445 // Check Core --------------------------------------- 3446 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3447 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3448 continue; // skip core if all PUs are out of fullMask 3449 } 3450 ++nC; 3451 if (nC <= __kmp_hws_core.offset || 3452 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3453 // skip node as not requested 3454 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3455 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3456 continue; // move to next node 3457 } 3458 // core requested, go down to PUs 3459 nT = 0; 3460 nTr = 0; 3461 hT = NULL; 3462 // num procs per core 3463 int NT = __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, 3464 &hT); 3465 for (int t = 0; t < NT; ++t) { 3466 // Check PU --------------------------------------- 3467 idx = hT->os_index; 3468 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3469 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3470 continue; // skip PU if not in fullMask 3471 } 3472 ++nT; 3473 if (nT <= __kmp_hws_proc.offset || 3474 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3475 // skip PU 3476 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3477 ++n_old; 3478 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3479 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3480 continue; // move to next node 3481 } 3482 ++nTr; 3483 if (pAddr) // collect requested thread's data 3484 newAddr[n_new] = (*pAddr)[n_old]; 3485 ++n_new; 3486 ++n_old; 3487 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3488 } // threads loop 3489 if (nTr > 0) { 3490 ++nCr; // num cores per socket 3491 ++nCo; // total num cores 3492 if (nTr > nTpC) 3493 nTpC = nTr; // calc max threads per core 3494 } 3495 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3496 } // cores loop 3497 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3498 } // tiles loop 3499 } else { // tile_support 3500 // no tiles, check cores 3501 nC = 0; 3502 hC = NULL; 3503 // num cores in socket 3504 int NC = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, 3505 &hC); 3506 for (int c = 0; c < NC; ++c) { 3507 // Check Core ------------------------------------------- 3508 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3509 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3510 continue; // skip core if all PUs are out of fullMask 3511 } 3512 ++nC; 3513 if (nC <= __kmp_hws_core.offset || 3514 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3515 // skip node as not requested 3516 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3517 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3518 continue; // move to next node 3519 } 3520 // core requested, go down to PUs 3521 nT = 0; 3522 nTr = 0; 3523 hT = NULL; 3524 // num procs per core 3525 int NT = __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, 3526 &hT); 3527 for (int t = 0; t < NT; ++t) { 3528 // Check PU --------------------------------------- 3529 idx = hT->os_index; 3530 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3531 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3532 continue; // skip PU if not in fullMask 3533 } 3534 ++nT; 3535 if (nT <= __kmp_hws_proc.offset || 3536 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3537 // skip PU 3538 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3539 ++n_old; 3540 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3541 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3542 continue; // move to next node 3543 } 3544 ++nTr; 3545 if (pAddr) // collect requested thread's data 3546 newAddr[n_new] = (*pAddr)[n_old]; 3547 ++n_new; 3548 ++n_old; 3549 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3550 } // threads loop 3551 if (nTr > 0) { 3552 ++nCr; // num cores per socket 3553 ++nCo; // total num cores 3554 if (nTr > nTpC) 3555 nTpC = nTr; // calc max threads per core 3556 } 3557 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3558 } // cores loop 3559 } // tiles support 3560 } // numa_support 3561 if (nCr > 0) { // found cores? 3562 ++nPkg; // num sockets 3563 if (nCr > nCpP) 3564 nCpP = nCr; // calc max cores per socket 3565 } 3566 } // sockets loop 3567 3568 // check the subset is valid 3569 KMP_DEBUG_ASSERT(n_old == __kmp_avail_proc); 3570 KMP_DEBUG_ASSERT(nPkg > 0); 3571 KMP_DEBUG_ASSERT(nCpP > 0); 3572 KMP_DEBUG_ASSERT(nTpC > 0); 3573 KMP_DEBUG_ASSERT(nCo > 0); 3574 KMP_DEBUG_ASSERT(nPkg <= nPackages); 3575 KMP_DEBUG_ASSERT(nCpP <= nCoresPerPkg); 3576 KMP_DEBUG_ASSERT(nTpC <= __kmp_nThreadsPerCore); 3577 KMP_DEBUG_ASSERT(nCo <= __kmp_ncores); 3578 3579 nPackages = nPkg; // correct num sockets 3580 nCoresPerPkg = nCpP; // correct num cores per socket 3581 __kmp_nThreadsPerCore = nTpC; // correct num threads per core 3582 __kmp_avail_proc = n_new; // correct num procs 3583 __kmp_ncores = nCo; // correct num cores 3584 // hwloc topology method end 3585 } else 3586 #endif // KMP_USE_HWLOC 3587 { 3588 int n_old = 0, n_new = 0, proc_num = 0; 3589 if (__kmp_hws_node.num > 0 || __kmp_hws_tile.num > 0) { 3590 KMP_WARNING(AffHWSubsetNoHWLOC); 3591 goto _exit; 3592 } 3593 if (__kmp_hws_socket.num == 0) 3594 __kmp_hws_socket.num = nPackages; // use all available sockets 3595 if (__kmp_hws_core.num == 0) 3596 __kmp_hws_core.num = nCoresPerPkg; // use all available cores 3597 if (__kmp_hws_proc.num == 0 || 3598 __kmp_hws_proc.num > __kmp_nThreadsPerCore) 3599 __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all HW contexts 3600 if ( !__kmp_affinity_uniform_topology() ) { 3601 KMP_WARNING( AffHWSubsetNonUniform ); 3602 goto _exit; // don't support non-uniform topology 3603 } 3604 if ( depth > 3 ) { 3605 KMP_WARNING( AffHWSubsetNonThreeLevel ); 3606 goto _exit; // don't support not-3-level topology 3607 } 3608 if (__kmp_hws_socket.offset + __kmp_hws_socket.num > nPackages) { 3609 KMP_WARNING(AffHWSubsetManySockets); 3610 goto _exit; 3611 } 3612 if ( __kmp_hws_core.offset + __kmp_hws_core.num > nCoresPerPkg ) { 3613 KMP_WARNING( AffHWSubsetManyCores ); 3614 goto _exit; 3615 } 3616 // Form the requested subset 3617 if (pAddr) // pAddr is NULL in case of affinity_none 3618 newAddr = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * 3619 __kmp_hws_socket.num * 3620 __kmp_hws_core.num * 3621 __kmp_hws_proc.num); 3622 for (int i = 0; i < nPackages; ++i) { 3623 if (i < __kmp_hws_socket.offset || 3624 i >= __kmp_hws_socket.offset + __kmp_hws_socket.num) { 3625 // skip not-requested socket 3626 n_old += nCoresPerPkg * __kmp_nThreadsPerCore; 3627 if (__kmp_pu_os_idx != NULL) { 3628 // walk through skipped socket 3629 for (int j = 0; j < nCoresPerPkg; ++j) { 3630 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 3631 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3632 ++proc_num; 3633 } 3634 } 3635 } 3636 } else { 3637 // walk through requested socket 3638 for (int j = 0; j < nCoresPerPkg; ++j) { 3639 if (j < __kmp_hws_core.offset || 3640 j >= __kmp_hws_core.offset + __kmp_hws_core.num) 3641 { // skip not-requested core 3642 n_old += __kmp_nThreadsPerCore; 3643 if (__kmp_pu_os_idx != NULL) { 3644 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 3645 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3646 ++proc_num; 3647 } 3648 } 3649 } else { 3650 // walk through requested core 3651 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 3652 if (k < __kmp_hws_proc.num) { 3653 if (pAddr) // collect requested thread's data 3654 newAddr[n_new] = (*pAddr)[n_old]; 3655 n_new++; 3656 } else { 3657 if (__kmp_pu_os_idx != NULL) 3658 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3659 } 3660 n_old++; 3661 ++proc_num; 3662 } 3663 } 3664 } 3665 } 3666 } 3667 KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore); 3668 KMP_DEBUG_ASSERT(n_new == __kmp_hws_socket.num * __kmp_hws_core.num * 3669 __kmp_hws_proc.num); 3670 nPackages = __kmp_hws_socket.num; // correct nPackages 3671 nCoresPerPkg = __kmp_hws_core.num; // correct nCoresPerPkg 3672 __kmp_nThreadsPerCore = __kmp_hws_proc.num; // correct __kmp_nThreadsPerCore 3673 __kmp_avail_proc = n_new; // correct avail_proc 3674 __kmp_ncores = nPackages * __kmp_hws_core.num; // correct ncores 3675 } // non-hwloc topology method 3676 if (pAddr) { 3677 __kmp_free( *pAddr ); 3678 *pAddr = newAddr; // replace old topology with new one 3679 } 3680 if (__kmp_affinity_verbose) { 3681 char m[KMP_AFFIN_MASK_PRINT_LEN]; 3682 __kmp_affinity_print_mask(m,KMP_AFFIN_MASK_PRINT_LEN,__kmp_affin_fullMask); 3683 if (__kmp_affinity_respect_mask) { 3684 KMP_INFORM(InitOSProcSetRespect, "KMP_HW_SUBSET", m); 3685 } else { 3686 KMP_INFORM(InitOSProcSetNotRespect, "KMP_HW_SUBSET", m); 3687 } 3688 KMP_INFORM(AvailableOSProc, "KMP_HW_SUBSET", __kmp_avail_proc); 3689 kmp_str_buf_t buf; 3690 __kmp_str_buf_init(&buf); 3691 __kmp_str_buf_print(&buf, "%d", nPackages); 3692 KMP_INFORM(TopologyExtra, "KMP_HW_SUBSET", buf.str, nCoresPerPkg, 3693 __kmp_nThreadsPerCore, __kmp_ncores); 3694 __kmp_str_buf_free(&buf); 3695 } 3696 _exit: 3697 if (__kmp_pu_os_idx != NULL) { 3698 __kmp_free(__kmp_pu_os_idx); 3699 __kmp_pu_os_idx = NULL; 3700 } 3701 } 3702 3703 // This function figures out the deepest level at which there is at least one 3704 // cluster/core with more than one processing unit bound to it. 3705 static int __kmp_affinity_find_core_level(const AddrUnsPair *address2os, 3706 int nprocs, int bottom_level) { 3707 int core_level = 0; 3708 3709 for (int i = 0; i < nprocs; i++) { 3710 for (int j = bottom_level; j > 0; j--) { 3711 if (address2os[i].first.labels[j] > 0) { 3712 if (core_level < (j - 1)) { 3713 core_level = j - 1; 3714 } 3715 } 3716 } 3717 } 3718 return core_level; 3719 } 3720 3721 // This function counts number of clusters/cores at given level. 3722 static int __kmp_affinity_compute_ncores(const AddrUnsPair *address2os, 3723 int nprocs, int bottom_level, 3724 int core_level) { 3725 int ncores = 0; 3726 int i, j; 3727 3728 j = bottom_level; 3729 for (i = 0; i < nprocs; i++) { 3730 for (j = bottom_level; j > core_level; j--) { 3731 if ((i + 1) < nprocs) { 3732 if (address2os[i + 1].first.labels[j] > 0) { 3733 break; 3734 } 3735 } 3736 } 3737 if (j == core_level) { 3738 ncores++; 3739 } 3740 } 3741 if (j > core_level) { 3742 // In case of ( nprocs < __kmp_avail_proc ) we may end too deep and miss one 3743 // core. May occur when called from __kmp_affinity_find_core(). 3744 ncores++; 3745 } 3746 return ncores; 3747 } 3748 3749 // This function finds to which cluster/core given processing unit is bound. 3750 static int __kmp_affinity_find_core(const AddrUnsPair *address2os, int proc, 3751 int bottom_level, int core_level) { 3752 return __kmp_affinity_compute_ncores(address2os, proc + 1, bottom_level, 3753 core_level) - 1; 3754 } 3755 3756 // This function finds maximal number of processing units bound to a 3757 // cluster/core at given level. 3758 static int __kmp_affinity_max_proc_per_core(const AddrUnsPair *address2os, 3759 int nprocs, int bottom_level, 3760 int core_level) { 3761 int maxprocpercore = 0; 3762 3763 if (core_level < bottom_level) { 3764 for (int i = 0; i < nprocs; i++) { 3765 int percore = address2os[i].first.labels[core_level + 1] + 1; 3766 3767 if (percore > maxprocpercore) { 3768 maxprocpercore = percore; 3769 } 3770 } 3771 } else { 3772 maxprocpercore = 1; 3773 } 3774 return maxprocpercore; 3775 } 3776 3777 static AddrUnsPair *address2os = NULL; 3778 static int *procarr = NULL; 3779 static int __kmp_aff_depth = 0; 3780 3781 #define KMP_EXIT_AFF_NONE \ 3782 KMP_ASSERT(__kmp_affinity_type == affinity_none); \ 3783 KMP_ASSERT(address2os == NULL); \ 3784 __kmp_apply_thread_places(NULL, 0); \ 3785 return; 3786 3787 static int __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) { 3788 const Address *aa = (const Address *)&(((AddrUnsPair *)a)->first); 3789 const Address *bb = (const Address *)&(((AddrUnsPair *)b)->first); 3790 unsigned depth = aa->depth; 3791 unsigned i; 3792 KMP_DEBUG_ASSERT(depth == bb->depth); 3793 KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth); 3794 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0); 3795 for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) { 3796 int j = depth - i - 1; 3797 if (aa->childNums[j] < bb->childNums[j]) 3798 return -1; 3799 if (aa->childNums[j] > bb->childNums[j]) 3800 return 1; 3801 } 3802 for (; i < depth; i++) { 3803 int j = i - __kmp_affinity_compact; 3804 if (aa->childNums[j] < bb->childNums[j]) 3805 return -1; 3806 if (aa->childNums[j] > bb->childNums[j]) 3807 return 1; 3808 } 3809 return 0; 3810 } 3811 3812 static void __kmp_aux_affinity_initialize(void) { 3813 if (__kmp_affinity_masks != NULL) { 3814 KMP_ASSERT(__kmp_affin_fullMask != NULL); 3815 return; 3816 } 3817 3818 // Create the "full" mask - this defines all of the processors that we 3819 // consider to be in the machine model. If respect is set, then it is the 3820 // initialization thread's affinity mask. Otherwise, it is all processors that 3821 // we know about on the machine. 3822 if (__kmp_affin_fullMask == NULL) { 3823 KMP_CPU_ALLOC(__kmp_affin_fullMask); 3824 } 3825 if (KMP_AFFINITY_CAPABLE()) { 3826 if (__kmp_affinity_respect_mask) { 3827 __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE); 3828 3829 // Count the number of available processors. 3830 unsigned i; 3831 __kmp_avail_proc = 0; 3832 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 3833 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 3834 continue; 3835 } 3836 __kmp_avail_proc++; 3837 } 3838 if (__kmp_avail_proc > __kmp_xproc) { 3839 if (__kmp_affinity_verbose || 3840 (__kmp_affinity_warnings && 3841 (__kmp_affinity_type != affinity_none))) { 3842 KMP_WARNING(ErrorInitializeAffinity); 3843 } 3844 __kmp_affinity_type = affinity_none; 3845 KMP_AFFINITY_DISABLE(); 3846 return; 3847 } 3848 } else { 3849 __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask); 3850 __kmp_avail_proc = __kmp_xproc; 3851 } 3852 } 3853 3854 int depth = -1; 3855 kmp_i18n_id_t msg_id = kmp_i18n_null; 3856 3857 // For backward compatibility, setting KMP_CPUINFO_FILE => 3858 // KMP_TOPOLOGY_METHOD=cpuinfo 3859 if ((__kmp_cpuinfo_file != NULL) && 3860 (__kmp_affinity_top_method == affinity_top_method_all)) { 3861 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 3862 } 3863 3864 if (__kmp_affinity_top_method == affinity_top_method_all) { 3865 // In the default code path, errors are not fatal - we just try using 3866 // another method. We only emit a warning message if affinity is on, or the 3867 // verbose flag is set, an the nowarnings flag was not set. 3868 const char *file_name = NULL; 3869 int line = 0; 3870 #if KMP_USE_HWLOC 3871 if (depth < 0 && 3872 __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 3873 if (__kmp_affinity_verbose) { 3874 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 3875 } 3876 if (!__kmp_hwloc_error) { 3877 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); 3878 if (depth == 0) { 3879 KMP_EXIT_AFF_NONE; 3880 } else if (depth < 0 && __kmp_affinity_verbose) { 3881 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 3882 } 3883 } else if (__kmp_affinity_verbose) { 3884 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 3885 } 3886 } 3887 #endif 3888 3889 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 3890 3891 if (depth < 0) { 3892 if (__kmp_affinity_verbose) { 3893 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 3894 } 3895 3896 file_name = NULL; 3897 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3898 if (depth == 0) { 3899 KMP_EXIT_AFF_NONE; 3900 } 3901 3902 if (depth < 0) { 3903 if (__kmp_affinity_verbose) { 3904 if (msg_id != kmp_i18n_null) { 3905 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", 3906 __kmp_i18n_catgets(msg_id), 3907 KMP_I18N_STR(DecodingLegacyAPIC)); 3908 } else { 3909 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3910 KMP_I18N_STR(DecodingLegacyAPIC)); 3911 } 3912 } 3913 3914 file_name = NULL; 3915 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3916 if (depth == 0) { 3917 KMP_EXIT_AFF_NONE; 3918 } 3919 } 3920 } 3921 3922 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3923 3924 #if KMP_OS_LINUX 3925 3926 if (depth < 0) { 3927 if (__kmp_affinity_verbose) { 3928 if (msg_id != kmp_i18n_null) { 3929 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", 3930 __kmp_i18n_catgets(msg_id), "/proc/cpuinfo"); 3931 } else { 3932 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo"); 3933 } 3934 } 3935 3936 FILE *f = fopen("/proc/cpuinfo", "r"); 3937 if (f == NULL) { 3938 msg_id = kmp_i18n_str_CantOpenCpuinfo; 3939 } else { 3940 file_name = "/proc/cpuinfo"; 3941 depth = 3942 __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3943 fclose(f); 3944 if (depth == 0) { 3945 KMP_EXIT_AFF_NONE; 3946 } 3947 } 3948 } 3949 3950 #endif /* KMP_OS_LINUX */ 3951 3952 #if KMP_GROUP_AFFINITY 3953 3954 if ((depth < 0) && (__kmp_num_proc_groups > 1)) { 3955 if (__kmp_affinity_verbose) { 3956 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 3957 } 3958 3959 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 3960 KMP_ASSERT(depth != 0); 3961 } 3962 3963 #endif /* KMP_GROUP_AFFINITY */ 3964 3965 if (depth < 0) { 3966 if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) { 3967 if (file_name == NULL) { 3968 KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id)); 3969 } else if (line == 0) { 3970 KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id)); 3971 } else { 3972 KMP_INFORM(UsingFlatOSFileLine, file_name, line, 3973 __kmp_i18n_catgets(msg_id)); 3974 } 3975 } 3976 // FIXME - print msg if msg_id = kmp_i18n_null ??? 3977 3978 file_name = ""; 3979 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3980 if (depth == 0) { 3981 KMP_EXIT_AFF_NONE; 3982 } 3983 KMP_ASSERT(depth > 0); 3984 KMP_ASSERT(address2os != NULL); 3985 } 3986 } 3987 3988 // If the user has specified that a paricular topology discovery method is to be 3989 // used, then we abort if that method fails. The exception is group affinity, 3990 // which might have been implicitly set. 3991 3992 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 3993 3994 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 3995 if (__kmp_affinity_verbose) { 3996 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 3997 } 3998 3999 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 4000 if (depth == 0) { 4001 KMP_EXIT_AFF_NONE; 4002 } 4003 if (depth < 0) { 4004 KMP_ASSERT(msg_id != kmp_i18n_null); 4005 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4006 } 4007 } else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 4008 if (__kmp_affinity_verbose) { 4009 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 4010 } 4011 4012 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 4013 if (depth == 0) { 4014 KMP_EXIT_AFF_NONE; 4015 } 4016 if (depth < 0) { 4017 KMP_ASSERT(msg_id != kmp_i18n_null); 4018 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4019 } 4020 } 4021 4022 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4023 4024 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 4025 const char *filename; 4026 if (__kmp_cpuinfo_file != NULL) { 4027 filename = __kmp_cpuinfo_file; 4028 } else { 4029 filename = "/proc/cpuinfo"; 4030 } 4031 4032 if (__kmp_affinity_verbose) { 4033 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 4034 } 4035 4036 FILE *f = fopen(filename, "r"); 4037 if (f == NULL) { 4038 int code = errno; 4039 if (__kmp_cpuinfo_file != NULL) { 4040 __kmp_msg(kmp_ms_fatal, KMP_MSG(CantOpenFileForReading, filename), 4041 KMP_ERR(code), KMP_HNT(NameComesFrom_CPUINFO_FILE), 4042 __kmp_msg_null); 4043 } else { 4044 __kmp_msg(kmp_ms_fatal, KMP_MSG(CantOpenFileForReading, filename), 4045 KMP_ERR(code), __kmp_msg_null); 4046 } 4047 } 4048 int line = 0; 4049 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 4050 fclose(f); 4051 if (depth < 0) { 4052 KMP_ASSERT(msg_id != kmp_i18n_null); 4053 if (line > 0) { 4054 KMP_FATAL(FileLineMsgExiting, filename, line, 4055 __kmp_i18n_catgets(msg_id)); 4056 } else { 4057 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 4058 } 4059 } 4060 if (__kmp_affinity_type == affinity_none) { 4061 KMP_ASSERT(depth == 0); 4062 KMP_EXIT_AFF_NONE; 4063 } 4064 } 4065 4066 #if KMP_GROUP_AFFINITY 4067 4068 else if (__kmp_affinity_top_method == affinity_top_method_group) { 4069 if (__kmp_affinity_verbose) { 4070 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 4071 } 4072 4073 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 4074 KMP_ASSERT(depth != 0); 4075 if (depth < 0) { 4076 KMP_ASSERT(msg_id != kmp_i18n_null); 4077 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4078 } 4079 } 4080 4081 #endif /* KMP_GROUP_AFFINITY */ 4082 4083 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 4084 if (__kmp_affinity_verbose) { 4085 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY"); 4086 } 4087 4088 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 4089 if (depth == 0) { 4090 KMP_EXIT_AFF_NONE; 4091 } 4092 // should not fail 4093 KMP_ASSERT(depth > 0); 4094 KMP_ASSERT(address2os != NULL); 4095 } 4096 4097 #if KMP_USE_HWLOC 4098 else if (__kmp_affinity_top_method == affinity_top_method_hwloc) { 4099 KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC); 4100 if (__kmp_affinity_verbose) { 4101 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 4102 } 4103 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); 4104 if (depth == 0) { 4105 KMP_EXIT_AFF_NONE; 4106 } 4107 } 4108 #endif // KMP_USE_HWLOC 4109 4110 if (address2os == NULL) { 4111 if (KMP_AFFINITY_CAPABLE() && 4112 (__kmp_affinity_verbose || 4113 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) { 4114 KMP_WARNING(ErrorInitializeAffinity); 4115 } 4116 __kmp_affinity_type = affinity_none; 4117 KMP_AFFINITY_DISABLE(); 4118 return; 4119 } 4120 4121 __kmp_apply_thread_places(&address2os, depth); 4122 4123 // Create the table of masks, indexed by thread Id. 4124 unsigned maxIndex; 4125 unsigned numUnique; 4126 kmp_affin_mask_t *osId2Mask = 4127 __kmp_create_masks(&maxIndex, &numUnique, address2os, __kmp_avail_proc); 4128 if (__kmp_affinity_gran_levels == 0) { 4129 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 4130 } 4131 4132 // Set the childNums vector in all Address objects. This must be done before 4133 // we can sort using __kmp_affinity_cmp_Address_child_num(), which takes into 4134 // account the setting of __kmp_affinity_compact. 4135 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc); 4136 4137 switch (__kmp_affinity_type) { 4138 4139 case affinity_explicit: 4140 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 4141 #if OMP_40_ENABLED 4142 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 4143 #endif 4144 { 4145 __kmp_affinity_process_proclist( 4146 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 4147 __kmp_affinity_proclist, osId2Mask, maxIndex); 4148 } 4149 #if OMP_40_ENABLED 4150 else { 4151 __kmp_affinity_process_placelist( 4152 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 4153 __kmp_affinity_proclist, osId2Mask, maxIndex); 4154 } 4155 #endif 4156 if (__kmp_affinity_num_masks == 0) { 4157 if (__kmp_affinity_verbose || 4158 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 4159 KMP_WARNING(AffNoValidProcID); 4160 } 4161 __kmp_affinity_type = affinity_none; 4162 return; 4163 } 4164 break; 4165 4166 // The other affinity types rely on sorting the Addresses according to some 4167 // permutation of the machine topology tree. Set __kmp_affinity_compact and 4168 // __kmp_affinity_offset appropriately, then jump to a common code fragment 4169 // to do the sort and create the array of affinity masks. 4170 4171 case affinity_logical: 4172 __kmp_affinity_compact = 0; 4173 if (__kmp_affinity_offset) { 4174 __kmp_affinity_offset = 4175 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 4176 } 4177 goto sortAddresses; 4178 4179 case affinity_physical: 4180 if (__kmp_nThreadsPerCore > 1) { 4181 __kmp_affinity_compact = 1; 4182 if (__kmp_affinity_compact >= depth) { 4183 __kmp_affinity_compact = 0; 4184 } 4185 } else { 4186 __kmp_affinity_compact = 0; 4187 } 4188 if (__kmp_affinity_offset) { 4189 __kmp_affinity_offset = 4190 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 4191 } 4192 goto sortAddresses; 4193 4194 case affinity_scatter: 4195 if (__kmp_affinity_compact >= depth) { 4196 __kmp_affinity_compact = 0; 4197 } else { 4198 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 4199 } 4200 goto sortAddresses; 4201 4202 case affinity_compact: 4203 if (__kmp_affinity_compact >= depth) { 4204 __kmp_affinity_compact = depth - 1; 4205 } 4206 goto sortAddresses; 4207 4208 case affinity_balanced: 4209 if (depth <= 1) { 4210 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 4211 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 4212 } 4213 __kmp_affinity_type = affinity_none; 4214 return; 4215 } else if (__kmp_affinity_uniform_topology()) { 4216 break; 4217 } else { // Non-uniform topology 4218 4219 // Save the depth for further usage 4220 __kmp_aff_depth = depth; 4221 4222 int core_level = __kmp_affinity_find_core_level( 4223 address2os, __kmp_avail_proc, depth - 1); 4224 int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, 4225 depth - 1, core_level); 4226 int maxprocpercore = __kmp_affinity_max_proc_per_core( 4227 address2os, __kmp_avail_proc, depth - 1, core_level); 4228 4229 int nproc = ncores * maxprocpercore; 4230 if ((nproc < 2) || (nproc < __kmp_avail_proc)) { 4231 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 4232 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 4233 } 4234 __kmp_affinity_type = affinity_none; 4235 return; 4236 } 4237 4238 procarr = (int *)__kmp_allocate(sizeof(int) * nproc); 4239 for (int i = 0; i < nproc; i++) { 4240 procarr[i] = -1; 4241 } 4242 4243 int lastcore = -1; 4244 int inlastcore = 0; 4245 for (int i = 0; i < __kmp_avail_proc; i++) { 4246 int proc = address2os[i].second; 4247 int core = 4248 __kmp_affinity_find_core(address2os, i, depth - 1, core_level); 4249 4250 if (core == lastcore) { 4251 inlastcore++; 4252 } else { 4253 inlastcore = 0; 4254 } 4255 lastcore = core; 4256 4257 procarr[core * maxprocpercore + inlastcore] = proc; 4258 } 4259 4260 break; 4261 } 4262 4263 sortAddresses: 4264 // Allocate the gtid->affinity mask table. 4265 if (__kmp_affinity_dups) { 4266 __kmp_affinity_num_masks = __kmp_avail_proc; 4267 } else { 4268 __kmp_affinity_num_masks = numUnique; 4269 } 4270 4271 #if OMP_40_ENABLED 4272 if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) && 4273 (__kmp_affinity_num_places > 0) && 4274 ((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) { 4275 __kmp_affinity_num_masks = __kmp_affinity_num_places; 4276 } 4277 #endif 4278 4279 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4280 4281 // Sort the address2os table according to the current setting of 4282 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 4283 qsort(address2os, __kmp_avail_proc, sizeof(*address2os), 4284 __kmp_affinity_cmp_Address_child_num); 4285 { 4286 int i; 4287 unsigned j; 4288 for (i = 0, j = 0; i < __kmp_avail_proc; i++) { 4289 if ((!__kmp_affinity_dups) && (!address2os[i].first.leader)) { 4290 continue; 4291 } 4292 unsigned osId = address2os[i].second; 4293 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 4294 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j); 4295 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 4296 KMP_CPU_COPY(dest, src); 4297 if (++j >= __kmp_affinity_num_masks) { 4298 break; 4299 } 4300 } 4301 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 4302 } 4303 break; 4304 4305 default: 4306 KMP_ASSERT2(0, "Unexpected affinity setting"); 4307 } 4308 4309 KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1); 4310 machine_hierarchy.init(address2os, __kmp_avail_proc); 4311 } 4312 #undef KMP_EXIT_AFF_NONE 4313 4314 void __kmp_affinity_initialize(void) { 4315 // Much of the code above was written assumming that if a machine was not 4316 // affinity capable, then __kmp_affinity_type == affinity_none. We now 4317 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 4318 // There are too many checks for __kmp_affinity_type == affinity_none 4319 // in this code. Instead of trying to change them all, check if 4320 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 4321 // affinity_none, call the real initialization routine, then restore 4322 // __kmp_affinity_type to affinity_disabled. 4323 int disabled = (__kmp_affinity_type == affinity_disabled); 4324 if (!KMP_AFFINITY_CAPABLE()) { 4325 KMP_ASSERT(disabled); 4326 } 4327 if (disabled) { 4328 __kmp_affinity_type = affinity_none; 4329 } 4330 __kmp_aux_affinity_initialize(); 4331 if (disabled) { 4332 __kmp_affinity_type = affinity_disabled; 4333 } 4334 } 4335 4336 void __kmp_affinity_uninitialize(void) { 4337 if (__kmp_affinity_masks != NULL) { 4338 KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4339 __kmp_affinity_masks = NULL; 4340 } 4341 if (__kmp_affin_fullMask != NULL) { 4342 KMP_CPU_FREE(__kmp_affin_fullMask); 4343 __kmp_affin_fullMask = NULL; 4344 } 4345 __kmp_affinity_num_masks = 0; 4346 __kmp_affinity_type = affinity_default; 4347 #if OMP_40_ENABLED 4348 __kmp_affinity_num_places = 0; 4349 #endif 4350 if (__kmp_affinity_proclist != NULL) { 4351 __kmp_free(__kmp_affinity_proclist); 4352 __kmp_affinity_proclist = NULL; 4353 } 4354 if (address2os != NULL) { 4355 __kmp_free(address2os); 4356 address2os = NULL; 4357 } 4358 if (procarr != NULL) { 4359 __kmp_free(procarr); 4360 procarr = NULL; 4361 } 4362 #if KMP_USE_HWLOC 4363 if (__kmp_hwloc_topology != NULL) { 4364 hwloc_topology_destroy(__kmp_hwloc_topology); 4365 __kmp_hwloc_topology = NULL; 4366 } 4367 #endif 4368 KMPAffinity::destroy_api(); 4369 } 4370 4371 void __kmp_affinity_set_init_mask(int gtid, int isa_root) { 4372 if (!KMP_AFFINITY_CAPABLE()) { 4373 return; 4374 } 4375 4376 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4377 if (th->th.th_affin_mask == NULL) { 4378 KMP_CPU_ALLOC(th->th.th_affin_mask); 4379 } else { 4380 KMP_CPU_ZERO(th->th.th_affin_mask); 4381 } 4382 4383 // Copy the thread mask to the kmp_info_t strucuture. If 4384 // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that 4385 // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set, 4386 // then the full mask is the same as the mask of the initialization thread. 4387 kmp_affin_mask_t *mask; 4388 int i; 4389 4390 #if OMP_40_ENABLED 4391 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 4392 #endif 4393 { 4394 if ((__kmp_affinity_type == affinity_none) || 4395 (__kmp_affinity_type == affinity_balanced)) { 4396 #if KMP_GROUP_AFFINITY 4397 if (__kmp_num_proc_groups > 1) { 4398 return; 4399 } 4400 #endif 4401 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4402 i = KMP_PLACE_ALL; 4403 mask = __kmp_affin_fullMask; 4404 } else { 4405 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 4406 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4407 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4408 } 4409 } 4410 #if OMP_40_ENABLED 4411 else { 4412 if ((!isa_root) || 4413 (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 4414 #if KMP_GROUP_AFFINITY 4415 if (__kmp_num_proc_groups > 1) { 4416 return; 4417 } 4418 #endif 4419 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4420 i = KMP_PLACE_ALL; 4421 mask = __kmp_affin_fullMask; 4422 } else { 4423 // int i = some hash function or just a counter that doesn't 4424 // always start at 0. Use gtid for now. 4425 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 4426 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4427 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4428 } 4429 } 4430 #endif 4431 4432 #if OMP_40_ENABLED 4433 th->th.th_current_place = i; 4434 if (isa_root) { 4435 th->th.th_new_place = i; 4436 th->th.th_first_place = 0; 4437 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4438 } 4439 4440 if (i == KMP_PLACE_ALL) { 4441 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 4442 gtid)); 4443 } else { 4444 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 4445 gtid, i)); 4446 } 4447 #else 4448 if (i == -1) { 4449 KA_TRACE( 4450 100, 4451 ("__kmp_affinity_set_init_mask: binding T#%d to __kmp_affin_fullMask\n", 4452 gtid)); 4453 } else { 4454 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n", 4455 gtid, i)); 4456 } 4457 #endif /* OMP_40_ENABLED */ 4458 4459 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4460 4461 if (__kmp_affinity_verbose) { 4462 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4463 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4464 th->th.th_affin_mask); 4465 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4466 __kmp_gettid(), gtid, buf); 4467 } 4468 4469 #if KMP_OS_WINDOWS 4470 // On Windows* OS, the process affinity mask might have changed. If the user 4471 // didn't request affinity and this call fails, just continue silently. 4472 // See CQ171393. 4473 if (__kmp_affinity_type == affinity_none) { 4474 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 4475 } else 4476 #endif 4477 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4478 } 4479 4480 #if OMP_40_ENABLED 4481 4482 void __kmp_affinity_set_place(int gtid) { 4483 int retval; 4484 4485 if (!KMP_AFFINITY_CAPABLE()) { 4486 return; 4487 } 4488 4489 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4490 4491 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current " 4492 "place = %d)\n", 4493 gtid, th->th.th_new_place, th->th.th_current_place)); 4494 4495 // Check that the new place is within this thread's partition. 4496 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4497 KMP_ASSERT(th->th.th_new_place >= 0); 4498 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 4499 if (th->th.th_first_place <= th->th.th_last_place) { 4500 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) && 4501 (th->th.th_new_place <= th->th.th_last_place)); 4502 } else { 4503 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) || 4504 (th->th.th_new_place >= th->th.th_last_place)); 4505 } 4506 4507 // Copy the thread mask to the kmp_info_t strucuture, 4508 // and set this thread's affinity. 4509 kmp_affin_mask_t *mask = 4510 KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place); 4511 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4512 th->th.th_current_place = th->th.th_new_place; 4513 4514 if (__kmp_affinity_verbose) { 4515 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4516 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4517 th->th.th_affin_mask); 4518 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), 4519 __kmp_gettid(), gtid, buf); 4520 } 4521 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4522 } 4523 4524 #endif /* OMP_40_ENABLED */ 4525 4526 int __kmp_aux_set_affinity(void **mask) { 4527 int gtid; 4528 kmp_info_t *th; 4529 int retval; 4530 4531 if (!KMP_AFFINITY_CAPABLE()) { 4532 return -1; 4533 } 4534 4535 gtid = __kmp_entry_gtid(); 4536 KA_TRACE(1000, ; { 4537 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4538 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4539 (kmp_affin_mask_t *)(*mask)); 4540 __kmp_debug_printf( 4541 "kmp_set_affinity: setting affinity mask for thread %d = %s\n", gtid, 4542 buf); 4543 }); 4544 4545 if (__kmp_env_consistency_check) { 4546 if ((mask == NULL) || (*mask == NULL)) { 4547 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4548 } else { 4549 unsigned proc; 4550 int num_procs = 0; 4551 4552 KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) { 4553 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4554 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4555 } 4556 if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 4557 continue; 4558 } 4559 num_procs++; 4560 } 4561 if (num_procs == 0) { 4562 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4563 } 4564 4565 #if KMP_GROUP_AFFINITY 4566 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 4567 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4568 } 4569 #endif /* KMP_GROUP_AFFINITY */ 4570 } 4571 } 4572 4573 th = __kmp_threads[gtid]; 4574 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4575 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4576 if (retval == 0) { 4577 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 4578 } 4579 4580 #if OMP_40_ENABLED 4581 th->th.th_current_place = KMP_PLACE_UNDEFINED; 4582 th->th.th_new_place = KMP_PLACE_UNDEFINED; 4583 th->th.th_first_place = 0; 4584 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4585 4586 // Turn off 4.0 affinity for the current tread at this parallel level. 4587 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 4588 #endif 4589 4590 return retval; 4591 } 4592 4593 int __kmp_aux_get_affinity(void **mask) { 4594 int gtid; 4595 int retval; 4596 kmp_info_t *th; 4597 4598 if (!KMP_AFFINITY_CAPABLE()) { 4599 return -1; 4600 } 4601 4602 gtid = __kmp_entry_gtid(); 4603 th = __kmp_threads[gtid]; 4604 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4605 4606 KA_TRACE(1000, ; { 4607 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4608 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4609 th->th.th_affin_mask); 4610 __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", 4611 gtid, buf); 4612 }); 4613 4614 if (__kmp_env_consistency_check) { 4615 if ((mask == NULL) || (*mask == NULL)) { 4616 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 4617 } 4618 } 4619 4620 #if !KMP_OS_WINDOWS 4621 4622 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4623 KA_TRACE(1000, ; { 4624 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4625 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4626 (kmp_affin_mask_t *)(*mask)); 4627 __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", 4628 gtid, buf); 4629 }); 4630 return retval; 4631 4632 #else 4633 4634 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 4635 return 0; 4636 4637 #endif /* KMP_OS_WINDOWS */ 4638 } 4639 4640 int __kmp_aux_get_affinity_max_proc() { 4641 if (!KMP_AFFINITY_CAPABLE()) { 4642 return 0; 4643 } 4644 #if KMP_GROUP_AFFINITY 4645 if (__kmp_num_proc_groups > 1) { 4646 return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT); 4647 } 4648 #endif 4649 return __kmp_xproc; 4650 } 4651 4652 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) { 4653 int retval; 4654 4655 if (!KMP_AFFINITY_CAPABLE()) { 4656 return -1; 4657 } 4658 4659 KA_TRACE(1000, ; { 4660 int gtid = __kmp_entry_gtid(); 4661 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4662 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4663 (kmp_affin_mask_t *)(*mask)); 4664 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in " 4665 "affinity mask for thread %d = %s\n", 4666 proc, gtid, buf); 4667 }); 4668 4669 if (__kmp_env_consistency_check) { 4670 if ((mask == NULL) || (*mask == NULL)) { 4671 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4672 } 4673 } 4674 4675 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4676 return -1; 4677 } 4678 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4679 return -2; 4680 } 4681 4682 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 4683 return 0; 4684 } 4685 4686 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) { 4687 int retval; 4688 4689 if (!KMP_AFFINITY_CAPABLE()) { 4690 return -1; 4691 } 4692 4693 KA_TRACE(1000, ; { 4694 int gtid = __kmp_entry_gtid(); 4695 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4696 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4697 (kmp_affin_mask_t *)(*mask)); 4698 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in " 4699 "affinity mask for thread %d = %s\n", 4700 proc, gtid, buf); 4701 }); 4702 4703 if (__kmp_env_consistency_check) { 4704 if ((mask == NULL) || (*mask == NULL)) { 4705 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 4706 } 4707 } 4708 4709 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4710 return -1; 4711 } 4712 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4713 return -2; 4714 } 4715 4716 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 4717 return 0; 4718 } 4719 4720 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) { 4721 int retval; 4722 4723 if (!KMP_AFFINITY_CAPABLE()) { 4724 return -1; 4725 } 4726 4727 KA_TRACE(1000, ; { 4728 int gtid = __kmp_entry_gtid(); 4729 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4730 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4731 (kmp_affin_mask_t *)(*mask)); 4732 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in " 4733 "affinity mask for thread %d = %s\n", 4734 proc, gtid, buf); 4735 }); 4736 4737 if (__kmp_env_consistency_check) { 4738 if ((mask == NULL) || (*mask == NULL)) { 4739 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc"); 4740 } 4741 } 4742 4743 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4744 return -1; 4745 } 4746 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4747 return 0; 4748 } 4749 4750 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 4751 } 4752 4753 // Dynamic affinity settings - Affinity balanced 4754 void __kmp_balanced_affinity(int tid, int nthreads) { 4755 bool fine_gran = true; 4756 4757 switch (__kmp_affinity_gran) { 4758 case affinity_gran_fine: 4759 case affinity_gran_thread: 4760 break; 4761 case affinity_gran_core: 4762 if (__kmp_nThreadsPerCore > 1) { 4763 fine_gran = false; 4764 } 4765 break; 4766 case affinity_gran_package: 4767 if (nCoresPerPkg > 1) { 4768 fine_gran = false; 4769 } 4770 break; 4771 default: 4772 fine_gran = false; 4773 } 4774 4775 if (__kmp_affinity_uniform_topology()) { 4776 int coreID; 4777 int threadID; 4778 // Number of hyper threads per core in HT machine 4779 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 4780 // Number of cores 4781 int ncores = __kmp_ncores; 4782 if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) { 4783 __kmp_nth_per_core = __kmp_avail_proc / nPackages; 4784 ncores = nPackages; 4785 } 4786 // How many threads will be bound to each core 4787 int chunk = nthreads / ncores; 4788 // How many cores will have an additional thread bound to it - "big cores" 4789 int big_cores = nthreads % ncores; 4790 // Number of threads on the big cores 4791 int big_nth = (chunk + 1) * big_cores; 4792 if (tid < big_nth) { 4793 coreID = tid / (chunk + 1); 4794 threadID = (tid % (chunk + 1)) % __kmp_nth_per_core; 4795 } else { // tid >= big_nth 4796 coreID = (tid - big_cores) / chunk; 4797 threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core; 4798 } 4799 4800 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 4801 "Illegal set affinity operation when not capable"); 4802 4803 kmp_affin_mask_t *mask; 4804 KMP_CPU_ALLOC_ON_STACK(mask); 4805 KMP_CPU_ZERO(mask); 4806 4807 if (fine_gran) { 4808 int osID = address2os[coreID * __kmp_nth_per_core + threadID].second; 4809 KMP_CPU_SET(osID, mask); 4810 } else { 4811 for (int i = 0; i < __kmp_nth_per_core; i++) { 4812 int osID; 4813 osID = address2os[coreID * __kmp_nth_per_core + i].second; 4814 KMP_CPU_SET(osID, mask); 4815 } 4816 } 4817 if (__kmp_affinity_verbose) { 4818 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4819 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4820 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4821 __kmp_gettid(), tid, buf); 4822 } 4823 __kmp_set_system_affinity(mask, TRUE); 4824 KMP_CPU_FREE_FROM_STACK(mask); 4825 } else { // Non-uniform topology 4826 4827 kmp_affin_mask_t *mask; 4828 KMP_CPU_ALLOC_ON_STACK(mask); 4829 KMP_CPU_ZERO(mask); 4830 4831 int core_level = __kmp_affinity_find_core_level( 4832 address2os, __kmp_avail_proc, __kmp_aff_depth - 1); 4833 int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, 4834 __kmp_aff_depth - 1, core_level); 4835 int nth_per_core = __kmp_affinity_max_proc_per_core( 4836 address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level); 4837 4838 // For performance gain consider the special case nthreads == 4839 // __kmp_avail_proc 4840 if (nthreads == __kmp_avail_proc) { 4841 if (fine_gran) { 4842 int osID = address2os[tid].second; 4843 KMP_CPU_SET(osID, mask); 4844 } else { 4845 int core = __kmp_affinity_find_core(address2os, tid, 4846 __kmp_aff_depth - 1, core_level); 4847 for (int i = 0; i < __kmp_avail_proc; i++) { 4848 int osID = address2os[i].second; 4849 if (__kmp_affinity_find_core(address2os, i, __kmp_aff_depth - 1, 4850 core_level) == core) { 4851 KMP_CPU_SET(osID, mask); 4852 } 4853 } 4854 } 4855 } else if (nthreads <= ncores) { 4856 4857 int core = 0; 4858 for (int i = 0; i < ncores; i++) { 4859 // Check if this core from procarr[] is in the mask 4860 int in_mask = 0; 4861 for (int j = 0; j < nth_per_core; j++) { 4862 if (procarr[i * nth_per_core + j] != -1) { 4863 in_mask = 1; 4864 break; 4865 } 4866 } 4867 if (in_mask) { 4868 if (tid == core) { 4869 for (int j = 0; j < nth_per_core; j++) { 4870 int osID = procarr[i * nth_per_core + j]; 4871 if (osID != -1) { 4872 KMP_CPU_SET(osID, mask); 4873 // For fine granularity it is enough to set the first available 4874 // osID for this core 4875 if (fine_gran) { 4876 break; 4877 } 4878 } 4879 } 4880 break; 4881 } else { 4882 core++; 4883 } 4884 } 4885 } 4886 } else { // nthreads > ncores 4887 // Array to save the number of processors at each core 4888 int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores); 4889 // Array to save the number of cores with "x" available processors; 4890 int *ncores_with_x_procs = 4891 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 4892 // Array to save the number of cores with # procs from x to nth_per_core 4893 int *ncores_with_x_to_max_procs = 4894 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 4895 4896 for (int i = 0; i <= nth_per_core; i++) { 4897 ncores_with_x_procs[i] = 0; 4898 ncores_with_x_to_max_procs[i] = 0; 4899 } 4900 4901 for (int i = 0; i < ncores; i++) { 4902 int cnt = 0; 4903 for (int j = 0; j < nth_per_core; j++) { 4904 if (procarr[i * nth_per_core + j] != -1) { 4905 cnt++; 4906 } 4907 } 4908 nproc_at_core[i] = cnt; 4909 ncores_with_x_procs[cnt]++; 4910 } 4911 4912 for (int i = 0; i <= nth_per_core; i++) { 4913 for (int j = i; j <= nth_per_core; j++) { 4914 ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j]; 4915 } 4916 } 4917 4918 // Max number of processors 4919 int nproc = nth_per_core * ncores; 4920 // An array to keep number of threads per each context 4921 int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc); 4922 for (int i = 0; i < nproc; i++) { 4923 newarr[i] = 0; 4924 } 4925 4926 int nth = nthreads; 4927 int flag = 0; 4928 while (nth > 0) { 4929 for (int j = 1; j <= nth_per_core; j++) { 4930 int cnt = ncores_with_x_to_max_procs[j]; 4931 for (int i = 0; i < ncores; i++) { 4932 // Skip the core with 0 processors 4933 if (nproc_at_core[i] == 0) { 4934 continue; 4935 } 4936 for (int k = 0; k < nth_per_core; k++) { 4937 if (procarr[i * nth_per_core + k] != -1) { 4938 if (newarr[i * nth_per_core + k] == 0) { 4939 newarr[i * nth_per_core + k] = 1; 4940 cnt--; 4941 nth--; 4942 break; 4943 } else { 4944 if (flag != 0) { 4945 newarr[i * nth_per_core + k]++; 4946 cnt--; 4947 nth--; 4948 break; 4949 } 4950 } 4951 } 4952 } 4953 if (cnt == 0 || nth == 0) { 4954 break; 4955 } 4956 } 4957 if (nth == 0) { 4958 break; 4959 } 4960 } 4961 flag = 1; 4962 } 4963 int sum = 0; 4964 for (int i = 0; i < nproc; i++) { 4965 sum += newarr[i]; 4966 if (sum > tid) { 4967 if (fine_gran) { 4968 int osID = procarr[i]; 4969 KMP_CPU_SET(osID, mask); 4970 } else { 4971 int coreID = i / nth_per_core; 4972 for (int ii = 0; ii < nth_per_core; ii++) { 4973 int osID = procarr[coreID * nth_per_core + ii]; 4974 if (osID != -1) { 4975 KMP_CPU_SET(osID, mask); 4976 } 4977 } 4978 } 4979 break; 4980 } 4981 } 4982 __kmp_free(newarr); 4983 } 4984 4985 if (__kmp_affinity_verbose) { 4986 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4987 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4988 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4989 __kmp_gettid(), tid, buf); 4990 } 4991 __kmp_set_system_affinity(mask, TRUE); 4992 KMP_CPU_FREE_FROM_STACK(mask); 4993 } 4994 } 4995 4996 #if KMP_OS_LINUX 4997 // We don't need this entry for Windows because 4998 // there is GetProcessAffinityMask() api 4999 // 5000 // The intended usage is indicated by these steps: 5001 // 1) The user gets the current affinity mask 5002 // 2) Then sets the affinity by calling this function 5003 // 3) Error check the return value 5004 // 4) Use non-OpenMP parallelization 5005 // 5) Reset the affinity to what was stored in step 1) 5006 #ifdef __cplusplus 5007 extern "C" 5008 #endif 5009 int 5010 kmp_set_thread_affinity_mask_initial() 5011 // the function returns 0 on success, 5012 // -1 if we cannot bind thread 5013 // >0 (errno) if an error happened during binding 5014 { 5015 int gtid = __kmp_get_gtid(); 5016 if (gtid < 0) { 5017 // Do not touch non-omp threads 5018 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5019 "non-omp thread, returning\n")); 5020 return -1; 5021 } 5022 if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) { 5023 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5024 "affinity not initialized, returning\n")); 5025 return -1; 5026 } 5027 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5028 "set full mask for thread %d\n", 5029 gtid)); 5030 KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL); 5031 return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE); 5032 } 5033 #endif 5034 5035 #endif // KMP_AFFINITY_SUPPORTED 5036