1 /* 2 * kmp_affinity.cpp -- affinity management 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "kmp.h" 17 #include "kmp_affinity.h" 18 #include "kmp_i18n.h" 19 #include "kmp_io.h" 20 #include "kmp_str.h" 21 #include "kmp_wrapper_getpid.h" 22 23 // Store the real or imagined machine hierarchy here 24 static hierarchy_info machine_hierarchy; 25 26 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); } 27 28 29 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 30 kmp_uint32 depth; 31 // The test below is true if affinity is available, but set to "none". Need to 32 // init on first use of hierarchical barrier. 33 if (TCR_1(machine_hierarchy.uninitialized)) 34 machine_hierarchy.init(NULL, nproc); 35 36 // Adjust the hierarchy in case num threads exceeds original 37 if (nproc > machine_hierarchy.base_num_threads) 38 machine_hierarchy.resize(nproc); 39 40 depth = machine_hierarchy.depth; 41 KMP_DEBUG_ASSERT(depth > 0); 42 43 thr_bar->depth = depth; 44 thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0] - 1; 45 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 46 } 47 48 #if KMP_AFFINITY_SUPPORTED 49 50 bool KMPAffinity::picked_api = false; 51 52 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); } 53 void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); } 54 void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); } 55 void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); } 56 void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); } 57 void KMPAffinity::operator delete(void *p) { __kmp_free(p); } 58 59 void KMPAffinity::pick_api() { 60 KMPAffinity *affinity_dispatch; 61 if (picked_api) 62 return; 63 #if KMP_USE_HWLOC 64 if (__kmp_affinity_top_method == affinity_top_method_hwloc) { 65 affinity_dispatch = new KMPHwlocAffinity(); 66 } else 67 #endif 68 { 69 affinity_dispatch = new KMPNativeAffinity(); 70 } 71 __kmp_affinity_dispatch = affinity_dispatch; 72 picked_api = true; 73 } 74 75 void KMPAffinity::destroy_api() { 76 if (__kmp_affinity_dispatch != NULL) { 77 delete __kmp_affinity_dispatch; 78 __kmp_affinity_dispatch = NULL; 79 picked_api = false; 80 } 81 } 82 83 // Print the affinity mask to the character array in a pretty format. 84 char *__kmp_affinity_print_mask(char *buf, int buf_len, 85 kmp_affin_mask_t *mask) { 86 KMP_ASSERT(buf_len >= 40); 87 char *scan = buf; 88 char *end = buf + buf_len - 1; 89 90 // Find first element / check for empty set. 91 size_t i; 92 i = mask->begin(); 93 if (i == mask->end()) { 94 KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}"); 95 while (*scan != '\0') 96 scan++; 97 KMP_ASSERT(scan <= end); 98 return buf; 99 } 100 101 KMP_SNPRINTF(scan, end - scan + 1, "{%ld", (long)i); 102 while (*scan != '\0') 103 scan++; 104 i++; 105 for (; i != mask->end(); i = mask->next(i)) { 106 if (!KMP_CPU_ISSET(i, mask)) { 107 continue; 108 } 109 110 // Check for buffer overflow. A string of the form ",<n>" will have at most 111 // 10 characters, plus we want to leave room to print ",...}" if the set is 112 // too large to print for a total of 15 characters. We already left room for 113 // '\0' in setting end. 114 if (end - scan < 15) { 115 break; 116 } 117 KMP_SNPRINTF(scan, end - scan + 1, ",%-ld", (long)i); 118 while (*scan != '\0') 119 scan++; 120 } 121 if (i != mask->end()) { 122 KMP_SNPRINTF(scan, end - scan + 1, ",..."); 123 while (*scan != '\0') 124 scan++; 125 } 126 KMP_SNPRINTF(scan, end - scan + 1, "}"); 127 while (*scan != '\0') 128 scan++; 129 KMP_ASSERT(scan <= end); 130 return buf; 131 } 132 133 void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) { 134 KMP_CPU_ZERO(mask); 135 136 #if KMP_GROUP_AFFINITY 137 138 if (__kmp_num_proc_groups > 1) { 139 int group; 140 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 141 for (group = 0; group < __kmp_num_proc_groups; group++) { 142 int i; 143 int num = __kmp_GetActiveProcessorCount(group); 144 for (i = 0; i < num; i++) { 145 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 146 } 147 } 148 } else 149 150 #endif /* KMP_GROUP_AFFINITY */ 151 152 { 153 int proc; 154 for (proc = 0; proc < __kmp_xproc; proc++) { 155 KMP_CPU_SET(proc, mask); 156 } 157 } 158 } 159 160 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be 161 // called to renumber the labels from [0..n] and place them into the child_num 162 // vector of the address object. This is done in case the labels used for 163 // the children at one node of the hierarchy differ from those used for 164 // another node at the same level. Example: suppose the machine has 2 nodes 165 // with 2 packages each. The first node contains packages 601 and 602, and 166 // second node contains packages 603 and 604. If we try to sort the table 167 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604 168 // because we are paying attention to the labels themselves, not the ordinal 169 // child numbers. By using the child numbers in the sort, the result is 170 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604. 171 static void __kmp_affinity_assign_child_nums(AddrUnsPair *address2os, 172 int numAddrs) { 173 KMP_DEBUG_ASSERT(numAddrs > 0); 174 int depth = address2os->first.depth; 175 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 176 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 177 int labCt; 178 for (labCt = 0; labCt < depth; labCt++) { 179 address2os[0].first.childNums[labCt] = counts[labCt] = 0; 180 lastLabel[labCt] = address2os[0].first.labels[labCt]; 181 } 182 int i; 183 for (i = 1; i < numAddrs; i++) { 184 for (labCt = 0; labCt < depth; labCt++) { 185 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) { 186 int labCt2; 187 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) { 188 counts[labCt2] = 0; 189 lastLabel[labCt2] = address2os[i].first.labels[labCt2]; 190 } 191 counts[labCt]++; 192 lastLabel[labCt] = address2os[i].first.labels[labCt]; 193 break; 194 } 195 } 196 for (labCt = 0; labCt < depth; labCt++) { 197 address2os[i].first.childNums[labCt] = counts[labCt]; 198 } 199 for (; labCt < (int)Address::maxDepth; labCt++) { 200 address2os[i].first.childNums[labCt] = 0; 201 } 202 } 203 __kmp_free(lastLabel); 204 __kmp_free(counts); 205 } 206 207 // All of the __kmp_affinity_create_*_map() routines should set 208 // __kmp_affinity_masks to a vector of affinity mask objects of length 209 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and return 210 // the number of levels in the machine topology tree (zero if 211 // __kmp_affinity_type == affinity_none). 212 // 213 // All of the __kmp_affinity_create_*_map() routines should set 214 // *__kmp_affin_fullMask to the affinity mask for the initialization thread. 215 // They need to save and restore the mask, and it could be needed later, so 216 // saving it is just an optimization to avoid calling kmp_get_system_affinity() 217 // again. 218 kmp_affin_mask_t *__kmp_affin_fullMask = NULL; 219 220 static int nCoresPerPkg, nPackages; 221 static int __kmp_nThreadsPerCore; 222 #ifndef KMP_DFLT_NTH_CORES 223 static int __kmp_ncores; 224 #endif 225 static int *__kmp_pu_os_idx = NULL; 226 227 // __kmp_affinity_uniform_topology() doesn't work when called from 228 // places which support arbitrarily many levels in the machine topology 229 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map() 230 // __kmp_affinity_create_x2apicid_map(). 231 inline static bool __kmp_affinity_uniform_topology() { 232 return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages); 233 } 234 235 // Print out the detailed machine topology map, i.e. the physical locations 236 // of each OS proc. 237 static void __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, 238 int depth, int pkgLevel, 239 int coreLevel, int threadLevel) { 240 int proc; 241 242 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 243 for (proc = 0; proc < len; proc++) { 244 int level; 245 kmp_str_buf_t buf; 246 __kmp_str_buf_init(&buf); 247 for (level = 0; level < depth; level++) { 248 if (level == threadLevel) { 249 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread)); 250 } else if (level == coreLevel) { 251 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core)); 252 } else if (level == pkgLevel) { 253 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package)); 254 } else if (level > pkgLevel) { 255 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node), 256 level - pkgLevel - 1); 257 } else { 258 __kmp_str_buf_print(&buf, "L%d ", level); 259 } 260 __kmp_str_buf_print(&buf, "%d ", address2os[proc].first.labels[level]); 261 } 262 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second, 263 buf.str); 264 __kmp_str_buf_free(&buf); 265 } 266 } 267 268 #if KMP_USE_HWLOC 269 270 // This function removes the topology levels that are radix 1 and don't offer 271 // further information about the topology. The most common example is when you 272 // have one thread context per core, we don't want the extra thread context 273 // level if it offers no unique labels. So they are removed. 274 // return value: the new depth of address2os 275 static int __kmp_affinity_remove_radix_one_levels(AddrUnsPair *address2os, 276 int nActiveThreads, int depth, 277 int *pkgLevel, int *coreLevel, 278 int *threadLevel) { 279 int level; 280 int i; 281 int radix1_detected; 282 283 for (level = depth - 1; level >= 0; --level) { 284 // Always keep the package level 285 if (level == *pkgLevel) 286 continue; 287 // Detect if this level is radix 1 288 radix1_detected = 1; 289 for (i = 1; i < nActiveThreads; ++i) { 290 if (address2os[0].first.labels[level] != 291 address2os[i].first.labels[level]) { 292 // There are differing label values for this level so it stays 293 radix1_detected = 0; 294 break; 295 } 296 } 297 if (!radix1_detected) 298 continue; 299 // Radix 1 was detected 300 if (level == *threadLevel) { 301 // If only one thread per core, then just decrement 302 // the depth which removes the threadlevel from address2os 303 for (i = 0; i < nActiveThreads; ++i) { 304 address2os[i].first.depth--; 305 } 306 *threadLevel = -1; 307 } else if (level == *coreLevel) { 308 // For core level, we move the thread labels over if they are still 309 // valid (*threadLevel != -1), and also reduce the depth another level 310 for (i = 0; i < nActiveThreads; ++i) { 311 if (*threadLevel != -1) { 312 address2os[i].first.labels[*coreLevel] = 313 address2os[i].first.labels[*threadLevel]; 314 } 315 address2os[i].first.depth--; 316 } 317 *coreLevel = -1; 318 } 319 } 320 return address2os[0].first.depth; 321 } 322 323 // Returns the number of objects of type 'type' below 'obj' within the topology 324 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is 325 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET 326 // object. 327 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj, 328 hwloc_obj_type_t type) { 329 int retval = 0; 330 hwloc_obj_t first; 331 for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type, 332 obj->logical_index, type, 0); 333 first != NULL && 334 hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, obj->type, first) == 335 obj; 336 first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type, 337 first)) { 338 ++retval; 339 } 340 return retval; 341 } 342 343 static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os, 344 kmp_i18n_id_t *const msg_id) { 345 *address2os = NULL; 346 *msg_id = kmp_i18n_null; 347 348 // Save the affinity mask for the current thread. 349 kmp_affin_mask_t *oldMask; 350 KMP_CPU_ALLOC(oldMask); 351 __kmp_get_system_affinity(oldMask, TRUE); 352 353 int depth = 3; 354 int pkgLevel = 0; 355 int coreLevel = 1; 356 int threadLevel = 2; 357 358 if (!KMP_AFFINITY_CAPABLE()) { 359 // Hack to try and infer the machine topology using only the data 360 // available from cpuid on the current thread, and __kmp_xproc. 361 KMP_ASSERT(__kmp_affinity_type == affinity_none); 362 363 nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj( 364 hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PACKAGE, 0), 365 HWLOC_OBJ_CORE); 366 __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj( 367 hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, 0), 368 HWLOC_OBJ_PU); 369 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 370 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 371 if (__kmp_affinity_verbose) { 372 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 373 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 374 if (__kmp_affinity_uniform_topology()) { 375 KMP_INFORM(Uniform, "KMP_AFFINITY"); 376 } else { 377 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 378 } 379 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 380 __kmp_nThreadsPerCore, __kmp_ncores); 381 } 382 KMP_CPU_FREE(oldMask); 383 return 0; 384 } 385 386 // Allocate the data structure to be returned. 387 AddrUnsPair *retval = 388 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 389 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 390 391 // When affinity is off, this routine will still be called to set 392 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 393 // nCoresPerPkg, & nPackages. Make sure all these vars are set 394 // correctly, and return if affinity is not enabled. 395 396 hwloc_obj_t pu; 397 hwloc_obj_t core; 398 hwloc_obj_t socket; 399 int nActiveThreads = 0; 400 int socket_identifier = 0; 401 // re-calculate globals to count only accessible resources 402 __kmp_ncores = nPackages = nCoresPerPkg = __kmp_nThreadsPerCore = 0; 403 for (socket = 404 hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PACKAGE, 0); 405 socket != NULL; 406 socket = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, 407 HWLOC_OBJ_PACKAGE, socket), 408 socket_identifier++) { 409 int core_identifier = 0; 410 int num_active_cores = 0; 411 for (core = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, socket->type, 412 socket->logical_index, 413 HWLOC_OBJ_CORE, 0); 414 core != NULL && 415 hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, socket->type, 416 core) == socket; 417 core = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, 418 core), 419 core_identifier++) { 420 int pu_identifier = 0; 421 int num_active_threads = 0; 422 for (pu = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, core->type, 423 core->logical_index, HWLOC_OBJ_PU, 424 0); 425 pu != NULL && 426 hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, core->type, 427 pu) == core; 428 pu = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PU, 429 pu), 430 pu_identifier++) { 431 Address addr(3); 432 if(!KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask)) 433 continue; // skip inactive (inaccessible) unit 434 KA_TRACE(20, 435 ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n", 436 socket->os_index, socket->logical_index, core->os_index, 437 core->logical_index, pu->os_index,pu->logical_index)); 438 addr.labels[0] = socket_identifier; // package 439 addr.labels[1] = core_identifier; // core 440 addr.labels[2] = pu_identifier; // pu 441 retval[nActiveThreads] = AddrUnsPair(addr, pu->os_index); 442 __kmp_pu_os_idx[nActiveThreads] = 443 pu->os_index; // keep os index for each active pu 444 nActiveThreads++; 445 ++num_active_threads; // count active threads per core 446 } 447 if (num_active_threads) { // were there any active threads on the core? 448 ++__kmp_ncores; // count total active cores 449 ++num_active_cores; // count active cores per socket 450 if (num_active_threads > __kmp_nThreadsPerCore) 451 __kmp_nThreadsPerCore = num_active_threads; // calc maximum 452 } 453 } 454 if (num_active_cores) { // were there any active cores on the socket? 455 ++nPackages; // count total active packages 456 if (num_active_cores > nCoresPerPkg) 457 nCoresPerPkg = num_active_cores; // calc maximum 458 } 459 } 460 461 // If there's only one thread context to bind to, return now. 462 KMP_DEBUG_ASSERT(nActiveThreads == __kmp_avail_proc); 463 KMP_ASSERT(nActiveThreads > 0); 464 if (nActiveThreads == 1) { 465 __kmp_ncores = nPackages = 1; 466 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 467 if (__kmp_affinity_verbose) { 468 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 469 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 470 471 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 472 if (__kmp_affinity_respect_mask) { 473 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 474 } else { 475 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 476 } 477 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 478 KMP_INFORM(Uniform, "KMP_AFFINITY"); 479 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 480 __kmp_nThreadsPerCore, __kmp_ncores); 481 } 482 483 if (__kmp_affinity_type == affinity_none) { 484 __kmp_free(retval); 485 KMP_CPU_FREE(oldMask); 486 return 0; 487 } 488 489 // Form an Address object which only includes the package level. 490 Address addr(1); 491 addr.labels[0] = retval[0].first.labels[pkgLevel]; 492 retval[0].first = addr; 493 494 if (__kmp_affinity_gran_levels < 0) { 495 __kmp_affinity_gran_levels = 0; 496 } 497 498 if (__kmp_affinity_verbose) { 499 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 500 } 501 502 *address2os = retval; 503 KMP_CPU_FREE(oldMask); 504 return 1; 505 } 506 507 // Sort the table by physical Id. 508 qsort(retval, nActiveThreads, sizeof(*retval), 509 __kmp_affinity_cmp_Address_labels); 510 511 // Check to see if the machine topology is uniform 512 unsigned uniform = 513 (nPackages * nCoresPerPkg * __kmp_nThreadsPerCore == nActiveThreads); 514 515 // Print the machine topology summary. 516 if (__kmp_affinity_verbose) { 517 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 518 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 519 520 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 521 if (__kmp_affinity_respect_mask) { 522 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 523 } else { 524 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 525 } 526 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 527 if (uniform) { 528 KMP_INFORM(Uniform, "KMP_AFFINITY"); 529 } else { 530 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 531 } 532 533 kmp_str_buf_t buf; 534 __kmp_str_buf_init(&buf); 535 536 __kmp_str_buf_print(&buf, "%d", nPackages); 537 // for (level = 1; level <= pkgLevel; level++) { 538 // __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 539 // } 540 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 541 __kmp_nThreadsPerCore, __kmp_ncores); 542 543 __kmp_str_buf_free(&buf); 544 } 545 546 if (__kmp_affinity_type == affinity_none) { 547 __kmp_free(retval); 548 KMP_CPU_FREE(oldMask); 549 return 0; 550 } 551 552 // Find any levels with radiix 1, and remove them from the map 553 // (except for the package level). 554 depth = __kmp_affinity_remove_radix_one_levels( 555 retval, nActiveThreads, depth, &pkgLevel, &coreLevel, &threadLevel); 556 557 if (__kmp_affinity_gran_levels < 0) { 558 // Set the granularity level based on what levels are modeled 559 // in the machine topology map. 560 __kmp_affinity_gran_levels = 0; 561 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 562 __kmp_affinity_gran_levels++; 563 } 564 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 565 __kmp_affinity_gran_levels++; 566 } 567 if (__kmp_affinity_gran > affinity_gran_package) { 568 __kmp_affinity_gran_levels++; 569 } 570 } 571 572 if (__kmp_affinity_verbose) { 573 __kmp_affinity_print_topology(retval, nActiveThreads, depth, pkgLevel, 574 coreLevel, threadLevel); 575 } 576 577 KMP_CPU_FREE(oldMask); 578 *address2os = retval; 579 return depth; 580 } 581 #endif // KMP_USE_HWLOC 582 583 // If we don't know how to retrieve the machine's processor topology, or 584 // encounter an error in doing so, this routine is called to form a "flat" 585 // mapping of os thread id's <-> processor id's. 586 static int __kmp_affinity_create_flat_map(AddrUnsPair **address2os, 587 kmp_i18n_id_t *const msg_id) { 588 *address2os = NULL; 589 *msg_id = kmp_i18n_null; 590 591 // Even if __kmp_affinity_type == affinity_none, this routine might still 592 // called to set __kmp_ncores, as well as 593 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 594 if (!KMP_AFFINITY_CAPABLE()) { 595 KMP_ASSERT(__kmp_affinity_type == affinity_none); 596 __kmp_ncores = nPackages = __kmp_xproc; 597 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 598 if (__kmp_affinity_verbose) { 599 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY"); 600 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 601 KMP_INFORM(Uniform, "KMP_AFFINITY"); 602 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 603 __kmp_nThreadsPerCore, __kmp_ncores); 604 } 605 return 0; 606 } 607 608 // When affinity is off, this routine will still be called to set 609 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 610 // Make sure all these vars are set correctly, and return now if affinity is 611 // not enabled. 612 __kmp_ncores = nPackages = __kmp_avail_proc; 613 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 614 if (__kmp_affinity_verbose) { 615 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 616 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 617 __kmp_affin_fullMask); 618 619 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY"); 620 if (__kmp_affinity_respect_mask) { 621 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 622 } else { 623 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 624 } 625 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 626 KMP_INFORM(Uniform, "KMP_AFFINITY"); 627 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 628 __kmp_nThreadsPerCore, __kmp_ncores); 629 } 630 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 631 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 632 if (__kmp_affinity_type == affinity_none) { 633 int avail_ct = 0; 634 int i; 635 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 636 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) 637 continue; 638 __kmp_pu_os_idx[avail_ct++] = i; // suppose indices are flat 639 } 640 return 0; 641 } 642 643 // Contruct the data structure to be returned. 644 *address2os = 645 (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 646 int avail_ct = 0; 647 unsigned int i; 648 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 649 // Skip this proc if it is not included in the machine model. 650 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 651 continue; 652 } 653 __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat 654 Address addr(1); 655 addr.labels[0] = i; 656 (*address2os)[avail_ct++] = AddrUnsPair(addr, i); 657 } 658 if (__kmp_affinity_verbose) { 659 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 660 } 661 662 if (__kmp_affinity_gran_levels < 0) { 663 // Only the package level is modeled in the machine topology map, 664 // so the #levels of granularity is either 0 or 1. 665 if (__kmp_affinity_gran > affinity_gran_package) { 666 __kmp_affinity_gran_levels = 1; 667 } else { 668 __kmp_affinity_gran_levels = 0; 669 } 670 } 671 return 1; 672 } 673 674 #if KMP_GROUP_AFFINITY 675 676 // If multiple Windows* OS processor groups exist, we can create a 2-level 677 // topology map with the groups at level 0 and the individual procs at level 1. 678 // This facilitates letting the threads float among all procs in a group, 679 // if granularity=group (the default when there are multiple groups). 680 static int __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os, 681 kmp_i18n_id_t *const msg_id) { 682 *address2os = NULL; 683 *msg_id = kmp_i18n_null; 684 685 // If we aren't affinity capable, then return now. 686 // The flat mapping will be used. 687 if (!KMP_AFFINITY_CAPABLE()) { 688 // FIXME set *msg_id 689 return -1; 690 } 691 692 // Contruct the data structure to be returned. 693 *address2os = 694 (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 695 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 696 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 697 int avail_ct = 0; 698 int i; 699 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 700 // Skip this proc if it is not included in the machine model. 701 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 702 continue; 703 } 704 __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat 705 Address addr(2); 706 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR)); 707 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR)); 708 (*address2os)[avail_ct++] = AddrUnsPair(addr, i); 709 710 if (__kmp_affinity_verbose) { 711 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0], 712 addr.labels[1]); 713 } 714 } 715 716 if (__kmp_affinity_gran_levels < 0) { 717 if (__kmp_affinity_gran == affinity_gran_group) { 718 __kmp_affinity_gran_levels = 1; 719 } else if ((__kmp_affinity_gran == affinity_gran_fine) || 720 (__kmp_affinity_gran == affinity_gran_thread)) { 721 __kmp_affinity_gran_levels = 0; 722 } else { 723 const char *gran_str = NULL; 724 if (__kmp_affinity_gran == affinity_gran_core) { 725 gran_str = "core"; 726 } else if (__kmp_affinity_gran == affinity_gran_package) { 727 gran_str = "package"; 728 } else if (__kmp_affinity_gran == affinity_gran_node) { 729 gran_str = "node"; 730 } else { 731 KMP_ASSERT(0); 732 } 733 734 // Warning: can't use affinity granularity \"gran\" with group topology 735 // method, using "thread" 736 __kmp_affinity_gran_levels = 0; 737 } 738 } 739 return 2; 740 } 741 742 #endif /* KMP_GROUP_AFFINITY */ 743 744 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 745 746 static int __kmp_cpuid_mask_width(int count) { 747 int r = 0; 748 749 while ((1 << r) < count) 750 ++r; 751 return r; 752 } 753 754 class apicThreadInfo { 755 public: 756 unsigned osId; // param to __kmp_affinity_bind_thread 757 unsigned apicId; // from cpuid after binding 758 unsigned maxCoresPerPkg; // "" 759 unsigned maxThreadsPerPkg; // "" 760 unsigned pkgId; // inferred from above values 761 unsigned coreId; // "" 762 unsigned threadId; // "" 763 }; 764 765 static int __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, 766 const void *b) { 767 const apicThreadInfo *aa = (const apicThreadInfo *)a; 768 const apicThreadInfo *bb = (const apicThreadInfo *)b; 769 if (aa->osId < bb->osId) 770 return -1; 771 if (aa->osId > bb->osId) 772 return 1; 773 return 0; 774 } 775 776 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, 777 const void *b) { 778 const apicThreadInfo *aa = (const apicThreadInfo *)a; 779 const apicThreadInfo *bb = (const apicThreadInfo *)b; 780 if (aa->pkgId < bb->pkgId) 781 return -1; 782 if (aa->pkgId > bb->pkgId) 783 return 1; 784 if (aa->coreId < bb->coreId) 785 return -1; 786 if (aa->coreId > bb->coreId) 787 return 1; 788 if (aa->threadId < bb->threadId) 789 return -1; 790 if (aa->threadId > bb->threadId) 791 return 1; 792 return 0; 793 } 794 795 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 796 // an algorithm which cycles through the available os threads, setting 797 // the current thread's affinity mask to that thread, and then retrieves 798 // the Apic Id for each thread context using the cpuid instruction. 799 static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os, 800 kmp_i18n_id_t *const msg_id) { 801 kmp_cpuid buf; 802 int rc; 803 *address2os = NULL; 804 *msg_id = kmp_i18n_null; 805 806 // Check if cpuid leaf 4 is supported. 807 __kmp_x86_cpuid(0, 0, &buf); 808 if (buf.eax < 4) { 809 *msg_id = kmp_i18n_str_NoLeaf4Support; 810 return -1; 811 } 812 813 // The algorithm used starts by setting the affinity to each available thread 814 // and retrieving info from the cpuid instruction, so if we are not capable of 815 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we 816 // need to do something else - use the defaults that we calculated from 817 // issuing cpuid without binding to each proc. 818 if (!KMP_AFFINITY_CAPABLE()) { 819 // Hack to try and infer the machine topology using only the data 820 // available from cpuid on the current thread, and __kmp_xproc. 821 KMP_ASSERT(__kmp_affinity_type == affinity_none); 822 823 // Get an upper bound on the number of threads per package using cpuid(1). 824 // On some OS/chps combinations where HT is supported by the chip but is 825 // disabled, this value will be 2 on a single core chip. Usually, it will be 826 // 2 if HT is enabled and 1 if HT is disabled. 827 __kmp_x86_cpuid(1, 0, &buf); 828 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 829 if (maxThreadsPerPkg == 0) { 830 maxThreadsPerPkg = 1; 831 } 832 833 // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded 834 // value. 835 // 836 // The author of cpu_count.cpp treated this only an upper bound on the 837 // number of cores, but I haven't seen any cases where it was greater than 838 // the actual number of cores, so we will treat it as exact in this block of 839 // code. 840 // 841 // First, we need to check if cpuid(4) is supported on this chip. To see if 842 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or 843 // greater. 844 __kmp_x86_cpuid(0, 0, &buf); 845 if (buf.eax >= 4) { 846 __kmp_x86_cpuid(4, 0, &buf); 847 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 848 } else { 849 nCoresPerPkg = 1; 850 } 851 852 // There is no way to reliably tell if HT is enabled without issuing the 853 // cpuid instruction from every thread, can correlating the cpuid info, so 854 // if the machine is not affinity capable, we assume that HT is off. We have 855 // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine 856 // does not support HT. 857 // 858 // - Older OSes are usually found on machines with older chips, which do not 859 // support HT. 860 // - The performance penalty for mistakenly identifying a machine as HT when 861 // it isn't (which results in blocktime being incorrecly set to 0) is 862 // greater than the penalty when for mistakenly identifying a machine as 863 // being 1 thread/core when it is really HT enabled (which results in 864 // blocktime being incorrectly set to a positive value). 865 __kmp_ncores = __kmp_xproc; 866 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 867 __kmp_nThreadsPerCore = 1; 868 if (__kmp_affinity_verbose) { 869 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY"); 870 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 871 if (__kmp_affinity_uniform_topology()) { 872 KMP_INFORM(Uniform, "KMP_AFFINITY"); 873 } else { 874 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 875 } 876 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 877 __kmp_nThreadsPerCore, __kmp_ncores); 878 } 879 return 0; 880 } 881 882 // From here on, we can assume that it is safe to call 883 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 884 // __kmp_affinity_type = affinity_none. 885 886 // Save the affinity mask for the current thread. 887 kmp_affin_mask_t *oldMask; 888 KMP_CPU_ALLOC(oldMask); 889 KMP_ASSERT(oldMask != NULL); 890 __kmp_get_system_affinity(oldMask, TRUE); 891 892 // Run through each of the available contexts, binding the current thread 893 // to it, and obtaining the pertinent information using the cpuid instr. 894 // 895 // The relevant information is: 896 // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 897 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 898 // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value 899 // of this field determines the width of the core# + thread# fields in the 900 // Apic Id. It is also an upper bound on the number of threads per 901 // package, but it has been verified that situations happen were it is not 902 // exact. In particular, on certain OS/chip combinations where Intel(R) 903 // Hyper-Threading Technology is supported by the chip but has been 904 // disabled, the value of this field will be 2 (for a single core chip). 905 // On other OS/chip combinations supporting Intel(R) Hyper-Threading 906 // Technology, the value of this field will be 1 when Intel(R) 907 // Hyper-Threading Technology is disabled and 2 when it is enabled. 908 // - Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The value 909 // of this field (+1) determines the width of the core# field in the Apic 910 // Id. The comments in "cpucount.cpp" say that this value is an upper 911 // bound, but the IA-32 architecture manual says that it is exactly the 912 // number of cores per package, and I haven't seen any case where it 913 // wasn't. 914 // 915 // From this information, deduce the package Id, core Id, and thread Id, 916 // and set the corresponding fields in the apicThreadInfo struct. 917 unsigned i; 918 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 919 __kmp_avail_proc * sizeof(apicThreadInfo)); 920 unsigned nApics = 0; 921 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 922 // Skip this proc if it is not included in the machine model. 923 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 924 continue; 925 } 926 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 927 928 __kmp_affinity_dispatch->bind_thread(i); 929 threadInfo[nApics].osId = i; 930 931 // The apic id and max threads per pkg come from cpuid(1). 932 __kmp_x86_cpuid(1, 0, &buf); 933 if (((buf.edx >> 9) & 1) == 0) { 934 __kmp_set_system_affinity(oldMask, TRUE); 935 __kmp_free(threadInfo); 936 KMP_CPU_FREE(oldMask); 937 *msg_id = kmp_i18n_str_ApicNotPresent; 938 return -1; 939 } 940 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 941 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 942 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 943 threadInfo[nApics].maxThreadsPerPkg = 1; 944 } 945 946 // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded 947 // value. 948 // 949 // First, we need to check if cpuid(4) is supported on this chip. To see if 950 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n 951 // or greater. 952 __kmp_x86_cpuid(0, 0, &buf); 953 if (buf.eax >= 4) { 954 __kmp_x86_cpuid(4, 0, &buf); 955 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 956 } else { 957 threadInfo[nApics].maxCoresPerPkg = 1; 958 } 959 960 // Infer the pkgId / coreId / threadId using only the info obtained locally. 961 int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg); 962 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 963 964 int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg); 965 int widthT = widthCT - widthC; 966 if (widthT < 0) { 967 // I've never seen this one happen, but I suppose it could, if the cpuid 968 // instruction on a chip was really screwed up. Make sure to restore the 969 // affinity mask before the tail call. 970 __kmp_set_system_affinity(oldMask, TRUE); 971 __kmp_free(threadInfo); 972 KMP_CPU_FREE(oldMask); 973 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 974 return -1; 975 } 976 977 int maskC = (1 << widthC) - 1; 978 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC; 979 980 int maskT = (1 << widthT) - 1; 981 threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT; 982 983 nApics++; 984 } 985 986 // We've collected all the info we need. 987 // Restore the old affinity mask for this thread. 988 __kmp_set_system_affinity(oldMask, TRUE); 989 990 // If there's only one thread context to bind to, form an Address object 991 // with depth 1 and return immediately (or, if affinity is off, set 992 // address2os to NULL and return). 993 // 994 // If it is configured to omit the package level when there is only a single 995 // package, the logic at the end of this routine won't work if there is only 996 // a single thread - it would try to form an Address object with depth 0. 997 KMP_ASSERT(nApics > 0); 998 if (nApics == 1) { 999 __kmp_ncores = nPackages = 1; 1000 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1001 if (__kmp_affinity_verbose) { 1002 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1003 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1004 1005 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1006 if (__kmp_affinity_respect_mask) { 1007 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1008 } else { 1009 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1010 } 1011 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1012 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1013 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1014 __kmp_nThreadsPerCore, __kmp_ncores); 1015 } 1016 1017 if (__kmp_affinity_type == affinity_none) { 1018 __kmp_free(threadInfo); 1019 KMP_CPU_FREE(oldMask); 1020 return 0; 1021 } 1022 1023 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair)); 1024 Address addr(1); 1025 addr.labels[0] = threadInfo[0].pkgId; 1026 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId); 1027 1028 if (__kmp_affinity_gran_levels < 0) { 1029 __kmp_affinity_gran_levels = 0; 1030 } 1031 1032 if (__kmp_affinity_verbose) { 1033 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 1034 } 1035 1036 __kmp_free(threadInfo); 1037 KMP_CPU_FREE(oldMask); 1038 return 1; 1039 } 1040 1041 // Sort the threadInfo table by physical Id. 1042 qsort(threadInfo, nApics, sizeof(*threadInfo), 1043 __kmp_affinity_cmp_apicThreadInfo_phys_id); 1044 1045 // The table is now sorted by pkgId / coreId / threadId, but we really don't 1046 // know the radix of any of the fields. pkgId's may be sparsely assigned among 1047 // the chips on a system. Although coreId's are usually assigned 1048 // [0 .. coresPerPkg-1] and threadId's are usually assigned 1049 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1050 // 1051 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 1052 // total # packages) are at this point - we want to determine that now. We 1053 // only have an upper bound on the first two figures. 1054 // 1055 // We also perform a consistency check at this point: the values returned by 1056 // the cpuid instruction for any thread bound to a given package had better 1057 // return the same info for maxThreadsPerPkg and maxCoresPerPkg. 1058 nPackages = 1; 1059 nCoresPerPkg = 1; 1060 __kmp_nThreadsPerCore = 1; 1061 unsigned nCores = 1; 1062 1063 unsigned pkgCt = 1; // to determine radii 1064 unsigned lastPkgId = threadInfo[0].pkgId; 1065 unsigned coreCt = 1; 1066 unsigned lastCoreId = threadInfo[0].coreId; 1067 unsigned threadCt = 1; 1068 unsigned lastThreadId = threadInfo[0].threadId; 1069 1070 // intra-pkg consist checks 1071 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 1072 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 1073 1074 for (i = 1; i < nApics; i++) { 1075 if (threadInfo[i].pkgId != lastPkgId) { 1076 nCores++; 1077 pkgCt++; 1078 lastPkgId = threadInfo[i].pkgId; 1079 if ((int)coreCt > nCoresPerPkg) 1080 nCoresPerPkg = coreCt; 1081 coreCt = 1; 1082 lastCoreId = threadInfo[i].coreId; 1083 if ((int)threadCt > __kmp_nThreadsPerCore) 1084 __kmp_nThreadsPerCore = threadCt; 1085 threadCt = 1; 1086 lastThreadId = threadInfo[i].threadId; 1087 1088 // This is a different package, so go on to the next iteration without 1089 // doing any consistency checks. Reset the consistency check vars, though. 1090 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 1091 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 1092 continue; 1093 } 1094 1095 if (threadInfo[i].coreId != lastCoreId) { 1096 nCores++; 1097 coreCt++; 1098 lastCoreId = threadInfo[i].coreId; 1099 if ((int)threadCt > __kmp_nThreadsPerCore) 1100 __kmp_nThreadsPerCore = threadCt; 1101 threadCt = 1; 1102 lastThreadId = threadInfo[i].threadId; 1103 } else if (threadInfo[i].threadId != lastThreadId) { 1104 threadCt++; 1105 lastThreadId = threadInfo[i].threadId; 1106 } else { 1107 __kmp_free(threadInfo); 1108 KMP_CPU_FREE(oldMask); 1109 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1110 return -1; 1111 } 1112 1113 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 1114 // fields agree between all the threads bounds to a given package. 1115 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) || 1116 (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 1117 __kmp_free(threadInfo); 1118 KMP_CPU_FREE(oldMask); 1119 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1120 return -1; 1121 } 1122 } 1123 nPackages = pkgCt; 1124 if ((int)coreCt > nCoresPerPkg) 1125 nCoresPerPkg = coreCt; 1126 if ((int)threadCt > __kmp_nThreadsPerCore) 1127 __kmp_nThreadsPerCore = threadCt; 1128 1129 // When affinity is off, this routine will still be called to set 1130 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1131 // Make sure all these vars are set correctly, and return now if affinity is 1132 // not enabled. 1133 __kmp_ncores = nCores; 1134 if (__kmp_affinity_verbose) { 1135 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1136 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1137 1138 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1139 if (__kmp_affinity_respect_mask) { 1140 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1141 } else { 1142 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1143 } 1144 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1145 if (__kmp_affinity_uniform_topology()) { 1146 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1147 } else { 1148 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1149 } 1150 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1151 __kmp_nThreadsPerCore, __kmp_ncores); 1152 } 1153 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 1154 KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc); 1155 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 1156 for (i = 0; i < nApics; ++i) { 1157 __kmp_pu_os_idx[i] = threadInfo[i].osId; 1158 } 1159 if (__kmp_affinity_type == affinity_none) { 1160 __kmp_free(threadInfo); 1161 KMP_CPU_FREE(oldMask); 1162 return 0; 1163 } 1164 1165 // Now that we've determined the number of packages, the number of cores per 1166 // package, and the number of threads per core, we can construct the data 1167 // structure that is to be returned. 1168 int pkgLevel = 0; 1169 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1; 1170 int threadLevel = 1171 (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 1172 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 1173 1174 KMP_ASSERT(depth > 0); 1175 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1176 1177 for (i = 0; i < nApics; ++i) { 1178 Address addr(depth); 1179 unsigned os = threadInfo[i].osId; 1180 int d = 0; 1181 1182 if (pkgLevel >= 0) { 1183 addr.labels[d++] = threadInfo[i].pkgId; 1184 } 1185 if (coreLevel >= 0) { 1186 addr.labels[d++] = threadInfo[i].coreId; 1187 } 1188 if (threadLevel >= 0) { 1189 addr.labels[d++] = threadInfo[i].threadId; 1190 } 1191 (*address2os)[i] = AddrUnsPair(addr, os); 1192 } 1193 1194 if (__kmp_affinity_gran_levels < 0) { 1195 // Set the granularity level based on what levels are modeled in the machine 1196 // topology map. 1197 __kmp_affinity_gran_levels = 0; 1198 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1199 __kmp_affinity_gran_levels++; 1200 } 1201 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1202 __kmp_affinity_gran_levels++; 1203 } 1204 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) { 1205 __kmp_affinity_gran_levels++; 1206 } 1207 } 1208 1209 if (__kmp_affinity_verbose) { 1210 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel, 1211 coreLevel, threadLevel); 1212 } 1213 1214 __kmp_free(threadInfo); 1215 KMP_CPU_FREE(oldMask); 1216 return depth; 1217 } 1218 1219 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 1220 // architectures support a newer interface for specifying the x2APIC Ids, 1221 // based on cpuid leaf 11. 1222 static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os, 1223 kmp_i18n_id_t *const msg_id) { 1224 kmp_cpuid buf; 1225 *address2os = NULL; 1226 *msg_id = kmp_i18n_null; 1227 1228 // Check to see if cpuid leaf 11 is supported. 1229 __kmp_x86_cpuid(0, 0, &buf); 1230 if (buf.eax < 11) { 1231 *msg_id = kmp_i18n_str_NoLeaf11Support; 1232 return -1; 1233 } 1234 __kmp_x86_cpuid(11, 0, &buf); 1235 if (buf.ebx == 0) { 1236 *msg_id = kmp_i18n_str_NoLeaf11Support; 1237 return -1; 1238 } 1239 1240 // Find the number of levels in the machine topology. While we're at it, get 1241 // the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will try to 1242 // get more accurate values later by explicitly counting them, but get 1243 // reasonable defaults now, in case we return early. 1244 int level; 1245 int threadLevel = -1; 1246 int coreLevel = -1; 1247 int pkgLevel = -1; 1248 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1249 1250 for (level = 0;; level++) { 1251 if (level > 31) { 1252 // FIXME: Hack for DPD200163180 1253 // 1254 // If level is big then something went wrong -> exiting 1255 // 1256 // There could actually be 32 valid levels in the machine topology, but so 1257 // far, the only machine we have seen which does not exit this loop before 1258 // iteration 32 has fubar x2APIC settings. 1259 // 1260 // For now, just reject this case based upon loop trip count. 1261 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1262 return -1; 1263 } 1264 __kmp_x86_cpuid(11, level, &buf); 1265 if (buf.ebx == 0) { 1266 if (pkgLevel < 0) { 1267 // Will infer nPackages from __kmp_xproc 1268 pkgLevel = level; 1269 level++; 1270 } 1271 break; 1272 } 1273 int kind = (buf.ecx >> 8) & 0xff; 1274 if (kind == 1) { 1275 // SMT level 1276 threadLevel = level; 1277 coreLevel = -1; 1278 pkgLevel = -1; 1279 __kmp_nThreadsPerCore = buf.ebx & 0xffff; 1280 if (__kmp_nThreadsPerCore == 0) { 1281 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1282 return -1; 1283 } 1284 } else if (kind == 2) { 1285 // core level 1286 coreLevel = level; 1287 pkgLevel = -1; 1288 nCoresPerPkg = buf.ebx & 0xffff; 1289 if (nCoresPerPkg == 0) { 1290 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1291 return -1; 1292 } 1293 } else { 1294 if (level <= 0) { 1295 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1296 return -1; 1297 } 1298 if (pkgLevel >= 0) { 1299 continue; 1300 } 1301 pkgLevel = level; 1302 nPackages = buf.ebx & 0xffff; 1303 if (nPackages == 0) { 1304 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1305 return -1; 1306 } 1307 } 1308 } 1309 int depth = level; 1310 1311 // In the above loop, "level" was counted from the finest level (usually 1312 // thread) to the coarsest. The caller expects that we will place the labels 1313 // in (*address2os)[].first.labels[] in the inverse order, so we need to 1314 // invert the vars saying which level means what. 1315 if (threadLevel >= 0) { 1316 threadLevel = depth - threadLevel - 1; 1317 } 1318 if (coreLevel >= 0) { 1319 coreLevel = depth - coreLevel - 1; 1320 } 1321 KMP_DEBUG_ASSERT(pkgLevel >= 0); 1322 pkgLevel = depth - pkgLevel - 1; 1323 1324 // The algorithm used starts by setting the affinity to each available thread 1325 // and retrieving info from the cpuid instruction, so if we are not capable of 1326 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we 1327 // need to do something else - use the defaults that we calculated from 1328 // issuing cpuid without binding to each proc. 1329 if (!KMP_AFFINITY_CAPABLE()) { 1330 // Hack to try and infer the machine topology using only the data 1331 // available from cpuid on the current thread, and __kmp_xproc. 1332 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1333 1334 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1335 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1336 if (__kmp_affinity_verbose) { 1337 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 1338 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1339 if (__kmp_affinity_uniform_topology()) { 1340 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1341 } else { 1342 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1343 } 1344 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1345 __kmp_nThreadsPerCore, __kmp_ncores); 1346 } 1347 return 0; 1348 } 1349 1350 // From here on, we can assume that it is safe to call 1351 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 1352 // __kmp_affinity_type = affinity_none. 1353 1354 // Save the affinity mask for the current thread. 1355 kmp_affin_mask_t *oldMask; 1356 KMP_CPU_ALLOC(oldMask); 1357 __kmp_get_system_affinity(oldMask, TRUE); 1358 1359 // Allocate the data structure to be returned. 1360 AddrUnsPair *retval = 1361 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 1362 1363 // Run through each of the available contexts, binding the current thread 1364 // to it, and obtaining the pertinent information using the cpuid instr. 1365 unsigned int proc; 1366 int nApics = 0; 1367 KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) { 1368 // Skip this proc if it is not included in the machine model. 1369 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 1370 continue; 1371 } 1372 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc); 1373 1374 __kmp_affinity_dispatch->bind_thread(proc); 1375 1376 // Extract labels for each level in the machine topology map from Apic ID. 1377 Address addr(depth); 1378 int prev_shift = 0; 1379 1380 for (level = 0; level < depth; level++) { 1381 __kmp_x86_cpuid(11, level, &buf); 1382 unsigned apicId = buf.edx; 1383 if (buf.ebx == 0) { 1384 if (level != depth - 1) { 1385 KMP_CPU_FREE(oldMask); 1386 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1387 return -1; 1388 } 1389 addr.labels[depth - level - 1] = apicId >> prev_shift; 1390 level++; 1391 break; 1392 } 1393 int shift = buf.eax & 0x1f; 1394 int mask = (1 << shift) - 1; 1395 addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift; 1396 prev_shift = shift; 1397 } 1398 if (level != depth) { 1399 KMP_CPU_FREE(oldMask); 1400 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1401 return -1; 1402 } 1403 1404 retval[nApics] = AddrUnsPair(addr, proc); 1405 nApics++; 1406 } 1407 1408 // We've collected all the info we need. 1409 // Restore the old affinity mask for this thread. 1410 __kmp_set_system_affinity(oldMask, TRUE); 1411 1412 // If there's only one thread context to bind to, return now. 1413 KMP_ASSERT(nApics > 0); 1414 if (nApics == 1) { 1415 __kmp_ncores = nPackages = 1; 1416 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1417 if (__kmp_affinity_verbose) { 1418 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1419 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1420 1421 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1422 if (__kmp_affinity_respect_mask) { 1423 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1424 } else { 1425 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1426 } 1427 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1428 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1429 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1430 __kmp_nThreadsPerCore, __kmp_ncores); 1431 } 1432 1433 if (__kmp_affinity_type == affinity_none) { 1434 __kmp_free(retval); 1435 KMP_CPU_FREE(oldMask); 1436 return 0; 1437 } 1438 1439 // Form an Address object which only includes the package level. 1440 Address addr(1); 1441 addr.labels[0] = retval[0].first.labels[pkgLevel]; 1442 retval[0].first = addr; 1443 1444 if (__kmp_affinity_gran_levels < 0) { 1445 __kmp_affinity_gran_levels = 0; 1446 } 1447 1448 if (__kmp_affinity_verbose) { 1449 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 1450 } 1451 1452 *address2os = retval; 1453 KMP_CPU_FREE(oldMask); 1454 return 1; 1455 } 1456 1457 // Sort the table by physical Id. 1458 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels); 1459 1460 // Find the radix at each of the levels. 1461 unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1462 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1463 unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1464 unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1465 for (level = 0; level < depth; level++) { 1466 totals[level] = 1; 1467 maxCt[level] = 1; 1468 counts[level] = 1; 1469 last[level] = retval[0].first.labels[level]; 1470 } 1471 1472 // From here on, the iteration variable "level" runs from the finest level to 1473 // the coarsest, i.e. we iterate forward through 1474 // (*address2os)[].first.labels[] - in the previous loops, we iterated 1475 // backwards. 1476 for (proc = 1; (int)proc < nApics; proc++) { 1477 int level; 1478 for (level = 0; level < depth; level++) { 1479 if (retval[proc].first.labels[level] != last[level]) { 1480 int j; 1481 for (j = level + 1; j < depth; j++) { 1482 totals[j]++; 1483 counts[j] = 1; 1484 // The line below causes printing incorrect topology information in 1485 // case the max value for some level (maxCt[level]) is encountered 1486 // earlier than some less value while going through the array. For 1487 // example, let pkg0 has 4 cores and pkg1 has 2 cores. Then 1488 // maxCt[1] == 2 1489 // whereas it must be 4. 1490 // TODO!!! Check if it can be commented safely 1491 // maxCt[j] = 1; 1492 last[j] = retval[proc].first.labels[j]; 1493 } 1494 totals[level]++; 1495 counts[level]++; 1496 if (counts[level] > maxCt[level]) { 1497 maxCt[level] = counts[level]; 1498 } 1499 last[level] = retval[proc].first.labels[level]; 1500 break; 1501 } else if (level == depth - 1) { 1502 __kmp_free(last); 1503 __kmp_free(maxCt); 1504 __kmp_free(counts); 1505 __kmp_free(totals); 1506 __kmp_free(retval); 1507 KMP_CPU_FREE(oldMask); 1508 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 1509 return -1; 1510 } 1511 } 1512 } 1513 1514 // When affinity is off, this routine will still be called to set 1515 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1516 // Make sure all these vars are set correctly, and return if affinity is not 1517 // enabled. 1518 if (threadLevel >= 0) { 1519 __kmp_nThreadsPerCore = maxCt[threadLevel]; 1520 } else { 1521 __kmp_nThreadsPerCore = 1; 1522 } 1523 nPackages = totals[pkgLevel]; 1524 1525 if (coreLevel >= 0) { 1526 __kmp_ncores = totals[coreLevel]; 1527 nCoresPerPkg = maxCt[coreLevel]; 1528 } else { 1529 __kmp_ncores = nPackages; 1530 nCoresPerPkg = 1; 1531 } 1532 1533 // Check to see if the machine topology is uniform 1534 unsigned prod = maxCt[0]; 1535 for (level = 1; level < depth; level++) { 1536 prod *= maxCt[level]; 1537 } 1538 bool uniform = (prod == totals[level - 1]); 1539 1540 // Print the machine topology summary. 1541 if (__kmp_affinity_verbose) { 1542 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 1543 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1544 1545 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1546 if (__kmp_affinity_respect_mask) { 1547 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 1548 } else { 1549 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 1550 } 1551 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1552 if (uniform) { 1553 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1554 } else { 1555 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1556 } 1557 1558 kmp_str_buf_t buf; 1559 __kmp_str_buf_init(&buf); 1560 1561 __kmp_str_buf_print(&buf, "%d", totals[0]); 1562 for (level = 1; level <= pkgLevel; level++) { 1563 __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 1564 } 1565 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 1566 __kmp_nThreadsPerCore, __kmp_ncores); 1567 1568 __kmp_str_buf_free(&buf); 1569 } 1570 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 1571 KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc); 1572 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 1573 for (proc = 0; (int)proc < nApics; ++proc) { 1574 __kmp_pu_os_idx[proc] = retval[proc].second; 1575 } 1576 if (__kmp_affinity_type == affinity_none) { 1577 __kmp_free(last); 1578 __kmp_free(maxCt); 1579 __kmp_free(counts); 1580 __kmp_free(totals); 1581 __kmp_free(retval); 1582 KMP_CPU_FREE(oldMask); 1583 return 0; 1584 } 1585 1586 // Find any levels with radiix 1, and remove them from the map 1587 // (except for the package level). 1588 int new_depth = 0; 1589 for (level = 0; level < depth; level++) { 1590 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1591 continue; 1592 } 1593 new_depth++; 1594 } 1595 1596 // If we are removing any levels, allocate a new vector to return, 1597 // and copy the relevant information to it. 1598 if (new_depth != depth) { 1599 AddrUnsPair *new_retval = 1600 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1601 for (proc = 0; (int)proc < nApics; proc++) { 1602 Address addr(new_depth); 1603 new_retval[proc] = AddrUnsPair(addr, retval[proc].second); 1604 } 1605 int new_level = 0; 1606 int newPkgLevel = -1; 1607 int newCoreLevel = -1; 1608 int newThreadLevel = -1; 1609 int i; 1610 for (level = 0; level < depth; level++) { 1611 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1612 // Remove this level. Never remove the package level 1613 continue; 1614 } 1615 if (level == pkgLevel) { 1616 newPkgLevel = level; 1617 } 1618 if (level == coreLevel) { 1619 newCoreLevel = level; 1620 } 1621 if (level == threadLevel) { 1622 newThreadLevel = level; 1623 } 1624 for (proc = 0; (int)proc < nApics; proc++) { 1625 new_retval[proc].first.labels[new_level] = 1626 retval[proc].first.labels[level]; 1627 } 1628 new_level++; 1629 } 1630 1631 __kmp_free(retval); 1632 retval = new_retval; 1633 depth = new_depth; 1634 pkgLevel = newPkgLevel; 1635 coreLevel = newCoreLevel; 1636 threadLevel = newThreadLevel; 1637 } 1638 1639 if (__kmp_affinity_gran_levels < 0) { 1640 // Set the granularity level based on what levels are modeled 1641 // in the machine topology map. 1642 __kmp_affinity_gran_levels = 0; 1643 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1644 __kmp_affinity_gran_levels++; 1645 } 1646 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1647 __kmp_affinity_gran_levels++; 1648 } 1649 if (__kmp_affinity_gran > affinity_gran_package) { 1650 __kmp_affinity_gran_levels++; 1651 } 1652 } 1653 1654 if (__kmp_affinity_verbose) { 1655 __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, coreLevel, 1656 threadLevel); 1657 } 1658 1659 __kmp_free(last); 1660 __kmp_free(maxCt); 1661 __kmp_free(counts); 1662 __kmp_free(totals); 1663 KMP_CPU_FREE(oldMask); 1664 *address2os = retval; 1665 return depth; 1666 } 1667 1668 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1669 1670 #define osIdIndex 0 1671 #define threadIdIndex 1 1672 #define coreIdIndex 2 1673 #define pkgIdIndex 3 1674 #define nodeIdIndex 4 1675 1676 typedef unsigned *ProcCpuInfo; 1677 static unsigned maxIndex = pkgIdIndex; 1678 1679 static int __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b) { 1680 const unsigned *aa = (const unsigned *)a; 1681 const unsigned *bb = (const unsigned *)b; 1682 if (aa[osIdIndex] < bb[osIdIndex]) 1683 return -1; 1684 if (aa[osIdIndex] > bb[osIdIndex]) 1685 return 1; 1686 return 0; 1687 }; 1688 1689 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, 1690 const void *b) { 1691 unsigned i; 1692 const unsigned *aa = *((const unsigned **)a); 1693 const unsigned *bb = *((const unsigned **)b); 1694 for (i = maxIndex;; i--) { 1695 if (aa[i] < bb[i]) 1696 return -1; 1697 if (aa[i] > bb[i]) 1698 return 1; 1699 if (i == osIdIndex) 1700 break; 1701 } 1702 return 0; 1703 } 1704 1705 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 1706 // affinity map. 1707 static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, 1708 int *line, 1709 kmp_i18n_id_t *const msg_id, 1710 FILE *f) { 1711 *address2os = NULL; 1712 *msg_id = kmp_i18n_null; 1713 1714 // Scan of the file, and count the number of "processor" (osId) fields, 1715 // and find the highest value of <n> for a node_<n> field. 1716 char buf[256]; 1717 unsigned num_records = 0; 1718 while (!feof(f)) { 1719 buf[sizeof(buf) - 1] = 1; 1720 if (!fgets(buf, sizeof(buf), f)) { 1721 // Read errors presumably because of EOF 1722 break; 1723 } 1724 1725 char s1[] = "processor"; 1726 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1727 num_records++; 1728 continue; 1729 } 1730 1731 // FIXME - this will match "node_<n> <garbage>" 1732 unsigned level; 1733 if (KMP_SSCANF(buf, "node_%d id", &level) == 1) { 1734 if (nodeIdIndex + level >= maxIndex) { 1735 maxIndex = nodeIdIndex + level; 1736 } 1737 continue; 1738 } 1739 } 1740 1741 // Check for empty file / no valid processor records, or too many. The number 1742 // of records can't exceed the number of valid bits in the affinity mask. 1743 if (num_records == 0) { 1744 *line = 0; 1745 *msg_id = kmp_i18n_str_NoProcRecords; 1746 return -1; 1747 } 1748 if (num_records > (unsigned)__kmp_xproc) { 1749 *line = 0; 1750 *msg_id = kmp_i18n_str_TooManyProcRecords; 1751 return -1; 1752 } 1753 1754 // Set the file pointer back to the begginning, so that we can scan the file 1755 // again, this time performing a full parse of the data. Allocate a vector of 1756 // ProcCpuInfo object, where we will place the data. Adding an extra element 1757 // at the end allows us to remove a lot of extra checks for termination 1758 // conditions. 1759 if (fseek(f, 0, SEEK_SET) != 0) { 1760 *line = 0; 1761 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 1762 return -1; 1763 } 1764 1765 // Allocate the array of records to store the proc info in. The dummy 1766 // element at the end makes the logic in filling them out easier to code. 1767 unsigned **threadInfo = 1768 (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *)); 1769 unsigned i; 1770 for (i = 0; i <= num_records; i++) { 1771 threadInfo[i] = 1772 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 1773 } 1774 1775 #define CLEANUP_THREAD_INFO \ 1776 for (i = 0; i <= num_records; i++) { \ 1777 __kmp_free(threadInfo[i]); \ 1778 } \ 1779 __kmp_free(threadInfo); 1780 1781 // A value of UINT_MAX means that we didn't find the field 1782 unsigned __index; 1783 1784 #define INIT_PROC_INFO(p) \ 1785 for (__index = 0; __index <= maxIndex; __index++) { \ 1786 (p)[__index] = UINT_MAX; \ 1787 } 1788 1789 for (i = 0; i <= num_records; i++) { 1790 INIT_PROC_INFO(threadInfo[i]); 1791 } 1792 1793 unsigned num_avail = 0; 1794 *line = 0; 1795 while (!feof(f)) { 1796 // Create an inner scoping level, so that all the goto targets at the end of 1797 // the loop appear in an outer scoping level. This avoids warnings about 1798 // jumping past an initialization to a target in the same block. 1799 { 1800 buf[sizeof(buf) - 1] = 1; 1801 bool long_line = false; 1802 if (!fgets(buf, sizeof(buf), f)) { 1803 // Read errors presumably because of EOF 1804 // If there is valid data in threadInfo[num_avail], then fake 1805 // a blank line in ensure that the last address gets parsed. 1806 bool valid = false; 1807 for (i = 0; i <= maxIndex; i++) { 1808 if (threadInfo[num_avail][i] != UINT_MAX) { 1809 valid = true; 1810 } 1811 } 1812 if (!valid) { 1813 break; 1814 } 1815 buf[0] = 0; 1816 } else if (!buf[sizeof(buf) - 1]) { 1817 // The line is longer than the buffer. Set a flag and don't 1818 // emit an error if we were going to ignore the line, anyway. 1819 long_line = true; 1820 1821 #define CHECK_LINE \ 1822 if (long_line) { \ 1823 CLEANUP_THREAD_INFO; \ 1824 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 1825 return -1; \ 1826 } 1827 } 1828 (*line)++; 1829 1830 char s1[] = "processor"; 1831 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1832 CHECK_LINE; 1833 char *p = strchr(buf + sizeof(s1) - 1, ':'); 1834 unsigned val; 1835 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 1836 goto no_val; 1837 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) 1838 goto dup_field; 1839 threadInfo[num_avail][osIdIndex] = val; 1840 #if KMP_OS_LINUX && USE_SYSFS_INFO 1841 char path[256]; 1842 KMP_SNPRINTF( 1843 path, sizeof(path), 1844 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 1845 threadInfo[num_avail][osIdIndex]); 1846 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 1847 1848 KMP_SNPRINTF(path, sizeof(path), 1849 "/sys/devices/system/cpu/cpu%u/topology/core_id", 1850 threadInfo[num_avail][osIdIndex]); 1851 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 1852 continue; 1853 #else 1854 } 1855 char s2[] = "physical id"; 1856 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 1857 CHECK_LINE; 1858 char *p = strchr(buf + sizeof(s2) - 1, ':'); 1859 unsigned val; 1860 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 1861 goto no_val; 1862 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) 1863 goto dup_field; 1864 threadInfo[num_avail][pkgIdIndex] = val; 1865 continue; 1866 } 1867 char s3[] = "core id"; 1868 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 1869 CHECK_LINE; 1870 char *p = strchr(buf + sizeof(s3) - 1, ':'); 1871 unsigned val; 1872 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 1873 goto no_val; 1874 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) 1875 goto dup_field; 1876 threadInfo[num_avail][coreIdIndex] = val; 1877 continue; 1878 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 1879 } 1880 char s4[] = "thread id"; 1881 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 1882 CHECK_LINE; 1883 char *p = strchr(buf + sizeof(s4) - 1, ':'); 1884 unsigned val; 1885 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 1886 goto no_val; 1887 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) 1888 goto dup_field; 1889 threadInfo[num_avail][threadIdIndex] = val; 1890 continue; 1891 } 1892 unsigned level; 1893 if (KMP_SSCANF(buf, "node_%d id", &level) == 1) { 1894 CHECK_LINE; 1895 char *p = strchr(buf + sizeof(s4) - 1, ':'); 1896 unsigned val; 1897 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 1898 goto no_val; 1899 KMP_ASSERT(nodeIdIndex + level <= maxIndex); 1900 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) 1901 goto dup_field; 1902 threadInfo[num_avail][nodeIdIndex + level] = val; 1903 continue; 1904 } 1905 1906 // We didn't recognize the leading token on the line. There are lots of 1907 // leading tokens that we don't recognize - if the line isn't empty, go on 1908 // to the next line. 1909 if ((*buf != 0) && (*buf != '\n')) { 1910 // If the line is longer than the buffer, read characters 1911 // until we find a newline. 1912 if (long_line) { 1913 int ch; 1914 while (((ch = fgetc(f)) != EOF) && (ch != '\n')) 1915 ; 1916 } 1917 continue; 1918 } 1919 1920 // A newline has signalled the end of the processor record. 1921 // Check that there aren't too many procs specified. 1922 if ((int)num_avail == __kmp_xproc) { 1923 CLEANUP_THREAD_INFO; 1924 *msg_id = kmp_i18n_str_TooManyEntries; 1925 return -1; 1926 } 1927 1928 // Check for missing fields. The osId field must be there, and we 1929 // currently require that the physical id field is specified, also. 1930 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 1931 CLEANUP_THREAD_INFO; 1932 *msg_id = kmp_i18n_str_MissingProcField; 1933 return -1; 1934 } 1935 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 1936 CLEANUP_THREAD_INFO; 1937 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 1938 return -1; 1939 } 1940 1941 // Skip this proc if it is not included in the machine model. 1942 if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], 1943 __kmp_affin_fullMask)) { 1944 INIT_PROC_INFO(threadInfo[num_avail]); 1945 continue; 1946 } 1947 1948 // We have a successful parse of this proc's info. 1949 // Increment the counter, and prepare for the next proc. 1950 num_avail++; 1951 KMP_ASSERT(num_avail <= num_records); 1952 INIT_PROC_INFO(threadInfo[num_avail]); 1953 } 1954 continue; 1955 1956 no_val: 1957 CLEANUP_THREAD_INFO; 1958 *msg_id = kmp_i18n_str_MissingValCpuinfo; 1959 return -1; 1960 1961 dup_field: 1962 CLEANUP_THREAD_INFO; 1963 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 1964 return -1; 1965 } 1966 *line = 0; 1967 1968 #if KMP_MIC && REDUCE_TEAM_SIZE 1969 unsigned teamSize = 0; 1970 #endif // KMP_MIC && REDUCE_TEAM_SIZE 1971 1972 // check for num_records == __kmp_xproc ??? 1973 1974 // If there's only one thread context to bind to, form an Address object with 1975 // depth 1 and return immediately (or, if affinity is off, set address2os to 1976 // NULL and return). 1977 // 1978 // If it is configured to omit the package level when there is only a single 1979 // package, the logic at the end of this routine won't work if there is only a 1980 // single thread - it would try to form an Address object with depth 0. 1981 KMP_ASSERT(num_avail > 0); 1982 KMP_ASSERT(num_avail <= num_records); 1983 if (num_avail == 1) { 1984 __kmp_ncores = 1; 1985 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1986 if (__kmp_affinity_verbose) { 1987 if (!KMP_AFFINITY_CAPABLE()) { 1988 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 1989 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1990 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1991 } else { 1992 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1993 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 1994 __kmp_affin_fullMask); 1995 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 1996 if (__kmp_affinity_respect_mask) { 1997 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1998 } else { 1999 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2000 } 2001 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2002 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2003 } 2004 int index; 2005 kmp_str_buf_t buf; 2006 __kmp_str_buf_init(&buf); 2007 __kmp_str_buf_print(&buf, "1"); 2008 for (index = maxIndex - 1; index > pkgIdIndex; index--) { 2009 __kmp_str_buf_print(&buf, " x 1"); 2010 } 2011 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1); 2012 __kmp_str_buf_free(&buf); 2013 } 2014 2015 if (__kmp_affinity_type == affinity_none) { 2016 CLEANUP_THREAD_INFO; 2017 return 0; 2018 } 2019 2020 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair)); 2021 Address addr(1); 2022 addr.labels[0] = threadInfo[0][pkgIdIndex]; 2023 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]); 2024 2025 if (__kmp_affinity_gran_levels < 0) { 2026 __kmp_affinity_gran_levels = 0; 2027 } 2028 2029 if (__kmp_affinity_verbose) { 2030 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 2031 } 2032 2033 CLEANUP_THREAD_INFO; 2034 return 1; 2035 } 2036 2037 // Sort the threadInfo table by physical Id. 2038 qsort(threadInfo, num_avail, sizeof(*threadInfo), 2039 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 2040 2041 // The table is now sorted by pkgId / coreId / threadId, but we really don't 2042 // know the radix of any of the fields. pkgId's may be sparsely assigned among 2043 // the chips on a system. Although coreId's are usually assigned 2044 // [0 .. coresPerPkg-1] and threadId's are usually assigned 2045 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2046 // 2047 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 2048 // total # packages) are at this point - we want to determine that now. We 2049 // only have an upper bound on the first two figures. 2050 unsigned *counts = 2051 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2052 unsigned *maxCt = 2053 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2054 unsigned *totals = 2055 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2056 unsigned *lastId = 2057 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2058 2059 bool assign_thread_ids = false; 2060 unsigned threadIdCt; 2061 unsigned index; 2062 2063 restart_radix_check: 2064 threadIdCt = 0; 2065 2066 // Initialize the counter arrays with data from threadInfo[0]. 2067 if (assign_thread_ids) { 2068 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 2069 threadInfo[0][threadIdIndex] = threadIdCt++; 2070 } else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 2071 threadIdCt = threadInfo[0][threadIdIndex] + 1; 2072 } 2073 } 2074 for (index = 0; index <= maxIndex; index++) { 2075 counts[index] = 1; 2076 maxCt[index] = 1; 2077 totals[index] = 1; 2078 lastId[index] = threadInfo[0][index]; 2079 ; 2080 } 2081 2082 // Run through the rest of the OS procs. 2083 for (i = 1; i < num_avail; i++) { 2084 // Find the most significant index whose id differs from the id for the 2085 // previous OS proc. 2086 for (index = maxIndex; index >= threadIdIndex; index--) { 2087 if (assign_thread_ids && (index == threadIdIndex)) { 2088 // Auto-assign the thread id field if it wasn't specified. 2089 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2090 threadInfo[i][threadIdIndex] = threadIdCt++; 2091 } 2092 // Aparrently the thread id field was specified for some entries and not 2093 // others. Start the thread id counter off at the next higher thread id. 2094 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2095 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2096 } 2097 } 2098 if (threadInfo[i][index] != lastId[index]) { 2099 // Run through all indices which are less significant, and reset the 2100 // counts to 1. At all levels up to and including index, we need to 2101 // increment the totals and record the last id. 2102 unsigned index2; 2103 for (index2 = threadIdIndex; index2 < index; index2++) { 2104 totals[index2]++; 2105 if (counts[index2] > maxCt[index2]) { 2106 maxCt[index2] = counts[index2]; 2107 } 2108 counts[index2] = 1; 2109 lastId[index2] = threadInfo[i][index2]; 2110 } 2111 counts[index]++; 2112 totals[index]++; 2113 lastId[index] = threadInfo[i][index]; 2114 2115 if (assign_thread_ids && (index > threadIdIndex)) { 2116 2117 #if KMP_MIC && REDUCE_TEAM_SIZE 2118 // The default team size is the total #threads in the machine 2119 // minus 1 thread for every core that has 3 or more threads. 2120 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2121 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2122 2123 // Restart the thread counter, as we are on a new core. 2124 threadIdCt = 0; 2125 2126 // Auto-assign the thread id field if it wasn't specified. 2127 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2128 threadInfo[i][threadIdIndex] = threadIdCt++; 2129 } 2130 2131 // Aparrently the thread id field was specified for some entries and 2132 // not others. Start the thread id counter off at the next higher 2133 // thread id. 2134 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2135 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2136 } 2137 } 2138 break; 2139 } 2140 } 2141 if (index < threadIdIndex) { 2142 // If thread ids were specified, it is an error if they are not unique. 2143 // Also, check that we waven't already restarted the loop (to be safe - 2144 // shouldn't need to). 2145 if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) { 2146 __kmp_free(lastId); 2147 __kmp_free(totals); 2148 __kmp_free(maxCt); 2149 __kmp_free(counts); 2150 CLEANUP_THREAD_INFO; 2151 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2152 return -1; 2153 } 2154 2155 // If the thread ids were not specified and we see entries entries that 2156 // are duplicates, start the loop over and assign the thread ids manually. 2157 assign_thread_ids = true; 2158 goto restart_radix_check; 2159 } 2160 } 2161 2162 #if KMP_MIC && REDUCE_TEAM_SIZE 2163 // The default team size is the total #threads in the machine 2164 // minus 1 thread for every core that has 3 or more threads. 2165 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2166 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2167 2168 for (index = threadIdIndex; index <= maxIndex; index++) { 2169 if (counts[index] > maxCt[index]) { 2170 maxCt[index] = counts[index]; 2171 } 2172 } 2173 2174 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2175 nCoresPerPkg = maxCt[coreIdIndex]; 2176 nPackages = totals[pkgIdIndex]; 2177 2178 // Check to see if the machine topology is uniform 2179 unsigned prod = totals[maxIndex]; 2180 for (index = threadIdIndex; index < maxIndex; index++) { 2181 prod *= maxCt[index]; 2182 } 2183 bool uniform = (prod == totals[threadIdIndex]); 2184 2185 // When affinity is off, this routine will still be called to set 2186 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 2187 // Make sure all these vars are set correctly, and return now if affinity is 2188 // not enabled. 2189 __kmp_ncores = totals[coreIdIndex]; 2190 2191 if (__kmp_affinity_verbose) { 2192 if (!KMP_AFFINITY_CAPABLE()) { 2193 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2194 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2195 if (uniform) { 2196 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2197 } else { 2198 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2199 } 2200 } else { 2201 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2202 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 2203 __kmp_affin_fullMask); 2204 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2205 if (__kmp_affinity_respect_mask) { 2206 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2207 } else { 2208 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2209 } 2210 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2211 if (uniform) { 2212 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2213 } else { 2214 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2215 } 2216 } 2217 kmp_str_buf_t buf; 2218 __kmp_str_buf_init(&buf); 2219 2220 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]); 2221 for (index = maxIndex - 1; index >= pkgIdIndex; index--) { 2222 __kmp_str_buf_print(&buf, " x %d", maxCt[index]); 2223 } 2224 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex], 2225 maxCt[threadIdIndex], __kmp_ncores); 2226 2227 __kmp_str_buf_free(&buf); 2228 } 2229 2230 #if KMP_MIC && REDUCE_TEAM_SIZE 2231 // Set the default team size. 2232 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2233 __kmp_dflt_team_nth = teamSize; 2234 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting " 2235 "__kmp_dflt_team_nth = %d\n", 2236 __kmp_dflt_team_nth)); 2237 } 2238 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2239 2240 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 2241 KMP_DEBUG_ASSERT(num_avail == __kmp_avail_proc); 2242 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 2243 for (i = 0; i < num_avail; ++i) { // fill the os indices 2244 __kmp_pu_os_idx[i] = threadInfo[i][osIdIndex]; 2245 } 2246 2247 if (__kmp_affinity_type == affinity_none) { 2248 __kmp_free(lastId); 2249 __kmp_free(totals); 2250 __kmp_free(maxCt); 2251 __kmp_free(counts); 2252 CLEANUP_THREAD_INFO; 2253 return 0; 2254 } 2255 2256 // Count the number of levels which have more nodes at that level than at the 2257 // parent's level (with there being an implicit root node of the top level). 2258 // This is equivalent to saying that there is at least one node at this level 2259 // which has a sibling. These levels are in the map, and the package level is 2260 // always in the map. 2261 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2262 int level = 0; 2263 for (index = threadIdIndex; index < maxIndex; index++) { 2264 KMP_ASSERT(totals[index] >= totals[index + 1]); 2265 inMap[index] = (totals[index] > totals[index + 1]); 2266 } 2267 inMap[maxIndex] = (totals[maxIndex] > 1); 2268 inMap[pkgIdIndex] = true; 2269 2270 int depth = 0; 2271 for (index = threadIdIndex; index <= maxIndex; index++) { 2272 if (inMap[index]) { 2273 depth++; 2274 } 2275 } 2276 KMP_ASSERT(depth > 0); 2277 2278 // Construct the data structure that is to be returned. 2279 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * num_avail); 2280 int pkgLevel = -1; 2281 int coreLevel = -1; 2282 int threadLevel = -1; 2283 2284 for (i = 0; i < num_avail; ++i) { 2285 Address addr(depth); 2286 unsigned os = threadInfo[i][osIdIndex]; 2287 int src_index; 2288 int dst_index = 0; 2289 2290 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2291 if (!inMap[src_index]) { 2292 continue; 2293 } 2294 addr.labels[dst_index] = threadInfo[i][src_index]; 2295 if (src_index == pkgIdIndex) { 2296 pkgLevel = dst_index; 2297 } else if (src_index == coreIdIndex) { 2298 coreLevel = dst_index; 2299 } else if (src_index == threadIdIndex) { 2300 threadLevel = dst_index; 2301 } 2302 dst_index++; 2303 } 2304 (*address2os)[i] = AddrUnsPair(addr, os); 2305 } 2306 2307 if (__kmp_affinity_gran_levels < 0) { 2308 // Set the granularity level based on what levels are modeled 2309 // in the machine topology map. 2310 unsigned src_index; 2311 __kmp_affinity_gran_levels = 0; 2312 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) { 2313 if (!inMap[src_index]) { 2314 continue; 2315 } 2316 switch (src_index) { 2317 case threadIdIndex: 2318 if (__kmp_affinity_gran > affinity_gran_thread) { 2319 __kmp_affinity_gran_levels++; 2320 } 2321 2322 break; 2323 case coreIdIndex: 2324 if (__kmp_affinity_gran > affinity_gran_core) { 2325 __kmp_affinity_gran_levels++; 2326 } 2327 break; 2328 2329 case pkgIdIndex: 2330 if (__kmp_affinity_gran > affinity_gran_package) { 2331 __kmp_affinity_gran_levels++; 2332 } 2333 break; 2334 } 2335 } 2336 } 2337 2338 if (__kmp_affinity_verbose) { 2339 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel, 2340 coreLevel, threadLevel); 2341 } 2342 2343 __kmp_free(inMap); 2344 __kmp_free(lastId); 2345 __kmp_free(totals); 2346 __kmp_free(maxCt); 2347 __kmp_free(counts); 2348 CLEANUP_THREAD_INFO; 2349 return depth; 2350 } 2351 2352 // Create and return a table of affinity masks, indexed by OS thread ID. 2353 // This routine handles OR'ing together all the affinity masks of threads 2354 // that are sufficiently close, if granularity > fine. 2355 static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex, 2356 unsigned *numUnique, 2357 AddrUnsPair *address2os, 2358 unsigned numAddrs) { 2359 // First form a table of affinity masks in order of OS thread id. 2360 unsigned depth; 2361 unsigned maxOsId; 2362 unsigned i; 2363 2364 KMP_ASSERT(numAddrs > 0); 2365 depth = address2os[0].first.depth; 2366 2367 maxOsId = 0; 2368 for (i = 0; i < numAddrs; i++) { 2369 unsigned osId = address2os[i].second; 2370 if (osId > maxOsId) { 2371 maxOsId = osId; 2372 } 2373 } 2374 kmp_affin_mask_t *osId2Mask; 2375 KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1)); 2376 2377 // Sort the address2os table according to physical order. Doing so will put 2378 // all threads on the same core/package/node in consecutive locations. 2379 qsort(address2os, numAddrs, sizeof(*address2os), 2380 __kmp_affinity_cmp_Address_labels); 2381 2382 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2383 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2384 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2385 } 2386 if (__kmp_affinity_gran_levels >= (int)depth) { 2387 if (__kmp_affinity_verbose || 2388 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 2389 KMP_WARNING(AffThreadsMayMigrate); 2390 } 2391 } 2392 2393 // Run through the table, forming the masks for all threads on each core. 2394 // Threads on the same core will have identical "Address" objects, not 2395 // considering the last level, which must be the thread id. All threads on a 2396 // core will appear consecutively. 2397 unsigned unique = 0; 2398 unsigned j = 0; // index of 1st thread on core 2399 unsigned leader = 0; 2400 Address *leaderAddr = &(address2os[0].first); 2401 kmp_affin_mask_t *sum; 2402 KMP_CPU_ALLOC_ON_STACK(sum); 2403 KMP_CPU_ZERO(sum); 2404 KMP_CPU_SET(address2os[0].second, sum); 2405 for (i = 1; i < numAddrs; i++) { 2406 // If this thread is sufficiently close to the leader (within the 2407 // granularity setting), then set the bit for this os thread in the 2408 // affinity mask for this group, and go on to the next thread. 2409 if (leaderAddr->isClose(address2os[i].first, __kmp_affinity_gran_levels)) { 2410 KMP_CPU_SET(address2os[i].second, sum); 2411 continue; 2412 } 2413 2414 // For every thread in this group, copy the mask to the thread's entry in 2415 // the osId2Mask table. Mark the first address as a leader. 2416 for (; j < i; j++) { 2417 unsigned osId = address2os[j].second; 2418 KMP_DEBUG_ASSERT(osId <= maxOsId); 2419 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2420 KMP_CPU_COPY(mask, sum); 2421 address2os[j].first.leader = (j == leader); 2422 } 2423 unique++; 2424 2425 // Start a new mask. 2426 leader = i; 2427 leaderAddr = &(address2os[i].first); 2428 KMP_CPU_ZERO(sum); 2429 KMP_CPU_SET(address2os[i].second, sum); 2430 } 2431 2432 // For every thread in last group, copy the mask to the thread's 2433 // entry in the osId2Mask table. 2434 for (; j < i; j++) { 2435 unsigned osId = address2os[j].second; 2436 KMP_DEBUG_ASSERT(osId <= maxOsId); 2437 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2438 KMP_CPU_COPY(mask, sum); 2439 address2os[j].first.leader = (j == leader); 2440 } 2441 unique++; 2442 KMP_CPU_FREE_FROM_STACK(sum); 2443 2444 *maxIndex = maxOsId; 2445 *numUnique = unique; 2446 return osId2Mask; 2447 } 2448 2449 // Stuff for the affinity proclist parsers. It's easier to declare these vars 2450 // as file-static than to try and pass them through the calling sequence of 2451 // the recursive-descent OMP_PLACES parser. 2452 static kmp_affin_mask_t *newMasks; 2453 static int numNewMasks; 2454 static int nextNewMask; 2455 2456 #define ADD_MASK(_mask) \ 2457 { \ 2458 if (nextNewMask >= numNewMasks) { \ 2459 int i; \ 2460 numNewMasks *= 2; \ 2461 kmp_affin_mask_t *temp; \ 2462 KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \ 2463 for (i = 0; i < numNewMasks / 2; i++) { \ 2464 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); \ 2465 kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i); \ 2466 KMP_CPU_COPY(dest, src); \ 2467 } \ 2468 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2); \ 2469 newMasks = temp; \ 2470 } \ 2471 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 2472 nextNewMask++; \ 2473 } 2474 2475 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId) \ 2476 { \ 2477 if (((_osId) > _maxOsId) || \ 2478 (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 2479 if (__kmp_affinity_verbose || \ 2480 (__kmp_affinity_warnings && \ 2481 (__kmp_affinity_type != affinity_none))) { \ 2482 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 2483 } \ 2484 } else { \ 2485 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 2486 } \ 2487 } 2488 2489 // Re-parse the proclist (for the explicit affinity type), and form the list 2490 // of affinity newMasks indexed by gtid. 2491 static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 2492 unsigned int *out_numMasks, 2493 const char *proclist, 2494 kmp_affin_mask_t *osId2Mask, 2495 int maxOsId) { 2496 int i; 2497 const char *scan = proclist; 2498 const char *next = proclist; 2499 2500 // We use malloc() for the temporary mask vector, so that we can use 2501 // realloc() to extend it. 2502 numNewMasks = 2; 2503 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 2504 nextNewMask = 0; 2505 kmp_affin_mask_t *sumMask; 2506 KMP_CPU_ALLOC(sumMask); 2507 int setSize = 0; 2508 2509 for (;;) { 2510 int start, end, stride; 2511 2512 SKIP_WS(scan); 2513 next = scan; 2514 if (*next == '\0') { 2515 break; 2516 } 2517 2518 if (*next == '{') { 2519 int num; 2520 setSize = 0; 2521 next++; // skip '{' 2522 SKIP_WS(next); 2523 scan = next; 2524 2525 // Read the first integer in the set. 2526 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist"); 2527 SKIP_DIGITS(next); 2528 num = __kmp_str_to_int(scan, *next); 2529 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2530 2531 // Copy the mask for that osId to the sum (union) mask. 2532 if ((num > maxOsId) || 2533 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2534 if (__kmp_affinity_verbose || 2535 (__kmp_affinity_warnings && 2536 (__kmp_affinity_type != affinity_none))) { 2537 KMP_WARNING(AffIgnoreInvalidProcID, num); 2538 } 2539 KMP_CPU_ZERO(sumMask); 2540 } else { 2541 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2542 setSize = 1; 2543 } 2544 2545 for (;;) { 2546 // Check for end of set. 2547 SKIP_WS(next); 2548 if (*next == '}') { 2549 next++; // skip '}' 2550 break; 2551 } 2552 2553 // Skip optional comma. 2554 if (*next == ',') { 2555 next++; 2556 } 2557 SKIP_WS(next); 2558 2559 // Read the next integer in the set. 2560 scan = next; 2561 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2562 2563 SKIP_DIGITS(next); 2564 num = __kmp_str_to_int(scan, *next); 2565 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2566 2567 // Add the mask for that osId to the sum mask. 2568 if ((num > maxOsId) || 2569 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2570 if (__kmp_affinity_verbose || 2571 (__kmp_affinity_warnings && 2572 (__kmp_affinity_type != affinity_none))) { 2573 KMP_WARNING(AffIgnoreInvalidProcID, num); 2574 } 2575 } else { 2576 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2577 setSize++; 2578 } 2579 } 2580 if (setSize > 0) { 2581 ADD_MASK(sumMask); 2582 } 2583 2584 SKIP_WS(next); 2585 if (*next == ',') { 2586 next++; 2587 } 2588 scan = next; 2589 continue; 2590 } 2591 2592 // Read the first integer. 2593 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2594 SKIP_DIGITS(next); 2595 start = __kmp_str_to_int(scan, *next); 2596 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 2597 SKIP_WS(next); 2598 2599 // If this isn't a range, then add a mask to the list and go on. 2600 if (*next != '-') { 2601 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2602 2603 // Skip optional comma. 2604 if (*next == ',') { 2605 next++; 2606 } 2607 scan = next; 2608 continue; 2609 } 2610 2611 // This is a range. Skip over the '-' and read in the 2nd int. 2612 next++; // skip '-' 2613 SKIP_WS(next); 2614 scan = next; 2615 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2616 SKIP_DIGITS(next); 2617 end = __kmp_str_to_int(scan, *next); 2618 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 2619 2620 // Check for a stride parameter 2621 stride = 1; 2622 SKIP_WS(next); 2623 if (*next == ':') { 2624 // A stride is specified. Skip over the ':" and read the 3rd int. 2625 int sign = +1; 2626 next++; // skip ':' 2627 SKIP_WS(next); 2628 scan = next; 2629 if (*next == '-') { 2630 sign = -1; 2631 next++; 2632 SKIP_WS(next); 2633 scan = next; 2634 } 2635 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2636 SKIP_DIGITS(next); 2637 stride = __kmp_str_to_int(scan, *next); 2638 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 2639 stride *= sign; 2640 } 2641 2642 // Do some range checks. 2643 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 2644 if (stride > 0) { 2645 KMP_ASSERT2(start <= end, "bad explicit proc list"); 2646 } else { 2647 KMP_ASSERT2(start >= end, "bad explicit proc list"); 2648 } 2649 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 2650 2651 // Add the mask for each OS proc # to the list. 2652 if (stride > 0) { 2653 do { 2654 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2655 start += stride; 2656 } while (start <= end); 2657 } else { 2658 do { 2659 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2660 start += stride; 2661 } while (start >= end); 2662 } 2663 2664 // Skip optional comma. 2665 SKIP_WS(next); 2666 if (*next == ',') { 2667 next++; 2668 } 2669 scan = next; 2670 } 2671 2672 *out_numMasks = nextNewMask; 2673 if (nextNewMask == 0) { 2674 *out_masks = NULL; 2675 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 2676 return; 2677 } 2678 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 2679 for (i = 0; i < nextNewMask; i++) { 2680 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 2681 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 2682 KMP_CPU_COPY(dest, src); 2683 } 2684 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 2685 KMP_CPU_FREE(sumMask); 2686 } 2687 2688 #if OMP_40_ENABLED 2689 2690 /*----------------------------------------------------------------------------- 2691 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 2692 places. Again, Here is the grammar: 2693 2694 place_list := place 2695 place_list := place , place_list 2696 place := num 2697 place := place : num 2698 place := place : num : signed 2699 place := { subplacelist } 2700 place := ! place // (lowest priority) 2701 subplace_list := subplace 2702 subplace_list := subplace , subplace_list 2703 subplace := num 2704 subplace := num : num 2705 subplace := num : num : signed 2706 signed := num 2707 signed := + signed 2708 signed := - signed 2709 -----------------------------------------------------------------------------*/ 2710 2711 static void __kmp_process_subplace_list(const char **scan, 2712 kmp_affin_mask_t *osId2Mask, 2713 int maxOsId, kmp_affin_mask_t *tempMask, 2714 int *setSize) { 2715 const char *next; 2716 2717 for (;;) { 2718 int start, count, stride, i; 2719 2720 // Read in the starting proc id 2721 SKIP_WS(*scan); 2722 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 2723 next = *scan; 2724 SKIP_DIGITS(next); 2725 start = __kmp_str_to_int(*scan, *next); 2726 KMP_ASSERT(start >= 0); 2727 *scan = next; 2728 2729 // valid follow sets are ',' ':' and '}' 2730 SKIP_WS(*scan); 2731 if (**scan == '}' || **scan == ',') { 2732 if ((start > maxOsId) || 2733 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2734 if (__kmp_affinity_verbose || 2735 (__kmp_affinity_warnings && 2736 (__kmp_affinity_type != affinity_none))) { 2737 KMP_WARNING(AffIgnoreInvalidProcID, start); 2738 } 2739 } else { 2740 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2741 (*setSize)++; 2742 } 2743 if (**scan == '}') { 2744 break; 2745 } 2746 (*scan)++; // skip ',' 2747 continue; 2748 } 2749 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 2750 (*scan)++; // skip ':' 2751 2752 // Read count parameter 2753 SKIP_WS(*scan); 2754 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 2755 next = *scan; 2756 SKIP_DIGITS(next); 2757 count = __kmp_str_to_int(*scan, *next); 2758 KMP_ASSERT(count >= 0); 2759 *scan = next; 2760 2761 // valid follow sets are ',' ':' and '}' 2762 SKIP_WS(*scan); 2763 if (**scan == '}' || **scan == ',') { 2764 for (i = 0; i < count; i++) { 2765 if ((start > maxOsId) || 2766 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2767 if (__kmp_affinity_verbose || 2768 (__kmp_affinity_warnings && 2769 (__kmp_affinity_type != affinity_none))) { 2770 KMP_WARNING(AffIgnoreInvalidProcID, start); 2771 } 2772 break; // don't proliferate warnings for large count 2773 } else { 2774 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2775 start++; 2776 (*setSize)++; 2777 } 2778 } 2779 if (**scan == '}') { 2780 break; 2781 } 2782 (*scan)++; // skip ',' 2783 continue; 2784 } 2785 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 2786 (*scan)++; // skip ':' 2787 2788 // Read stride parameter 2789 int sign = +1; 2790 for (;;) { 2791 SKIP_WS(*scan); 2792 if (**scan == '+') { 2793 (*scan)++; // skip '+' 2794 continue; 2795 } 2796 if (**scan == '-') { 2797 sign *= -1; 2798 (*scan)++; // skip '-' 2799 continue; 2800 } 2801 break; 2802 } 2803 SKIP_WS(*scan); 2804 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 2805 next = *scan; 2806 SKIP_DIGITS(next); 2807 stride = __kmp_str_to_int(*scan, *next); 2808 KMP_ASSERT(stride >= 0); 2809 *scan = next; 2810 stride *= sign; 2811 2812 // valid follow sets are ',' and '}' 2813 SKIP_WS(*scan); 2814 if (**scan == '}' || **scan == ',') { 2815 for (i = 0; i < count; i++) { 2816 if ((start > maxOsId) || 2817 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2818 if (__kmp_affinity_verbose || 2819 (__kmp_affinity_warnings && 2820 (__kmp_affinity_type != affinity_none))) { 2821 KMP_WARNING(AffIgnoreInvalidProcID, start); 2822 } 2823 break; // don't proliferate warnings for large count 2824 } else { 2825 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2826 start += stride; 2827 (*setSize)++; 2828 } 2829 } 2830 if (**scan == '}') { 2831 break; 2832 } 2833 (*scan)++; // skip ',' 2834 continue; 2835 } 2836 2837 KMP_ASSERT2(0, "bad explicit places list"); 2838 } 2839 } 2840 2841 static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 2842 int maxOsId, kmp_affin_mask_t *tempMask, 2843 int *setSize) { 2844 const char *next; 2845 2846 // valid follow sets are '{' '!' and num 2847 SKIP_WS(*scan); 2848 if (**scan == '{') { 2849 (*scan)++; // skip '{' 2850 __kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize); 2851 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 2852 (*scan)++; // skip '}' 2853 } else if (**scan == '!') { 2854 (*scan)++; // skip '!' 2855 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 2856 KMP_CPU_COMPLEMENT(maxOsId, tempMask); 2857 } else if ((**scan >= '0') && (**scan <= '9')) { 2858 next = *scan; 2859 SKIP_DIGITS(next); 2860 int num = __kmp_str_to_int(*scan, *next); 2861 KMP_ASSERT(num >= 0); 2862 if ((num > maxOsId) || 2863 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2864 if (__kmp_affinity_verbose || 2865 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 2866 KMP_WARNING(AffIgnoreInvalidProcID, num); 2867 } 2868 } else { 2869 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 2870 (*setSize)++; 2871 } 2872 *scan = next; // skip num 2873 } else { 2874 KMP_ASSERT2(0, "bad explicit places list"); 2875 } 2876 } 2877 2878 // static void 2879 void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 2880 unsigned int *out_numMasks, 2881 const char *placelist, 2882 kmp_affin_mask_t *osId2Mask, 2883 int maxOsId) { 2884 int i, j, count, stride, sign; 2885 const char *scan = placelist; 2886 const char *next = placelist; 2887 2888 numNewMasks = 2; 2889 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 2890 nextNewMask = 0; 2891 2892 // tempMask is modified based on the previous or initial 2893 // place to form the current place 2894 // previousMask contains the previous place 2895 kmp_affin_mask_t *tempMask; 2896 kmp_affin_mask_t *previousMask; 2897 KMP_CPU_ALLOC(tempMask); 2898 KMP_CPU_ZERO(tempMask); 2899 KMP_CPU_ALLOC(previousMask); 2900 KMP_CPU_ZERO(previousMask); 2901 int setSize = 0; 2902 2903 for (;;) { 2904 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 2905 2906 // valid follow sets are ',' ':' and EOL 2907 SKIP_WS(scan); 2908 if (*scan == '\0' || *scan == ',') { 2909 if (setSize > 0) { 2910 ADD_MASK(tempMask); 2911 } 2912 KMP_CPU_ZERO(tempMask); 2913 setSize = 0; 2914 if (*scan == '\0') { 2915 break; 2916 } 2917 scan++; // skip ',' 2918 continue; 2919 } 2920 2921 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 2922 scan++; // skip ':' 2923 2924 // Read count parameter 2925 SKIP_WS(scan); 2926 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 2927 next = scan; 2928 SKIP_DIGITS(next); 2929 count = __kmp_str_to_int(scan, *next); 2930 KMP_ASSERT(count >= 0); 2931 scan = next; 2932 2933 // valid follow sets are ',' ':' and EOL 2934 SKIP_WS(scan); 2935 if (*scan == '\0' || *scan == ',') { 2936 stride = +1; 2937 } else { 2938 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 2939 scan++; // skip ':' 2940 2941 // Read stride parameter 2942 sign = +1; 2943 for (;;) { 2944 SKIP_WS(scan); 2945 if (*scan == '+') { 2946 scan++; // skip '+' 2947 continue; 2948 } 2949 if (*scan == '-') { 2950 sign *= -1; 2951 scan++; // skip '-' 2952 continue; 2953 } 2954 break; 2955 } 2956 SKIP_WS(scan); 2957 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 2958 next = scan; 2959 SKIP_DIGITS(next); 2960 stride = __kmp_str_to_int(scan, *next); 2961 KMP_DEBUG_ASSERT(stride >= 0); 2962 scan = next; 2963 stride *= sign; 2964 } 2965 2966 // Add places determined by initial_place : count : stride 2967 for (i = 0; i < count; i++) { 2968 if (setSize == 0) { 2969 break; 2970 } 2971 // Add the current place, then build the next place (tempMask) from that 2972 KMP_CPU_COPY(previousMask, tempMask); 2973 ADD_MASK(previousMask); 2974 KMP_CPU_ZERO(tempMask); 2975 setSize = 0; 2976 KMP_CPU_SET_ITERATE(j, previousMask) { 2977 if (!KMP_CPU_ISSET(j, previousMask)) { 2978 continue; 2979 } 2980 if ((j + stride > maxOsId) || (j + stride < 0) || 2981 (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) || 2982 (!KMP_CPU_ISSET(j + stride, 2983 KMP_CPU_INDEX(osId2Mask, j + stride)))) { 2984 if ((__kmp_affinity_verbose || 2985 (__kmp_affinity_warnings && 2986 (__kmp_affinity_type != affinity_none))) && 2987 i < count - 1) { 2988 KMP_WARNING(AffIgnoreInvalidProcID, j + stride); 2989 } 2990 continue; 2991 } 2992 KMP_CPU_SET(j + stride, tempMask); 2993 setSize++; 2994 } 2995 } 2996 KMP_CPU_ZERO(tempMask); 2997 setSize = 0; 2998 2999 // valid follow sets are ',' and EOL 3000 SKIP_WS(scan); 3001 if (*scan == '\0') { 3002 break; 3003 } 3004 if (*scan == ',') { 3005 scan++; // skip ',' 3006 continue; 3007 } 3008 3009 KMP_ASSERT2(0, "bad explicit places list"); 3010 } 3011 3012 *out_numMasks = nextNewMask; 3013 if (nextNewMask == 0) { 3014 *out_masks = NULL; 3015 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3016 return; 3017 } 3018 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3019 KMP_CPU_FREE(tempMask); 3020 KMP_CPU_FREE(previousMask); 3021 for (i = 0; i < nextNewMask; i++) { 3022 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 3023 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 3024 KMP_CPU_COPY(dest, src); 3025 } 3026 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3027 } 3028 3029 #endif /* OMP_40_ENABLED */ 3030 3031 #undef ADD_MASK 3032 #undef ADD_MASK_OSID 3033 3034 #if KMP_USE_HWLOC 3035 static int __kmp_hwloc_count_children_by_type(hwloc_topology_t t, hwloc_obj_t o, 3036 hwloc_obj_type_t type, 3037 hwloc_obj_t* f) { 3038 if (!hwloc_compare_types(o->type, type)) { 3039 if (*f == NULL) 3040 *f = o; // output first descendant found 3041 return 1; 3042 } 3043 int sum = 0; 3044 for (unsigned i = 0; i < o->arity; i++) 3045 sum += __kmp_hwloc_count_children_by_type(t, o->children[i], type, f); 3046 return sum; // will be 0 if no one found (as PU arity is 0) 3047 } 3048 3049 static int __kmp_hwloc_count_children_by_depth(hwloc_topology_t t, 3050 hwloc_obj_t o, unsigned depth, 3051 hwloc_obj_t* f) { 3052 if (o->depth == depth) { 3053 if (*f == NULL) 3054 *f = o; // output first descendant found 3055 return 1; 3056 } 3057 int sum = 0; 3058 for (unsigned i = 0; i < o->arity; i++) 3059 sum += __kmp_hwloc_count_children_by_depth(t, o->children[i], depth, f); 3060 return sum; // will be 0 if no one found (as PU arity is 0) 3061 } 3062 3063 static int __kmp_hwloc_skip_PUs_obj(hwloc_topology_t t, hwloc_obj_t o) { 3064 // skip PUs descendants of the object o 3065 int skipped = 0; 3066 hwloc_obj_t hT = NULL; 3067 int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT); 3068 for (int i = 0; i < N; ++i) { 3069 KMP_DEBUG_ASSERT(hT); 3070 unsigned idx = hT->os_index; 3071 if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3072 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3073 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3074 ++skipped; 3075 } 3076 hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT); 3077 } 3078 return skipped; // count number of skipped units 3079 } 3080 3081 static int __kmp_hwloc_obj_has_PUs(hwloc_topology_t t, hwloc_obj_t o) { 3082 // check if obj has PUs present in fullMask 3083 hwloc_obj_t hT = NULL; 3084 int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT); 3085 for (int i = 0; i < N; ++i) { 3086 KMP_DEBUG_ASSERT(hT); 3087 unsigned idx = hT->os_index; 3088 if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) 3089 return 1; // found PU 3090 hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT); 3091 } 3092 return 0; // no PUs found 3093 } 3094 #endif // KMP_USE_HWLOC 3095 3096 static void __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) { 3097 AddrUnsPair *newAddr; 3098 if (__kmp_hws_requested == 0) 3099 goto _exit; // no topology limiting actions requested, exit 3100 #if KMP_USE_HWLOC 3101 if (__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 3102 // Number of subobjects calculated dynamically, this works fine for 3103 // any non-uniform topology. 3104 // L2 cache objects are determined by depth, other objects - by type. 3105 hwloc_topology_t tp = __kmp_hwloc_topology; 3106 int nS=0, nN=0, nL=0, nC=0, nT=0; // logical index including skipped 3107 int nCr=0, nTr=0; // number of requested units 3108 int nPkg=0, nCo=0, n_new=0, n_old = 0, nCpP=0, nTpC=0; // counters 3109 hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to) 3110 int L2depth, idx; 3111 3112 // check support of extensions ---------------------------------- 3113 int numa_support = 0, tile_support = 0; 3114 if (__kmp_pu_os_idx) 3115 hT = hwloc_get_pu_obj_by_os_index(tp, 3116 __kmp_pu_os_idx[__kmp_avail_proc - 1]); 3117 else 3118 hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, __kmp_avail_proc - 1); 3119 if (hT == NULL) { // something's gone wrong 3120 KMP_WARNING(AffHWSubsetUnsupported); 3121 goto _exit; 3122 } 3123 // check NUMA node 3124 hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT); 3125 hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT); 3126 if (hN != NULL && hN->depth > hS->depth) { 3127 numa_support = 1; // 1 in case socket includes node(s) 3128 } else if (__kmp_hws_node.num > 0) { 3129 // don't support sockets inside NUMA node (no such HW found for testing) 3130 KMP_WARNING(AffHWSubsetUnsupported); 3131 goto _exit; 3132 } 3133 // check L2 cahce, get object by depth because of multiple caches 3134 L2depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED); 3135 hL = hwloc_get_ancestor_obj_by_depth(tp, L2depth, hT); 3136 if (hL != NULL && __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, 3137 &hC) > 1) { 3138 tile_support = 1; // no sense to count L2 if it includes single core 3139 } else if (__kmp_hws_tile.num > 0) { 3140 if (__kmp_hws_core.num == 0) { 3141 __kmp_hws_core = __kmp_hws_tile; // replace L2 with core 3142 __kmp_hws_tile.num = 0; 3143 } else { 3144 // L2 and core are both requested, but represent same object 3145 KMP_WARNING(AffHWSubsetInvalid); 3146 goto _exit; 3147 } 3148 } 3149 // end of check of extensions ----------------------------------- 3150 3151 // fill in unset items, validate settings ----------------------- 3152 if (__kmp_hws_socket.num == 0) 3153 __kmp_hws_socket.num = nPackages; // use all available sockets 3154 if (__kmp_hws_socket.offset >= nPackages) { 3155 KMP_WARNING(AffHWSubsetManySockets); 3156 goto _exit; 3157 } 3158 if (numa_support) { 3159 int NN = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, 3160 &hN); // num nodes in socket 3161 if (__kmp_hws_node.num == 0) 3162 __kmp_hws_node.num = NN; // use all available nodes 3163 if (__kmp_hws_node.offset >= NN) { 3164 KMP_WARNING(AffHWSubsetManyNodes); 3165 goto _exit; 3166 } 3167 if (tile_support) { 3168 // get num tiles in node 3169 int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL); 3170 if (__kmp_hws_tile.num == 0) { 3171 __kmp_hws_tile.num = NL + 1; 3172 } // use all available tiles, some node may have more tiles, thus +1 3173 if (__kmp_hws_tile.offset >= NL) { 3174 KMP_WARNING(AffHWSubsetManyTiles); 3175 goto _exit; 3176 } 3177 int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, 3178 &hC); // num cores in tile 3179 if (__kmp_hws_core.num == 0) 3180 __kmp_hws_core.num = NC; // use all available cores 3181 if (__kmp_hws_core.offset >= NC) { 3182 KMP_WARNING(AffHWSubsetManyCores); 3183 goto _exit; 3184 } 3185 } else { // tile_support 3186 int NC = __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, 3187 &hC); // num cores in node 3188 if (__kmp_hws_core.num == 0) 3189 __kmp_hws_core.num = NC; // use all available cores 3190 if (__kmp_hws_core.offset >= NC) { 3191 KMP_WARNING(AffHWSubsetManyCores); 3192 goto _exit; 3193 } 3194 } // tile_support 3195 } else { // numa_support 3196 if (tile_support) { 3197 // get num tiles in socket 3198 int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL); 3199 if (__kmp_hws_tile.num == 0) 3200 __kmp_hws_tile.num = NL; // use all available tiles 3201 if (__kmp_hws_tile.offset >= NL) { 3202 KMP_WARNING(AffHWSubsetManyTiles); 3203 goto _exit; 3204 } 3205 int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, 3206 &hC); // num cores in tile 3207 if (__kmp_hws_core.num == 0) 3208 __kmp_hws_core.num = NC; // use all available cores 3209 if (__kmp_hws_core.offset >= NC) { 3210 KMP_WARNING(AffHWSubsetManyCores); 3211 goto _exit; 3212 } 3213 } else { // tile_support 3214 int NC = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, 3215 &hC); // num cores in socket 3216 if (__kmp_hws_core.num == 0) 3217 __kmp_hws_core.num = NC; // use all available cores 3218 if (__kmp_hws_core.offset >= NC) { 3219 KMP_WARNING(AffHWSubsetManyCores); 3220 goto _exit; 3221 } 3222 } // tile_support 3223 } 3224 if (__kmp_hws_proc.num == 0) 3225 __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all available procs 3226 if (__kmp_hws_proc.offset >= __kmp_nThreadsPerCore) { 3227 KMP_WARNING(AffHWSubsetManyProcs); 3228 goto _exit; 3229 } 3230 // end of validation -------------------------------------------- 3231 3232 if (pAddr) // pAddr is NULL in case of affinity_none 3233 newAddr = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * 3234 __kmp_avail_proc); // max size 3235 // main loop to form HW subset ---------------------------------- 3236 hS = NULL; 3237 int NP = hwloc_get_nbobjs_by_type(tp, HWLOC_OBJ_PACKAGE); 3238 for (int s = 0; s < NP; ++s) { 3239 // Check Socket ----------------------------------------------- 3240 hS = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hS); 3241 if (!__kmp_hwloc_obj_has_PUs(tp, hS)) 3242 continue; // skip socket if all PUs are out of fullMask 3243 ++nS; // only count objects those have PUs in affinity mask 3244 if (nS <= __kmp_hws_socket.offset || 3245 nS > __kmp_hws_socket.num + __kmp_hws_socket.offset) { 3246 n_old += __kmp_hwloc_skip_PUs_obj(tp, hS); // skip socket 3247 continue; // move to next socket 3248 } 3249 nCr = 0; // count number of cores per socket 3250 // socket requested, go down the topology tree 3251 // check 4 cases: (+NUMA+Tile), (+NUMA-Tile), (-NUMA+Tile), (-NUMA-Tile) 3252 if (numa_support) { 3253 nN = 0; 3254 hN = NULL; 3255 // num nodes in current socket 3256 int NN = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, 3257 &hN); 3258 for (int n = 0; n < NN; ++n) { 3259 // Check NUMA Node ---------------------------------------- 3260 if (!__kmp_hwloc_obj_has_PUs(tp, hN)) { 3261 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3262 continue; // skip node if all PUs are out of fullMask 3263 } 3264 ++nN; 3265 if (nN <= __kmp_hws_node.offset || 3266 nN > __kmp_hws_node.num + __kmp_hws_node.offset) { 3267 // skip node as not requested 3268 n_old += __kmp_hwloc_skip_PUs_obj(tp, hN); // skip node 3269 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3270 continue; // move to next node 3271 } 3272 // node requested, go down the topology tree 3273 if (tile_support) { 3274 nL = 0; 3275 hL = NULL; 3276 int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL); 3277 for (int l = 0; l < NL; ++l) { 3278 // Check L2 (tile) ------------------------------------ 3279 if (!__kmp_hwloc_obj_has_PUs(tp, hL)) { 3280 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3281 continue; // skip tile if all PUs are out of fullMask 3282 } 3283 ++nL; 3284 if (nL <= __kmp_hws_tile.offset || 3285 nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) { 3286 // skip tile as not requested 3287 n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile 3288 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3289 continue; // move to next tile 3290 } 3291 // tile requested, go down the topology tree 3292 nC = 0; 3293 hC = NULL; 3294 // num cores in current tile 3295 int NC = __kmp_hwloc_count_children_by_type(tp, hL, 3296 HWLOC_OBJ_CORE, &hC); 3297 for (int c = 0; c < NC; ++c) { 3298 // Check Core --------------------------------------- 3299 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3300 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3301 continue; // skip core if all PUs are out of fullMask 3302 } 3303 ++nC; 3304 if (nC <= __kmp_hws_core.offset || 3305 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3306 // skip node as not requested 3307 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3308 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3309 continue; // move to next node 3310 } 3311 // core requested, go down to PUs 3312 nT = 0; 3313 nTr = 0; 3314 hT = NULL; 3315 // num procs in current core 3316 int NT = __kmp_hwloc_count_children_by_type(tp, hC, 3317 HWLOC_OBJ_PU, &hT); 3318 for (int t = 0; t < NT; ++t) { 3319 // Check PU --------------------------------------- 3320 idx = hT->os_index; 3321 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3322 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3323 continue; // skip PU if not in fullMask 3324 } 3325 ++nT; 3326 if (nT <= __kmp_hws_proc.offset || 3327 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3328 // skip PU 3329 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3330 ++n_old; 3331 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3332 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3333 continue; // move to next node 3334 } 3335 ++nTr; 3336 if (pAddr) // collect requested thread's data 3337 newAddr[n_new] = (*pAddr)[n_old]; 3338 ++n_new; 3339 ++n_old; 3340 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3341 } // threads loop 3342 if (nTr > 0) { 3343 ++nCr; // num cores per socket 3344 ++nCo; // total num cores 3345 if (nTr > nTpC) 3346 nTpC = nTr; // calc max threads per core 3347 } 3348 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3349 } // cores loop 3350 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3351 } // tiles loop 3352 } else { // tile_support 3353 // no tiles, check cores 3354 nC = 0; 3355 hC = NULL; 3356 // num cores in current node 3357 int NC = __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, 3358 &hC); 3359 for (int c = 0; c < NC; ++c) { 3360 // Check Core --------------------------------------- 3361 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3362 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3363 continue; // skip core if all PUs are out of fullMask 3364 } 3365 ++nC; 3366 if (nC <= __kmp_hws_core.offset || 3367 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3368 // skip node as not requested 3369 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3370 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3371 continue; // move to next node 3372 } 3373 // core requested, go down to PUs 3374 nT = 0; 3375 nTr = 0; 3376 hT = NULL; 3377 int NT = __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, 3378 &hT); 3379 for (int t = 0; t < NT; ++t) { 3380 // Check PU --------------------------------------- 3381 idx = hT->os_index; 3382 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3383 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3384 continue; // skip PU if not in fullMask 3385 } 3386 ++nT; 3387 if (nT <= __kmp_hws_proc.offset || 3388 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3389 // skip PU 3390 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3391 ++n_old; 3392 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3393 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3394 continue; // move to next node 3395 } 3396 ++nTr; 3397 if (pAddr) // collect requested thread's data 3398 newAddr[n_new] = (*pAddr)[n_old]; 3399 ++n_new; 3400 ++n_old; 3401 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3402 } // threads loop 3403 if (nTr > 0) { 3404 ++nCr; // num cores per socket 3405 ++nCo; // total num cores 3406 if (nTr > nTpC) 3407 nTpC = nTr; // calc max threads per core 3408 } 3409 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3410 } // cores loop 3411 } // tiles support 3412 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3413 } // nodes loop 3414 } else { // numa_support 3415 // no NUMA support 3416 if (tile_support) { 3417 nL = 0; 3418 hL = NULL; 3419 // num tiles in current socket 3420 int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL); 3421 for (int l = 0; l < NL; ++l) { 3422 // Check L2 (tile) ------------------------------------ 3423 if (!__kmp_hwloc_obj_has_PUs(tp, hL)) { 3424 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3425 continue; // skip tile if all PUs are out of fullMask 3426 } 3427 ++nL; 3428 if (nL <= __kmp_hws_tile.offset || 3429 nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) { 3430 // skip tile as not requested 3431 n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile 3432 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3433 continue; // move to next tile 3434 } 3435 // tile requested, go down the topology tree 3436 nC = 0; 3437 hC = NULL; 3438 // num cores per tile 3439 int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, 3440 &hC); 3441 for (int c = 0; c < NC; ++c) { 3442 // Check Core --------------------------------------- 3443 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3444 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3445 continue; // skip core if all PUs are out of fullMask 3446 } 3447 ++nC; 3448 if (nC <= __kmp_hws_core.offset || 3449 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3450 // skip node as not requested 3451 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3452 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3453 continue; // move to next node 3454 } 3455 // core requested, go down to PUs 3456 nT = 0; 3457 nTr = 0; 3458 hT = NULL; 3459 // num procs per core 3460 int NT = __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, 3461 &hT); 3462 for (int t = 0; t < NT; ++t) { 3463 // Check PU --------------------------------------- 3464 idx = hT->os_index; 3465 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3466 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3467 continue; // skip PU if not in fullMask 3468 } 3469 ++nT; 3470 if (nT <= __kmp_hws_proc.offset || 3471 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3472 // skip PU 3473 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3474 ++n_old; 3475 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3476 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3477 continue; // move to next node 3478 } 3479 ++nTr; 3480 if (pAddr) // collect requested thread's data 3481 newAddr[n_new] = (*pAddr)[n_old]; 3482 ++n_new; 3483 ++n_old; 3484 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3485 } // threads loop 3486 if (nTr > 0) { 3487 ++nCr; // num cores per socket 3488 ++nCo; // total num cores 3489 if (nTr > nTpC) 3490 nTpC = nTr; // calc max threads per core 3491 } 3492 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3493 } // cores loop 3494 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3495 } // tiles loop 3496 } else { // tile_support 3497 // no tiles, check cores 3498 nC = 0; 3499 hC = NULL; 3500 // num cores in socket 3501 int NC = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, 3502 &hC); 3503 for (int c = 0; c < NC; ++c) { 3504 // Check Core ------------------------------------------- 3505 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3506 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3507 continue; // skip core if all PUs are out of fullMask 3508 } 3509 ++nC; 3510 if (nC <= __kmp_hws_core.offset || 3511 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3512 // skip node as not requested 3513 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3514 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3515 continue; // move to next node 3516 } 3517 // core requested, go down to PUs 3518 nT = 0; 3519 nTr = 0; 3520 hT = NULL; 3521 // num procs per core 3522 int NT = __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, 3523 &hT); 3524 for (int t = 0; t < NT; ++t) { 3525 // Check PU --------------------------------------- 3526 idx = hT->os_index; 3527 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3528 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3529 continue; // skip PU if not in fullMask 3530 } 3531 ++nT; 3532 if (nT <= __kmp_hws_proc.offset || 3533 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3534 // skip PU 3535 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3536 ++n_old; 3537 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3538 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3539 continue; // move to next node 3540 } 3541 ++nTr; 3542 if (pAddr) // collect requested thread's data 3543 newAddr[n_new] = (*pAddr)[n_old]; 3544 ++n_new; 3545 ++n_old; 3546 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3547 } // threads loop 3548 if (nTr > 0) { 3549 ++nCr; // num cores per socket 3550 ++nCo; // total num cores 3551 if (nTr > nTpC) 3552 nTpC = nTr; // calc max threads per core 3553 } 3554 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3555 } // cores loop 3556 } // tiles support 3557 } // numa_support 3558 if (nCr > 0) { // found cores? 3559 ++nPkg; // num sockets 3560 if (nCr > nCpP) 3561 nCpP = nCr; // calc max cores per socket 3562 } 3563 } // sockets loop 3564 3565 // check the subset is valid 3566 KMP_DEBUG_ASSERT(n_old == __kmp_avail_proc); 3567 KMP_DEBUG_ASSERT(nPkg > 0); 3568 KMP_DEBUG_ASSERT(nCpP > 0); 3569 KMP_DEBUG_ASSERT(nTpC > 0); 3570 KMP_DEBUG_ASSERT(nCo > 0); 3571 KMP_DEBUG_ASSERT(nPkg <= nPackages); 3572 KMP_DEBUG_ASSERT(nCpP <= nCoresPerPkg); 3573 KMP_DEBUG_ASSERT(nTpC <= __kmp_nThreadsPerCore); 3574 KMP_DEBUG_ASSERT(nCo <= __kmp_ncores); 3575 3576 nPackages = nPkg; // correct num sockets 3577 nCoresPerPkg = nCpP; // correct num cores per socket 3578 __kmp_nThreadsPerCore = nTpC; // correct num threads per core 3579 __kmp_avail_proc = n_new; // correct num procs 3580 __kmp_ncores = nCo; // correct num cores 3581 // hwloc topology method end 3582 } else 3583 #endif // KMP_USE_HWLOC 3584 { 3585 int n_old = 0, n_new = 0, proc_num = 0; 3586 if (__kmp_hws_node.num > 0 || __kmp_hws_tile.num > 0) { 3587 KMP_WARNING(AffHWSubsetNoHWLOC); 3588 goto _exit; 3589 } 3590 if (__kmp_hws_socket.num == 0) 3591 __kmp_hws_socket.num = nPackages; // use all available sockets 3592 if (__kmp_hws_core.num == 0) 3593 __kmp_hws_core.num = nCoresPerPkg; // use all available cores 3594 if (__kmp_hws_proc.num == 0 || 3595 __kmp_hws_proc.num > __kmp_nThreadsPerCore) 3596 __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all HW contexts 3597 if ( !__kmp_affinity_uniform_topology() ) { 3598 KMP_WARNING( AffHWSubsetNonUniform ); 3599 goto _exit; // don't support non-uniform topology 3600 } 3601 if ( depth > 3 ) { 3602 KMP_WARNING( AffHWSubsetNonThreeLevel ); 3603 goto _exit; // don't support not-3-level topology 3604 } 3605 if (__kmp_hws_socket.offset + __kmp_hws_socket.num > nPackages) { 3606 KMP_WARNING(AffHWSubsetManySockets); 3607 goto _exit; 3608 } 3609 if ( __kmp_hws_core.offset + __kmp_hws_core.num > nCoresPerPkg ) { 3610 KMP_WARNING( AffHWSubsetManyCores ); 3611 goto _exit; 3612 } 3613 // Form the requested subset 3614 if (pAddr) // pAddr is NULL in case of affinity_none 3615 newAddr = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * 3616 __kmp_hws_socket.num * 3617 __kmp_hws_core.num * 3618 __kmp_hws_proc.num); 3619 for (int i = 0; i < nPackages; ++i) { 3620 if (i < __kmp_hws_socket.offset || 3621 i >= __kmp_hws_socket.offset + __kmp_hws_socket.num) { 3622 // skip not-requested socket 3623 n_old += nCoresPerPkg * __kmp_nThreadsPerCore; 3624 if (__kmp_pu_os_idx != NULL) { 3625 // walk through skipped socket 3626 for (int j = 0; j < nCoresPerPkg; ++j) { 3627 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 3628 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3629 ++proc_num; 3630 } 3631 } 3632 } 3633 } else { 3634 // walk through requested socket 3635 for (int j = 0; j < nCoresPerPkg; ++j) { 3636 if (j < __kmp_hws_core.offset || 3637 j >= __kmp_hws_core.offset + __kmp_hws_core.num) 3638 { // skip not-requested core 3639 n_old += __kmp_nThreadsPerCore; 3640 if (__kmp_pu_os_idx != NULL) { 3641 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 3642 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3643 ++proc_num; 3644 } 3645 } 3646 } else { 3647 // walk through requested core 3648 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 3649 if (k < __kmp_hws_proc.num) { 3650 if (pAddr) // collect requested thread's data 3651 newAddr[n_new] = (*pAddr)[n_old]; 3652 n_new++; 3653 } else { 3654 if (__kmp_pu_os_idx != NULL) 3655 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3656 } 3657 n_old++; 3658 ++proc_num; 3659 } 3660 } 3661 } 3662 } 3663 } 3664 KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore); 3665 KMP_DEBUG_ASSERT(n_new == __kmp_hws_socket.num * __kmp_hws_core.num * 3666 __kmp_hws_proc.num); 3667 nPackages = __kmp_hws_socket.num; // correct nPackages 3668 nCoresPerPkg = __kmp_hws_core.num; // correct nCoresPerPkg 3669 __kmp_nThreadsPerCore = __kmp_hws_proc.num; // correct __kmp_nThreadsPerCore 3670 __kmp_avail_proc = n_new; // correct avail_proc 3671 __kmp_ncores = nPackages * __kmp_hws_core.num; // correct ncores 3672 } // non-hwloc topology method 3673 if (pAddr) { 3674 __kmp_free( *pAddr ); 3675 *pAddr = newAddr; // replace old topology with new one 3676 } 3677 if (__kmp_affinity_verbose) { 3678 char m[KMP_AFFIN_MASK_PRINT_LEN]; 3679 __kmp_affinity_print_mask(m,KMP_AFFIN_MASK_PRINT_LEN,__kmp_affin_fullMask); 3680 if (__kmp_affinity_respect_mask) { 3681 KMP_INFORM(InitOSProcSetRespect, "KMP_HW_SUBSET", m); 3682 } else { 3683 KMP_INFORM(InitOSProcSetNotRespect, "KMP_HW_SUBSET", m); 3684 } 3685 KMP_INFORM(AvailableOSProc, "KMP_HW_SUBSET", __kmp_avail_proc); 3686 kmp_str_buf_t buf; 3687 __kmp_str_buf_init(&buf); 3688 __kmp_str_buf_print(&buf, "%d", nPackages); 3689 KMP_INFORM(TopologyExtra, "KMP_HW_SUBSET", buf.str, nCoresPerPkg, 3690 __kmp_nThreadsPerCore, __kmp_ncores); 3691 __kmp_str_buf_free(&buf); 3692 } 3693 _exit: 3694 if (__kmp_pu_os_idx != NULL) { 3695 __kmp_free(__kmp_pu_os_idx); 3696 __kmp_pu_os_idx = NULL; 3697 } 3698 } 3699 3700 // This function figures out the deepest level at which there is at least one 3701 // cluster/core with more than one processing unit bound to it. 3702 static int __kmp_affinity_find_core_level(const AddrUnsPair *address2os, 3703 int nprocs, int bottom_level) { 3704 int core_level = 0; 3705 3706 for (int i = 0; i < nprocs; i++) { 3707 for (int j = bottom_level; j > 0; j--) { 3708 if (address2os[i].first.labels[j] > 0) { 3709 if (core_level < (j - 1)) { 3710 core_level = j - 1; 3711 } 3712 } 3713 } 3714 } 3715 return core_level; 3716 } 3717 3718 // This function counts number of clusters/cores at given level. 3719 static int __kmp_affinity_compute_ncores(const AddrUnsPair *address2os, 3720 int nprocs, int bottom_level, 3721 int core_level) { 3722 int ncores = 0; 3723 int i, j; 3724 3725 j = bottom_level; 3726 for (i = 0; i < nprocs; i++) { 3727 for (j = bottom_level; j > core_level; j--) { 3728 if ((i + 1) < nprocs) { 3729 if (address2os[i + 1].first.labels[j] > 0) { 3730 break; 3731 } 3732 } 3733 } 3734 if (j == core_level) { 3735 ncores++; 3736 } 3737 } 3738 if (j > core_level) { 3739 // In case of ( nprocs < __kmp_avail_proc ) we may end too deep and miss one 3740 // core. May occur when called from __kmp_affinity_find_core(). 3741 ncores++; 3742 } 3743 return ncores; 3744 } 3745 3746 // This function finds to which cluster/core given processing unit is bound. 3747 static int __kmp_affinity_find_core(const AddrUnsPair *address2os, int proc, 3748 int bottom_level, int core_level) { 3749 return __kmp_affinity_compute_ncores(address2os, proc + 1, bottom_level, 3750 core_level) - 1; 3751 } 3752 3753 // This function finds maximal number of processing units bound to a 3754 // cluster/core at given level. 3755 static int __kmp_affinity_max_proc_per_core(const AddrUnsPair *address2os, 3756 int nprocs, int bottom_level, 3757 int core_level) { 3758 int maxprocpercore = 0; 3759 3760 if (core_level < bottom_level) { 3761 for (int i = 0; i < nprocs; i++) { 3762 int percore = address2os[i].first.labels[core_level + 1] + 1; 3763 3764 if (percore > maxprocpercore) { 3765 maxprocpercore = percore; 3766 } 3767 } 3768 } else { 3769 maxprocpercore = 1; 3770 } 3771 return maxprocpercore; 3772 } 3773 3774 static AddrUnsPair *address2os = NULL; 3775 static int *procarr = NULL; 3776 static int __kmp_aff_depth = 0; 3777 3778 #define KMP_EXIT_AFF_NONE \ 3779 KMP_ASSERT(__kmp_affinity_type == affinity_none); \ 3780 KMP_ASSERT(address2os == NULL); \ 3781 __kmp_apply_thread_places(NULL, 0); \ 3782 return; 3783 3784 static int __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) { 3785 const Address *aa = (const Address *)&(((AddrUnsPair *)a)->first); 3786 const Address *bb = (const Address *)&(((AddrUnsPair *)b)->first); 3787 unsigned depth = aa->depth; 3788 unsigned i; 3789 KMP_DEBUG_ASSERT(depth == bb->depth); 3790 KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth); 3791 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0); 3792 for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) { 3793 int j = depth - i - 1; 3794 if (aa->childNums[j] < bb->childNums[j]) 3795 return -1; 3796 if (aa->childNums[j] > bb->childNums[j]) 3797 return 1; 3798 } 3799 for (; i < depth; i++) { 3800 int j = i - __kmp_affinity_compact; 3801 if (aa->childNums[j] < bb->childNums[j]) 3802 return -1; 3803 if (aa->childNums[j] > bb->childNums[j]) 3804 return 1; 3805 } 3806 return 0; 3807 } 3808 3809 static void __kmp_aux_affinity_initialize(void) { 3810 if (__kmp_affinity_masks != NULL) { 3811 KMP_ASSERT(__kmp_affin_fullMask != NULL); 3812 return; 3813 } 3814 3815 // Create the "full" mask - this defines all of the processors that we 3816 // consider to be in the machine model. If respect is set, then it is the 3817 // initialization thread's affinity mask. Otherwise, it is all processors that 3818 // we know about on the machine. 3819 if (__kmp_affin_fullMask == NULL) { 3820 KMP_CPU_ALLOC(__kmp_affin_fullMask); 3821 } 3822 if (KMP_AFFINITY_CAPABLE()) { 3823 if (__kmp_affinity_respect_mask) { 3824 __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE); 3825 3826 // Count the number of available processors. 3827 unsigned i; 3828 __kmp_avail_proc = 0; 3829 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 3830 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 3831 continue; 3832 } 3833 __kmp_avail_proc++; 3834 } 3835 if (__kmp_avail_proc > __kmp_xproc) { 3836 if (__kmp_affinity_verbose || 3837 (__kmp_affinity_warnings && 3838 (__kmp_affinity_type != affinity_none))) { 3839 KMP_WARNING(ErrorInitializeAffinity); 3840 } 3841 __kmp_affinity_type = affinity_none; 3842 KMP_AFFINITY_DISABLE(); 3843 return; 3844 } 3845 } else { 3846 __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask); 3847 __kmp_avail_proc = __kmp_xproc; 3848 } 3849 } 3850 3851 int depth = -1; 3852 kmp_i18n_id_t msg_id = kmp_i18n_null; 3853 3854 // For backward compatibility, setting KMP_CPUINFO_FILE => 3855 // KMP_TOPOLOGY_METHOD=cpuinfo 3856 if ((__kmp_cpuinfo_file != NULL) && 3857 (__kmp_affinity_top_method == affinity_top_method_all)) { 3858 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 3859 } 3860 3861 if (__kmp_affinity_top_method == affinity_top_method_all) { 3862 // In the default code path, errors are not fatal - we just try using 3863 // another method. We only emit a warning message if affinity is on, or the 3864 // verbose flag is set, an the nowarnings flag was not set. 3865 const char *file_name = NULL; 3866 int line = 0; 3867 #if KMP_USE_HWLOC 3868 if (depth < 0 && 3869 __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 3870 if (__kmp_affinity_verbose) { 3871 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 3872 } 3873 if (!__kmp_hwloc_error) { 3874 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); 3875 if (depth == 0) { 3876 KMP_EXIT_AFF_NONE; 3877 } else if (depth < 0 && __kmp_affinity_verbose) { 3878 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 3879 } 3880 } else if (__kmp_affinity_verbose) { 3881 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 3882 } 3883 } 3884 #endif 3885 3886 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 3887 3888 if (depth < 0) { 3889 if (__kmp_affinity_verbose) { 3890 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 3891 } 3892 3893 file_name = NULL; 3894 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3895 if (depth == 0) { 3896 KMP_EXIT_AFF_NONE; 3897 } 3898 3899 if (depth < 0) { 3900 if (__kmp_affinity_verbose) { 3901 if (msg_id != kmp_i18n_null) { 3902 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", 3903 __kmp_i18n_catgets(msg_id), 3904 KMP_I18N_STR(DecodingLegacyAPIC)); 3905 } else { 3906 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3907 KMP_I18N_STR(DecodingLegacyAPIC)); 3908 } 3909 } 3910 3911 file_name = NULL; 3912 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3913 if (depth == 0) { 3914 KMP_EXIT_AFF_NONE; 3915 } 3916 } 3917 } 3918 3919 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3920 3921 #if KMP_OS_LINUX 3922 3923 if (depth < 0) { 3924 if (__kmp_affinity_verbose) { 3925 if (msg_id != kmp_i18n_null) { 3926 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", 3927 __kmp_i18n_catgets(msg_id), "/proc/cpuinfo"); 3928 } else { 3929 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo"); 3930 } 3931 } 3932 3933 FILE *f = fopen("/proc/cpuinfo", "r"); 3934 if (f == NULL) { 3935 msg_id = kmp_i18n_str_CantOpenCpuinfo; 3936 } else { 3937 file_name = "/proc/cpuinfo"; 3938 depth = 3939 __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3940 fclose(f); 3941 if (depth == 0) { 3942 KMP_EXIT_AFF_NONE; 3943 } 3944 } 3945 } 3946 3947 #endif /* KMP_OS_LINUX */ 3948 3949 #if KMP_GROUP_AFFINITY 3950 3951 if ((depth < 0) && (__kmp_num_proc_groups > 1)) { 3952 if (__kmp_affinity_verbose) { 3953 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 3954 } 3955 3956 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 3957 KMP_ASSERT(depth != 0); 3958 } 3959 3960 #endif /* KMP_GROUP_AFFINITY */ 3961 3962 if (depth < 0) { 3963 if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) { 3964 if (file_name == NULL) { 3965 KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id)); 3966 } else if (line == 0) { 3967 KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id)); 3968 } else { 3969 KMP_INFORM(UsingFlatOSFileLine, file_name, line, 3970 __kmp_i18n_catgets(msg_id)); 3971 } 3972 } 3973 // FIXME - print msg if msg_id = kmp_i18n_null ??? 3974 3975 file_name = ""; 3976 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3977 if (depth == 0) { 3978 KMP_EXIT_AFF_NONE; 3979 } 3980 KMP_ASSERT(depth > 0); 3981 KMP_ASSERT(address2os != NULL); 3982 } 3983 } 3984 3985 // If the user has specified that a paricular topology discovery method is to be 3986 // used, then we abort if that method fails. The exception is group affinity, 3987 // which might have been implicitly set. 3988 3989 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 3990 3991 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 3992 if (__kmp_affinity_verbose) { 3993 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 3994 } 3995 3996 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3997 if (depth == 0) { 3998 KMP_EXIT_AFF_NONE; 3999 } 4000 if (depth < 0) { 4001 KMP_ASSERT(msg_id != kmp_i18n_null); 4002 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4003 } 4004 } else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 4005 if (__kmp_affinity_verbose) { 4006 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 4007 } 4008 4009 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 4010 if (depth == 0) { 4011 KMP_EXIT_AFF_NONE; 4012 } 4013 if (depth < 0) { 4014 KMP_ASSERT(msg_id != kmp_i18n_null); 4015 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4016 } 4017 } 4018 4019 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4020 4021 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 4022 const char *filename; 4023 if (__kmp_cpuinfo_file != NULL) { 4024 filename = __kmp_cpuinfo_file; 4025 } else { 4026 filename = "/proc/cpuinfo"; 4027 } 4028 4029 if (__kmp_affinity_verbose) { 4030 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 4031 } 4032 4033 FILE *f = fopen(filename, "r"); 4034 if (f == NULL) { 4035 int code = errno; 4036 if (__kmp_cpuinfo_file != NULL) { 4037 __kmp_msg(kmp_ms_fatal, KMP_MSG(CantOpenFileForReading, filename), 4038 KMP_ERR(code), KMP_HNT(NameComesFrom_CPUINFO_FILE), 4039 __kmp_msg_null); 4040 } else { 4041 __kmp_msg(kmp_ms_fatal, KMP_MSG(CantOpenFileForReading, filename), 4042 KMP_ERR(code), __kmp_msg_null); 4043 } 4044 } 4045 int line = 0; 4046 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 4047 fclose(f); 4048 if (depth < 0) { 4049 KMP_ASSERT(msg_id != kmp_i18n_null); 4050 if (line > 0) { 4051 KMP_FATAL(FileLineMsgExiting, filename, line, 4052 __kmp_i18n_catgets(msg_id)); 4053 } else { 4054 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 4055 } 4056 } 4057 if (__kmp_affinity_type == affinity_none) { 4058 KMP_ASSERT(depth == 0); 4059 KMP_EXIT_AFF_NONE; 4060 } 4061 } 4062 4063 #if KMP_GROUP_AFFINITY 4064 4065 else if (__kmp_affinity_top_method == affinity_top_method_group) { 4066 if (__kmp_affinity_verbose) { 4067 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 4068 } 4069 4070 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 4071 KMP_ASSERT(depth != 0); 4072 if (depth < 0) { 4073 KMP_ASSERT(msg_id != kmp_i18n_null); 4074 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4075 } 4076 } 4077 4078 #endif /* KMP_GROUP_AFFINITY */ 4079 4080 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 4081 if (__kmp_affinity_verbose) { 4082 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY"); 4083 } 4084 4085 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 4086 if (depth == 0) { 4087 KMP_EXIT_AFF_NONE; 4088 } 4089 // should not fail 4090 KMP_ASSERT(depth > 0); 4091 KMP_ASSERT(address2os != NULL); 4092 } 4093 4094 #if KMP_USE_HWLOC 4095 else if (__kmp_affinity_top_method == affinity_top_method_hwloc) { 4096 KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC); 4097 if (__kmp_affinity_verbose) { 4098 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 4099 } 4100 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); 4101 if (depth == 0) { 4102 KMP_EXIT_AFF_NONE; 4103 } 4104 } 4105 #endif // KMP_USE_HWLOC 4106 4107 if (address2os == NULL) { 4108 if (KMP_AFFINITY_CAPABLE() && 4109 (__kmp_affinity_verbose || 4110 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) { 4111 KMP_WARNING(ErrorInitializeAffinity); 4112 } 4113 __kmp_affinity_type = affinity_none; 4114 KMP_AFFINITY_DISABLE(); 4115 return; 4116 } 4117 4118 __kmp_apply_thread_places(&address2os, depth); 4119 4120 // Create the table of masks, indexed by thread Id. 4121 unsigned maxIndex; 4122 unsigned numUnique; 4123 kmp_affin_mask_t *osId2Mask = 4124 __kmp_create_masks(&maxIndex, &numUnique, address2os, __kmp_avail_proc); 4125 if (__kmp_affinity_gran_levels == 0) { 4126 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 4127 } 4128 4129 // Set the childNums vector in all Address objects. This must be done before 4130 // we can sort using __kmp_affinity_cmp_Address_child_num(), which takes into 4131 // account the setting of __kmp_affinity_compact. 4132 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc); 4133 4134 switch (__kmp_affinity_type) { 4135 4136 case affinity_explicit: 4137 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 4138 #if OMP_40_ENABLED 4139 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 4140 #endif 4141 { 4142 __kmp_affinity_process_proclist( 4143 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 4144 __kmp_affinity_proclist, osId2Mask, maxIndex); 4145 } 4146 #if OMP_40_ENABLED 4147 else { 4148 __kmp_affinity_process_placelist( 4149 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 4150 __kmp_affinity_proclist, osId2Mask, maxIndex); 4151 } 4152 #endif 4153 if (__kmp_affinity_num_masks == 0) { 4154 if (__kmp_affinity_verbose || 4155 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 4156 KMP_WARNING(AffNoValidProcID); 4157 } 4158 __kmp_affinity_type = affinity_none; 4159 return; 4160 } 4161 break; 4162 4163 // The other affinity types rely on sorting the Addresses according to some 4164 // permutation of the machine topology tree. Set __kmp_affinity_compact and 4165 // __kmp_affinity_offset appropriately, then jump to a common code fragment 4166 // to do the sort and create the array of affinity masks. 4167 4168 case affinity_logical: 4169 __kmp_affinity_compact = 0; 4170 if (__kmp_affinity_offset) { 4171 __kmp_affinity_offset = 4172 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 4173 } 4174 goto sortAddresses; 4175 4176 case affinity_physical: 4177 if (__kmp_nThreadsPerCore > 1) { 4178 __kmp_affinity_compact = 1; 4179 if (__kmp_affinity_compact >= depth) { 4180 __kmp_affinity_compact = 0; 4181 } 4182 } else { 4183 __kmp_affinity_compact = 0; 4184 } 4185 if (__kmp_affinity_offset) { 4186 __kmp_affinity_offset = 4187 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 4188 } 4189 goto sortAddresses; 4190 4191 case affinity_scatter: 4192 if (__kmp_affinity_compact >= depth) { 4193 __kmp_affinity_compact = 0; 4194 } else { 4195 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 4196 } 4197 goto sortAddresses; 4198 4199 case affinity_compact: 4200 if (__kmp_affinity_compact >= depth) { 4201 __kmp_affinity_compact = depth - 1; 4202 } 4203 goto sortAddresses; 4204 4205 case affinity_balanced: 4206 if (depth <= 1) { 4207 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 4208 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 4209 } 4210 __kmp_affinity_type = affinity_none; 4211 return; 4212 } else if (__kmp_affinity_uniform_topology()) { 4213 break; 4214 } else { // Non-uniform topology 4215 4216 // Save the depth for further usage 4217 __kmp_aff_depth = depth; 4218 4219 int core_level = __kmp_affinity_find_core_level( 4220 address2os, __kmp_avail_proc, depth - 1); 4221 int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, 4222 depth - 1, core_level); 4223 int maxprocpercore = __kmp_affinity_max_proc_per_core( 4224 address2os, __kmp_avail_proc, depth - 1, core_level); 4225 4226 int nproc = ncores * maxprocpercore; 4227 if ((nproc < 2) || (nproc < __kmp_avail_proc)) { 4228 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 4229 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 4230 } 4231 __kmp_affinity_type = affinity_none; 4232 return; 4233 } 4234 4235 procarr = (int *)__kmp_allocate(sizeof(int) * nproc); 4236 for (int i = 0; i < nproc; i++) { 4237 procarr[i] = -1; 4238 } 4239 4240 int lastcore = -1; 4241 int inlastcore = 0; 4242 for (int i = 0; i < __kmp_avail_proc; i++) { 4243 int proc = address2os[i].second; 4244 int core = 4245 __kmp_affinity_find_core(address2os, i, depth - 1, core_level); 4246 4247 if (core == lastcore) { 4248 inlastcore++; 4249 } else { 4250 inlastcore = 0; 4251 } 4252 lastcore = core; 4253 4254 procarr[core * maxprocpercore + inlastcore] = proc; 4255 } 4256 4257 break; 4258 } 4259 4260 sortAddresses: 4261 // Allocate the gtid->affinity mask table. 4262 if (__kmp_affinity_dups) { 4263 __kmp_affinity_num_masks = __kmp_avail_proc; 4264 } else { 4265 __kmp_affinity_num_masks = numUnique; 4266 } 4267 4268 #if OMP_40_ENABLED 4269 if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) && 4270 (__kmp_affinity_num_places > 0) && 4271 ((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) { 4272 __kmp_affinity_num_masks = __kmp_affinity_num_places; 4273 } 4274 #endif 4275 4276 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4277 4278 // Sort the address2os table according to the current setting of 4279 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 4280 qsort(address2os, __kmp_avail_proc, sizeof(*address2os), 4281 __kmp_affinity_cmp_Address_child_num); 4282 { 4283 int i; 4284 unsigned j; 4285 for (i = 0, j = 0; i < __kmp_avail_proc; i++) { 4286 if ((!__kmp_affinity_dups) && (!address2os[i].first.leader)) { 4287 continue; 4288 } 4289 unsigned osId = address2os[i].second; 4290 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 4291 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j); 4292 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 4293 KMP_CPU_COPY(dest, src); 4294 if (++j >= __kmp_affinity_num_masks) { 4295 break; 4296 } 4297 } 4298 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 4299 } 4300 break; 4301 4302 default: 4303 KMP_ASSERT2(0, "Unexpected affinity setting"); 4304 } 4305 4306 KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1); 4307 machine_hierarchy.init(address2os, __kmp_avail_proc); 4308 } 4309 #undef KMP_EXIT_AFF_NONE 4310 4311 void __kmp_affinity_initialize(void) { 4312 // Much of the code above was written assumming that if a machine was not 4313 // affinity capable, then __kmp_affinity_type == affinity_none. We now 4314 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 4315 // There are too many checks for __kmp_affinity_type == affinity_none 4316 // in this code. Instead of trying to change them all, check if 4317 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 4318 // affinity_none, call the real initialization routine, then restore 4319 // __kmp_affinity_type to affinity_disabled. 4320 int disabled = (__kmp_affinity_type == affinity_disabled); 4321 if (!KMP_AFFINITY_CAPABLE()) { 4322 KMP_ASSERT(disabled); 4323 } 4324 if (disabled) { 4325 __kmp_affinity_type = affinity_none; 4326 } 4327 __kmp_aux_affinity_initialize(); 4328 if (disabled) { 4329 __kmp_affinity_type = affinity_disabled; 4330 } 4331 } 4332 4333 void __kmp_affinity_uninitialize(void) { 4334 if (__kmp_affinity_masks != NULL) { 4335 KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4336 __kmp_affinity_masks = NULL; 4337 } 4338 if (__kmp_affin_fullMask != NULL) { 4339 KMP_CPU_FREE(__kmp_affin_fullMask); 4340 __kmp_affin_fullMask = NULL; 4341 } 4342 __kmp_affinity_num_masks = 0; 4343 __kmp_affinity_type = affinity_default; 4344 #if OMP_40_ENABLED 4345 __kmp_affinity_num_places = 0; 4346 #endif 4347 if (__kmp_affinity_proclist != NULL) { 4348 __kmp_free(__kmp_affinity_proclist); 4349 __kmp_affinity_proclist = NULL; 4350 } 4351 if (address2os != NULL) { 4352 __kmp_free(address2os); 4353 address2os = NULL; 4354 } 4355 if (procarr != NULL) { 4356 __kmp_free(procarr); 4357 procarr = NULL; 4358 } 4359 #if KMP_USE_HWLOC 4360 if (__kmp_hwloc_topology != NULL) { 4361 hwloc_topology_destroy(__kmp_hwloc_topology); 4362 __kmp_hwloc_topology = NULL; 4363 } 4364 #endif 4365 KMPAffinity::destroy_api(); 4366 } 4367 4368 void __kmp_affinity_set_init_mask(int gtid, int isa_root) { 4369 if (!KMP_AFFINITY_CAPABLE()) { 4370 return; 4371 } 4372 4373 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4374 if (th->th.th_affin_mask == NULL) { 4375 KMP_CPU_ALLOC(th->th.th_affin_mask); 4376 } else { 4377 KMP_CPU_ZERO(th->th.th_affin_mask); 4378 } 4379 4380 // Copy the thread mask to the kmp_info_t strucuture. If 4381 // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that 4382 // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set, 4383 // then the full mask is the same as the mask of the initialization thread. 4384 kmp_affin_mask_t *mask; 4385 int i; 4386 4387 #if OMP_40_ENABLED 4388 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 4389 #endif 4390 { 4391 if ((__kmp_affinity_type == affinity_none) || 4392 (__kmp_affinity_type == affinity_balanced)) { 4393 #if KMP_GROUP_AFFINITY 4394 if (__kmp_num_proc_groups > 1) { 4395 return; 4396 } 4397 #endif 4398 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4399 i = KMP_PLACE_ALL; 4400 mask = __kmp_affin_fullMask; 4401 } else { 4402 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 4403 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4404 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4405 } 4406 } 4407 #if OMP_40_ENABLED 4408 else { 4409 if ((!isa_root) || 4410 (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 4411 #if KMP_GROUP_AFFINITY 4412 if (__kmp_num_proc_groups > 1) { 4413 return; 4414 } 4415 #endif 4416 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4417 i = KMP_PLACE_ALL; 4418 mask = __kmp_affin_fullMask; 4419 } else { 4420 // int i = some hash function or just a counter that doesn't 4421 // always start at 0. Use gtid for now. 4422 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 4423 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4424 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4425 } 4426 } 4427 #endif 4428 4429 #if OMP_40_ENABLED 4430 th->th.th_current_place = i; 4431 if (isa_root) { 4432 th->th.th_new_place = i; 4433 th->th.th_first_place = 0; 4434 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4435 } 4436 4437 if (i == KMP_PLACE_ALL) { 4438 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 4439 gtid)); 4440 } else { 4441 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 4442 gtid, i)); 4443 } 4444 #else 4445 if (i == -1) { 4446 KA_TRACE( 4447 100, 4448 ("__kmp_affinity_set_init_mask: binding T#%d to __kmp_affin_fullMask\n", 4449 gtid)); 4450 } else { 4451 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n", 4452 gtid, i)); 4453 } 4454 #endif /* OMP_40_ENABLED */ 4455 4456 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4457 4458 if (__kmp_affinity_verbose) { 4459 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4460 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4461 th->th.th_affin_mask); 4462 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4463 __kmp_gettid(), gtid, buf); 4464 } 4465 4466 #if KMP_OS_WINDOWS 4467 // On Windows* OS, the process affinity mask might have changed. If the user 4468 // didn't request affinity and this call fails, just continue silently. 4469 // See CQ171393. 4470 if (__kmp_affinity_type == affinity_none) { 4471 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 4472 } else 4473 #endif 4474 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4475 } 4476 4477 #if OMP_40_ENABLED 4478 4479 void __kmp_affinity_set_place(int gtid) { 4480 int retval; 4481 4482 if (!KMP_AFFINITY_CAPABLE()) { 4483 return; 4484 } 4485 4486 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4487 4488 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current " 4489 "place = %d)\n", 4490 gtid, th->th.th_new_place, th->th.th_current_place)); 4491 4492 // Check that the new place is within this thread's partition. 4493 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4494 KMP_ASSERT(th->th.th_new_place >= 0); 4495 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 4496 if (th->th.th_first_place <= th->th.th_last_place) { 4497 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) && 4498 (th->th.th_new_place <= th->th.th_last_place)); 4499 } else { 4500 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) || 4501 (th->th.th_new_place >= th->th.th_last_place)); 4502 } 4503 4504 // Copy the thread mask to the kmp_info_t strucuture, 4505 // and set this thread's affinity. 4506 kmp_affin_mask_t *mask = 4507 KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place); 4508 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4509 th->th.th_current_place = th->th.th_new_place; 4510 4511 if (__kmp_affinity_verbose) { 4512 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4513 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4514 th->th.th_affin_mask); 4515 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), 4516 __kmp_gettid(), gtid, buf); 4517 } 4518 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4519 } 4520 4521 #endif /* OMP_40_ENABLED */ 4522 4523 int __kmp_aux_set_affinity(void **mask) { 4524 int gtid; 4525 kmp_info_t *th; 4526 int retval; 4527 4528 if (!KMP_AFFINITY_CAPABLE()) { 4529 return -1; 4530 } 4531 4532 gtid = __kmp_entry_gtid(); 4533 KA_TRACE(1000, ; { 4534 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4535 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4536 (kmp_affin_mask_t *)(*mask)); 4537 __kmp_debug_printf( 4538 "kmp_set_affinity: setting affinity mask for thread %d = %s\n", gtid, 4539 buf); 4540 }); 4541 4542 if (__kmp_env_consistency_check) { 4543 if ((mask == NULL) || (*mask == NULL)) { 4544 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4545 } else { 4546 unsigned proc; 4547 int num_procs = 0; 4548 4549 KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) { 4550 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4551 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4552 } 4553 if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 4554 continue; 4555 } 4556 num_procs++; 4557 } 4558 if (num_procs == 0) { 4559 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4560 } 4561 4562 #if KMP_GROUP_AFFINITY 4563 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 4564 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4565 } 4566 #endif /* KMP_GROUP_AFFINITY */ 4567 } 4568 } 4569 4570 th = __kmp_threads[gtid]; 4571 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4572 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4573 if (retval == 0) { 4574 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 4575 } 4576 4577 #if OMP_40_ENABLED 4578 th->th.th_current_place = KMP_PLACE_UNDEFINED; 4579 th->th.th_new_place = KMP_PLACE_UNDEFINED; 4580 th->th.th_first_place = 0; 4581 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4582 4583 // Turn off 4.0 affinity for the current tread at this parallel level. 4584 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 4585 #endif 4586 4587 return retval; 4588 } 4589 4590 int __kmp_aux_get_affinity(void **mask) { 4591 int gtid; 4592 int retval; 4593 kmp_info_t *th; 4594 4595 if (!KMP_AFFINITY_CAPABLE()) { 4596 return -1; 4597 } 4598 4599 gtid = __kmp_entry_gtid(); 4600 th = __kmp_threads[gtid]; 4601 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4602 4603 KA_TRACE(1000, ; { 4604 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4605 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4606 th->th.th_affin_mask); 4607 __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", 4608 gtid, buf); 4609 }); 4610 4611 if (__kmp_env_consistency_check) { 4612 if ((mask == NULL) || (*mask == NULL)) { 4613 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 4614 } 4615 } 4616 4617 #if !KMP_OS_WINDOWS 4618 4619 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4620 KA_TRACE(1000, ; { 4621 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4622 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4623 (kmp_affin_mask_t *)(*mask)); 4624 __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", 4625 gtid, buf); 4626 }); 4627 return retval; 4628 4629 #else 4630 4631 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 4632 return 0; 4633 4634 #endif /* KMP_OS_WINDOWS */ 4635 } 4636 4637 int __kmp_aux_get_affinity_max_proc() { 4638 if (!KMP_AFFINITY_CAPABLE()) { 4639 return 0; 4640 } 4641 #if KMP_GROUP_AFFINITY 4642 if (__kmp_num_proc_groups > 1) { 4643 return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT); 4644 } 4645 #endif 4646 return __kmp_xproc; 4647 } 4648 4649 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) { 4650 int retval; 4651 4652 if (!KMP_AFFINITY_CAPABLE()) { 4653 return -1; 4654 } 4655 4656 KA_TRACE(1000, ; { 4657 int gtid = __kmp_entry_gtid(); 4658 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4659 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4660 (kmp_affin_mask_t *)(*mask)); 4661 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in " 4662 "affinity mask for thread %d = %s\n", 4663 proc, gtid, buf); 4664 }); 4665 4666 if (__kmp_env_consistency_check) { 4667 if ((mask == NULL) || (*mask == NULL)) { 4668 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4669 } 4670 } 4671 4672 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4673 return -1; 4674 } 4675 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4676 return -2; 4677 } 4678 4679 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 4680 return 0; 4681 } 4682 4683 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) { 4684 int retval; 4685 4686 if (!KMP_AFFINITY_CAPABLE()) { 4687 return -1; 4688 } 4689 4690 KA_TRACE(1000, ; { 4691 int gtid = __kmp_entry_gtid(); 4692 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4693 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4694 (kmp_affin_mask_t *)(*mask)); 4695 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in " 4696 "affinity mask for thread %d = %s\n", 4697 proc, gtid, buf); 4698 }); 4699 4700 if (__kmp_env_consistency_check) { 4701 if ((mask == NULL) || (*mask == NULL)) { 4702 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 4703 } 4704 } 4705 4706 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4707 return -1; 4708 } 4709 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4710 return -2; 4711 } 4712 4713 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 4714 return 0; 4715 } 4716 4717 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) { 4718 int retval; 4719 4720 if (!KMP_AFFINITY_CAPABLE()) { 4721 return -1; 4722 } 4723 4724 KA_TRACE(1000, ; { 4725 int gtid = __kmp_entry_gtid(); 4726 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4727 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4728 (kmp_affin_mask_t *)(*mask)); 4729 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in " 4730 "affinity mask for thread %d = %s\n", 4731 proc, gtid, buf); 4732 }); 4733 4734 if (__kmp_env_consistency_check) { 4735 if ((mask == NULL) || (*mask == NULL)) { 4736 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc"); 4737 } 4738 } 4739 4740 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4741 return -1; 4742 } 4743 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4744 return 0; 4745 } 4746 4747 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 4748 } 4749 4750 // Dynamic affinity settings - Affinity balanced 4751 void __kmp_balanced_affinity(int tid, int nthreads) { 4752 bool fine_gran = true; 4753 4754 switch (__kmp_affinity_gran) { 4755 case affinity_gran_fine: 4756 case affinity_gran_thread: 4757 break; 4758 case affinity_gran_core: 4759 if (__kmp_nThreadsPerCore > 1) { 4760 fine_gran = false; 4761 } 4762 break; 4763 case affinity_gran_package: 4764 if (nCoresPerPkg > 1) { 4765 fine_gran = false; 4766 } 4767 break; 4768 default: 4769 fine_gran = false; 4770 } 4771 4772 if (__kmp_affinity_uniform_topology()) { 4773 int coreID; 4774 int threadID; 4775 // Number of hyper threads per core in HT machine 4776 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 4777 // Number of cores 4778 int ncores = __kmp_ncores; 4779 if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) { 4780 __kmp_nth_per_core = __kmp_avail_proc / nPackages; 4781 ncores = nPackages; 4782 } 4783 // How many threads will be bound to each core 4784 int chunk = nthreads / ncores; 4785 // How many cores will have an additional thread bound to it - "big cores" 4786 int big_cores = nthreads % ncores; 4787 // Number of threads on the big cores 4788 int big_nth = (chunk + 1) * big_cores; 4789 if (tid < big_nth) { 4790 coreID = tid / (chunk + 1); 4791 threadID = (tid % (chunk + 1)) % __kmp_nth_per_core; 4792 } else { // tid >= big_nth 4793 coreID = (tid - big_cores) / chunk; 4794 threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core; 4795 } 4796 4797 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 4798 "Illegal set affinity operation when not capable"); 4799 4800 kmp_affin_mask_t *mask; 4801 KMP_CPU_ALLOC_ON_STACK(mask); 4802 KMP_CPU_ZERO(mask); 4803 4804 if (fine_gran) { 4805 int osID = address2os[coreID * __kmp_nth_per_core + threadID].second; 4806 KMP_CPU_SET(osID, mask); 4807 } else { 4808 for (int i = 0; i < __kmp_nth_per_core; i++) { 4809 int osID; 4810 osID = address2os[coreID * __kmp_nth_per_core + i].second; 4811 KMP_CPU_SET(osID, mask); 4812 } 4813 } 4814 if (__kmp_affinity_verbose) { 4815 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4816 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4817 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4818 __kmp_gettid(), tid, buf); 4819 } 4820 __kmp_set_system_affinity(mask, TRUE); 4821 KMP_CPU_FREE_FROM_STACK(mask); 4822 } else { // Non-uniform topology 4823 4824 kmp_affin_mask_t *mask; 4825 KMP_CPU_ALLOC_ON_STACK(mask); 4826 KMP_CPU_ZERO(mask); 4827 4828 int core_level = __kmp_affinity_find_core_level( 4829 address2os, __kmp_avail_proc, __kmp_aff_depth - 1); 4830 int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, 4831 __kmp_aff_depth - 1, core_level); 4832 int nth_per_core = __kmp_affinity_max_proc_per_core( 4833 address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level); 4834 4835 // For performance gain consider the special case nthreads == 4836 // __kmp_avail_proc 4837 if (nthreads == __kmp_avail_proc) { 4838 if (fine_gran) { 4839 int osID = address2os[tid].second; 4840 KMP_CPU_SET(osID, mask); 4841 } else { 4842 int core = __kmp_affinity_find_core(address2os, tid, 4843 __kmp_aff_depth - 1, core_level); 4844 for (int i = 0; i < __kmp_avail_proc; i++) { 4845 int osID = address2os[i].second; 4846 if (__kmp_affinity_find_core(address2os, i, __kmp_aff_depth - 1, 4847 core_level) == core) { 4848 KMP_CPU_SET(osID, mask); 4849 } 4850 } 4851 } 4852 } else if (nthreads <= ncores) { 4853 4854 int core = 0; 4855 for (int i = 0; i < ncores; i++) { 4856 // Check if this core from procarr[] is in the mask 4857 int in_mask = 0; 4858 for (int j = 0; j < nth_per_core; j++) { 4859 if (procarr[i * nth_per_core + j] != -1) { 4860 in_mask = 1; 4861 break; 4862 } 4863 } 4864 if (in_mask) { 4865 if (tid == core) { 4866 for (int j = 0; j < nth_per_core; j++) { 4867 int osID = procarr[i * nth_per_core + j]; 4868 if (osID != -1) { 4869 KMP_CPU_SET(osID, mask); 4870 // For fine granularity it is enough to set the first available 4871 // osID for this core 4872 if (fine_gran) { 4873 break; 4874 } 4875 } 4876 } 4877 break; 4878 } else { 4879 core++; 4880 } 4881 } 4882 } 4883 } else { // nthreads > ncores 4884 // Array to save the number of processors at each core 4885 int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores); 4886 // Array to save the number of cores with "x" available processors; 4887 int *ncores_with_x_procs = 4888 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 4889 // Array to save the number of cores with # procs from x to nth_per_core 4890 int *ncores_with_x_to_max_procs = 4891 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 4892 4893 for (int i = 0; i <= nth_per_core; i++) { 4894 ncores_with_x_procs[i] = 0; 4895 ncores_with_x_to_max_procs[i] = 0; 4896 } 4897 4898 for (int i = 0; i < ncores; i++) { 4899 int cnt = 0; 4900 for (int j = 0; j < nth_per_core; j++) { 4901 if (procarr[i * nth_per_core + j] != -1) { 4902 cnt++; 4903 } 4904 } 4905 nproc_at_core[i] = cnt; 4906 ncores_with_x_procs[cnt]++; 4907 } 4908 4909 for (int i = 0; i <= nth_per_core; i++) { 4910 for (int j = i; j <= nth_per_core; j++) { 4911 ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j]; 4912 } 4913 } 4914 4915 // Max number of processors 4916 int nproc = nth_per_core * ncores; 4917 // An array to keep number of threads per each context 4918 int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc); 4919 for (int i = 0; i < nproc; i++) { 4920 newarr[i] = 0; 4921 } 4922 4923 int nth = nthreads; 4924 int flag = 0; 4925 while (nth > 0) { 4926 for (int j = 1; j <= nth_per_core; j++) { 4927 int cnt = ncores_with_x_to_max_procs[j]; 4928 for (int i = 0; i < ncores; i++) { 4929 // Skip the core with 0 processors 4930 if (nproc_at_core[i] == 0) { 4931 continue; 4932 } 4933 for (int k = 0; k < nth_per_core; k++) { 4934 if (procarr[i * nth_per_core + k] != -1) { 4935 if (newarr[i * nth_per_core + k] == 0) { 4936 newarr[i * nth_per_core + k] = 1; 4937 cnt--; 4938 nth--; 4939 break; 4940 } else { 4941 if (flag != 0) { 4942 newarr[i * nth_per_core + k]++; 4943 cnt--; 4944 nth--; 4945 break; 4946 } 4947 } 4948 } 4949 } 4950 if (cnt == 0 || nth == 0) { 4951 break; 4952 } 4953 } 4954 if (nth == 0) { 4955 break; 4956 } 4957 } 4958 flag = 1; 4959 } 4960 int sum = 0; 4961 for (int i = 0; i < nproc; i++) { 4962 sum += newarr[i]; 4963 if (sum > tid) { 4964 if (fine_gran) { 4965 int osID = procarr[i]; 4966 KMP_CPU_SET(osID, mask); 4967 } else { 4968 int coreID = i / nth_per_core; 4969 for (int ii = 0; ii < nth_per_core; ii++) { 4970 int osID = procarr[coreID * nth_per_core + ii]; 4971 if (osID != -1) { 4972 KMP_CPU_SET(osID, mask); 4973 } 4974 } 4975 } 4976 break; 4977 } 4978 } 4979 __kmp_free(newarr); 4980 } 4981 4982 if (__kmp_affinity_verbose) { 4983 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4984 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4985 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4986 __kmp_gettid(), tid, buf); 4987 } 4988 __kmp_set_system_affinity(mask, TRUE); 4989 KMP_CPU_FREE_FROM_STACK(mask); 4990 } 4991 } 4992 4993 #if KMP_OS_LINUX 4994 // We don't need this entry for Windows because 4995 // there is GetProcessAffinityMask() api 4996 // 4997 // The intended usage is indicated by these steps: 4998 // 1) The user gets the current affinity mask 4999 // 2) Then sets the affinity by calling this function 5000 // 3) Error check the return value 5001 // 4) Use non-OpenMP parallelization 5002 // 5) Reset the affinity to what was stored in step 1) 5003 #ifdef __cplusplus 5004 extern "C" 5005 #endif 5006 int 5007 kmp_set_thread_affinity_mask_initial() 5008 // the function returns 0 on success, 5009 // -1 if we cannot bind thread 5010 // >0 (errno) if an error happened during binding 5011 { 5012 int gtid = __kmp_get_gtid(); 5013 if (gtid < 0) { 5014 // Do not touch non-omp threads 5015 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5016 "non-omp thread, returning\n")); 5017 return -1; 5018 } 5019 if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) { 5020 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5021 "affinity not initialized, returning\n")); 5022 return -1; 5023 } 5024 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 5025 "set full mask for thread %d\n", 5026 gtid)); 5027 KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL); 5028 return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE); 5029 } 5030 #endif 5031 5032 #endif // KMP_AFFINITY_SUPPORTED 5033