1 /* 2 * kmp_affinity.cpp -- affinity management 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "kmp.h" 17 #include "kmp_i18n.h" 18 #include "kmp_io.h" 19 #include "kmp_str.h" 20 #include "kmp_wrapper_getpid.h" 21 #include "kmp_affinity.h" 22 23 // Store the real or imagined machine hierarchy here 24 static hierarchy_info machine_hierarchy; 25 26 void __kmp_cleanup_hierarchy() { 27 machine_hierarchy.fini(); 28 } 29 30 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 31 kmp_uint32 depth; 32 // The test below is true if affinity is available, but set to "none". Need to init on first use of hierarchical barrier. 33 if (TCR_1(machine_hierarchy.uninitialized)) 34 machine_hierarchy.init(NULL, nproc); 35 36 // Adjust the hierarchy in case num threads exceeds original 37 if (nproc > machine_hierarchy.base_num_threads) 38 machine_hierarchy.resize(nproc); 39 40 depth = machine_hierarchy.depth; 41 KMP_DEBUG_ASSERT(depth > 0); 42 43 thr_bar->depth = depth; 44 thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1; 45 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 46 } 47 48 #if KMP_AFFINITY_SUPPORTED 49 50 bool KMPAffinity::picked_api = false; 51 52 void* KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); } 53 void* KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); } 54 void KMPAffinity::Mask::operator delete(void* p) { __kmp_free(p); } 55 void KMPAffinity::Mask::operator delete[](void* p) { __kmp_free(p); } 56 void* KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); } 57 void KMPAffinity::operator delete(void* p) { __kmp_free(p); } 58 59 void KMPAffinity::pick_api() { 60 KMPAffinity* affinity_dispatch; 61 if (picked_api) 62 return; 63 #if KMP_USE_HWLOC 64 if (__kmp_affinity_top_method == affinity_top_method_hwloc) { 65 affinity_dispatch = new KMPHwlocAffinity(); 66 } else 67 #endif 68 { 69 affinity_dispatch = new KMPNativeAffinity(); 70 } 71 __kmp_affinity_dispatch = affinity_dispatch; 72 picked_api = true; 73 } 74 75 void KMPAffinity::destroy_api() { 76 if (__kmp_affinity_dispatch != NULL) { 77 delete __kmp_affinity_dispatch; 78 __kmp_affinity_dispatch = NULL; 79 picked_api = false; 80 } 81 } 82 83 // 84 // Print the affinity mask to the character array in a pretty format. 85 // 86 char * 87 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask) 88 { 89 KMP_ASSERT(buf_len >= 40); 90 char *scan = buf; 91 char *end = buf + buf_len - 1; 92 93 // 94 // Find first element / check for empty set. 95 // 96 size_t i; 97 i = mask->begin(); 98 if (i == mask->end()) { 99 KMP_SNPRINTF(scan, end-scan+1, "{<empty>}"); 100 while (*scan != '\0') scan++; 101 KMP_ASSERT(scan <= end); 102 return buf; 103 } 104 105 KMP_SNPRINTF(scan, end-scan+1, "{%ld", (long)i); 106 while (*scan != '\0') scan++; 107 i++; 108 for (; i != mask->end(); i = mask->next(i)) { 109 if (! KMP_CPU_ISSET(i, mask)) { 110 continue; 111 } 112 113 // 114 // Check for buffer overflow. A string of the form ",<n>" will have 115 // at most 10 characters, plus we want to leave room to print ",...}" 116 // if the set is too large to print for a total of 15 characters. 117 // We already left room for '\0' in setting end. 118 // 119 if (end - scan < 15) { 120 break; 121 } 122 KMP_SNPRINTF(scan, end-scan+1, ",%-ld", (long)i); 123 while (*scan != '\0') scan++; 124 } 125 if (i != mask->end()) { 126 KMP_SNPRINTF(scan, end-scan+1, ",..."); 127 while (*scan != '\0') scan++; 128 } 129 KMP_SNPRINTF(scan, end-scan+1, "}"); 130 while (*scan != '\0') scan++; 131 KMP_ASSERT(scan <= end); 132 return buf; 133 } 134 135 136 void 137 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) 138 { 139 KMP_CPU_ZERO(mask); 140 141 # if KMP_GROUP_AFFINITY 142 143 if (__kmp_num_proc_groups > 1) { 144 int group; 145 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 146 for (group = 0; group < __kmp_num_proc_groups; group++) { 147 int i; 148 int num = __kmp_GetActiveProcessorCount(group); 149 for (i = 0; i < num; i++) { 150 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 151 } 152 } 153 } 154 else 155 156 # endif /* KMP_GROUP_AFFINITY */ 157 158 { 159 int proc; 160 for (proc = 0; proc < __kmp_xproc; proc++) { 161 KMP_CPU_SET(proc, mask); 162 } 163 } 164 } 165 166 // 167 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be 168 // called to renumber the labels from [0..n] and place them into the child_num 169 // vector of the address object. This is done in case the labels used for 170 // the children at one node of the hierarchy differ from those used for 171 // another node at the same level. Example: suppose the machine has 2 nodes 172 // with 2 packages each. The first node contains packages 601 and 602, and 173 // second node contains packages 603 and 604. If we try to sort the table 174 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604 175 // because we are paying attention to the labels themselves, not the ordinal 176 // child numbers. By using the child numbers in the sort, the result is 177 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604. 178 // 179 static void 180 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os, 181 int numAddrs) 182 { 183 KMP_DEBUG_ASSERT(numAddrs > 0); 184 int depth = address2os->first.depth; 185 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 186 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth 187 * sizeof(unsigned)); 188 int labCt; 189 for (labCt = 0; labCt < depth; labCt++) { 190 address2os[0].first.childNums[labCt] = counts[labCt] = 0; 191 lastLabel[labCt] = address2os[0].first.labels[labCt]; 192 } 193 int i; 194 for (i = 1; i < numAddrs; i++) { 195 for (labCt = 0; labCt < depth; labCt++) { 196 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) { 197 int labCt2; 198 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) { 199 counts[labCt2] = 0; 200 lastLabel[labCt2] = address2os[i].first.labels[labCt2]; 201 } 202 counts[labCt]++; 203 lastLabel[labCt] = address2os[i].first.labels[labCt]; 204 break; 205 } 206 } 207 for (labCt = 0; labCt < depth; labCt++) { 208 address2os[i].first.childNums[labCt] = counts[labCt]; 209 } 210 for (; labCt < (int)Address::maxDepth; labCt++) { 211 address2os[i].first.childNums[labCt] = 0; 212 } 213 } 214 __kmp_free(lastLabel); 215 __kmp_free(counts); 216 } 217 218 219 // 220 // All of the __kmp_affinity_create_*_map() routines should set 221 // __kmp_affinity_masks to a vector of affinity mask objects of length 222 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and 223 // return the number of levels in the machine topology tree (zero if 224 // __kmp_affinity_type == affinity_none). 225 // 226 // All of the __kmp_affinity_create_*_map() routines should set *__kmp_affin_fullMask 227 // to the affinity mask for the initialization thread. They need to save and 228 // restore the mask, and it could be needed later, so saving it is just an 229 // optimization to avoid calling kmp_get_system_affinity() again. 230 // 231 kmp_affin_mask_t *__kmp_affin_fullMask = NULL; 232 233 static int nCoresPerPkg, nPackages; 234 static int __kmp_nThreadsPerCore; 235 #ifndef KMP_DFLT_NTH_CORES 236 static int __kmp_ncores; 237 #endif 238 static int *__kmp_pu_os_idx = NULL; 239 240 // 241 // __kmp_affinity_uniform_topology() doesn't work when called from 242 // places which support arbitrarily many levels in the machine topology 243 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map() 244 // __kmp_affinity_create_x2apicid_map(). 245 // 246 inline static bool 247 __kmp_affinity_uniform_topology() 248 { 249 return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages); 250 } 251 252 253 // 254 // Print out the detailed machine topology map, i.e. the physical locations 255 // of each OS proc. 256 // 257 static void 258 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth, 259 int pkgLevel, int coreLevel, int threadLevel) 260 { 261 int proc; 262 263 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 264 for (proc = 0; proc < len; proc++) { 265 int level; 266 kmp_str_buf_t buf; 267 __kmp_str_buf_init(&buf); 268 for (level = 0; level < depth; level++) { 269 if (level == threadLevel) { 270 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread)); 271 } 272 else if (level == coreLevel) { 273 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core)); 274 } 275 else if (level == pkgLevel) { 276 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package)); 277 } 278 else if (level > pkgLevel) { 279 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node), 280 level - pkgLevel - 1); 281 } 282 else { 283 __kmp_str_buf_print(&buf, "L%d ", level); 284 } 285 __kmp_str_buf_print(&buf, "%d ", 286 address2os[proc].first.labels[level]); 287 } 288 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second, 289 buf.str); 290 __kmp_str_buf_free(&buf); 291 } 292 } 293 294 #if KMP_USE_HWLOC 295 296 // This function removes the topology levels that are radix 1 and don't offer 297 // further information about the topology. The most common example is when you 298 // have one thread context per core, we don't want the extra thread context 299 // level if it offers no unique labels. So they are removed. 300 // return value: the new depth of address2os 301 static int 302 __kmp_affinity_remove_radix_one_levels(AddrUnsPair *address2os, int nActiveThreads, int depth, int* pkgLevel, int* coreLevel, int* threadLevel) { 303 int level; 304 int i; 305 int radix1_detected; 306 307 for (level = depth-1; level >= 0; --level) { 308 // Always keep the package level 309 if (level == *pkgLevel) 310 continue; 311 // Detect if this level is radix 1 312 radix1_detected = 1; 313 for (i = 1; i < nActiveThreads; ++i) { 314 if (address2os[0].first.labels[level] != address2os[i].first.labels[level]) { 315 // There are differing label values for this level so it stays 316 radix1_detected = 0; 317 break; 318 } 319 } 320 if (!radix1_detected) 321 continue; 322 // Radix 1 was detected 323 if (level == *threadLevel) { 324 // If only one thread per core, then just decrement 325 // the depth which removes the threadlevel from address2os 326 for (i = 0; i < nActiveThreads; ++i) { 327 address2os[i].first.depth--; 328 } 329 *threadLevel = -1; 330 } else if (level == *coreLevel) { 331 // For core level, we move the thread labels over if they are still 332 // valid (*threadLevel != -1), and also reduce the depth another level 333 for (i = 0; i < nActiveThreads; ++i) { 334 if (*threadLevel != -1) { 335 address2os[i].first.labels[*coreLevel] = address2os[i].first.labels[*threadLevel]; 336 } 337 address2os[i].first.depth--; 338 } 339 *coreLevel = -1; 340 } 341 } 342 return address2os[0].first.depth; 343 } 344 345 // Returns the number of objects of type 'type' below 'obj' within the topology tree structure. 346 // e.g., if obj is a HWLOC_OBJ_SOCKET object, and type is HWLOC_OBJ_PU, then 347 // this will return the number of PU's under the SOCKET object. 348 static int 349 __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj, hwloc_obj_type_t type) { 350 int retval = 0; 351 hwloc_obj_t first; 352 for(first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type, obj->logical_index, type, 0); 353 first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, obj->type, first) == obj; 354 first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type, first)) 355 { 356 ++retval; 357 } 358 return retval; 359 } 360 361 static int 362 __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os, 363 kmp_i18n_id_t *const msg_id) 364 { 365 *address2os = NULL; 366 *msg_id = kmp_i18n_null; 367 368 // 369 // Save the affinity mask for the current thread. 370 // 371 kmp_affin_mask_t *oldMask; 372 KMP_CPU_ALLOC(oldMask); 373 __kmp_get_system_affinity(oldMask, TRUE); 374 375 int depth = 3; 376 int pkgLevel = 0; 377 int coreLevel = 1; 378 int threadLevel = 2; 379 380 if (! KMP_AFFINITY_CAPABLE()) 381 { 382 // 383 // Hack to try and infer the machine topology using only the data 384 // available from cpuid on the current thread, and __kmp_xproc. 385 // 386 KMP_ASSERT(__kmp_affinity_type == affinity_none); 387 388 nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET, 0), HWLOC_OBJ_CORE); 389 __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, 0), HWLOC_OBJ_PU); 390 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 391 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 392 if (__kmp_affinity_verbose) { 393 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 394 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 395 if (__kmp_affinity_uniform_topology()) { 396 KMP_INFORM(Uniform, "KMP_AFFINITY"); 397 } else { 398 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 399 } 400 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 401 __kmp_nThreadsPerCore, __kmp_ncores); 402 } 403 KMP_CPU_FREE(oldMask); 404 return 0; 405 } 406 407 // 408 // Allocate the data structure to be returned. 409 // 410 AddrUnsPair *retval = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 411 __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 412 413 // 414 // When affinity is off, this routine will still be called to set 415 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 416 // nCoresPerPkg, & nPackages. Make sure all these vars are set 417 // correctly, and return if affinity is not enabled. 418 // 419 420 hwloc_obj_t pu; 421 hwloc_obj_t core; 422 hwloc_obj_t socket; 423 int nActiveThreads = 0; 424 int socket_identifier = 0; 425 // re-calculate globals to count only accessible resources 426 __kmp_ncores = nPackages = nCoresPerPkg = __kmp_nThreadsPerCore = 0; 427 for(socket = hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET, 0); 428 socket != NULL; 429 socket = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET, socket), 430 socket_identifier++) 431 { 432 int core_identifier = 0; 433 int num_active_cores = 0; 434 for(core = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, socket->type, socket->logical_index, HWLOC_OBJ_CORE, 0); 435 core != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, socket->type, core) == socket; 436 core = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, core), 437 core_identifier++) 438 { 439 int pu_identifier = 0; 440 int num_active_threads = 0; 441 for(pu = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, core->type, core->logical_index, HWLOC_OBJ_PU, 0); 442 pu != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, core->type, pu) == core; 443 pu = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PU, pu), 444 pu_identifier++) 445 { 446 Address addr(3); 447 if(! KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask)) 448 continue; // skip inactive (inaccessible) unit 449 KA_TRACE(20, ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n", 450 socket->os_index, socket->logical_index, core->os_index, core->logical_index, pu->os_index,pu->logical_index)); 451 addr.labels[0] = socket_identifier; // package 452 addr.labels[1] = core_identifier; // core 453 addr.labels[2] = pu_identifier; // pu 454 retval[nActiveThreads] = AddrUnsPair(addr, pu->os_index); 455 __kmp_pu_os_idx[nActiveThreads] = pu->os_index; // keep os index for each active pu 456 nActiveThreads++; 457 ++num_active_threads; // count active threads per core 458 } 459 if (num_active_threads) { // were there any active threads on the core? 460 ++__kmp_ncores; // count total active cores 461 ++num_active_cores; // count active cores per socket 462 if (num_active_threads > __kmp_nThreadsPerCore) 463 __kmp_nThreadsPerCore = num_active_threads; // calc maximum 464 } 465 } 466 if (num_active_cores) { // were there any active cores on the socket? 467 ++nPackages; // count total active packages 468 if (num_active_cores > nCoresPerPkg) 469 nCoresPerPkg = num_active_cores; // calc maximum 470 } 471 } 472 473 // 474 // If there's only one thread context to bind to, return now. 475 // 476 KMP_DEBUG_ASSERT(nActiveThreads == __kmp_avail_proc); 477 KMP_ASSERT(nActiveThreads > 0); 478 if (nActiveThreads == 1) { 479 __kmp_ncores = nPackages = 1; 480 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 481 if (__kmp_affinity_verbose) { 482 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 483 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 484 485 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 486 if (__kmp_affinity_respect_mask) { 487 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 488 } else { 489 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 490 } 491 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 492 KMP_INFORM(Uniform, "KMP_AFFINITY"); 493 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 494 __kmp_nThreadsPerCore, __kmp_ncores); 495 } 496 497 if (__kmp_affinity_type == affinity_none) { 498 __kmp_free(retval); 499 KMP_CPU_FREE(oldMask); 500 return 0; 501 } 502 503 // 504 // Form an Address object which only includes the package level. 505 // 506 Address addr(1); 507 addr.labels[0] = retval[0].first.labels[pkgLevel]; 508 retval[0].first = addr; 509 510 if (__kmp_affinity_gran_levels < 0) { 511 __kmp_affinity_gran_levels = 0; 512 } 513 514 if (__kmp_affinity_verbose) { 515 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 516 } 517 518 *address2os = retval; 519 KMP_CPU_FREE(oldMask); 520 return 1; 521 } 522 523 // 524 // Sort the table by physical Id. 525 // 526 qsort(retval, nActiveThreads, sizeof(*retval), __kmp_affinity_cmp_Address_labels); 527 528 // 529 // Check to see if the machine topology is uniform 530 // 531 unsigned uniform = (nPackages * nCoresPerPkg * __kmp_nThreadsPerCore == nActiveThreads); 532 533 // 534 // Print the machine topology summary. 535 // 536 if (__kmp_affinity_verbose) { 537 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 538 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 539 540 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 541 if (__kmp_affinity_respect_mask) { 542 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 543 } else { 544 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 545 } 546 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 547 if (uniform) { 548 KMP_INFORM(Uniform, "KMP_AFFINITY"); 549 } else { 550 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 551 } 552 553 kmp_str_buf_t buf; 554 __kmp_str_buf_init(&buf); 555 556 __kmp_str_buf_print(&buf, "%d", nPackages); 557 //for (level = 1; level <= pkgLevel; level++) { 558 // __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 559 // } 560 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 561 __kmp_nThreadsPerCore, __kmp_ncores); 562 563 __kmp_str_buf_free(&buf); 564 } 565 566 if (__kmp_affinity_type == affinity_none) { 567 __kmp_free(retval); 568 KMP_CPU_FREE(oldMask); 569 return 0; 570 } 571 572 // 573 // Find any levels with radiix 1, and remove them from the map 574 // (except for the package level). 575 // 576 depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth, &pkgLevel, &coreLevel, &threadLevel); 577 578 if (__kmp_affinity_gran_levels < 0) { 579 // 580 // Set the granularity level based on what levels are modeled 581 // in the machine topology map. 582 // 583 __kmp_affinity_gran_levels = 0; 584 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 585 __kmp_affinity_gran_levels++; 586 } 587 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 588 __kmp_affinity_gran_levels++; 589 } 590 if (__kmp_affinity_gran > affinity_gran_package) { 591 __kmp_affinity_gran_levels++; 592 } 593 } 594 595 if (__kmp_affinity_verbose) { 596 __kmp_affinity_print_topology(retval, nActiveThreads, depth, pkgLevel, 597 coreLevel, threadLevel); 598 } 599 600 KMP_CPU_FREE(oldMask); 601 *address2os = retval; 602 return depth; 603 } 604 #endif // KMP_USE_HWLOC 605 606 // 607 // If we don't know how to retrieve the machine's processor topology, or 608 // encounter an error in doing so, this routine is called to form a "flat" 609 // mapping of os thread id's <-> processor id's. 610 // 611 static int 612 __kmp_affinity_create_flat_map(AddrUnsPair **address2os, 613 kmp_i18n_id_t *const msg_id) 614 { 615 *address2os = NULL; 616 *msg_id = kmp_i18n_null; 617 618 // 619 // Even if __kmp_affinity_type == affinity_none, this routine might still 620 // called to set __kmp_ncores, as well as 621 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 622 // 623 if (! KMP_AFFINITY_CAPABLE()) { 624 KMP_ASSERT(__kmp_affinity_type == affinity_none); 625 __kmp_ncores = nPackages = __kmp_xproc; 626 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 627 if (__kmp_affinity_verbose) { 628 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY"); 629 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 630 KMP_INFORM(Uniform, "KMP_AFFINITY"); 631 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 632 __kmp_nThreadsPerCore, __kmp_ncores); 633 } 634 return 0; 635 } 636 637 // 638 // When affinity is off, this routine will still be called to set 639 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 640 // nCoresPerPkg, & nPackages. Make sure all these vars are set 641 // correctly, and return now if affinity is not enabled. 642 // 643 __kmp_ncores = nPackages = __kmp_avail_proc; 644 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 645 if (__kmp_affinity_verbose) { 646 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 647 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, __kmp_affin_fullMask); 648 649 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY"); 650 if (__kmp_affinity_respect_mask) { 651 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 652 } else { 653 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 654 } 655 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 656 KMP_INFORM(Uniform, "KMP_AFFINITY"); 657 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 658 __kmp_nThreadsPerCore, __kmp_ncores); 659 } 660 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 661 __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 662 if (__kmp_affinity_type == affinity_none) { 663 int avail_ct = 0; 664 int i; 665 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 666 if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) 667 continue; 668 __kmp_pu_os_idx[avail_ct++] = i; // suppose indices are flat 669 } 670 return 0; 671 } 672 673 // 674 // Contruct the data structure to be returned. 675 // 676 *address2os = (AddrUnsPair*) 677 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 678 int avail_ct = 0; 679 unsigned int i; 680 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 681 // 682 // Skip this proc if it is not included in the machine model. 683 // 684 if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 685 continue; 686 } 687 __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat 688 Address addr(1); 689 addr.labels[0] = i; 690 (*address2os)[avail_ct++] = AddrUnsPair(addr,i); 691 } 692 if (__kmp_affinity_verbose) { 693 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 694 } 695 696 if (__kmp_affinity_gran_levels < 0) { 697 // 698 // Only the package level is modeled in the machine topology map, 699 // so the #levels of granularity is either 0 or 1. 700 // 701 if (__kmp_affinity_gran > affinity_gran_package) { 702 __kmp_affinity_gran_levels = 1; 703 } 704 else { 705 __kmp_affinity_gran_levels = 0; 706 } 707 } 708 return 1; 709 } 710 711 712 # if KMP_GROUP_AFFINITY 713 714 // 715 // If multiple Windows* OS processor groups exist, we can create a 2-level 716 // topology map with the groups at level 0 and the individual procs at 717 // level 1. 718 // 719 // This facilitates letting the threads float among all procs in a group, 720 // if granularity=group (the default when there are multiple groups). 721 // 722 static int 723 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os, 724 kmp_i18n_id_t *const msg_id) 725 { 726 *address2os = NULL; 727 *msg_id = kmp_i18n_null; 728 729 // 730 // If we don't have multiple processor groups, return now. 731 // The flat mapping will be used. 732 // 733 if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(__kmp_affin_fullMask) >= 0)) { 734 // FIXME set *msg_id 735 return -1; 736 } 737 738 // 739 // Contruct the data structure to be returned. 740 // 741 *address2os = (AddrUnsPair*) 742 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 743 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 744 __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 745 int avail_ct = 0; 746 int i; 747 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 748 // 749 // Skip this proc if it is not included in the machine model. 750 // 751 if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 752 continue; 753 } 754 __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat 755 Address addr(2); 756 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR)); 757 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR)); 758 (*address2os)[avail_ct++] = AddrUnsPair(addr,i); 759 760 if (__kmp_affinity_verbose) { 761 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0], 762 addr.labels[1]); 763 } 764 } 765 766 if (__kmp_affinity_gran_levels < 0) { 767 if (__kmp_affinity_gran == affinity_gran_group) { 768 __kmp_affinity_gran_levels = 1; 769 } 770 else if ((__kmp_affinity_gran == affinity_gran_fine) 771 || (__kmp_affinity_gran == affinity_gran_thread)) { 772 __kmp_affinity_gran_levels = 0; 773 } 774 else { 775 const char *gran_str = NULL; 776 if (__kmp_affinity_gran == affinity_gran_core) { 777 gran_str = "core"; 778 } 779 else if (__kmp_affinity_gran == affinity_gran_package) { 780 gran_str = "package"; 781 } 782 else if (__kmp_affinity_gran == affinity_gran_node) { 783 gran_str = "node"; 784 } 785 else { 786 KMP_ASSERT(0); 787 } 788 789 // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread" 790 __kmp_affinity_gran_levels = 0; 791 } 792 } 793 return 2; 794 } 795 796 # endif /* KMP_GROUP_AFFINITY */ 797 798 799 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 800 801 static int 802 __kmp_cpuid_mask_width(int count) { 803 int r = 0; 804 805 while((1<<r) < count) 806 ++r; 807 return r; 808 } 809 810 811 class apicThreadInfo { 812 public: 813 unsigned osId; // param to __kmp_affinity_bind_thread 814 unsigned apicId; // from cpuid after binding 815 unsigned maxCoresPerPkg; // "" 816 unsigned maxThreadsPerPkg; // "" 817 unsigned pkgId; // inferred from above values 818 unsigned coreId; // "" 819 unsigned threadId; // "" 820 }; 821 822 823 static int 824 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b) 825 { 826 const apicThreadInfo *aa = (const apicThreadInfo *)a; 827 const apicThreadInfo *bb = (const apicThreadInfo *)b; 828 if (aa->osId < bb->osId) return -1; 829 if (aa->osId > bb->osId) return 1; 830 return 0; 831 } 832 833 834 static int 835 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b) 836 { 837 const apicThreadInfo *aa = (const apicThreadInfo *)a; 838 const apicThreadInfo *bb = (const apicThreadInfo *)b; 839 if (aa->pkgId < bb->pkgId) return -1; 840 if (aa->pkgId > bb->pkgId) return 1; 841 if (aa->coreId < bb->coreId) return -1; 842 if (aa->coreId > bb->coreId) return 1; 843 if (aa->threadId < bb->threadId) return -1; 844 if (aa->threadId > bb->threadId) return 1; 845 return 0; 846 } 847 848 849 // 850 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 851 // an algorithm which cycles through the available os threads, setting 852 // the current thread's affinity mask to that thread, and then retrieves 853 // the Apic Id for each thread context using the cpuid instruction. 854 // 855 static int 856 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os, 857 kmp_i18n_id_t *const msg_id) 858 { 859 kmp_cpuid buf; 860 int rc; 861 *address2os = NULL; 862 *msg_id = kmp_i18n_null; 863 864 // 865 // Check if cpuid leaf 4 is supported. 866 // 867 __kmp_x86_cpuid(0, 0, &buf); 868 if (buf.eax < 4) { 869 *msg_id = kmp_i18n_str_NoLeaf4Support; 870 return -1; 871 } 872 873 // 874 // The algorithm used starts by setting the affinity to each available 875 // thread and retrieving info from the cpuid instruction, so if we are 876 // not capable of calling __kmp_get_system_affinity() and 877 // _kmp_get_system_affinity(), then we need to do something else - use 878 // the defaults that we calculated from issuing cpuid without binding 879 // to each proc. 880 // 881 if (! KMP_AFFINITY_CAPABLE()) { 882 // 883 // Hack to try and infer the machine topology using only the data 884 // available from cpuid on the current thread, and __kmp_xproc. 885 // 886 KMP_ASSERT(__kmp_affinity_type == affinity_none); 887 888 // 889 // Get an upper bound on the number of threads per package using 890 // cpuid(1). 891 // 892 // On some OS/chps combinations where HT is supported by the chip 893 // but is disabled, this value will be 2 on a single core chip. 894 // Usually, it will be 2 if HT is enabled and 1 if HT is disabled. 895 // 896 __kmp_x86_cpuid(1, 0, &buf); 897 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 898 if (maxThreadsPerPkg == 0) { 899 maxThreadsPerPkg = 1; 900 } 901 902 // 903 // The num cores per pkg comes from cpuid(4). 904 // 1 must be added to the encoded value. 905 // 906 // The author of cpu_count.cpp treated this only an upper bound 907 // on the number of cores, but I haven't seen any cases where it 908 // was greater than the actual number of cores, so we will treat 909 // it as exact in this block of code. 910 // 911 // First, we need to check if cpuid(4) is supported on this chip. 912 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax 913 // has the value n or greater. 914 // 915 __kmp_x86_cpuid(0, 0, &buf); 916 if (buf.eax >= 4) { 917 __kmp_x86_cpuid(4, 0, &buf); 918 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 919 } 920 else { 921 nCoresPerPkg = 1; 922 } 923 924 // 925 // There is no way to reliably tell if HT is enabled without issuing 926 // the cpuid instruction from every thread, can correlating the cpuid 927 // info, so if the machine is not affinity capable, we assume that HT 928 // is off. We have seen quite a few machines where maxThreadsPerPkg 929 // is 2, yet the machine does not support HT. 930 // 931 // - Older OSes are usually found on machines with older chips, which 932 // do not support HT. 933 // 934 // - The performance penalty for mistakenly identifying a machine as 935 // HT when it isn't (which results in blocktime being incorrecly set 936 // to 0) is greater than the penalty when for mistakenly identifying 937 // a machine as being 1 thread/core when it is really HT enabled 938 // (which results in blocktime being incorrectly set to a positive 939 // value). 940 // 941 __kmp_ncores = __kmp_xproc; 942 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 943 __kmp_nThreadsPerCore = 1; 944 if (__kmp_affinity_verbose) { 945 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY"); 946 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 947 if (__kmp_affinity_uniform_topology()) { 948 KMP_INFORM(Uniform, "KMP_AFFINITY"); 949 } else { 950 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 951 } 952 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 953 __kmp_nThreadsPerCore, __kmp_ncores); 954 } 955 return 0; 956 } 957 958 // 959 // 960 // From here on, we can assume that it is safe to call 961 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), 962 // even if __kmp_affinity_type = affinity_none. 963 // 964 965 // 966 // Save the affinity mask for the current thread. 967 // 968 kmp_affin_mask_t *oldMask; 969 KMP_CPU_ALLOC(oldMask); 970 KMP_ASSERT(oldMask != NULL); 971 __kmp_get_system_affinity(oldMask, TRUE); 972 973 // 974 // Run through each of the available contexts, binding the current thread 975 // to it, and obtaining the pertinent information using the cpuid instr. 976 // 977 // The relevant information is: 978 // 979 // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 980 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 981 // 982 // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The 983 // value of this field determines the width of the core# + thread# 984 // fields in the Apic Id. It is also an upper bound on the number 985 // of threads per package, but it has been verified that situations 986 // happen were it is not exact. In particular, on certain OS/chip 987 // combinations where Intel(R) Hyper-Threading Technology is supported 988 // by the chip but has 989 // been disabled, the value of this field will be 2 (for a single core 990 // chip). On other OS/chip combinations supporting 991 // Intel(R) Hyper-Threading Technology, the value of 992 // this field will be 1 when Intel(R) Hyper-Threading Technology is 993 // disabled and 2 when it is enabled. 994 // 995 // Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The 996 // value of this field (+1) determines the width of the core# field in 997 // the Apic Id. The comments in "cpucount.cpp" say that this value is 998 // an upper bound, but the IA-32 architecture manual says that it is 999 // exactly the number of cores per package, and I haven't seen any 1000 // case where it wasn't. 1001 // 1002 // From this information, deduce the package Id, core Id, and thread Id, 1003 // and set the corresponding fields in the apicThreadInfo struct. 1004 // 1005 unsigned i; 1006 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 1007 __kmp_avail_proc * sizeof(apicThreadInfo)); 1008 unsigned nApics = 0; 1009 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1010 // 1011 // Skip this proc if it is not included in the machine model. 1012 // 1013 if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1014 continue; 1015 } 1016 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 1017 1018 __kmp_affinity_dispatch->bind_thread(i); 1019 threadInfo[nApics].osId = i; 1020 1021 // 1022 // The apic id and max threads per pkg come from cpuid(1). 1023 // 1024 __kmp_x86_cpuid(1, 0, &buf); 1025 if (! (buf.edx >> 9) & 1) { 1026 __kmp_set_system_affinity(oldMask, TRUE); 1027 __kmp_free(threadInfo); 1028 KMP_CPU_FREE(oldMask); 1029 *msg_id = kmp_i18n_str_ApicNotPresent; 1030 return -1; 1031 } 1032 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 1033 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1034 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 1035 threadInfo[nApics].maxThreadsPerPkg = 1; 1036 } 1037 1038 // 1039 // Max cores per pkg comes from cpuid(4). 1040 // 1 must be added to the encoded value. 1041 // 1042 // First, we need to check if cpuid(4) is supported on this chip. 1043 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax 1044 // has the value n or greater. 1045 // 1046 __kmp_x86_cpuid(0, 0, &buf); 1047 if (buf.eax >= 4) { 1048 __kmp_x86_cpuid(4, 0, &buf); 1049 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1050 } 1051 else { 1052 threadInfo[nApics].maxCoresPerPkg = 1; 1053 } 1054 1055 // 1056 // Infer the pkgId / coreId / threadId using only the info 1057 // obtained locally. 1058 // 1059 int widthCT = __kmp_cpuid_mask_width( 1060 threadInfo[nApics].maxThreadsPerPkg); 1061 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 1062 1063 int widthC = __kmp_cpuid_mask_width( 1064 threadInfo[nApics].maxCoresPerPkg); 1065 int widthT = widthCT - widthC; 1066 if (widthT < 0) { 1067 // 1068 // I've never seen this one happen, but I suppose it could, if 1069 // the cpuid instruction on a chip was really screwed up. 1070 // Make sure to restore the affinity mask before the tail call. 1071 // 1072 __kmp_set_system_affinity(oldMask, TRUE); 1073 __kmp_free(threadInfo); 1074 KMP_CPU_FREE(oldMask); 1075 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1076 return -1; 1077 } 1078 1079 int maskC = (1 << widthC) - 1; 1080 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) 1081 &maskC; 1082 1083 int maskT = (1 << widthT) - 1; 1084 threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT; 1085 1086 nApics++; 1087 } 1088 1089 // 1090 // We've collected all the info we need. 1091 // Restore the old affinity mask for this thread. 1092 // 1093 __kmp_set_system_affinity(oldMask, TRUE); 1094 1095 // 1096 // If there's only one thread context to bind to, form an Address object 1097 // with depth 1 and return immediately (or, if affinity is off, set 1098 // address2os to NULL and return). 1099 // 1100 // If it is configured to omit the package level when there is only a 1101 // single package, the logic at the end of this routine won't work if 1102 // there is only a single thread - it would try to form an Address 1103 // object with depth 0. 1104 // 1105 KMP_ASSERT(nApics > 0); 1106 if (nApics == 1) { 1107 __kmp_ncores = nPackages = 1; 1108 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1109 if (__kmp_affinity_verbose) { 1110 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1111 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1112 1113 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1114 if (__kmp_affinity_respect_mask) { 1115 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1116 } else { 1117 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1118 } 1119 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1120 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1121 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1122 __kmp_nThreadsPerCore, __kmp_ncores); 1123 } 1124 1125 if (__kmp_affinity_type == affinity_none) { 1126 __kmp_free(threadInfo); 1127 KMP_CPU_FREE(oldMask); 1128 return 0; 1129 } 1130 1131 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair)); 1132 Address addr(1); 1133 addr.labels[0] = threadInfo[0].pkgId; 1134 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId); 1135 1136 if (__kmp_affinity_gran_levels < 0) { 1137 __kmp_affinity_gran_levels = 0; 1138 } 1139 1140 if (__kmp_affinity_verbose) { 1141 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 1142 } 1143 1144 __kmp_free(threadInfo); 1145 KMP_CPU_FREE(oldMask); 1146 return 1; 1147 } 1148 1149 // 1150 // Sort the threadInfo table by physical Id. 1151 // 1152 qsort(threadInfo, nApics, sizeof(*threadInfo), 1153 __kmp_affinity_cmp_apicThreadInfo_phys_id); 1154 1155 // 1156 // The table is now sorted by pkgId / coreId / threadId, but we really 1157 // don't know the radix of any of the fields. pkgId's may be sparsely 1158 // assigned among the chips on a system. Although coreId's are usually 1159 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned 1160 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1161 // 1162 // For that matter, we don't know what coresPerPkg and threadsPerCore 1163 // (or the total # packages) are at this point - we want to determine 1164 // that now. We only have an upper bound on the first two figures. 1165 // 1166 // We also perform a consistency check at this point: the values returned 1167 // by the cpuid instruction for any thread bound to a given package had 1168 // better return the same info for maxThreadsPerPkg and maxCoresPerPkg. 1169 // 1170 nPackages = 1; 1171 nCoresPerPkg = 1; 1172 __kmp_nThreadsPerCore = 1; 1173 unsigned nCores = 1; 1174 1175 unsigned pkgCt = 1; // to determine radii 1176 unsigned lastPkgId = threadInfo[0].pkgId; 1177 unsigned coreCt = 1; 1178 unsigned lastCoreId = threadInfo[0].coreId; 1179 unsigned threadCt = 1; 1180 unsigned lastThreadId = threadInfo[0].threadId; 1181 1182 // intra-pkg consist checks 1183 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 1184 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 1185 1186 for (i = 1; i < nApics; i++) { 1187 if (threadInfo[i].pkgId != lastPkgId) { 1188 nCores++; 1189 pkgCt++; 1190 lastPkgId = threadInfo[i].pkgId; 1191 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; 1192 coreCt = 1; 1193 lastCoreId = threadInfo[i].coreId; 1194 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1195 threadCt = 1; 1196 lastThreadId = threadInfo[i].threadId; 1197 1198 // 1199 // This is a different package, so go on to the next iteration 1200 // without doing any consistency checks. Reset the consistency 1201 // check vars, though. 1202 // 1203 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 1204 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 1205 continue; 1206 } 1207 1208 if (threadInfo[i].coreId != lastCoreId) { 1209 nCores++; 1210 coreCt++; 1211 lastCoreId = threadInfo[i].coreId; 1212 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1213 threadCt = 1; 1214 lastThreadId = threadInfo[i].threadId; 1215 } 1216 else if (threadInfo[i].threadId != lastThreadId) { 1217 threadCt++; 1218 lastThreadId = threadInfo[i].threadId; 1219 } 1220 else { 1221 __kmp_free(threadInfo); 1222 KMP_CPU_FREE(oldMask); 1223 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1224 return -1; 1225 } 1226 1227 // 1228 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 1229 // fields agree between all the threads bounds to a given package. 1230 // 1231 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) 1232 || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 1233 __kmp_free(threadInfo); 1234 KMP_CPU_FREE(oldMask); 1235 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1236 return -1; 1237 } 1238 } 1239 nPackages = pkgCt; 1240 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; 1241 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1242 1243 // 1244 // When affinity is off, this routine will still be called to set 1245 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 1246 // nCoresPerPkg, & nPackages. Make sure all these vars are set 1247 // correctly, and return now if affinity is not enabled. 1248 // 1249 __kmp_ncores = nCores; 1250 if (__kmp_affinity_verbose) { 1251 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1252 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1253 1254 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1255 if (__kmp_affinity_respect_mask) { 1256 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1257 } else { 1258 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1259 } 1260 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1261 if (__kmp_affinity_uniform_topology()) { 1262 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1263 } else { 1264 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1265 } 1266 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1267 __kmp_nThreadsPerCore, __kmp_ncores); 1268 1269 } 1270 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 1271 KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc); 1272 __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 1273 for (i = 0; i < nApics; ++i) { 1274 __kmp_pu_os_idx[i] = threadInfo[i].osId; 1275 } 1276 if (__kmp_affinity_type == affinity_none) { 1277 __kmp_free(threadInfo); 1278 KMP_CPU_FREE(oldMask); 1279 return 0; 1280 } 1281 1282 // 1283 // Now that we've determined the number of packages, the number of cores 1284 // per package, and the number of threads per core, we can construct the 1285 // data structure that is to be returned. 1286 // 1287 int pkgLevel = 0; 1288 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1; 1289 int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 1290 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 1291 1292 KMP_ASSERT(depth > 0); 1293 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1294 1295 for (i = 0; i < nApics; ++i) { 1296 Address addr(depth); 1297 unsigned os = threadInfo[i].osId; 1298 int d = 0; 1299 1300 if (pkgLevel >= 0) { 1301 addr.labels[d++] = threadInfo[i].pkgId; 1302 } 1303 if (coreLevel >= 0) { 1304 addr.labels[d++] = threadInfo[i].coreId; 1305 } 1306 if (threadLevel >= 0) { 1307 addr.labels[d++] = threadInfo[i].threadId; 1308 } 1309 (*address2os)[i] = AddrUnsPair(addr, os); 1310 } 1311 1312 if (__kmp_affinity_gran_levels < 0) { 1313 // 1314 // Set the granularity level based on what levels are modeled 1315 // in the machine topology map. 1316 // 1317 __kmp_affinity_gran_levels = 0; 1318 if ((threadLevel >= 0) 1319 && (__kmp_affinity_gran > affinity_gran_thread)) { 1320 __kmp_affinity_gran_levels++; 1321 } 1322 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1323 __kmp_affinity_gran_levels++; 1324 } 1325 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) { 1326 __kmp_affinity_gran_levels++; 1327 } 1328 } 1329 1330 if (__kmp_affinity_verbose) { 1331 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel, 1332 coreLevel, threadLevel); 1333 } 1334 1335 __kmp_free(threadInfo); 1336 KMP_CPU_FREE(oldMask); 1337 return depth; 1338 } 1339 1340 1341 // 1342 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 1343 // architectures support a newer interface for specifying the x2APIC Ids, 1344 // based on cpuid leaf 11. 1345 // 1346 static int 1347 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os, 1348 kmp_i18n_id_t *const msg_id) 1349 { 1350 kmp_cpuid buf; 1351 1352 *address2os = NULL; 1353 *msg_id = kmp_i18n_null; 1354 1355 // 1356 // Check to see if cpuid leaf 11 is supported. 1357 // 1358 __kmp_x86_cpuid(0, 0, &buf); 1359 if (buf.eax < 11) { 1360 *msg_id = kmp_i18n_str_NoLeaf11Support; 1361 return -1; 1362 } 1363 __kmp_x86_cpuid(11, 0, &buf); 1364 if (buf.ebx == 0) { 1365 *msg_id = kmp_i18n_str_NoLeaf11Support; 1366 return -1; 1367 } 1368 1369 // 1370 // Find the number of levels in the machine topology. While we're at it, 1371 // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will 1372 // try to get more accurate values later by explicitly counting them, 1373 // but get reasonable defaults now, in case we return early. 1374 // 1375 int level; 1376 int threadLevel = -1; 1377 int coreLevel = -1; 1378 int pkgLevel = -1; 1379 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1380 1381 for (level = 0;; level++) { 1382 if (level > 31) { 1383 // 1384 // FIXME: Hack for DPD200163180 1385 // 1386 // If level is big then something went wrong -> exiting 1387 // 1388 // There could actually be 32 valid levels in the machine topology, 1389 // but so far, the only machine we have seen which does not exit 1390 // this loop before iteration 32 has fubar x2APIC settings. 1391 // 1392 // For now, just reject this case based upon loop trip count. 1393 // 1394 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1395 return -1; 1396 } 1397 __kmp_x86_cpuid(11, level, &buf); 1398 if (buf.ebx == 0) { 1399 if (pkgLevel < 0) { 1400 // 1401 // Will infer nPackages from __kmp_xproc 1402 // 1403 pkgLevel = level; 1404 level++; 1405 } 1406 break; 1407 } 1408 int kind = (buf.ecx >> 8) & 0xff; 1409 if (kind == 1) { 1410 // 1411 // SMT level 1412 // 1413 threadLevel = level; 1414 coreLevel = -1; 1415 pkgLevel = -1; 1416 __kmp_nThreadsPerCore = buf.ebx & 0xffff; 1417 if (__kmp_nThreadsPerCore == 0) { 1418 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1419 return -1; 1420 } 1421 } 1422 else if (kind == 2) { 1423 // 1424 // core level 1425 // 1426 coreLevel = level; 1427 pkgLevel = -1; 1428 nCoresPerPkg = buf.ebx & 0xffff; 1429 if (nCoresPerPkg == 0) { 1430 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1431 return -1; 1432 } 1433 } 1434 else { 1435 if (level <= 0) { 1436 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1437 return -1; 1438 } 1439 if (pkgLevel >= 0) { 1440 continue; 1441 } 1442 pkgLevel = level; 1443 nPackages = buf.ebx & 0xffff; 1444 if (nPackages == 0) { 1445 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1446 return -1; 1447 } 1448 } 1449 } 1450 int depth = level; 1451 1452 // 1453 // In the above loop, "level" was counted from the finest level (usually 1454 // thread) to the coarsest. The caller expects that we will place the 1455 // labels in (*address2os)[].first.labels[] in the inverse order, so 1456 // we need to invert the vars saying which level means what. 1457 // 1458 if (threadLevel >= 0) { 1459 threadLevel = depth - threadLevel - 1; 1460 } 1461 if (coreLevel >= 0) { 1462 coreLevel = depth - coreLevel - 1; 1463 } 1464 KMP_DEBUG_ASSERT(pkgLevel >= 0); 1465 pkgLevel = depth - pkgLevel - 1; 1466 1467 // 1468 // The algorithm used starts by setting the affinity to each available 1469 // thread and retrieving info from the cpuid instruction, so if we are 1470 // not capable of calling __kmp_get_system_affinity() and 1471 // _kmp_get_system_affinity(), then we need to do something else - use 1472 // the defaults that we calculated from issuing cpuid without binding 1473 // to each proc. 1474 // 1475 if (! KMP_AFFINITY_CAPABLE()) 1476 { 1477 // 1478 // Hack to try and infer the machine topology using only the data 1479 // available from cpuid on the current thread, and __kmp_xproc. 1480 // 1481 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1482 1483 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1484 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1485 if (__kmp_affinity_verbose) { 1486 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 1487 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1488 if (__kmp_affinity_uniform_topology()) { 1489 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1490 } else { 1491 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1492 } 1493 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1494 __kmp_nThreadsPerCore, __kmp_ncores); 1495 } 1496 return 0; 1497 } 1498 1499 // 1500 // 1501 // From here on, we can assume that it is safe to call 1502 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), 1503 // even if __kmp_affinity_type = affinity_none. 1504 // 1505 1506 // 1507 // Save the affinity mask for the current thread. 1508 // 1509 kmp_affin_mask_t *oldMask; 1510 KMP_CPU_ALLOC(oldMask); 1511 __kmp_get_system_affinity(oldMask, TRUE); 1512 1513 // 1514 // Allocate the data structure to be returned. 1515 // 1516 AddrUnsPair *retval = (AddrUnsPair *) 1517 __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 1518 1519 // 1520 // Run through each of the available contexts, binding the current thread 1521 // to it, and obtaining the pertinent information using the cpuid instr. 1522 // 1523 unsigned int proc; 1524 int nApics = 0; 1525 KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) { 1526 // 1527 // Skip this proc if it is not included in the machine model. 1528 // 1529 if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 1530 continue; 1531 } 1532 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc); 1533 1534 __kmp_affinity_dispatch->bind_thread(proc); 1535 1536 // 1537 // Extrach the labels for each level in the machine topology map 1538 // from the Apic ID. 1539 // 1540 Address addr(depth); 1541 int prev_shift = 0; 1542 1543 for (level = 0; level < depth; level++) { 1544 __kmp_x86_cpuid(11, level, &buf); 1545 unsigned apicId = buf.edx; 1546 if (buf.ebx == 0) { 1547 if (level != depth - 1) { 1548 KMP_CPU_FREE(oldMask); 1549 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1550 return -1; 1551 } 1552 addr.labels[depth - level - 1] = apicId >> prev_shift; 1553 level++; 1554 break; 1555 } 1556 int shift = buf.eax & 0x1f; 1557 int mask = (1 << shift) - 1; 1558 addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift; 1559 prev_shift = shift; 1560 } 1561 if (level != depth) { 1562 KMP_CPU_FREE(oldMask); 1563 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1564 return -1; 1565 } 1566 1567 retval[nApics] = AddrUnsPair(addr, proc); 1568 nApics++; 1569 } 1570 1571 // 1572 // We've collected all the info we need. 1573 // Restore the old affinity mask for this thread. 1574 // 1575 __kmp_set_system_affinity(oldMask, TRUE); 1576 1577 // 1578 // If there's only one thread context to bind to, return now. 1579 // 1580 KMP_ASSERT(nApics > 0); 1581 if (nApics == 1) { 1582 __kmp_ncores = nPackages = 1; 1583 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1584 if (__kmp_affinity_verbose) { 1585 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1586 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1587 1588 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1589 if (__kmp_affinity_respect_mask) { 1590 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1591 } else { 1592 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1593 } 1594 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1595 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1596 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1597 __kmp_nThreadsPerCore, __kmp_ncores); 1598 } 1599 1600 if (__kmp_affinity_type == affinity_none) { 1601 __kmp_free(retval); 1602 KMP_CPU_FREE(oldMask); 1603 return 0; 1604 } 1605 1606 // 1607 // Form an Address object which only includes the package level. 1608 // 1609 Address addr(1); 1610 addr.labels[0] = retval[0].first.labels[pkgLevel]; 1611 retval[0].first = addr; 1612 1613 if (__kmp_affinity_gran_levels < 0) { 1614 __kmp_affinity_gran_levels = 0; 1615 } 1616 1617 if (__kmp_affinity_verbose) { 1618 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 1619 } 1620 1621 *address2os = retval; 1622 KMP_CPU_FREE(oldMask); 1623 return 1; 1624 } 1625 1626 // 1627 // Sort the table by physical Id. 1628 // 1629 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels); 1630 1631 // 1632 // Find the radix at each of the levels. 1633 // 1634 unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1635 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1636 unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1637 unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1638 for (level = 0; level < depth; level++) { 1639 totals[level] = 1; 1640 maxCt[level] = 1; 1641 counts[level] = 1; 1642 last[level] = retval[0].first.labels[level]; 1643 } 1644 1645 // 1646 // From here on, the iteration variable "level" runs from the finest 1647 // level to the coarsest, i.e. we iterate forward through 1648 // (*address2os)[].first.labels[] - in the previous loops, we iterated 1649 // backwards. 1650 // 1651 for (proc = 1; (int)proc < nApics; proc++) { 1652 int level; 1653 for (level = 0; level < depth; level++) { 1654 if (retval[proc].first.labels[level] != last[level]) { 1655 int j; 1656 for (j = level + 1; j < depth; j++) { 1657 totals[j]++; 1658 counts[j] = 1; 1659 // The line below causes printing incorrect topology information 1660 // in case the max value for some level (maxCt[level]) is encountered earlier than 1661 // some less value while going through the array. 1662 // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2 1663 // whereas it must be 4. 1664 // TODO!!! Check if it can be commented safely 1665 //maxCt[j] = 1; 1666 last[j] = retval[proc].first.labels[j]; 1667 } 1668 totals[level]++; 1669 counts[level]++; 1670 if (counts[level] > maxCt[level]) { 1671 maxCt[level] = counts[level]; 1672 } 1673 last[level] = retval[proc].first.labels[level]; 1674 break; 1675 } 1676 else if (level == depth - 1) { 1677 __kmp_free(last); 1678 __kmp_free(maxCt); 1679 __kmp_free(counts); 1680 __kmp_free(totals); 1681 __kmp_free(retval); 1682 KMP_CPU_FREE(oldMask); 1683 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 1684 return -1; 1685 } 1686 } 1687 } 1688 1689 // 1690 // When affinity is off, this routine will still be called to set 1691 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 1692 // nCoresPerPkg, & nPackages. Make sure all these vars are set 1693 // correctly, and return if affinity is not enabled. 1694 // 1695 if (threadLevel >= 0) { 1696 __kmp_nThreadsPerCore = maxCt[threadLevel]; 1697 } 1698 else { 1699 __kmp_nThreadsPerCore = 1; 1700 } 1701 nPackages = totals[pkgLevel]; 1702 1703 if (coreLevel >= 0) { 1704 __kmp_ncores = totals[coreLevel]; 1705 nCoresPerPkg = maxCt[coreLevel]; 1706 } 1707 else { 1708 __kmp_ncores = nPackages; 1709 nCoresPerPkg = 1; 1710 } 1711 1712 // 1713 // Check to see if the machine topology is uniform 1714 // 1715 unsigned prod = maxCt[0]; 1716 for (level = 1; level < depth; level++) { 1717 prod *= maxCt[level]; 1718 } 1719 bool uniform = (prod == totals[level - 1]); 1720 1721 // 1722 // Print the machine topology summary. 1723 // 1724 if (__kmp_affinity_verbose) { 1725 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 1726 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1727 1728 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1729 if (__kmp_affinity_respect_mask) { 1730 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 1731 } else { 1732 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 1733 } 1734 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1735 if (uniform) { 1736 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1737 } else { 1738 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1739 } 1740 1741 kmp_str_buf_t buf; 1742 __kmp_str_buf_init(&buf); 1743 1744 __kmp_str_buf_print(&buf, "%d", totals[0]); 1745 for (level = 1; level <= pkgLevel; level++) { 1746 __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 1747 } 1748 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 1749 __kmp_nThreadsPerCore, __kmp_ncores); 1750 1751 __kmp_str_buf_free(&buf); 1752 } 1753 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 1754 KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc); 1755 __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 1756 for (proc = 0; (int)proc < nApics; ++proc) { 1757 __kmp_pu_os_idx[proc] = retval[proc].second; 1758 } 1759 if (__kmp_affinity_type == affinity_none) { 1760 __kmp_free(last); 1761 __kmp_free(maxCt); 1762 __kmp_free(counts); 1763 __kmp_free(totals); 1764 __kmp_free(retval); 1765 KMP_CPU_FREE(oldMask); 1766 return 0; 1767 } 1768 1769 // 1770 // Find any levels with radiix 1, and remove them from the map 1771 // (except for the package level). 1772 // 1773 int new_depth = 0; 1774 for (level = 0; level < depth; level++) { 1775 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1776 continue; 1777 } 1778 new_depth++; 1779 } 1780 1781 // 1782 // If we are removing any levels, allocate a new vector to return, 1783 // and copy the relevant information to it. 1784 // 1785 if (new_depth != depth) { 1786 AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate( 1787 sizeof(AddrUnsPair) * nApics); 1788 for (proc = 0; (int)proc < nApics; proc++) { 1789 Address addr(new_depth); 1790 new_retval[proc] = AddrUnsPair(addr, retval[proc].second); 1791 } 1792 int new_level = 0; 1793 int newPkgLevel = -1; 1794 int newCoreLevel = -1; 1795 int newThreadLevel = -1; 1796 int i; 1797 for (level = 0; level < depth; level++) { 1798 if ((maxCt[level] == 1) 1799 && (level != pkgLevel)) { 1800 // 1801 // Remove this level. Never remove the package level 1802 // 1803 continue; 1804 } 1805 if (level == pkgLevel) { 1806 newPkgLevel = level; 1807 } 1808 if (level == coreLevel) { 1809 newCoreLevel = level; 1810 } 1811 if (level == threadLevel) { 1812 newThreadLevel = level; 1813 } 1814 for (proc = 0; (int)proc < nApics; proc++) { 1815 new_retval[proc].first.labels[new_level] 1816 = retval[proc].first.labels[level]; 1817 } 1818 new_level++; 1819 } 1820 1821 __kmp_free(retval); 1822 retval = new_retval; 1823 depth = new_depth; 1824 pkgLevel = newPkgLevel; 1825 coreLevel = newCoreLevel; 1826 threadLevel = newThreadLevel; 1827 } 1828 1829 if (__kmp_affinity_gran_levels < 0) { 1830 // 1831 // Set the granularity level based on what levels are modeled 1832 // in the machine topology map. 1833 // 1834 __kmp_affinity_gran_levels = 0; 1835 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1836 __kmp_affinity_gran_levels++; 1837 } 1838 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1839 __kmp_affinity_gran_levels++; 1840 } 1841 if (__kmp_affinity_gran > affinity_gran_package) { 1842 __kmp_affinity_gran_levels++; 1843 } 1844 } 1845 1846 if (__kmp_affinity_verbose) { 1847 __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, 1848 coreLevel, threadLevel); 1849 } 1850 1851 __kmp_free(last); 1852 __kmp_free(maxCt); 1853 __kmp_free(counts); 1854 __kmp_free(totals); 1855 KMP_CPU_FREE(oldMask); 1856 *address2os = retval; 1857 return depth; 1858 } 1859 1860 1861 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1862 1863 1864 #define osIdIndex 0 1865 #define threadIdIndex 1 1866 #define coreIdIndex 2 1867 #define pkgIdIndex 3 1868 #define nodeIdIndex 4 1869 1870 typedef unsigned *ProcCpuInfo; 1871 static unsigned maxIndex = pkgIdIndex; 1872 1873 1874 static int 1875 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b) 1876 { 1877 const unsigned *aa = (const unsigned *)a; 1878 const unsigned *bb = (const unsigned *)b; 1879 if (aa[osIdIndex] < bb[osIdIndex]) return -1; 1880 if (aa[osIdIndex] > bb[osIdIndex]) return 1; 1881 return 0; 1882 }; 1883 1884 1885 static int 1886 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b) 1887 { 1888 unsigned i; 1889 const unsigned *aa = *((const unsigned **)a); 1890 const unsigned *bb = *((const unsigned **)b); 1891 for (i = maxIndex; ; i--) { 1892 if (aa[i] < bb[i]) return -1; 1893 if (aa[i] > bb[i]) return 1; 1894 if (i == osIdIndex) break; 1895 } 1896 return 0; 1897 } 1898 1899 1900 // 1901 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 1902 // affinity map. 1903 // 1904 static int 1905 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line, 1906 kmp_i18n_id_t *const msg_id, FILE *f) 1907 { 1908 *address2os = NULL; 1909 *msg_id = kmp_i18n_null; 1910 1911 // 1912 // Scan of the file, and count the number of "processor" (osId) fields, 1913 // and find the highest value of <n> for a node_<n> field. 1914 // 1915 char buf[256]; 1916 unsigned num_records = 0; 1917 while (! feof(f)) { 1918 buf[sizeof(buf) - 1] = 1; 1919 if (! fgets(buf, sizeof(buf), f)) { 1920 // 1921 // Read errors presumably because of EOF 1922 // 1923 break; 1924 } 1925 1926 char s1[] = "processor"; 1927 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1928 num_records++; 1929 continue; 1930 } 1931 1932 // 1933 // FIXME - this will match "node_<n> <garbage>" 1934 // 1935 unsigned level; 1936 if (KMP_SSCANF(buf, "node_%d id", &level) == 1) { 1937 if (nodeIdIndex + level >= maxIndex) { 1938 maxIndex = nodeIdIndex + level; 1939 } 1940 continue; 1941 } 1942 } 1943 1944 // 1945 // Check for empty file / no valid processor records, or too many. 1946 // The number of records can't exceed the number of valid bits in the 1947 // affinity mask. 1948 // 1949 if (num_records == 0) { 1950 *line = 0; 1951 *msg_id = kmp_i18n_str_NoProcRecords; 1952 return -1; 1953 } 1954 if (num_records > (unsigned)__kmp_xproc) { 1955 *line = 0; 1956 *msg_id = kmp_i18n_str_TooManyProcRecords; 1957 return -1; 1958 } 1959 1960 // 1961 // Set the file pointer back to the begginning, so that we can scan the 1962 // file again, this time performing a full parse of the data. 1963 // Allocate a vector of ProcCpuInfo object, where we will place the data. 1964 // Adding an extra element at the end allows us to remove a lot of extra 1965 // checks for termination conditions. 1966 // 1967 if (fseek(f, 0, SEEK_SET) != 0) { 1968 *line = 0; 1969 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 1970 return -1; 1971 } 1972 1973 // 1974 // Allocate the array of records to store the proc info in. The dummy 1975 // element at the end makes the logic in filling them out easier to code. 1976 // 1977 unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1) 1978 * sizeof(unsigned *)); 1979 unsigned i; 1980 for (i = 0; i <= num_records; i++) { 1981 threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1) 1982 * sizeof(unsigned)); 1983 } 1984 1985 #define CLEANUP_THREAD_INFO \ 1986 for (i = 0; i <= num_records; i++) { \ 1987 __kmp_free(threadInfo[i]); \ 1988 } \ 1989 __kmp_free(threadInfo); 1990 1991 // 1992 // A value of UINT_MAX means that we didn't find the field 1993 // 1994 unsigned __index; 1995 1996 #define INIT_PROC_INFO(p) \ 1997 for (__index = 0; __index <= maxIndex; __index++) { \ 1998 (p)[__index] = UINT_MAX; \ 1999 } 2000 2001 for (i = 0; i <= num_records; i++) { 2002 INIT_PROC_INFO(threadInfo[i]); 2003 } 2004 2005 unsigned num_avail = 0; 2006 *line = 0; 2007 while (! feof(f)) { 2008 // 2009 // Create an inner scoping level, so that all the goto targets at the 2010 // end of the loop appear in an outer scoping level. This avoids 2011 // warnings about jumping past an initialization to a target in the 2012 // same block. 2013 // 2014 { 2015 buf[sizeof(buf) - 1] = 1; 2016 bool long_line = false; 2017 if (! fgets(buf, sizeof(buf), f)) { 2018 // 2019 // Read errors presumably because of EOF 2020 // 2021 // If there is valid data in threadInfo[num_avail], then fake 2022 // a blank line in ensure that the last address gets parsed. 2023 // 2024 bool valid = false; 2025 for (i = 0; i <= maxIndex; i++) { 2026 if (threadInfo[num_avail][i] != UINT_MAX) { 2027 valid = true; 2028 } 2029 } 2030 if (! valid) { 2031 break; 2032 } 2033 buf[0] = 0; 2034 } else if (!buf[sizeof(buf) - 1]) { 2035 // 2036 // The line is longer than the buffer. Set a flag and don't 2037 // emit an error if we were going to ignore the line, anyway. 2038 // 2039 long_line = true; 2040 2041 #define CHECK_LINE \ 2042 if (long_line) { \ 2043 CLEANUP_THREAD_INFO; \ 2044 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 2045 return -1; \ 2046 } 2047 } 2048 (*line)++; 2049 2050 char s1[] = "processor"; 2051 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 2052 CHECK_LINE; 2053 char *p = strchr(buf + sizeof(s1) - 1, ':'); 2054 unsigned val; 2055 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2056 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field; 2057 threadInfo[num_avail][osIdIndex] = val; 2058 #if KMP_OS_LINUX && USE_SYSFS_INFO 2059 char path[256]; 2060 KMP_SNPRINTF(path, sizeof(path), 2061 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 2062 threadInfo[num_avail][osIdIndex]); 2063 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 2064 2065 KMP_SNPRINTF(path, sizeof(path), 2066 "/sys/devices/system/cpu/cpu%u/topology/core_id", 2067 threadInfo[num_avail][osIdIndex]); 2068 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 2069 continue; 2070 #else 2071 } 2072 char s2[] = "physical id"; 2073 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 2074 CHECK_LINE; 2075 char *p = strchr(buf + sizeof(s2) - 1, ':'); 2076 unsigned val; 2077 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2078 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field; 2079 threadInfo[num_avail][pkgIdIndex] = val; 2080 continue; 2081 } 2082 char s3[] = "core id"; 2083 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 2084 CHECK_LINE; 2085 char *p = strchr(buf + sizeof(s3) - 1, ':'); 2086 unsigned val; 2087 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2088 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field; 2089 threadInfo[num_avail][coreIdIndex] = val; 2090 continue; 2091 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 2092 } 2093 char s4[] = "thread id"; 2094 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 2095 CHECK_LINE; 2096 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2097 unsigned val; 2098 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2099 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field; 2100 threadInfo[num_avail][threadIdIndex] = val; 2101 continue; 2102 } 2103 unsigned level; 2104 if (KMP_SSCANF(buf, "node_%d id", &level) == 1) { 2105 CHECK_LINE; 2106 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2107 unsigned val; 2108 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2109 KMP_ASSERT(nodeIdIndex + level <= maxIndex); 2110 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field; 2111 threadInfo[num_avail][nodeIdIndex + level] = val; 2112 continue; 2113 } 2114 2115 // 2116 // We didn't recognize the leading token on the line. 2117 // There are lots of leading tokens that we don't recognize - 2118 // if the line isn't empty, go on to the next line. 2119 // 2120 if ((*buf != 0) && (*buf != '\n')) { 2121 // 2122 // If the line is longer than the buffer, read characters 2123 // until we find a newline. 2124 // 2125 if (long_line) { 2126 int ch; 2127 while (((ch = fgetc(f)) != EOF) && (ch != '\n')); 2128 } 2129 continue; 2130 } 2131 2132 // 2133 // A newline has signalled the end of the processor record. 2134 // Check that there aren't too many procs specified. 2135 // 2136 if ((int)num_avail == __kmp_xproc) { 2137 CLEANUP_THREAD_INFO; 2138 *msg_id = kmp_i18n_str_TooManyEntries; 2139 return -1; 2140 } 2141 2142 // 2143 // Check for missing fields. The osId field must be there, and we 2144 // currently require that the physical id field is specified, also. 2145 // 2146 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 2147 CLEANUP_THREAD_INFO; 2148 *msg_id = kmp_i18n_str_MissingProcField; 2149 return -1; 2150 } 2151 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 2152 CLEANUP_THREAD_INFO; 2153 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 2154 return -1; 2155 } 2156 2157 // 2158 // Skip this proc if it is not included in the machine model. 2159 // 2160 if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], __kmp_affin_fullMask)) { 2161 INIT_PROC_INFO(threadInfo[num_avail]); 2162 continue; 2163 } 2164 2165 // 2166 // We have a successful parse of this proc's info. 2167 // Increment the counter, and prepare for the next proc. 2168 // 2169 num_avail++; 2170 KMP_ASSERT(num_avail <= num_records); 2171 INIT_PROC_INFO(threadInfo[num_avail]); 2172 } 2173 continue; 2174 2175 no_val: 2176 CLEANUP_THREAD_INFO; 2177 *msg_id = kmp_i18n_str_MissingValCpuinfo; 2178 return -1; 2179 2180 dup_field: 2181 CLEANUP_THREAD_INFO; 2182 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 2183 return -1; 2184 } 2185 *line = 0; 2186 2187 # if KMP_MIC && REDUCE_TEAM_SIZE 2188 unsigned teamSize = 0; 2189 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2190 2191 // check for num_records == __kmp_xproc ??? 2192 2193 // 2194 // If there's only one thread context to bind to, form an Address object 2195 // with depth 1 and return immediately (or, if affinity is off, set 2196 // address2os to NULL and return). 2197 // 2198 // If it is configured to omit the package level when there is only a 2199 // single package, the logic at the end of this routine won't work if 2200 // there is only a single thread - it would try to form an Address 2201 // object with depth 0. 2202 // 2203 KMP_ASSERT(num_avail > 0); 2204 KMP_ASSERT(num_avail <= num_records); 2205 if (num_avail == 1) { 2206 __kmp_ncores = 1; 2207 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 2208 if (__kmp_affinity_verbose) { 2209 if (! KMP_AFFINITY_CAPABLE()) { 2210 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2211 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2212 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2213 } 2214 else { 2215 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2216 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 2217 __kmp_affin_fullMask); 2218 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2219 if (__kmp_affinity_respect_mask) { 2220 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2221 } else { 2222 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2223 } 2224 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2225 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2226 } 2227 int index; 2228 kmp_str_buf_t buf; 2229 __kmp_str_buf_init(&buf); 2230 __kmp_str_buf_print(&buf, "1"); 2231 for (index = maxIndex - 1; index > pkgIdIndex; index--) { 2232 __kmp_str_buf_print(&buf, " x 1"); 2233 } 2234 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1); 2235 __kmp_str_buf_free(&buf); 2236 } 2237 2238 if (__kmp_affinity_type == affinity_none) { 2239 CLEANUP_THREAD_INFO; 2240 return 0; 2241 } 2242 2243 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair)); 2244 Address addr(1); 2245 addr.labels[0] = threadInfo[0][pkgIdIndex]; 2246 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]); 2247 2248 if (__kmp_affinity_gran_levels < 0) { 2249 __kmp_affinity_gran_levels = 0; 2250 } 2251 2252 if (__kmp_affinity_verbose) { 2253 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 2254 } 2255 2256 CLEANUP_THREAD_INFO; 2257 return 1; 2258 } 2259 2260 // 2261 // Sort the threadInfo table by physical Id. 2262 // 2263 qsort(threadInfo, num_avail, sizeof(*threadInfo), 2264 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 2265 2266 // 2267 // The table is now sorted by pkgId / coreId / threadId, but we really 2268 // don't know the radix of any of the fields. pkgId's may be sparsely 2269 // assigned among the chips on a system. Although coreId's are usually 2270 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned 2271 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2272 // 2273 // For that matter, we don't know what coresPerPkg and threadsPerCore 2274 // (or the total # packages) are at this point - we want to determine 2275 // that now. We only have an upper bound on the first two figures. 2276 // 2277 unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1) 2278 * sizeof(unsigned)); 2279 unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1) 2280 * sizeof(unsigned)); 2281 unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1) 2282 * sizeof(unsigned)); 2283 unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1) 2284 * sizeof(unsigned)); 2285 2286 bool assign_thread_ids = false; 2287 unsigned threadIdCt; 2288 unsigned index; 2289 2290 restart_radix_check: 2291 threadIdCt = 0; 2292 2293 // 2294 // Initialize the counter arrays with data from threadInfo[0]. 2295 // 2296 if (assign_thread_ids) { 2297 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 2298 threadInfo[0][threadIdIndex] = threadIdCt++; 2299 } 2300 else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 2301 threadIdCt = threadInfo[0][threadIdIndex] + 1; 2302 } 2303 } 2304 for (index = 0; index <= maxIndex; index++) { 2305 counts[index] = 1; 2306 maxCt[index] = 1; 2307 totals[index] = 1; 2308 lastId[index] = threadInfo[0][index];; 2309 } 2310 2311 // 2312 // Run through the rest of the OS procs. 2313 // 2314 for (i = 1; i < num_avail; i++) { 2315 // 2316 // Find the most significant index whose id differs 2317 // from the id for the previous OS proc. 2318 // 2319 for (index = maxIndex; index >= threadIdIndex; index--) { 2320 if (assign_thread_ids && (index == threadIdIndex)) { 2321 // 2322 // Auto-assign the thread id field if it wasn't specified. 2323 // 2324 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2325 threadInfo[i][threadIdIndex] = threadIdCt++; 2326 } 2327 2328 // 2329 // Aparrently the thread id field was specified for some 2330 // entries and not others. Start the thread id counter 2331 // off at the next higher thread id. 2332 // 2333 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2334 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2335 } 2336 } 2337 if (threadInfo[i][index] != lastId[index]) { 2338 // 2339 // Run through all indices which are less significant, 2340 // and reset the counts to 1. 2341 // 2342 // At all levels up to and including index, we need to 2343 // increment the totals and record the last id. 2344 // 2345 unsigned index2; 2346 for (index2 = threadIdIndex; index2 < index; index2++) { 2347 totals[index2]++; 2348 if (counts[index2] > maxCt[index2]) { 2349 maxCt[index2] = counts[index2]; 2350 } 2351 counts[index2] = 1; 2352 lastId[index2] = threadInfo[i][index2]; 2353 } 2354 counts[index]++; 2355 totals[index]++; 2356 lastId[index] = threadInfo[i][index]; 2357 2358 if (assign_thread_ids && (index > threadIdIndex)) { 2359 2360 # if KMP_MIC && REDUCE_TEAM_SIZE 2361 // 2362 // The default team size is the total #threads in the machine 2363 // minus 1 thread for every core that has 3 or more threads. 2364 // 2365 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 ); 2366 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2367 2368 // 2369 // Restart the thread counter, as we are on a new core. 2370 // 2371 threadIdCt = 0; 2372 2373 // 2374 // Auto-assign the thread id field if it wasn't specified. 2375 // 2376 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2377 threadInfo[i][threadIdIndex] = threadIdCt++; 2378 } 2379 2380 // 2381 // Aparrently the thread id field was specified for some 2382 // entries and not others. Start the thread id counter 2383 // off at the next higher thread id. 2384 // 2385 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2386 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2387 } 2388 } 2389 break; 2390 } 2391 } 2392 if (index < threadIdIndex) { 2393 // 2394 // If thread ids were specified, it is an error if they are not 2395 // unique. Also, check that we waven't already restarted the 2396 // loop (to be safe - shouldn't need to). 2397 // 2398 if ((threadInfo[i][threadIdIndex] != UINT_MAX) 2399 || assign_thread_ids) { 2400 __kmp_free(lastId); 2401 __kmp_free(totals); 2402 __kmp_free(maxCt); 2403 __kmp_free(counts); 2404 CLEANUP_THREAD_INFO; 2405 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2406 return -1; 2407 } 2408 2409 // 2410 // If the thread ids were not specified and we see entries 2411 // entries that are duplicates, start the loop over and 2412 // assign the thread ids manually. 2413 // 2414 assign_thread_ids = true; 2415 goto restart_radix_check; 2416 } 2417 } 2418 2419 # if KMP_MIC && REDUCE_TEAM_SIZE 2420 // 2421 // The default team size is the total #threads in the machine 2422 // minus 1 thread for every core that has 3 or more threads. 2423 // 2424 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 ); 2425 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2426 2427 for (index = threadIdIndex; index <= maxIndex; index++) { 2428 if (counts[index] > maxCt[index]) { 2429 maxCt[index] = counts[index]; 2430 } 2431 } 2432 2433 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2434 nCoresPerPkg = maxCt[coreIdIndex]; 2435 nPackages = totals[pkgIdIndex]; 2436 2437 // 2438 // Check to see if the machine topology is uniform 2439 // 2440 unsigned prod = totals[maxIndex]; 2441 for (index = threadIdIndex; index < maxIndex; index++) { 2442 prod *= maxCt[index]; 2443 } 2444 bool uniform = (prod == totals[threadIdIndex]); 2445 2446 // 2447 // When affinity is off, this routine will still be called to set 2448 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 2449 // nCoresPerPkg, & nPackages. Make sure all these vars are set 2450 // correctly, and return now if affinity is not enabled. 2451 // 2452 __kmp_ncores = totals[coreIdIndex]; 2453 2454 if (__kmp_affinity_verbose) { 2455 if (! KMP_AFFINITY_CAPABLE()) { 2456 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2457 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2458 if (uniform) { 2459 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2460 } else { 2461 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2462 } 2463 } 2464 else { 2465 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2466 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, __kmp_affin_fullMask); 2467 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2468 if (__kmp_affinity_respect_mask) { 2469 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2470 } else { 2471 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2472 } 2473 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2474 if (uniform) { 2475 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2476 } else { 2477 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2478 } 2479 } 2480 kmp_str_buf_t buf; 2481 __kmp_str_buf_init(&buf); 2482 2483 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]); 2484 for (index = maxIndex - 1; index >= pkgIdIndex; index--) { 2485 __kmp_str_buf_print(&buf, " x %d", maxCt[index]); 2486 } 2487 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex], 2488 maxCt[threadIdIndex], __kmp_ncores); 2489 2490 __kmp_str_buf_free(&buf); 2491 } 2492 2493 # if KMP_MIC && REDUCE_TEAM_SIZE 2494 // 2495 // Set the default team size. 2496 // 2497 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2498 __kmp_dflt_team_nth = teamSize; 2499 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n", 2500 __kmp_dflt_team_nth)); 2501 } 2502 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2503 2504 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 2505 KMP_DEBUG_ASSERT(num_avail == __kmp_avail_proc); 2506 __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 2507 for (i = 0; i < num_avail; ++i) { // fill the os indices 2508 __kmp_pu_os_idx[i] = threadInfo[i][osIdIndex]; 2509 } 2510 2511 if (__kmp_affinity_type == affinity_none) { 2512 __kmp_free(lastId); 2513 __kmp_free(totals); 2514 __kmp_free(maxCt); 2515 __kmp_free(counts); 2516 CLEANUP_THREAD_INFO; 2517 return 0; 2518 } 2519 2520 // 2521 // Count the number of levels which have more nodes at that level than 2522 // at the parent's level (with there being an implicit root node of 2523 // the top level). This is equivalent to saying that there is at least 2524 // one node at this level which has a sibling. These levels are in the 2525 // map, and the package level is always in the map. 2526 // 2527 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2528 int level = 0; 2529 for (index = threadIdIndex; index < maxIndex; index++) { 2530 KMP_ASSERT(totals[index] >= totals[index + 1]); 2531 inMap[index] = (totals[index] > totals[index + 1]); 2532 } 2533 inMap[maxIndex] = (totals[maxIndex] > 1); 2534 inMap[pkgIdIndex] = true; 2535 2536 int depth = 0; 2537 for (index = threadIdIndex; index <= maxIndex; index++) { 2538 if (inMap[index]) { 2539 depth++; 2540 } 2541 } 2542 KMP_ASSERT(depth > 0); 2543 2544 // 2545 // Construct the data structure that is to be returned. 2546 // 2547 *address2os = (AddrUnsPair*) 2548 __kmp_allocate(sizeof(AddrUnsPair) * num_avail); 2549 int pkgLevel = -1; 2550 int coreLevel = -1; 2551 int threadLevel = -1; 2552 2553 for (i = 0; i < num_avail; ++i) { 2554 Address addr(depth); 2555 unsigned os = threadInfo[i][osIdIndex]; 2556 int src_index; 2557 int dst_index = 0; 2558 2559 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2560 if (! inMap[src_index]) { 2561 continue; 2562 } 2563 addr.labels[dst_index] = threadInfo[i][src_index]; 2564 if (src_index == pkgIdIndex) { 2565 pkgLevel = dst_index; 2566 } 2567 else if (src_index == coreIdIndex) { 2568 coreLevel = dst_index; 2569 } 2570 else if (src_index == threadIdIndex) { 2571 threadLevel = dst_index; 2572 } 2573 dst_index++; 2574 } 2575 (*address2os)[i] = AddrUnsPair(addr, os); 2576 } 2577 2578 if (__kmp_affinity_gran_levels < 0) { 2579 // 2580 // Set the granularity level based on what levels are modeled 2581 // in the machine topology map. 2582 // 2583 unsigned src_index; 2584 __kmp_affinity_gran_levels = 0; 2585 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) { 2586 if (! inMap[src_index]) { 2587 continue; 2588 } 2589 switch (src_index) { 2590 case threadIdIndex: 2591 if (__kmp_affinity_gran > affinity_gran_thread) { 2592 __kmp_affinity_gran_levels++; 2593 } 2594 2595 break; 2596 case coreIdIndex: 2597 if (__kmp_affinity_gran > affinity_gran_core) { 2598 __kmp_affinity_gran_levels++; 2599 } 2600 break; 2601 2602 case pkgIdIndex: 2603 if (__kmp_affinity_gran > affinity_gran_package) { 2604 __kmp_affinity_gran_levels++; 2605 } 2606 break; 2607 } 2608 } 2609 } 2610 2611 if (__kmp_affinity_verbose) { 2612 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel, 2613 coreLevel, threadLevel); 2614 } 2615 2616 __kmp_free(inMap); 2617 __kmp_free(lastId); 2618 __kmp_free(totals); 2619 __kmp_free(maxCt); 2620 __kmp_free(counts); 2621 CLEANUP_THREAD_INFO; 2622 return depth; 2623 } 2624 2625 2626 // 2627 // Create and return a table of affinity masks, indexed by OS thread ID. 2628 // This routine handles OR'ing together all the affinity masks of threads 2629 // that are sufficiently close, if granularity > fine. 2630 // 2631 static kmp_affin_mask_t * 2632 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique, 2633 AddrUnsPair *address2os, unsigned numAddrs) 2634 { 2635 // 2636 // First form a table of affinity masks in order of OS thread id. 2637 // 2638 unsigned depth; 2639 unsigned maxOsId; 2640 unsigned i; 2641 2642 KMP_ASSERT(numAddrs > 0); 2643 depth = address2os[0].first.depth; 2644 2645 maxOsId = 0; 2646 for (i = 0; i < numAddrs; i++) { 2647 unsigned osId = address2os[i].second; 2648 if (osId > maxOsId) { 2649 maxOsId = osId; 2650 } 2651 } 2652 kmp_affin_mask_t *osId2Mask; 2653 KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId+1)); 2654 2655 // 2656 // Sort the address2os table according to physical order. Doing so 2657 // will put all threads on the same core/package/node in consecutive 2658 // locations. 2659 // 2660 qsort(address2os, numAddrs, sizeof(*address2os), 2661 __kmp_affinity_cmp_Address_labels); 2662 2663 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2664 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2665 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2666 } 2667 if (__kmp_affinity_gran_levels >= (int)depth) { 2668 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2669 && (__kmp_affinity_type != affinity_none))) { 2670 KMP_WARNING(AffThreadsMayMigrate); 2671 } 2672 } 2673 2674 // 2675 // Run through the table, forming the masks for all threads on each 2676 // core. Threads on the same core will have identical "Address" 2677 // objects, not considering the last level, which must be the thread 2678 // id. All threads on a core will appear consecutively. 2679 // 2680 unsigned unique = 0; 2681 unsigned j = 0; // index of 1st thread on core 2682 unsigned leader = 0; 2683 Address *leaderAddr = &(address2os[0].first); 2684 kmp_affin_mask_t *sum; 2685 KMP_CPU_ALLOC_ON_STACK(sum); 2686 KMP_CPU_ZERO(sum); 2687 KMP_CPU_SET(address2os[0].second, sum); 2688 for (i = 1; i < numAddrs; i++) { 2689 // 2690 // If this thread is sufficiently close to the leader (within the 2691 // granularity setting), then set the bit for this os thread in the 2692 // affinity mask for this group, and go on to the next thread. 2693 // 2694 if (leaderAddr->isClose(address2os[i].first, 2695 __kmp_affinity_gran_levels)) { 2696 KMP_CPU_SET(address2os[i].second, sum); 2697 continue; 2698 } 2699 2700 // 2701 // For every thread in this group, copy the mask to the thread's 2702 // entry in the osId2Mask table. Mark the first address as a 2703 // leader. 2704 // 2705 for (; j < i; j++) { 2706 unsigned osId = address2os[j].second; 2707 KMP_DEBUG_ASSERT(osId <= maxOsId); 2708 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2709 KMP_CPU_COPY(mask, sum); 2710 address2os[j].first.leader = (j == leader); 2711 } 2712 unique++; 2713 2714 // 2715 // Start a new mask. 2716 // 2717 leader = i; 2718 leaderAddr = &(address2os[i].first); 2719 KMP_CPU_ZERO(sum); 2720 KMP_CPU_SET(address2os[i].second, sum); 2721 } 2722 2723 // 2724 // For every thread in last group, copy the mask to the thread's 2725 // entry in the osId2Mask table. 2726 // 2727 for (; j < i; j++) { 2728 unsigned osId = address2os[j].second; 2729 KMP_DEBUG_ASSERT(osId <= maxOsId); 2730 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2731 KMP_CPU_COPY(mask, sum); 2732 address2os[j].first.leader = (j == leader); 2733 } 2734 unique++; 2735 KMP_CPU_FREE_FROM_STACK(sum); 2736 2737 *maxIndex = maxOsId; 2738 *numUnique = unique; 2739 return osId2Mask; 2740 } 2741 2742 2743 // 2744 // Stuff for the affinity proclist parsers. It's easier to declare these vars 2745 // as file-static than to try and pass them through the calling sequence of 2746 // the recursive-descent OMP_PLACES parser. 2747 // 2748 static kmp_affin_mask_t *newMasks; 2749 static int numNewMasks; 2750 static int nextNewMask; 2751 2752 #define ADD_MASK(_mask) \ 2753 { \ 2754 if (nextNewMask >= numNewMasks) { \ 2755 int i; \ 2756 numNewMasks *= 2; \ 2757 kmp_affin_mask_t* temp; \ 2758 KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \ 2759 for(i=0;i<numNewMasks/2;i++) { \ 2760 kmp_affin_mask_t* src = KMP_CPU_INDEX(newMasks, i); \ 2761 kmp_affin_mask_t* dest = KMP_CPU_INDEX(temp, i); \ 2762 KMP_CPU_COPY(dest, src); \ 2763 } \ 2764 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks/2); \ 2765 newMasks = temp; \ 2766 } \ 2767 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 2768 nextNewMask++; \ 2769 } 2770 2771 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \ 2772 { \ 2773 if (((_osId) > _maxOsId) || \ 2774 (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 2775 if (__kmp_affinity_verbose || (__kmp_affinity_warnings \ 2776 && (__kmp_affinity_type != affinity_none))) { \ 2777 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 2778 } \ 2779 } \ 2780 else { \ 2781 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 2782 } \ 2783 } 2784 2785 2786 // 2787 // Re-parse the proclist (for the explicit affinity type), and form the list 2788 // of affinity newMasks indexed by gtid. 2789 // 2790 static void 2791 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 2792 unsigned int *out_numMasks, const char *proclist, 2793 kmp_affin_mask_t *osId2Mask, int maxOsId) 2794 { 2795 int i; 2796 const char *scan = proclist; 2797 const char *next = proclist; 2798 2799 // 2800 // We use malloc() for the temporary mask vector, 2801 // so that we can use realloc() to extend it. 2802 // 2803 numNewMasks = 2; 2804 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 2805 nextNewMask = 0; 2806 kmp_affin_mask_t *sumMask; 2807 KMP_CPU_ALLOC(sumMask); 2808 int setSize = 0; 2809 2810 for (;;) { 2811 int start, end, stride; 2812 2813 SKIP_WS(scan); 2814 next = scan; 2815 if (*next == '\0') { 2816 break; 2817 } 2818 2819 if (*next == '{') { 2820 int num; 2821 setSize = 0; 2822 next++; // skip '{' 2823 SKIP_WS(next); 2824 scan = next; 2825 2826 // 2827 // Read the first integer in the set. 2828 // 2829 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2830 "bad proclist"); 2831 SKIP_DIGITS(next); 2832 num = __kmp_str_to_int(scan, *next); 2833 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2834 2835 // 2836 // Copy the mask for that osId to the sum (union) mask. 2837 // 2838 if ((num > maxOsId) || 2839 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2840 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2841 && (__kmp_affinity_type != affinity_none))) { 2842 KMP_WARNING(AffIgnoreInvalidProcID, num); 2843 } 2844 KMP_CPU_ZERO(sumMask); 2845 } 2846 else { 2847 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2848 setSize = 1; 2849 } 2850 2851 for (;;) { 2852 // 2853 // Check for end of set. 2854 // 2855 SKIP_WS(next); 2856 if (*next == '}') { 2857 next++; // skip '}' 2858 break; 2859 } 2860 2861 // 2862 // Skip optional comma. 2863 // 2864 if (*next == ',') { 2865 next++; 2866 } 2867 SKIP_WS(next); 2868 2869 // 2870 // Read the next integer in the set. 2871 // 2872 scan = next; 2873 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2874 "bad explicit proc list"); 2875 2876 SKIP_DIGITS(next); 2877 num = __kmp_str_to_int(scan, *next); 2878 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2879 2880 // 2881 // Add the mask for that osId to the sum mask. 2882 // 2883 if ((num > maxOsId) || 2884 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2885 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2886 && (__kmp_affinity_type != affinity_none))) { 2887 KMP_WARNING(AffIgnoreInvalidProcID, num); 2888 } 2889 } 2890 else { 2891 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2892 setSize++; 2893 } 2894 } 2895 if (setSize > 0) { 2896 ADD_MASK(sumMask); 2897 } 2898 2899 SKIP_WS(next); 2900 if (*next == ',') { 2901 next++; 2902 } 2903 scan = next; 2904 continue; 2905 } 2906 2907 // 2908 // Read the first integer. 2909 // 2910 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2911 SKIP_DIGITS(next); 2912 start = __kmp_str_to_int(scan, *next); 2913 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 2914 SKIP_WS(next); 2915 2916 // 2917 // If this isn't a range, then add a mask to the list and go on. 2918 // 2919 if (*next != '-') { 2920 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2921 2922 // 2923 // Skip optional comma. 2924 // 2925 if (*next == ',') { 2926 next++; 2927 } 2928 scan = next; 2929 continue; 2930 } 2931 2932 // 2933 // This is a range. Skip over the '-' and read in the 2nd int. 2934 // 2935 next++; // skip '-' 2936 SKIP_WS(next); 2937 scan = next; 2938 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2939 SKIP_DIGITS(next); 2940 end = __kmp_str_to_int(scan, *next); 2941 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 2942 2943 // 2944 // Check for a stride parameter 2945 // 2946 stride = 1; 2947 SKIP_WS(next); 2948 if (*next == ':') { 2949 // 2950 // A stride is specified. Skip over the ':" and read the 3rd int. 2951 // 2952 int sign = +1; 2953 next++; // skip ':' 2954 SKIP_WS(next); 2955 scan = next; 2956 if (*next == '-') { 2957 sign = -1; 2958 next++; 2959 SKIP_WS(next); 2960 scan = next; 2961 } 2962 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2963 "bad explicit proc list"); 2964 SKIP_DIGITS(next); 2965 stride = __kmp_str_to_int(scan, *next); 2966 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 2967 stride *= sign; 2968 } 2969 2970 // 2971 // Do some range checks. 2972 // 2973 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 2974 if (stride > 0) { 2975 KMP_ASSERT2(start <= end, "bad explicit proc list"); 2976 } 2977 else { 2978 KMP_ASSERT2(start >= end, "bad explicit proc list"); 2979 } 2980 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 2981 2982 // 2983 // Add the mask for each OS proc # to the list. 2984 // 2985 if (stride > 0) { 2986 do { 2987 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2988 start += stride; 2989 } while (start <= end); 2990 } 2991 else { 2992 do { 2993 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2994 start += stride; 2995 } while (start >= end); 2996 } 2997 2998 // 2999 // Skip optional comma. 3000 // 3001 SKIP_WS(next); 3002 if (*next == ',') { 3003 next++; 3004 } 3005 scan = next; 3006 } 3007 3008 *out_numMasks = nextNewMask; 3009 if (nextNewMask == 0) { 3010 *out_masks = NULL; 3011 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3012 return; 3013 } 3014 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3015 for(i = 0; i < nextNewMask; i++) { 3016 kmp_affin_mask_t* src = KMP_CPU_INDEX(newMasks, i); 3017 kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i); 3018 KMP_CPU_COPY(dest, src); 3019 } 3020 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3021 KMP_CPU_FREE(sumMask); 3022 } 3023 3024 3025 # if OMP_40_ENABLED 3026 3027 /*----------------------------------------------------------------------------- 3028 3029 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 3030 places. Again, Here is the grammar: 3031 3032 place_list := place 3033 place_list := place , place_list 3034 place := num 3035 place := place : num 3036 place := place : num : signed 3037 place := { subplacelist } 3038 place := ! place // (lowest priority) 3039 subplace_list := subplace 3040 subplace_list := subplace , subplace_list 3041 subplace := num 3042 subplace := num : num 3043 subplace := num : num : signed 3044 signed := num 3045 signed := + signed 3046 signed := - signed 3047 3048 -----------------------------------------------------------------------------*/ 3049 3050 static void 3051 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask, 3052 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize) 3053 { 3054 const char *next; 3055 3056 for (;;) { 3057 int start, count, stride, i; 3058 3059 // 3060 // Read in the starting proc id 3061 // 3062 SKIP_WS(*scan); 3063 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 3064 "bad explicit places list"); 3065 next = *scan; 3066 SKIP_DIGITS(next); 3067 start = __kmp_str_to_int(*scan, *next); 3068 KMP_ASSERT(start >= 0); 3069 *scan = next; 3070 3071 // 3072 // valid follow sets are ',' ':' and '}' 3073 // 3074 SKIP_WS(*scan); 3075 if (**scan == '}' || **scan == ',') { 3076 if ((start > maxOsId) || 3077 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3078 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3079 && (__kmp_affinity_type != affinity_none))) { 3080 KMP_WARNING(AffIgnoreInvalidProcID, start); 3081 } 3082 } 3083 else { 3084 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3085 (*setSize)++; 3086 } 3087 if (**scan == '}') { 3088 break; 3089 } 3090 (*scan)++; // skip ',' 3091 continue; 3092 } 3093 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3094 (*scan)++; // skip ':' 3095 3096 // 3097 // Read count parameter 3098 // 3099 SKIP_WS(*scan); 3100 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 3101 "bad explicit places list"); 3102 next = *scan; 3103 SKIP_DIGITS(next); 3104 count = __kmp_str_to_int(*scan, *next); 3105 KMP_ASSERT(count >= 0); 3106 *scan = next; 3107 3108 // 3109 // valid follow sets are ',' ':' and '}' 3110 // 3111 SKIP_WS(*scan); 3112 if (**scan == '}' || **scan == ',') { 3113 for (i = 0; i < count; i++) { 3114 if ((start > maxOsId) || 3115 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3116 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3117 && (__kmp_affinity_type != affinity_none))) { 3118 KMP_WARNING(AffIgnoreInvalidProcID, start); 3119 } 3120 break; // don't proliferate warnings for large count 3121 } 3122 else { 3123 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3124 start++; 3125 (*setSize)++; 3126 } 3127 } 3128 if (**scan == '}') { 3129 break; 3130 } 3131 (*scan)++; // skip ',' 3132 continue; 3133 } 3134 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3135 (*scan)++; // skip ':' 3136 3137 // 3138 // Read stride parameter 3139 // 3140 int sign = +1; 3141 for (;;) { 3142 SKIP_WS(*scan); 3143 if (**scan == '+') { 3144 (*scan)++; // skip '+' 3145 continue; 3146 } 3147 if (**scan == '-') { 3148 sign *= -1; 3149 (*scan)++; // skip '-' 3150 continue; 3151 } 3152 break; 3153 } 3154 SKIP_WS(*scan); 3155 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 3156 "bad explicit places list"); 3157 next = *scan; 3158 SKIP_DIGITS(next); 3159 stride = __kmp_str_to_int(*scan, *next); 3160 KMP_ASSERT(stride >= 0); 3161 *scan = next; 3162 stride *= sign; 3163 3164 // 3165 // valid follow sets are ',' and '}' 3166 // 3167 SKIP_WS(*scan); 3168 if (**scan == '}' || **scan == ',') { 3169 for (i = 0; i < count; i++) { 3170 if ((start > maxOsId) || 3171 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3172 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3173 && (__kmp_affinity_type != affinity_none))) { 3174 KMP_WARNING(AffIgnoreInvalidProcID, start); 3175 } 3176 break; // don't proliferate warnings for large count 3177 } 3178 else { 3179 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3180 start += stride; 3181 (*setSize)++; 3182 } 3183 } 3184 if (**scan == '}') { 3185 break; 3186 } 3187 (*scan)++; // skip ',' 3188 continue; 3189 } 3190 3191 KMP_ASSERT2(0, "bad explicit places list"); 3192 } 3193 } 3194 3195 3196 static void 3197 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 3198 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize) 3199 { 3200 const char *next; 3201 3202 // 3203 // valid follow sets are '{' '!' and num 3204 // 3205 SKIP_WS(*scan); 3206 if (**scan == '{') { 3207 (*scan)++; // skip '{' 3208 __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask, 3209 setSize); 3210 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 3211 (*scan)++; // skip '}' 3212 } 3213 else if (**scan == '!') { 3214 (*scan)++; // skip '!' 3215 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 3216 KMP_CPU_COMPLEMENT(maxOsId, tempMask); 3217 } 3218 else if ((**scan >= '0') && (**scan <= '9')) { 3219 next = *scan; 3220 SKIP_DIGITS(next); 3221 int num = __kmp_str_to_int(*scan, *next); 3222 KMP_ASSERT(num >= 0); 3223 if ((num > maxOsId) || 3224 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3225 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3226 && (__kmp_affinity_type != affinity_none))) { 3227 KMP_WARNING(AffIgnoreInvalidProcID, num); 3228 } 3229 } 3230 else { 3231 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 3232 (*setSize)++; 3233 } 3234 *scan = next; // skip num 3235 } 3236 else { 3237 KMP_ASSERT2(0, "bad explicit places list"); 3238 } 3239 } 3240 3241 3242 //static void 3243 void 3244 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 3245 unsigned int *out_numMasks, const char *placelist, 3246 kmp_affin_mask_t *osId2Mask, int maxOsId) 3247 { 3248 int i,j,count,stride,sign; 3249 const char *scan = placelist; 3250 const char *next = placelist; 3251 3252 numNewMasks = 2; 3253 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 3254 nextNewMask = 0; 3255 3256 // tempMask is modified based on the previous or initial 3257 // place to form the current place 3258 // previousMask contains the previous place 3259 kmp_affin_mask_t *tempMask; 3260 kmp_affin_mask_t *previousMask; 3261 KMP_CPU_ALLOC(tempMask); 3262 KMP_CPU_ZERO(tempMask); 3263 KMP_CPU_ALLOC(previousMask); 3264 KMP_CPU_ZERO(previousMask); 3265 int setSize = 0; 3266 3267 for (;;) { 3268 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 3269 3270 // 3271 // valid follow sets are ',' ':' and EOL 3272 // 3273 SKIP_WS(scan); 3274 if (*scan == '\0' || *scan == ',') { 3275 if (setSize > 0) { 3276 ADD_MASK(tempMask); 3277 } 3278 KMP_CPU_ZERO(tempMask); 3279 setSize = 0; 3280 if (*scan == '\0') { 3281 break; 3282 } 3283 scan++; // skip ',' 3284 continue; 3285 } 3286 3287 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3288 scan++; // skip ':' 3289 3290 // 3291 // Read count parameter 3292 // 3293 SKIP_WS(scan); 3294 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), 3295 "bad explicit places list"); 3296 next = scan; 3297 SKIP_DIGITS(next); 3298 count = __kmp_str_to_int(scan, *next); 3299 KMP_ASSERT(count >= 0); 3300 scan = next; 3301 3302 // 3303 // valid follow sets are ',' ':' and EOL 3304 // 3305 SKIP_WS(scan); 3306 if (*scan == '\0' || *scan == ',') { 3307 stride = +1; 3308 } 3309 else { 3310 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3311 scan++; // skip ':' 3312 3313 // 3314 // Read stride parameter 3315 // 3316 sign = +1; 3317 for (;;) { 3318 SKIP_WS(scan); 3319 if (*scan == '+') { 3320 scan++; // skip '+' 3321 continue; 3322 } 3323 if (*scan == '-') { 3324 sign *= -1; 3325 scan++; // skip '-' 3326 continue; 3327 } 3328 break; 3329 } 3330 SKIP_WS(scan); 3331 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), 3332 "bad explicit places list"); 3333 next = scan; 3334 SKIP_DIGITS(next); 3335 stride = __kmp_str_to_int(scan, *next); 3336 KMP_DEBUG_ASSERT(stride >= 0); 3337 scan = next; 3338 stride *= sign; 3339 } 3340 3341 // Add places determined by initial_place : count : stride 3342 for (i = 0; i < count; i++) { 3343 if (setSize == 0) { 3344 break; 3345 } 3346 // Add the current place, then build the next place (tempMask) from that 3347 KMP_CPU_COPY(previousMask, tempMask); 3348 ADD_MASK(previousMask); 3349 KMP_CPU_ZERO(tempMask); 3350 setSize = 0; 3351 KMP_CPU_SET_ITERATE(j, previousMask) { 3352 if (! KMP_CPU_ISSET(j, previousMask)) { 3353 continue; 3354 } 3355 if ((j+stride > maxOsId) || (j+stride < 0) || 3356 (! KMP_CPU_ISSET(j, __kmp_affin_fullMask)) || 3357 (! KMP_CPU_ISSET(j+stride, KMP_CPU_INDEX(osId2Mask, j+stride)))) { 3358 if ((__kmp_affinity_verbose || (__kmp_affinity_warnings 3359 && (__kmp_affinity_type != affinity_none))) && i < count - 1) { 3360 KMP_WARNING(AffIgnoreInvalidProcID, j+stride); 3361 } 3362 continue; 3363 } 3364 KMP_CPU_SET(j+stride, tempMask); 3365 setSize++; 3366 } 3367 } 3368 KMP_CPU_ZERO(tempMask); 3369 setSize = 0; 3370 3371 // 3372 // valid follow sets are ',' and EOL 3373 // 3374 SKIP_WS(scan); 3375 if (*scan == '\0') { 3376 break; 3377 } 3378 if (*scan == ',') { 3379 scan++; // skip ',' 3380 continue; 3381 } 3382 3383 KMP_ASSERT2(0, "bad explicit places list"); 3384 } 3385 3386 *out_numMasks = nextNewMask; 3387 if (nextNewMask == 0) { 3388 *out_masks = NULL; 3389 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3390 return; 3391 } 3392 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3393 KMP_CPU_FREE(tempMask); 3394 KMP_CPU_FREE(previousMask); 3395 for(i = 0; i < nextNewMask; i++) { 3396 kmp_affin_mask_t* src = KMP_CPU_INDEX(newMasks, i); 3397 kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i); 3398 KMP_CPU_COPY(dest, src); 3399 } 3400 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3401 } 3402 3403 # endif /* OMP_40_ENABLED */ 3404 3405 #undef ADD_MASK 3406 #undef ADD_MASK_OSID 3407 3408 static void 3409 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) 3410 { 3411 int i, j, k, n_old = 0, n_new = 0, proc_num = 0; 3412 if (__kmp_place_num_sockets == 0 && 3413 __kmp_place_num_cores == 0 && 3414 __kmp_place_num_threads_per_core == 0 ) 3415 goto _exit; // no topology limiting actions requested, exit 3416 if (__kmp_place_num_sockets == 0) 3417 __kmp_place_num_sockets = nPackages; // use all available sockets 3418 if (__kmp_place_num_cores == 0) 3419 __kmp_place_num_cores = nCoresPerPkg; // use all available cores 3420 if (__kmp_place_num_threads_per_core == 0 || 3421 __kmp_place_num_threads_per_core > __kmp_nThreadsPerCore) 3422 __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts 3423 3424 if ( !__kmp_affinity_uniform_topology() ) { 3425 KMP_WARNING( AffHWSubsetNonUniform ); 3426 goto _exit; // don't support non-uniform topology 3427 } 3428 if ( depth > 3 ) { 3429 KMP_WARNING( AffHWSubsetNonThreeLevel ); 3430 goto _exit; // don't support not-3-level topology 3431 } 3432 if (__kmp_place_socket_offset + __kmp_place_num_sockets > nPackages) { 3433 KMP_WARNING(AffHWSubsetManySockets); 3434 goto _exit; 3435 } 3436 if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) { 3437 KMP_WARNING( AffHWSubsetManyCores ); 3438 goto _exit; 3439 } 3440 3441 AddrUnsPair *newAddr; 3442 if (pAddr) // pAddr is NULL in case of affinity_none 3443 newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) * 3444 __kmp_place_num_sockets * __kmp_place_num_cores * __kmp_place_num_threads_per_core); 3445 3446 for (i = 0; i < nPackages; ++i) { 3447 if (i < __kmp_place_socket_offset || 3448 i >= __kmp_place_socket_offset + __kmp_place_num_sockets) { 3449 n_old += nCoresPerPkg * __kmp_nThreadsPerCore; // skip not-requested socket 3450 if (__kmp_pu_os_idx != NULL) { 3451 for (j = 0; j < nCoresPerPkg; ++j) { // walk through skipped socket 3452 for (k = 0; k < __kmp_nThreadsPerCore; ++k) { 3453 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3454 ++proc_num; 3455 } 3456 } 3457 } 3458 } else { 3459 for (j = 0; j < nCoresPerPkg; ++j) { // walk through requested socket 3460 if (j < __kmp_place_core_offset || 3461 j >= __kmp_place_core_offset + __kmp_place_num_cores) { 3462 n_old += __kmp_nThreadsPerCore; // skip not-requested core 3463 if (__kmp_pu_os_idx != NULL) { 3464 for (k = 0; k < __kmp_nThreadsPerCore; ++k) { // walk through skipped core 3465 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3466 ++proc_num; 3467 } 3468 } 3469 } else { 3470 for (k = 0; k < __kmp_nThreadsPerCore; ++k) { // walk through requested core 3471 if (k < __kmp_place_num_threads_per_core) { 3472 if (pAddr) 3473 newAddr[n_new] = (*pAddr)[n_old]; // collect requested thread's data 3474 n_new++; 3475 } else { 3476 if (__kmp_pu_os_idx != NULL) 3477 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3478 } 3479 n_old++; 3480 ++proc_num; 3481 } 3482 } 3483 } 3484 } 3485 } 3486 KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore); 3487 KMP_DEBUG_ASSERT(n_new == __kmp_place_num_sockets * __kmp_place_num_cores * 3488 __kmp_place_num_threads_per_core); 3489 3490 nPackages = __kmp_place_num_sockets; // correct nPackages 3491 nCoresPerPkg = __kmp_place_num_cores; // correct nCoresPerPkg 3492 __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore 3493 __kmp_avail_proc = n_new; // correct avail_proc 3494 __kmp_ncores = nPackages * __kmp_place_num_cores; // correct ncores 3495 3496 if (pAddr) { 3497 __kmp_free( *pAddr ); 3498 *pAddr = newAddr; // replace old topology with new one 3499 } 3500 _exit: 3501 if (__kmp_pu_os_idx != NULL) { 3502 __kmp_free(__kmp_pu_os_idx); 3503 __kmp_pu_os_idx = NULL; 3504 } 3505 } 3506 3507 // 3508 // This function figures out the deepest level at which there is at least one cluster/core 3509 // with more than one processing unit bound to it. 3510 // 3511 static int 3512 __kmp_affinity_find_core_level(const AddrUnsPair *address2os, int nprocs, int bottom_level) 3513 { 3514 int core_level = 0; 3515 3516 for( int i = 0; i < nprocs; i++ ) { 3517 for( int j = bottom_level; j > 0; j-- ) { 3518 if( address2os[i].first.labels[j] > 0 ) { 3519 if( core_level < ( j - 1 ) ) { 3520 core_level = j - 1; 3521 } 3522 } 3523 } 3524 } 3525 return core_level; 3526 } 3527 3528 // 3529 // This function counts number of clusters/cores at given level. 3530 // 3531 static int __kmp_affinity_compute_ncores(const AddrUnsPair *address2os, int nprocs, int bottom_level, int core_level) 3532 { 3533 int ncores = 0; 3534 int i, j; 3535 3536 j = bottom_level; 3537 for( i = 0; i < nprocs; i++ ) { 3538 for ( j = bottom_level; j > core_level; j-- ) { 3539 if( ( i + 1 ) < nprocs ) { 3540 if( address2os[i + 1].first.labels[j] > 0 ) { 3541 break; 3542 } 3543 } 3544 } 3545 if( j == core_level ) { 3546 ncores++; 3547 } 3548 } 3549 if( j > core_level ) { 3550 // 3551 // In case of ( nprocs < __kmp_avail_proc ) we may end too deep and miss one core. 3552 // May occur when called from __kmp_affinity_find_core(). 3553 // 3554 ncores++; 3555 } 3556 return ncores; 3557 } 3558 3559 // 3560 // This function finds to which cluster/core given processing unit is bound. 3561 // 3562 static int __kmp_affinity_find_core(const AddrUnsPair *address2os, int proc, int bottom_level, int core_level) 3563 { 3564 return __kmp_affinity_compute_ncores(address2os, proc + 1, bottom_level, core_level) - 1; 3565 } 3566 3567 // 3568 // This function finds maximal number of processing units bound to a cluster/core at given level. 3569 // 3570 static int __kmp_affinity_max_proc_per_core(const AddrUnsPair *address2os, int nprocs, int bottom_level, int core_level) 3571 { 3572 int maxprocpercore = 0; 3573 3574 if( core_level < bottom_level ) { 3575 for( int i = 0; i < nprocs; i++ ) { 3576 int percore = address2os[i].first.labels[core_level + 1] + 1; 3577 3578 if( percore > maxprocpercore ) { 3579 maxprocpercore = percore; 3580 } 3581 } 3582 } else { 3583 maxprocpercore = 1; 3584 } 3585 return maxprocpercore; 3586 } 3587 3588 static AddrUnsPair *address2os = NULL; 3589 static int * procarr = NULL; 3590 static int __kmp_aff_depth = 0; 3591 3592 #define KMP_EXIT_AFF_NONE \ 3593 KMP_ASSERT(__kmp_affinity_type == affinity_none); \ 3594 KMP_ASSERT(address2os == NULL); \ 3595 __kmp_apply_thread_places(NULL, 0); \ 3596 return; 3597 3598 static int 3599 __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) 3600 { 3601 const Address *aa = (const Address *)&(((AddrUnsPair *)a) 3602 ->first); 3603 const Address *bb = (const Address *)&(((AddrUnsPair *)b) 3604 ->first); 3605 unsigned depth = aa->depth; 3606 unsigned i; 3607 KMP_DEBUG_ASSERT(depth == bb->depth); 3608 KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth); 3609 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0); 3610 for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) { 3611 int j = depth - i - 1; 3612 if (aa->childNums[j] < bb->childNums[j]) return -1; 3613 if (aa->childNums[j] > bb->childNums[j]) return 1; 3614 } 3615 for (; i < depth; i++) { 3616 int j = i - __kmp_affinity_compact; 3617 if (aa->childNums[j] < bb->childNums[j]) return -1; 3618 if (aa->childNums[j] > bb->childNums[j]) return 1; 3619 } 3620 return 0; 3621 } 3622 3623 static void 3624 __kmp_aux_affinity_initialize(void) 3625 { 3626 if (__kmp_affinity_masks != NULL) { 3627 KMP_ASSERT(__kmp_affin_fullMask != NULL); 3628 return; 3629 } 3630 3631 // 3632 // Create the "full" mask - this defines all of the processors that we 3633 // consider to be in the machine model. If respect is set, then it is 3634 // the initialization thread's affinity mask. Otherwise, it is all 3635 // processors that we know about on the machine. 3636 // 3637 if (__kmp_affin_fullMask == NULL) { 3638 KMP_CPU_ALLOC(__kmp_affin_fullMask); 3639 } 3640 if (KMP_AFFINITY_CAPABLE()) { 3641 if (__kmp_affinity_respect_mask) { 3642 __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE); 3643 3644 // 3645 // Count the number of available processors. 3646 // 3647 unsigned i; 3648 __kmp_avail_proc = 0; 3649 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 3650 if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 3651 continue; 3652 } 3653 __kmp_avail_proc++; 3654 } 3655 if (__kmp_avail_proc > __kmp_xproc) { 3656 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3657 && (__kmp_affinity_type != affinity_none))) { 3658 KMP_WARNING(ErrorInitializeAffinity); 3659 } 3660 __kmp_affinity_type = affinity_none; 3661 KMP_AFFINITY_DISABLE(); 3662 return; 3663 } 3664 } 3665 else { 3666 __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask); 3667 __kmp_avail_proc = __kmp_xproc; 3668 } 3669 } 3670 3671 int depth = -1; 3672 kmp_i18n_id_t msg_id = kmp_i18n_null; 3673 3674 // 3675 // For backward compatibility, setting KMP_CPUINFO_FILE => 3676 // KMP_TOPOLOGY_METHOD=cpuinfo 3677 // 3678 if ((__kmp_cpuinfo_file != NULL) && 3679 (__kmp_affinity_top_method == affinity_top_method_all)) { 3680 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 3681 } 3682 3683 if (__kmp_affinity_top_method == affinity_top_method_all) { 3684 // 3685 // In the default code path, errors are not fatal - we just try using 3686 // another method. We only emit a warning message if affinity is on, 3687 // or the verbose flag is set, an the nowarnings flag was not set. 3688 // 3689 const char *file_name = NULL; 3690 int line = 0; 3691 # if KMP_USE_HWLOC 3692 if (depth < 0 && __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 3693 if (__kmp_affinity_verbose) { 3694 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 3695 } 3696 if(!__kmp_hwloc_error) { 3697 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); 3698 if (depth == 0) { 3699 KMP_EXIT_AFF_NONE; 3700 } else if(depth < 0 && __kmp_affinity_verbose) { 3701 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 3702 } 3703 } else if(__kmp_affinity_verbose) { 3704 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 3705 } 3706 } 3707 # endif 3708 3709 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 3710 3711 if (depth < 0) { 3712 if (__kmp_affinity_verbose) { 3713 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 3714 } 3715 3716 file_name = NULL; 3717 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3718 if (depth == 0) { 3719 KMP_EXIT_AFF_NONE; 3720 } 3721 3722 if (depth < 0) { 3723 if (__kmp_affinity_verbose) { 3724 if (msg_id != kmp_i18n_null) { 3725 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), 3726 KMP_I18N_STR(DecodingLegacyAPIC)); 3727 } 3728 else { 3729 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 3730 } 3731 } 3732 3733 file_name = NULL; 3734 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3735 if (depth == 0) { 3736 KMP_EXIT_AFF_NONE; 3737 } 3738 } 3739 } 3740 3741 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3742 3743 # if KMP_OS_LINUX 3744 3745 if (depth < 0) { 3746 if (__kmp_affinity_verbose) { 3747 if (msg_id != kmp_i18n_null) { 3748 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo"); 3749 } 3750 else { 3751 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo"); 3752 } 3753 } 3754 3755 FILE *f = fopen("/proc/cpuinfo", "r"); 3756 if (f == NULL) { 3757 msg_id = kmp_i18n_str_CantOpenCpuinfo; 3758 } 3759 else { 3760 file_name = "/proc/cpuinfo"; 3761 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3762 fclose(f); 3763 if (depth == 0) { 3764 KMP_EXIT_AFF_NONE; 3765 } 3766 } 3767 } 3768 3769 # endif /* KMP_OS_LINUX */ 3770 3771 # if KMP_GROUP_AFFINITY 3772 3773 if ((depth < 0) && (__kmp_num_proc_groups > 1)) { 3774 if (__kmp_affinity_verbose) { 3775 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 3776 } 3777 3778 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 3779 KMP_ASSERT(depth != 0); 3780 } 3781 3782 # endif /* KMP_GROUP_AFFINITY */ 3783 3784 if (depth < 0) { 3785 if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) { 3786 if (file_name == NULL) { 3787 KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id)); 3788 } 3789 else if (line == 0) { 3790 KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id)); 3791 } 3792 else { 3793 KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id)); 3794 } 3795 } 3796 // FIXME - print msg if msg_id = kmp_i18n_null ??? 3797 3798 file_name = ""; 3799 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3800 if (depth == 0) { 3801 KMP_EXIT_AFF_NONE; 3802 } 3803 KMP_ASSERT(depth > 0); 3804 KMP_ASSERT(address2os != NULL); 3805 } 3806 } 3807 3808 // 3809 // If the user has specified that a paricular topology discovery method 3810 // is to be used, then we abort if that method fails. The exception is 3811 // group affinity, which might have been implicitly set. 3812 // 3813 3814 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 3815 3816 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 3817 if (__kmp_affinity_verbose) { 3818 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3819 KMP_I18N_STR(Decodingx2APIC)); 3820 } 3821 3822 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3823 if (depth == 0) { 3824 KMP_EXIT_AFF_NONE; 3825 } 3826 if (depth < 0) { 3827 KMP_ASSERT(msg_id != kmp_i18n_null); 3828 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3829 } 3830 } 3831 else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 3832 if (__kmp_affinity_verbose) { 3833 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3834 KMP_I18N_STR(DecodingLegacyAPIC)); 3835 } 3836 3837 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3838 if (depth == 0) { 3839 KMP_EXIT_AFF_NONE; 3840 } 3841 if (depth < 0) { 3842 KMP_ASSERT(msg_id != kmp_i18n_null); 3843 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3844 } 3845 } 3846 3847 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3848 3849 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 3850 const char *filename; 3851 if (__kmp_cpuinfo_file != NULL) { 3852 filename = __kmp_cpuinfo_file; 3853 } 3854 else { 3855 filename = "/proc/cpuinfo"; 3856 } 3857 3858 if (__kmp_affinity_verbose) { 3859 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 3860 } 3861 3862 FILE *f = fopen(filename, "r"); 3863 if (f == NULL) { 3864 int code = errno; 3865 if (__kmp_cpuinfo_file != NULL) { 3866 __kmp_msg( 3867 kmp_ms_fatal, 3868 KMP_MSG(CantOpenFileForReading, filename), 3869 KMP_ERR(code), 3870 KMP_HNT(NameComesFrom_CPUINFO_FILE), 3871 __kmp_msg_null 3872 ); 3873 } 3874 else { 3875 __kmp_msg( 3876 kmp_ms_fatal, 3877 KMP_MSG(CantOpenFileForReading, filename), 3878 KMP_ERR(code), 3879 __kmp_msg_null 3880 ); 3881 } 3882 } 3883 int line = 0; 3884 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3885 fclose(f); 3886 if (depth < 0) { 3887 KMP_ASSERT(msg_id != kmp_i18n_null); 3888 if (line > 0) { 3889 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id)); 3890 } 3891 else { 3892 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 3893 } 3894 } 3895 if (__kmp_affinity_type == affinity_none) { 3896 KMP_ASSERT(depth == 0); 3897 KMP_EXIT_AFF_NONE; 3898 } 3899 } 3900 3901 # if KMP_GROUP_AFFINITY 3902 3903 else if (__kmp_affinity_top_method == affinity_top_method_group) { 3904 if (__kmp_affinity_verbose) { 3905 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 3906 } 3907 3908 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 3909 KMP_ASSERT(depth != 0); 3910 if (depth < 0) { 3911 KMP_ASSERT(msg_id != kmp_i18n_null); 3912 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3913 } 3914 } 3915 3916 # endif /* KMP_GROUP_AFFINITY */ 3917 3918 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 3919 if (__kmp_affinity_verbose) { 3920 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY"); 3921 } 3922 3923 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3924 if (depth == 0) { 3925 KMP_EXIT_AFF_NONE; 3926 } 3927 // should not fail 3928 KMP_ASSERT(depth > 0); 3929 KMP_ASSERT(address2os != NULL); 3930 } 3931 3932 # if KMP_USE_HWLOC 3933 else if (__kmp_affinity_top_method == affinity_top_method_hwloc) { 3934 KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC); 3935 if (__kmp_affinity_verbose) { 3936 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 3937 } 3938 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); 3939 if (depth == 0) { 3940 KMP_EXIT_AFF_NONE; 3941 } 3942 } 3943 # endif // KMP_USE_HWLOC 3944 3945 if (address2os == NULL) { 3946 if (KMP_AFFINITY_CAPABLE() 3947 && (__kmp_affinity_verbose || (__kmp_affinity_warnings 3948 && (__kmp_affinity_type != affinity_none)))) { 3949 KMP_WARNING(ErrorInitializeAffinity); 3950 } 3951 __kmp_affinity_type = affinity_none; 3952 KMP_AFFINITY_DISABLE(); 3953 return; 3954 } 3955 3956 __kmp_apply_thread_places(&address2os, depth); 3957 3958 // 3959 // Create the table of masks, indexed by thread Id. 3960 // 3961 unsigned maxIndex; 3962 unsigned numUnique; 3963 kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique, 3964 address2os, __kmp_avail_proc); 3965 if (__kmp_affinity_gran_levels == 0) { 3966 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 3967 } 3968 3969 // 3970 // Set the childNums vector in all Address objects. This must be done 3971 // before we can sort using __kmp_affinity_cmp_Address_child_num(), 3972 // which takes into account the setting of __kmp_affinity_compact. 3973 // 3974 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc); 3975 3976 switch (__kmp_affinity_type) { 3977 3978 case affinity_explicit: 3979 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 3980 # if OMP_40_ENABLED 3981 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 3982 # endif 3983 { 3984 __kmp_affinity_process_proclist(&__kmp_affinity_masks, 3985 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask, 3986 maxIndex); 3987 } 3988 # if OMP_40_ENABLED 3989 else { 3990 __kmp_affinity_process_placelist(&__kmp_affinity_masks, 3991 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask, 3992 maxIndex); 3993 } 3994 # endif 3995 if (__kmp_affinity_num_masks == 0) { 3996 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3997 && (__kmp_affinity_type != affinity_none))) { 3998 KMP_WARNING(AffNoValidProcID); 3999 } 4000 __kmp_affinity_type = affinity_none; 4001 return; 4002 } 4003 break; 4004 4005 // 4006 // The other affinity types rely on sorting the Addresses according 4007 // to some permutation of the machine topology tree. Set 4008 // __kmp_affinity_compact and __kmp_affinity_offset appropriately, 4009 // then jump to a common code fragment to do the sort and create 4010 // the array of affinity masks. 4011 // 4012 4013 case affinity_logical: 4014 __kmp_affinity_compact = 0; 4015 if (__kmp_affinity_offset) { 4016 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset 4017 % __kmp_avail_proc; 4018 } 4019 goto sortAddresses; 4020 4021 case affinity_physical: 4022 if (__kmp_nThreadsPerCore > 1) { 4023 __kmp_affinity_compact = 1; 4024 if (__kmp_affinity_compact >= depth) { 4025 __kmp_affinity_compact = 0; 4026 } 4027 } else { 4028 __kmp_affinity_compact = 0; 4029 } 4030 if (__kmp_affinity_offset) { 4031 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset 4032 % __kmp_avail_proc; 4033 } 4034 goto sortAddresses; 4035 4036 case affinity_scatter: 4037 if (__kmp_affinity_compact >= depth) { 4038 __kmp_affinity_compact = 0; 4039 } 4040 else { 4041 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 4042 } 4043 goto sortAddresses; 4044 4045 case affinity_compact: 4046 if (__kmp_affinity_compact >= depth) { 4047 __kmp_affinity_compact = depth - 1; 4048 } 4049 goto sortAddresses; 4050 4051 case affinity_balanced: 4052 if( depth <= 1 ) { 4053 if( __kmp_affinity_verbose || __kmp_affinity_warnings ) { 4054 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" ); 4055 } 4056 __kmp_affinity_type = affinity_none; 4057 return; 4058 } else if( __kmp_affinity_uniform_topology() ) { 4059 break; 4060 } else { // Non-uniform topology 4061 4062 // Save the depth for further usage 4063 __kmp_aff_depth = depth; 4064 4065 int core_level = __kmp_affinity_find_core_level(address2os, __kmp_avail_proc, depth - 1); 4066 int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, depth - 1, core_level); 4067 int maxprocpercore = __kmp_affinity_max_proc_per_core(address2os, __kmp_avail_proc, depth - 1, core_level); 4068 4069 int nproc = ncores * maxprocpercore; 4070 if( ( nproc < 2 ) || ( nproc < __kmp_avail_proc ) ) { 4071 if( __kmp_affinity_verbose || __kmp_affinity_warnings ) { 4072 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" ); 4073 } 4074 __kmp_affinity_type = affinity_none; 4075 return; 4076 } 4077 4078 procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc ); 4079 for( int i = 0; i < nproc; i++ ) { 4080 procarr[ i ] = -1; 4081 } 4082 4083 int lastcore = -1; 4084 int inlastcore = 0; 4085 for( int i = 0; i < __kmp_avail_proc; i++ ) { 4086 int proc = address2os[ i ].second; 4087 int core = __kmp_affinity_find_core(address2os, i, depth - 1, core_level); 4088 4089 if ( core == lastcore ) { 4090 inlastcore++; 4091 } else { 4092 inlastcore = 0; 4093 } 4094 lastcore = core; 4095 4096 procarr[ core * maxprocpercore + inlastcore ] = proc; 4097 } 4098 4099 break; 4100 } 4101 4102 sortAddresses: 4103 // 4104 // Allocate the gtid->affinity mask table. 4105 // 4106 if (__kmp_affinity_dups) { 4107 __kmp_affinity_num_masks = __kmp_avail_proc; 4108 } 4109 else { 4110 __kmp_affinity_num_masks = numUnique; 4111 } 4112 4113 # if OMP_40_ENABLED 4114 if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel ) 4115 && ( __kmp_affinity_num_places > 0 ) 4116 && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) { 4117 __kmp_affinity_num_masks = __kmp_affinity_num_places; 4118 } 4119 # endif 4120 4121 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4122 4123 // 4124 // Sort the address2os table according to the current setting of 4125 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 4126 // 4127 qsort(address2os, __kmp_avail_proc, sizeof(*address2os), 4128 __kmp_affinity_cmp_Address_child_num); 4129 { 4130 int i; 4131 unsigned j; 4132 for (i = 0, j = 0; i < __kmp_avail_proc; i++) { 4133 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) { 4134 continue; 4135 } 4136 unsigned osId = address2os[i].second; 4137 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 4138 kmp_affin_mask_t *dest 4139 = KMP_CPU_INDEX(__kmp_affinity_masks, j); 4140 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 4141 KMP_CPU_COPY(dest, src); 4142 if (++j >= __kmp_affinity_num_masks) { 4143 break; 4144 } 4145 } 4146 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 4147 } 4148 break; 4149 4150 default: 4151 KMP_ASSERT2(0, "Unexpected affinity setting"); 4152 } 4153 4154 KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex+1); 4155 machine_hierarchy.init(address2os, __kmp_avail_proc); 4156 } 4157 #undef KMP_EXIT_AFF_NONE 4158 4159 4160 void 4161 __kmp_affinity_initialize(void) 4162 { 4163 // 4164 // Much of the code above was written assumming that if a machine was not 4165 // affinity capable, then __kmp_affinity_type == affinity_none. We now 4166 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 4167 // 4168 // There are too many checks for __kmp_affinity_type == affinity_none 4169 // in this code. Instead of trying to change them all, check if 4170 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 4171 // affinity_none, call the real initialization routine, then restore 4172 // __kmp_affinity_type to affinity_disabled. 4173 // 4174 int disabled = (__kmp_affinity_type == affinity_disabled); 4175 if (! KMP_AFFINITY_CAPABLE()) { 4176 KMP_ASSERT(disabled); 4177 } 4178 if (disabled) { 4179 __kmp_affinity_type = affinity_none; 4180 } 4181 __kmp_aux_affinity_initialize(); 4182 if (disabled) { 4183 __kmp_affinity_type = affinity_disabled; 4184 } 4185 } 4186 4187 4188 void 4189 __kmp_affinity_uninitialize(void) 4190 { 4191 if (__kmp_affinity_masks != NULL) { 4192 KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4193 __kmp_affinity_masks = NULL; 4194 } 4195 if (__kmp_affin_fullMask != NULL) { 4196 KMP_CPU_FREE(__kmp_affin_fullMask); 4197 __kmp_affin_fullMask = NULL; 4198 } 4199 __kmp_affinity_num_masks = 0; 4200 # if OMP_40_ENABLED 4201 __kmp_affinity_num_places = 0; 4202 # endif 4203 if (__kmp_affinity_proclist != NULL) { 4204 __kmp_free(__kmp_affinity_proclist); 4205 __kmp_affinity_proclist = NULL; 4206 } 4207 if( address2os != NULL ) { 4208 __kmp_free( address2os ); 4209 address2os = NULL; 4210 } 4211 if( procarr != NULL ) { 4212 __kmp_free( procarr ); 4213 procarr = NULL; 4214 } 4215 # if KMP_USE_HWLOC 4216 if (__kmp_hwloc_topology != NULL) { 4217 hwloc_topology_destroy(__kmp_hwloc_topology); 4218 __kmp_hwloc_topology = NULL; 4219 } 4220 # endif 4221 KMPAffinity::destroy_api(); 4222 } 4223 4224 4225 void 4226 __kmp_affinity_set_init_mask(int gtid, int isa_root) 4227 { 4228 if (! KMP_AFFINITY_CAPABLE()) { 4229 return; 4230 } 4231 4232 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4233 if (th->th.th_affin_mask == NULL) { 4234 KMP_CPU_ALLOC(th->th.th_affin_mask); 4235 } 4236 else { 4237 KMP_CPU_ZERO(th->th.th_affin_mask); 4238 } 4239 4240 // 4241 // Copy the thread mask to the kmp_info_t strucuture. 4242 // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one 4243 // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask 4244 // is set, then the full mask is the same as the mask of the initialization 4245 // thread. 4246 // 4247 kmp_affin_mask_t *mask; 4248 int i; 4249 4250 # if OMP_40_ENABLED 4251 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 4252 # endif 4253 { 4254 if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced) 4255 ) { 4256 # if KMP_GROUP_AFFINITY 4257 if (__kmp_num_proc_groups > 1) { 4258 return; 4259 } 4260 # endif 4261 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4262 i = KMP_PLACE_ALL; 4263 mask = __kmp_affin_fullMask; 4264 } 4265 else { 4266 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 ); 4267 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4268 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4269 } 4270 } 4271 # if OMP_40_ENABLED 4272 else { 4273 if ((! isa_root) 4274 || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 4275 # if KMP_GROUP_AFFINITY 4276 if (__kmp_num_proc_groups > 1) { 4277 return; 4278 } 4279 # endif 4280 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4281 i = KMP_PLACE_ALL; 4282 mask = __kmp_affin_fullMask; 4283 } 4284 else { 4285 // 4286 // int i = some hash function or just a counter that doesn't 4287 // always start at 0. Use gtid for now. 4288 // 4289 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 ); 4290 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4291 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4292 } 4293 } 4294 # endif 4295 4296 # if OMP_40_ENABLED 4297 th->th.th_current_place = i; 4298 if (isa_root) { 4299 th->th.th_new_place = i; 4300 th->th.th_first_place = 0; 4301 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4302 } 4303 4304 if (i == KMP_PLACE_ALL) { 4305 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 4306 gtid)); 4307 } 4308 else { 4309 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 4310 gtid, i)); 4311 } 4312 # else 4313 if (i == -1) { 4314 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to __kmp_affin_fullMask\n", 4315 gtid)); 4316 } 4317 else { 4318 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n", 4319 gtid, i)); 4320 } 4321 # endif /* OMP_40_ENABLED */ 4322 4323 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4324 4325 if (__kmp_affinity_verbose) { 4326 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4327 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4328 th->th.th_affin_mask); 4329 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid, 4330 buf); 4331 } 4332 4333 # if KMP_OS_WINDOWS 4334 // 4335 // On Windows* OS, the process affinity mask might have changed. 4336 // If the user didn't request affinity and this call fails, 4337 // just continue silently. See CQ171393. 4338 // 4339 if ( __kmp_affinity_type == affinity_none ) { 4340 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 4341 } 4342 else 4343 # endif 4344 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4345 } 4346 4347 4348 # if OMP_40_ENABLED 4349 4350 void 4351 __kmp_affinity_set_place(int gtid) 4352 { 4353 int retval; 4354 4355 if (! KMP_AFFINITY_CAPABLE()) { 4356 return; 4357 } 4358 4359 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4360 4361 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n", 4362 gtid, th->th.th_new_place, th->th.th_current_place)); 4363 4364 // 4365 // Check that the new place is within this thread's partition. 4366 // 4367 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4368 KMP_ASSERT(th->th.th_new_place >= 0); 4369 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 4370 if (th->th.th_first_place <= th->th.th_last_place) { 4371 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) 4372 && (th->th.th_new_place <= th->th.th_last_place)); 4373 } 4374 else { 4375 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) 4376 || (th->th.th_new_place >= th->th.th_last_place)); 4377 } 4378 4379 // 4380 // Copy the thread mask to the kmp_info_t strucuture, 4381 // and set this thread's affinity. 4382 // 4383 kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, 4384 th->th.th_new_place); 4385 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4386 th->th.th_current_place = th->th.th_new_place; 4387 4388 if (__kmp_affinity_verbose) { 4389 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4390 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4391 th->th.th_affin_mask); 4392 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), 4393 gtid, buf); 4394 } 4395 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4396 } 4397 4398 # endif /* OMP_40_ENABLED */ 4399 4400 4401 int 4402 __kmp_aux_set_affinity(void **mask) 4403 { 4404 int gtid; 4405 kmp_info_t *th; 4406 int retval; 4407 4408 if (! KMP_AFFINITY_CAPABLE()) { 4409 return -1; 4410 } 4411 4412 gtid = __kmp_entry_gtid(); 4413 KA_TRACE(1000, ;{ 4414 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4415 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4416 (kmp_affin_mask_t *)(*mask)); 4417 __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n", 4418 gtid, buf); 4419 }); 4420 4421 if (__kmp_env_consistency_check) { 4422 if ((mask == NULL) || (*mask == NULL)) { 4423 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4424 } 4425 else { 4426 unsigned proc; 4427 int num_procs = 0; 4428 4429 KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t*)(*mask))) { 4430 if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4431 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4432 } 4433 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 4434 continue; 4435 } 4436 num_procs++; 4437 } 4438 if (num_procs == 0) { 4439 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4440 } 4441 4442 # if KMP_GROUP_AFFINITY 4443 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 4444 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4445 } 4446 # endif /* KMP_GROUP_AFFINITY */ 4447 4448 } 4449 } 4450 4451 th = __kmp_threads[gtid]; 4452 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4453 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4454 if (retval == 0) { 4455 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 4456 } 4457 4458 # if OMP_40_ENABLED 4459 th->th.th_current_place = KMP_PLACE_UNDEFINED; 4460 th->th.th_new_place = KMP_PLACE_UNDEFINED; 4461 th->th.th_first_place = 0; 4462 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4463 4464 // 4465 // Turn off 4.0 affinity for the current tread at this parallel level. 4466 // 4467 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 4468 # endif 4469 4470 return retval; 4471 } 4472 4473 4474 int 4475 __kmp_aux_get_affinity(void **mask) 4476 { 4477 int gtid; 4478 int retval; 4479 kmp_info_t *th; 4480 4481 if (! KMP_AFFINITY_CAPABLE()) { 4482 return -1; 4483 } 4484 4485 gtid = __kmp_entry_gtid(); 4486 th = __kmp_threads[gtid]; 4487 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4488 4489 KA_TRACE(1000, ;{ 4490 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4491 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4492 th->th.th_affin_mask); 4493 __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf); 4494 }); 4495 4496 if (__kmp_env_consistency_check) { 4497 if ((mask == NULL) || (*mask == NULL)) { 4498 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 4499 } 4500 } 4501 4502 # if !KMP_OS_WINDOWS 4503 4504 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4505 KA_TRACE(1000, ;{ 4506 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4507 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4508 (kmp_affin_mask_t *)(*mask)); 4509 __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf); 4510 }); 4511 return retval; 4512 4513 # else 4514 4515 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 4516 return 0; 4517 4518 # endif /* KMP_OS_WINDOWS */ 4519 4520 } 4521 4522 int 4523 __kmp_aux_get_affinity_max_proc() { 4524 if (! KMP_AFFINITY_CAPABLE()) { 4525 return 0; 4526 } 4527 #if KMP_GROUP_AFFINITY 4528 if ( __kmp_num_proc_groups > 1 ) { 4529 return (int)(__kmp_num_proc_groups*sizeof(DWORD_PTR)*CHAR_BIT); 4530 } 4531 #endif 4532 return __kmp_xproc; 4533 } 4534 4535 int 4536 __kmp_aux_set_affinity_mask_proc(int proc, void **mask) 4537 { 4538 int retval; 4539 4540 if (! KMP_AFFINITY_CAPABLE()) { 4541 return -1; 4542 } 4543 4544 KA_TRACE(1000, ;{ 4545 int gtid = __kmp_entry_gtid(); 4546 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4547 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4548 (kmp_affin_mask_t *)(*mask)); 4549 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n", 4550 proc, gtid, buf); 4551 }); 4552 4553 if (__kmp_env_consistency_check) { 4554 if ((mask == NULL) || (*mask == NULL)) { 4555 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4556 } 4557 } 4558 4559 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4560 return -1; 4561 } 4562 if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4563 return -2; 4564 } 4565 4566 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 4567 return 0; 4568 } 4569 4570 4571 int 4572 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) 4573 { 4574 int retval; 4575 4576 if (! KMP_AFFINITY_CAPABLE()) { 4577 return -1; 4578 } 4579 4580 KA_TRACE(1000, ;{ 4581 int gtid = __kmp_entry_gtid(); 4582 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4583 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4584 (kmp_affin_mask_t *)(*mask)); 4585 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n", 4586 proc, gtid, buf); 4587 }); 4588 4589 if (__kmp_env_consistency_check) { 4590 if ((mask == NULL) || (*mask == NULL)) { 4591 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 4592 } 4593 } 4594 4595 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4596 return -1; 4597 } 4598 if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4599 return -2; 4600 } 4601 4602 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 4603 return 0; 4604 } 4605 4606 4607 int 4608 __kmp_aux_get_affinity_mask_proc(int proc, void **mask) 4609 { 4610 int retval; 4611 4612 if (! KMP_AFFINITY_CAPABLE()) { 4613 return -1; 4614 } 4615 4616 KA_TRACE(1000, ;{ 4617 int gtid = __kmp_entry_gtid(); 4618 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4619 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4620 (kmp_affin_mask_t *)(*mask)); 4621 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n", 4622 proc, gtid, buf); 4623 }); 4624 4625 if (__kmp_env_consistency_check) { 4626 if ((mask == NULL) || (*mask == NULL)) { 4627 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc"); 4628 } 4629 } 4630 4631 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4632 return -1; 4633 } 4634 if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4635 return 0; 4636 } 4637 4638 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 4639 } 4640 4641 4642 // Dynamic affinity settings - Affinity balanced 4643 void __kmp_balanced_affinity( int tid, int nthreads ) 4644 { 4645 bool fine_gran = true; 4646 4647 switch (__kmp_affinity_gran) { 4648 case affinity_gran_fine: 4649 case affinity_gran_thread: 4650 break; 4651 case affinity_gran_core: 4652 if( __kmp_nThreadsPerCore > 1) { 4653 fine_gran = false; 4654 } 4655 break; 4656 case affinity_gran_package: 4657 if( nCoresPerPkg > 1) { 4658 fine_gran = false; 4659 } 4660 break; 4661 default: 4662 fine_gran = false; 4663 } 4664 4665 if( __kmp_affinity_uniform_topology() ) { 4666 int coreID; 4667 int threadID; 4668 // Number of hyper threads per core in HT machine 4669 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 4670 // Number of cores 4671 int ncores = __kmp_ncores; 4672 if( ( nPackages > 1 ) && ( __kmp_nth_per_core <= 1 ) ) { 4673 __kmp_nth_per_core = __kmp_avail_proc / nPackages; 4674 ncores = nPackages; 4675 } 4676 // How many threads will be bound to each core 4677 int chunk = nthreads / ncores; 4678 // How many cores will have an additional thread bound to it - "big cores" 4679 int big_cores = nthreads % ncores; 4680 // Number of threads on the big cores 4681 int big_nth = ( chunk + 1 ) * big_cores; 4682 if( tid < big_nth ) { 4683 coreID = tid / (chunk + 1 ); 4684 threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ; 4685 } else { //tid >= big_nth 4686 coreID = ( tid - big_cores ) / chunk; 4687 threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ; 4688 } 4689 4690 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 4691 "Illegal set affinity operation when not capable"); 4692 4693 kmp_affin_mask_t *mask; 4694 KMP_CPU_ALLOC_ON_STACK(mask); 4695 KMP_CPU_ZERO(mask); 4696 4697 if( fine_gran ) { 4698 int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second; 4699 KMP_CPU_SET( osID, mask); 4700 } else { 4701 for( int i = 0; i < __kmp_nth_per_core; i++ ) { 4702 int osID; 4703 osID = address2os[ coreID * __kmp_nth_per_core + i ].second; 4704 KMP_CPU_SET( osID, mask); 4705 } 4706 } 4707 if (__kmp_affinity_verbose) { 4708 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4709 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4710 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4711 tid, buf); 4712 } 4713 __kmp_set_system_affinity( mask, TRUE ); 4714 KMP_CPU_FREE_FROM_STACK(mask); 4715 } else { // Non-uniform topology 4716 4717 kmp_affin_mask_t *mask; 4718 KMP_CPU_ALLOC_ON_STACK(mask); 4719 KMP_CPU_ZERO(mask); 4720 4721 int core_level = __kmp_affinity_find_core_level(address2os, __kmp_avail_proc, __kmp_aff_depth - 1); 4722 int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level); 4723 int nth_per_core = __kmp_affinity_max_proc_per_core(address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level); 4724 4725 // For performance gain consider the special case nthreads == __kmp_avail_proc 4726 if( nthreads == __kmp_avail_proc ) { 4727 if( fine_gran ) { 4728 int osID = address2os[ tid ].second; 4729 KMP_CPU_SET( osID, mask); 4730 } else { 4731 int core = __kmp_affinity_find_core(address2os, tid, __kmp_aff_depth - 1, core_level); 4732 for( int i = 0; i < __kmp_avail_proc; i++ ) { 4733 int osID = address2os[ i ].second; 4734 if( __kmp_affinity_find_core(address2os, i, __kmp_aff_depth - 1, core_level) == core ) { 4735 KMP_CPU_SET( osID, mask); 4736 } 4737 } 4738 } 4739 } else if( nthreads <= ncores ) { 4740 4741 int core = 0; 4742 for( int i = 0; i < ncores; i++ ) { 4743 // Check if this core from procarr[] is in the mask 4744 int in_mask = 0; 4745 for( int j = 0; j < nth_per_core; j++ ) { 4746 if( procarr[ i * nth_per_core + j ] != - 1 ) { 4747 in_mask = 1; 4748 break; 4749 } 4750 } 4751 if( in_mask ) { 4752 if( tid == core ) { 4753 for( int j = 0; j < nth_per_core; j++ ) { 4754 int osID = procarr[ i * nth_per_core + j ]; 4755 if( osID != -1 ) { 4756 KMP_CPU_SET( osID, mask ); 4757 // For fine granularity it is enough to set the first available osID for this core 4758 if( fine_gran) { 4759 break; 4760 } 4761 } 4762 } 4763 break; 4764 } else { 4765 core++; 4766 } 4767 } 4768 } 4769 4770 } else { // nthreads > ncores 4771 4772 // Array to save the number of processors at each core 4773 int* nproc_at_core = (int*)KMP_ALLOCA(sizeof(int)*ncores); 4774 // Array to save the number of cores with "x" available processors; 4775 int* ncores_with_x_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1)); 4776 // Array to save the number of cores with # procs from x to nth_per_core 4777 int* ncores_with_x_to_max_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1)); 4778 4779 for( int i = 0; i <= nth_per_core; i++ ) { 4780 ncores_with_x_procs[ i ] = 0; 4781 ncores_with_x_to_max_procs[ i ] = 0; 4782 } 4783 4784 for( int i = 0; i < ncores; i++ ) { 4785 int cnt = 0; 4786 for( int j = 0; j < nth_per_core; j++ ) { 4787 if( procarr[ i * nth_per_core + j ] != -1 ) { 4788 cnt++; 4789 } 4790 } 4791 nproc_at_core[ i ] = cnt; 4792 ncores_with_x_procs[ cnt ]++; 4793 } 4794 4795 for( int i = 0; i <= nth_per_core; i++ ) { 4796 for( int j = i; j <= nth_per_core; j++ ) { 4797 ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ]; 4798 } 4799 } 4800 4801 // Max number of processors 4802 int nproc = nth_per_core * ncores; 4803 // An array to keep number of threads per each context 4804 int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc ); 4805 for( int i = 0; i < nproc; i++ ) { 4806 newarr[ i ] = 0; 4807 } 4808 4809 int nth = nthreads; 4810 int flag = 0; 4811 while( nth > 0 ) { 4812 for( int j = 1; j <= nth_per_core; j++ ) { 4813 int cnt = ncores_with_x_to_max_procs[ j ]; 4814 for( int i = 0; i < ncores; i++ ) { 4815 // Skip the core with 0 processors 4816 if( nproc_at_core[ i ] == 0 ) { 4817 continue; 4818 } 4819 for( int k = 0; k < nth_per_core; k++ ) { 4820 if( procarr[ i * nth_per_core + k ] != -1 ) { 4821 if( newarr[ i * nth_per_core + k ] == 0 ) { 4822 newarr[ i * nth_per_core + k ] = 1; 4823 cnt--; 4824 nth--; 4825 break; 4826 } else { 4827 if( flag != 0 ) { 4828 newarr[ i * nth_per_core + k ] ++; 4829 cnt--; 4830 nth--; 4831 break; 4832 } 4833 } 4834 } 4835 } 4836 if( cnt == 0 || nth == 0 ) { 4837 break; 4838 } 4839 } 4840 if( nth == 0 ) { 4841 break; 4842 } 4843 } 4844 flag = 1; 4845 } 4846 int sum = 0; 4847 for( int i = 0; i < nproc; i++ ) { 4848 sum += newarr[ i ]; 4849 if( sum > tid ) { 4850 if( fine_gran) { 4851 int osID = procarr[ i ]; 4852 KMP_CPU_SET( osID, mask); 4853 } else { 4854 int coreID = i / nth_per_core; 4855 for( int ii = 0; ii < nth_per_core; ii++ ) { 4856 int osID = procarr[ coreID * nth_per_core + ii ]; 4857 if( osID != -1 ) { 4858 KMP_CPU_SET( osID, mask); 4859 } 4860 } 4861 } 4862 break; 4863 } 4864 } 4865 __kmp_free( newarr ); 4866 } 4867 4868 if (__kmp_affinity_verbose) { 4869 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4870 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4871 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4872 tid, buf); 4873 } 4874 __kmp_set_system_affinity( mask, TRUE ); 4875 KMP_CPU_FREE_FROM_STACK(mask); 4876 } 4877 } 4878 4879 #if KMP_OS_LINUX 4880 // We don't need this entry for Windows because 4881 // there is GetProcessAffinityMask() api 4882 // 4883 // The intended usage is indicated by these steps: 4884 // 1) The user gets the current affinity mask 4885 // 2) Then sets the affinity by calling this function 4886 // 3) Error check the return value 4887 // 4) Use non-OpenMP parallelization 4888 // 5) Reset the affinity to what was stored in step 1) 4889 #ifdef __cplusplus 4890 extern "C" 4891 #endif 4892 int 4893 kmp_set_thread_affinity_mask_initial() 4894 // the function returns 0 on success, 4895 // -1 if we cannot bind thread 4896 // >0 (errno) if an error happened during binding 4897 { 4898 int gtid = __kmp_get_gtid(); 4899 if (gtid < 0) { 4900 // Do not touch non-omp threads 4901 KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: " 4902 "non-omp thread, returning\n")); 4903 return -1; 4904 } 4905 if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) { 4906 KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: " 4907 "affinity not initialized, returning\n")); 4908 return -1; 4909 } 4910 KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: " 4911 "set full mask for thread %d\n", gtid)); 4912 KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL); 4913 return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE); 4914 } 4915 #endif 4916 4917 #endif // KMP_AFFINITY_SUPPORTED 4918