1 /* 2 * kmp_affinity.cpp -- affinity management 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "kmp.h" 17 #include "kmp_i18n.h" 18 #include "kmp_io.h" 19 #include "kmp_str.h" 20 #include "kmp_wrapper_getpid.h" 21 #include "kmp_affinity.h" 22 23 // Store the real or imagined machine hierarchy here 24 static hierarchy_info machine_hierarchy; 25 26 void __kmp_cleanup_hierarchy() { 27 machine_hierarchy.fini(); 28 } 29 30 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 31 kmp_uint32 depth; 32 // The test below is true if affinity is available, but set to "none". Need to init on first use of hierarchical barrier. 33 if (TCR_1(machine_hierarchy.uninitialized)) 34 machine_hierarchy.init(NULL, nproc); 35 36 // Adjust the hierarchy in case num threads exceeds original 37 if (nproc > machine_hierarchy.base_num_threads) 38 machine_hierarchy.resize(nproc); 39 40 depth = machine_hierarchy.depth; 41 KMP_DEBUG_ASSERT(depth > 0); 42 43 thr_bar->depth = depth; 44 thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1; 45 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 46 } 47 48 #if KMP_AFFINITY_SUPPORTED 49 50 bool KMPAffinity::picked_api = false; 51 52 void* KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); } 53 void* KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); } 54 void KMPAffinity::Mask::operator delete(void* p) { __kmp_free(p); } 55 void KMPAffinity::Mask::operator delete[](void* p) { __kmp_free(p); } 56 void* KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); } 57 void KMPAffinity::operator delete(void* p) { __kmp_free(p); } 58 59 void KMPAffinity::pick_api() { 60 KMPAffinity* affinity_dispatch; 61 if (picked_api) 62 return; 63 #if KMP_USE_HWLOC 64 if (__kmp_affinity_top_method == affinity_top_method_hwloc) { 65 affinity_dispatch = new KMPHwlocAffinity(); 66 } else 67 #endif 68 { 69 affinity_dispatch = new KMPNativeAffinity(); 70 } 71 __kmp_affinity_dispatch = affinity_dispatch; 72 picked_api = true; 73 } 74 75 void KMPAffinity::destroy_api() { 76 if (__kmp_affinity_dispatch != NULL) { 77 delete __kmp_affinity_dispatch; 78 __kmp_affinity_dispatch = NULL; 79 picked_api = false; 80 } 81 } 82 83 // 84 // Print the affinity mask to the character array in a pretty format. 85 // 86 char * 87 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask) 88 { 89 KMP_ASSERT(buf_len >= 40); 90 char *scan = buf; 91 char *end = buf + buf_len - 1; 92 93 // 94 // Find first element / check for empty set. 95 // 96 size_t i; 97 i = mask->begin(); 98 if (i == mask->end()) { 99 KMP_SNPRINTF(scan, end-scan+1, "{<empty>}"); 100 while (*scan != '\0') scan++; 101 KMP_ASSERT(scan <= end); 102 return buf; 103 } 104 105 KMP_SNPRINTF(scan, end-scan+1, "{%ld", (long)i); 106 while (*scan != '\0') scan++; 107 i++; 108 for (; i != mask->end(); i = mask->next(i)) { 109 if (! KMP_CPU_ISSET(i, mask)) { 110 continue; 111 } 112 113 // 114 // Check for buffer overflow. A string of the form ",<n>" will have 115 // at most 10 characters, plus we want to leave room to print ",...}" 116 // if the set is too large to print for a total of 15 characters. 117 // We already left room for '\0' in setting end. 118 // 119 if (end - scan < 15) { 120 break; 121 } 122 KMP_SNPRINTF(scan, end-scan+1, ",%-ld", (long)i); 123 while (*scan != '\0') scan++; 124 } 125 if (i != mask->end()) { 126 KMP_SNPRINTF(scan, end-scan+1, ",..."); 127 while (*scan != '\0') scan++; 128 } 129 KMP_SNPRINTF(scan, end-scan+1, "}"); 130 while (*scan != '\0') scan++; 131 KMP_ASSERT(scan <= end); 132 return buf; 133 } 134 135 136 void 137 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) 138 { 139 KMP_CPU_ZERO(mask); 140 141 # if KMP_GROUP_AFFINITY 142 143 if (__kmp_num_proc_groups > 1) { 144 int group; 145 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 146 for (group = 0; group < __kmp_num_proc_groups; group++) { 147 int i; 148 int num = __kmp_GetActiveProcessorCount(group); 149 for (i = 0; i < num; i++) { 150 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 151 } 152 } 153 } 154 else 155 156 # endif /* KMP_GROUP_AFFINITY */ 157 158 { 159 int proc; 160 for (proc = 0; proc < __kmp_xproc; proc++) { 161 KMP_CPU_SET(proc, mask); 162 } 163 } 164 } 165 166 // 167 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be 168 // called to renumber the labels from [0..n] and place them into the child_num 169 // vector of the address object. This is done in case the labels used for 170 // the children at one node of the hierarchy differ from those used for 171 // another node at the same level. Example: suppose the machine has 2 nodes 172 // with 2 packages each. The first node contains packages 601 and 602, and 173 // second node contains packages 603 and 604. If we try to sort the table 174 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604 175 // because we are paying attention to the labels themselves, not the ordinal 176 // child numbers. By using the child numbers in the sort, the result is 177 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604. 178 // 179 static void 180 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os, 181 int numAddrs) 182 { 183 KMP_DEBUG_ASSERT(numAddrs > 0); 184 int depth = address2os->first.depth; 185 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 186 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth 187 * sizeof(unsigned)); 188 int labCt; 189 for (labCt = 0; labCt < depth; labCt++) { 190 address2os[0].first.childNums[labCt] = counts[labCt] = 0; 191 lastLabel[labCt] = address2os[0].first.labels[labCt]; 192 } 193 int i; 194 for (i = 1; i < numAddrs; i++) { 195 for (labCt = 0; labCt < depth; labCt++) { 196 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) { 197 int labCt2; 198 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) { 199 counts[labCt2] = 0; 200 lastLabel[labCt2] = address2os[i].first.labels[labCt2]; 201 } 202 counts[labCt]++; 203 lastLabel[labCt] = address2os[i].first.labels[labCt]; 204 break; 205 } 206 } 207 for (labCt = 0; labCt < depth; labCt++) { 208 address2os[i].first.childNums[labCt] = counts[labCt]; 209 } 210 for (; labCt < (int)Address::maxDepth; labCt++) { 211 address2os[i].first.childNums[labCt] = 0; 212 } 213 } 214 __kmp_free(lastLabel); 215 __kmp_free(counts); 216 } 217 218 219 // 220 // All of the __kmp_affinity_create_*_map() routines should set 221 // __kmp_affinity_masks to a vector of affinity mask objects of length 222 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and 223 // return the number of levels in the machine topology tree (zero if 224 // __kmp_affinity_type == affinity_none). 225 // 226 // All of the __kmp_affinity_create_*_map() routines should set *__kmp_affin_fullMask 227 // to the affinity mask for the initialization thread. They need to save and 228 // restore the mask, and it could be needed later, so saving it is just an 229 // optimization to avoid calling kmp_get_system_affinity() again. 230 // 231 kmp_affin_mask_t *__kmp_affin_fullMask = NULL; 232 233 static int nCoresPerPkg, nPackages; 234 static int __kmp_nThreadsPerCore; 235 #ifndef KMP_DFLT_NTH_CORES 236 static int __kmp_ncores; 237 #endif 238 static int *__kmp_pu_os_idx = NULL; 239 240 // 241 // __kmp_affinity_uniform_topology() doesn't work when called from 242 // places which support arbitrarily many levels in the machine topology 243 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map() 244 // __kmp_affinity_create_x2apicid_map(). 245 // 246 inline static bool 247 __kmp_affinity_uniform_topology() 248 { 249 return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages); 250 } 251 252 253 // 254 // Print out the detailed machine topology map, i.e. the physical locations 255 // of each OS proc. 256 // 257 static void 258 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth, 259 int pkgLevel, int coreLevel, int threadLevel) 260 { 261 int proc; 262 263 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 264 for (proc = 0; proc < len; proc++) { 265 int level; 266 kmp_str_buf_t buf; 267 __kmp_str_buf_init(&buf); 268 for (level = 0; level < depth; level++) { 269 if (level == threadLevel) { 270 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread)); 271 } 272 else if (level == coreLevel) { 273 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core)); 274 } 275 else if (level == pkgLevel) { 276 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package)); 277 } 278 else if (level > pkgLevel) { 279 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node), 280 level - pkgLevel - 1); 281 } 282 else { 283 __kmp_str_buf_print(&buf, "L%d ", level); 284 } 285 __kmp_str_buf_print(&buf, "%d ", 286 address2os[proc].first.labels[level]); 287 } 288 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second, 289 buf.str); 290 __kmp_str_buf_free(&buf); 291 } 292 } 293 294 #if KMP_USE_HWLOC 295 296 // This function removes the topology levels that are radix 1 and don't offer 297 // further information about the topology. The most common example is when you 298 // have one thread context per core, we don't want the extra thread context 299 // level if it offers no unique labels. So they are removed. 300 // return value: the new depth of address2os 301 static int 302 __kmp_affinity_remove_radix_one_levels(AddrUnsPair *address2os, int nActiveThreads, int depth, int* pkgLevel, int* coreLevel, int* threadLevel) { 303 int level; 304 int i; 305 int radix1_detected; 306 307 for (level = depth-1; level >= 0; --level) { 308 // Always keep the package level 309 if (level == *pkgLevel) 310 continue; 311 // Detect if this level is radix 1 312 radix1_detected = 1; 313 for (i = 1; i < nActiveThreads; ++i) { 314 if (address2os[0].first.labels[level] != address2os[i].first.labels[level]) { 315 // There are differing label values for this level so it stays 316 radix1_detected = 0; 317 break; 318 } 319 } 320 if (!radix1_detected) 321 continue; 322 // Radix 1 was detected 323 if (level == *threadLevel) { 324 // If only one thread per core, then just decrement 325 // the depth which removes the threadlevel from address2os 326 for (i = 0; i < nActiveThreads; ++i) { 327 address2os[i].first.depth--; 328 } 329 *threadLevel = -1; 330 } else if (level == *coreLevel) { 331 // For core level, we move the thread labels over if they are still 332 // valid (*threadLevel != -1), and also reduce the depth another level 333 for (i = 0; i < nActiveThreads; ++i) { 334 if (*threadLevel != -1) { 335 address2os[i].first.labels[*coreLevel] = address2os[i].first.labels[*threadLevel]; 336 } 337 address2os[i].first.depth--; 338 } 339 *coreLevel = -1; 340 } 341 } 342 return address2os[0].first.depth; 343 } 344 345 // Returns the number of objects of type 'type' below 'obj' within the topology tree structure. 346 // e.g., if obj is a HWLOC_OBJ_SOCKET object, and type is HWLOC_OBJ_PU, then 347 // this will return the number of PU's under the SOCKET object. 348 static int 349 __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj, hwloc_obj_type_t type) { 350 int retval = 0; 351 hwloc_obj_t first; 352 for(first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type, obj->logical_index, type, 0); 353 first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, obj->type, first) == obj; 354 first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type, first)) 355 { 356 ++retval; 357 } 358 return retval; 359 } 360 361 static int 362 __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os, 363 kmp_i18n_id_t *const msg_id) 364 { 365 *address2os = NULL; 366 *msg_id = kmp_i18n_null; 367 368 // 369 // Save the affinity mask for the current thread. 370 // 371 kmp_affin_mask_t *oldMask; 372 KMP_CPU_ALLOC(oldMask); 373 __kmp_get_system_affinity(oldMask, TRUE); 374 375 int depth = 3; 376 int pkgLevel = 0; 377 int coreLevel = 1; 378 int threadLevel = 2; 379 380 if (! KMP_AFFINITY_CAPABLE()) 381 { 382 // 383 // Hack to try and infer the machine topology using only the data 384 // available from cpuid on the current thread, and __kmp_xproc. 385 // 386 KMP_ASSERT(__kmp_affinity_type == affinity_none); 387 388 nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET, 0), HWLOC_OBJ_CORE); 389 __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, 0), HWLOC_OBJ_PU); 390 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 391 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 392 if (__kmp_affinity_verbose) { 393 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 394 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 395 if (__kmp_affinity_uniform_topology()) { 396 KMP_INFORM(Uniform, "KMP_AFFINITY"); 397 } else { 398 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 399 } 400 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 401 __kmp_nThreadsPerCore, __kmp_ncores); 402 } 403 KMP_CPU_FREE(oldMask); 404 return 0; 405 } 406 407 // 408 // Allocate the data structure to be returned. 409 // 410 AddrUnsPair *retval = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 411 __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 412 413 // 414 // When affinity is off, this routine will still be called to set 415 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 416 // nCoresPerPkg, & nPackages. Make sure all these vars are set 417 // correctly, and return if affinity is not enabled. 418 // 419 420 hwloc_obj_t pu; 421 hwloc_obj_t core; 422 hwloc_obj_t socket; 423 int nActiveThreads = 0; 424 int socket_identifier = 0; 425 // re-calculate globals to count only accessible resources 426 __kmp_ncores = nPackages = nCoresPerPkg = __kmp_nThreadsPerCore = 0; 427 for(socket = hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET, 0); 428 socket != NULL; 429 socket = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET, socket), 430 socket_identifier++) 431 { 432 int core_identifier = 0; 433 int num_active_cores = 0; 434 for(core = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, socket->type, socket->logical_index, HWLOC_OBJ_CORE, 0); 435 core != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, socket->type, core) == socket; 436 core = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, core), 437 core_identifier++) 438 { 439 int pu_identifier = 0; 440 int num_active_threads = 0; 441 for(pu = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, core->type, core->logical_index, HWLOC_OBJ_PU, 0); 442 pu != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, core->type, pu) == core; 443 pu = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PU, pu), 444 pu_identifier++) 445 { 446 Address addr(3); 447 if(! KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask)) 448 continue; // skip inactive (inaccessible) unit 449 KA_TRACE(20, ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n", 450 socket->os_index, socket->logical_index, core->os_index, core->logical_index, pu->os_index,pu->logical_index)); 451 addr.labels[0] = socket_identifier; // package 452 addr.labels[1] = core_identifier; // core 453 addr.labels[2] = pu_identifier; // pu 454 retval[nActiveThreads] = AddrUnsPair(addr, pu->os_index); 455 __kmp_pu_os_idx[nActiveThreads] = pu->os_index; // keep os index for each active pu 456 nActiveThreads++; 457 ++num_active_threads; // count active threads per core 458 } 459 if (num_active_threads) { // were there any active threads on the core? 460 ++__kmp_ncores; // count total active cores 461 ++num_active_cores; // count active cores per socket 462 if (num_active_threads > __kmp_nThreadsPerCore) 463 __kmp_nThreadsPerCore = num_active_threads; // calc maximum 464 } 465 } 466 if (num_active_cores) { // were there any active cores on the socket? 467 ++nPackages; // count total active packages 468 if (num_active_cores > nCoresPerPkg) 469 nCoresPerPkg = num_active_cores; // calc maximum 470 } 471 } 472 473 // 474 // If there's only one thread context to bind to, return now. 475 // 476 KMP_DEBUG_ASSERT(nActiveThreads == __kmp_avail_proc); 477 KMP_ASSERT(nActiveThreads > 0); 478 if (nActiveThreads == 1) { 479 __kmp_ncores = nPackages = 1; 480 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 481 if (__kmp_affinity_verbose) { 482 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 483 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 484 485 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 486 if (__kmp_affinity_respect_mask) { 487 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 488 } else { 489 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 490 } 491 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 492 KMP_INFORM(Uniform, "KMP_AFFINITY"); 493 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 494 __kmp_nThreadsPerCore, __kmp_ncores); 495 } 496 497 if (__kmp_affinity_type == affinity_none) { 498 __kmp_free(retval); 499 KMP_CPU_FREE(oldMask); 500 return 0; 501 } 502 503 // 504 // Form an Address object which only includes the package level. 505 // 506 Address addr(1); 507 addr.labels[0] = retval[0].first.labels[pkgLevel]; 508 retval[0].first = addr; 509 510 if (__kmp_affinity_gran_levels < 0) { 511 __kmp_affinity_gran_levels = 0; 512 } 513 514 if (__kmp_affinity_verbose) { 515 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 516 } 517 518 *address2os = retval; 519 KMP_CPU_FREE(oldMask); 520 return 1; 521 } 522 523 // 524 // Sort the table by physical Id. 525 // 526 qsort(retval, nActiveThreads, sizeof(*retval), __kmp_affinity_cmp_Address_labels); 527 528 // 529 // Check to see if the machine topology is uniform 530 // 531 unsigned uniform = (nPackages * nCoresPerPkg * __kmp_nThreadsPerCore == nActiveThreads); 532 533 // 534 // Print the machine topology summary. 535 // 536 if (__kmp_affinity_verbose) { 537 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 538 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 539 540 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 541 if (__kmp_affinity_respect_mask) { 542 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 543 } else { 544 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 545 } 546 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 547 if (uniform) { 548 KMP_INFORM(Uniform, "KMP_AFFINITY"); 549 } else { 550 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 551 } 552 553 kmp_str_buf_t buf; 554 __kmp_str_buf_init(&buf); 555 556 __kmp_str_buf_print(&buf, "%d", nPackages); 557 //for (level = 1; level <= pkgLevel; level++) { 558 // __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 559 // } 560 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 561 __kmp_nThreadsPerCore, __kmp_ncores); 562 563 __kmp_str_buf_free(&buf); 564 } 565 566 if (__kmp_affinity_type == affinity_none) { 567 __kmp_free(retval); 568 KMP_CPU_FREE(oldMask); 569 return 0; 570 } 571 572 // 573 // Find any levels with radiix 1, and remove them from the map 574 // (except for the package level). 575 // 576 depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth, &pkgLevel, &coreLevel, &threadLevel); 577 578 if (__kmp_affinity_gran_levels < 0) { 579 // 580 // Set the granularity level based on what levels are modeled 581 // in the machine topology map. 582 // 583 __kmp_affinity_gran_levels = 0; 584 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 585 __kmp_affinity_gran_levels++; 586 } 587 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 588 __kmp_affinity_gran_levels++; 589 } 590 if (__kmp_affinity_gran > affinity_gran_package) { 591 __kmp_affinity_gran_levels++; 592 } 593 } 594 595 if (__kmp_affinity_verbose) { 596 __kmp_affinity_print_topology(retval, nActiveThreads, depth, pkgLevel, 597 coreLevel, threadLevel); 598 } 599 600 KMP_CPU_FREE(oldMask); 601 *address2os = retval; 602 return depth; 603 } 604 #endif // KMP_USE_HWLOC 605 606 // 607 // If we don't know how to retrieve the machine's processor topology, or 608 // encounter an error in doing so, this routine is called to form a "flat" 609 // mapping of os thread id's <-> processor id's. 610 // 611 static int 612 __kmp_affinity_create_flat_map(AddrUnsPair **address2os, 613 kmp_i18n_id_t *const msg_id) 614 { 615 *address2os = NULL; 616 *msg_id = kmp_i18n_null; 617 618 // 619 // Even if __kmp_affinity_type == affinity_none, this routine might still 620 // called to set __kmp_ncores, as well as 621 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 622 // 623 if (! KMP_AFFINITY_CAPABLE()) { 624 KMP_ASSERT(__kmp_affinity_type == affinity_none); 625 __kmp_ncores = nPackages = __kmp_xproc; 626 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 627 if (__kmp_affinity_verbose) { 628 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY"); 629 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 630 KMP_INFORM(Uniform, "KMP_AFFINITY"); 631 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 632 __kmp_nThreadsPerCore, __kmp_ncores); 633 } 634 return 0; 635 } 636 637 // 638 // When affinity is off, this routine will still be called to set 639 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 640 // nCoresPerPkg, & nPackages. Make sure all these vars are set 641 // correctly, and return now if affinity is not enabled. 642 // 643 __kmp_ncores = nPackages = __kmp_avail_proc; 644 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 645 if (__kmp_affinity_verbose) { 646 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 647 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, __kmp_affin_fullMask); 648 649 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY"); 650 if (__kmp_affinity_respect_mask) { 651 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 652 } else { 653 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 654 } 655 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 656 KMP_INFORM(Uniform, "KMP_AFFINITY"); 657 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 658 __kmp_nThreadsPerCore, __kmp_ncores); 659 } 660 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 661 __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 662 if (__kmp_affinity_type == affinity_none) { 663 int avail_ct = 0; 664 int i; 665 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 666 if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) 667 continue; 668 __kmp_pu_os_idx[avail_ct++] = i; // suppose indices are flat 669 } 670 return 0; 671 } 672 673 // 674 // Contruct the data structure to be returned. 675 // 676 *address2os = (AddrUnsPair*) 677 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 678 int avail_ct = 0; 679 unsigned int i; 680 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 681 // 682 // Skip this proc if it is not included in the machine model. 683 // 684 if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 685 continue; 686 } 687 __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat 688 Address addr(1); 689 addr.labels[0] = i; 690 (*address2os)[avail_ct++] = AddrUnsPair(addr,i); 691 } 692 if (__kmp_affinity_verbose) { 693 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 694 } 695 696 if (__kmp_affinity_gran_levels < 0) { 697 // 698 // Only the package level is modeled in the machine topology map, 699 // so the #levels of granularity is either 0 or 1. 700 // 701 if (__kmp_affinity_gran > affinity_gran_package) { 702 __kmp_affinity_gran_levels = 1; 703 } 704 else { 705 __kmp_affinity_gran_levels = 0; 706 } 707 } 708 return 1; 709 } 710 711 712 # if KMP_GROUP_AFFINITY 713 714 // 715 // If multiple Windows* OS processor groups exist, we can create a 2-level 716 // topology map with the groups at level 0 and the individual procs at 717 // level 1. 718 // 719 // This facilitates letting the threads float among all procs in a group, 720 // if granularity=group (the default when there are multiple groups). 721 // 722 static int 723 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os, 724 kmp_i18n_id_t *const msg_id) 725 { 726 *address2os = NULL; 727 *msg_id = kmp_i18n_null; 728 729 // 730 // If we don't have multiple processor groups, return now. 731 // The flat mapping will be used. 732 // 733 if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(__kmp_affin_fullMask) >= 0)) { 734 // FIXME set *msg_id 735 return -1; 736 } 737 738 // 739 // Contruct the data structure to be returned. 740 // 741 *address2os = (AddrUnsPair*) 742 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 743 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 744 __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 745 int avail_ct = 0; 746 int i; 747 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 748 // 749 // Skip this proc if it is not included in the machine model. 750 // 751 if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 752 continue; 753 } 754 __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat 755 Address addr(2); 756 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR)); 757 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR)); 758 (*address2os)[avail_ct++] = AddrUnsPair(addr,i); 759 760 if (__kmp_affinity_verbose) { 761 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0], 762 addr.labels[1]); 763 } 764 } 765 766 if (__kmp_affinity_gran_levels < 0) { 767 if (__kmp_affinity_gran == affinity_gran_group) { 768 __kmp_affinity_gran_levels = 1; 769 } 770 else if ((__kmp_affinity_gran == affinity_gran_fine) 771 || (__kmp_affinity_gran == affinity_gran_thread)) { 772 __kmp_affinity_gran_levels = 0; 773 } 774 else { 775 const char *gran_str = NULL; 776 if (__kmp_affinity_gran == affinity_gran_core) { 777 gran_str = "core"; 778 } 779 else if (__kmp_affinity_gran == affinity_gran_package) { 780 gran_str = "package"; 781 } 782 else if (__kmp_affinity_gran == affinity_gran_node) { 783 gran_str = "node"; 784 } 785 else { 786 KMP_ASSERT(0); 787 } 788 789 // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread" 790 __kmp_affinity_gran_levels = 0; 791 } 792 } 793 return 2; 794 } 795 796 # endif /* KMP_GROUP_AFFINITY */ 797 798 799 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 800 801 static int 802 __kmp_cpuid_mask_width(int count) { 803 int r = 0; 804 805 while((1<<r) < count) 806 ++r; 807 return r; 808 } 809 810 811 class apicThreadInfo { 812 public: 813 unsigned osId; // param to __kmp_affinity_bind_thread 814 unsigned apicId; // from cpuid after binding 815 unsigned maxCoresPerPkg; // "" 816 unsigned maxThreadsPerPkg; // "" 817 unsigned pkgId; // inferred from above values 818 unsigned coreId; // "" 819 unsigned threadId; // "" 820 }; 821 822 823 static int 824 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b) 825 { 826 const apicThreadInfo *aa = (const apicThreadInfo *)a; 827 const apicThreadInfo *bb = (const apicThreadInfo *)b; 828 if (aa->osId < bb->osId) return -1; 829 if (aa->osId > bb->osId) return 1; 830 return 0; 831 } 832 833 834 static int 835 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b) 836 { 837 const apicThreadInfo *aa = (const apicThreadInfo *)a; 838 const apicThreadInfo *bb = (const apicThreadInfo *)b; 839 if (aa->pkgId < bb->pkgId) return -1; 840 if (aa->pkgId > bb->pkgId) return 1; 841 if (aa->coreId < bb->coreId) return -1; 842 if (aa->coreId > bb->coreId) return 1; 843 if (aa->threadId < bb->threadId) return -1; 844 if (aa->threadId > bb->threadId) return 1; 845 return 0; 846 } 847 848 849 // 850 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 851 // an algorithm which cycles through the available os threads, setting 852 // the current thread's affinity mask to that thread, and then retrieves 853 // the Apic Id for each thread context using the cpuid instruction. 854 // 855 static int 856 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os, 857 kmp_i18n_id_t *const msg_id) 858 { 859 kmp_cpuid buf; 860 int rc; 861 *address2os = NULL; 862 *msg_id = kmp_i18n_null; 863 864 // 865 // Check if cpuid leaf 4 is supported. 866 // 867 __kmp_x86_cpuid(0, 0, &buf); 868 if (buf.eax < 4) { 869 *msg_id = kmp_i18n_str_NoLeaf4Support; 870 return -1; 871 } 872 873 // 874 // The algorithm used starts by setting the affinity to each available 875 // thread and retrieving info from the cpuid instruction, so if we are 876 // not capable of calling __kmp_get_system_affinity() and 877 // _kmp_get_system_affinity(), then we need to do something else - use 878 // the defaults that we calculated from issuing cpuid without binding 879 // to each proc. 880 // 881 if (! KMP_AFFINITY_CAPABLE()) { 882 // 883 // Hack to try and infer the machine topology using only the data 884 // available from cpuid on the current thread, and __kmp_xproc. 885 // 886 KMP_ASSERT(__kmp_affinity_type == affinity_none); 887 888 // 889 // Get an upper bound on the number of threads per package using 890 // cpuid(1). 891 // 892 // On some OS/chps combinations where HT is supported by the chip 893 // but is disabled, this value will be 2 on a single core chip. 894 // Usually, it will be 2 if HT is enabled and 1 if HT is disabled. 895 // 896 __kmp_x86_cpuid(1, 0, &buf); 897 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 898 if (maxThreadsPerPkg == 0) { 899 maxThreadsPerPkg = 1; 900 } 901 902 // 903 // The num cores per pkg comes from cpuid(4). 904 // 1 must be added to the encoded value. 905 // 906 // The author of cpu_count.cpp treated this only an upper bound 907 // on the number of cores, but I haven't seen any cases where it 908 // was greater than the actual number of cores, so we will treat 909 // it as exact in this block of code. 910 // 911 // First, we need to check if cpuid(4) is supported on this chip. 912 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax 913 // has the value n or greater. 914 // 915 __kmp_x86_cpuid(0, 0, &buf); 916 if (buf.eax >= 4) { 917 __kmp_x86_cpuid(4, 0, &buf); 918 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 919 } 920 else { 921 nCoresPerPkg = 1; 922 } 923 924 // 925 // There is no way to reliably tell if HT is enabled without issuing 926 // the cpuid instruction from every thread, can correlating the cpuid 927 // info, so if the machine is not affinity capable, we assume that HT 928 // is off. We have seen quite a few machines where maxThreadsPerPkg 929 // is 2, yet the machine does not support HT. 930 // 931 // - Older OSes are usually found on machines with older chips, which 932 // do not support HT. 933 // 934 // - The performance penalty for mistakenly identifying a machine as 935 // HT when it isn't (which results in blocktime being incorrecly set 936 // to 0) is greater than the penalty when for mistakenly identifying 937 // a machine as being 1 thread/core when it is really HT enabled 938 // (which results in blocktime being incorrectly set to a positive 939 // value). 940 // 941 __kmp_ncores = __kmp_xproc; 942 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 943 __kmp_nThreadsPerCore = 1; 944 if (__kmp_affinity_verbose) { 945 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY"); 946 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 947 if (__kmp_affinity_uniform_topology()) { 948 KMP_INFORM(Uniform, "KMP_AFFINITY"); 949 } else { 950 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 951 } 952 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 953 __kmp_nThreadsPerCore, __kmp_ncores); 954 } 955 return 0; 956 } 957 958 // 959 // 960 // From here on, we can assume that it is safe to call 961 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), 962 // even if __kmp_affinity_type = affinity_none. 963 // 964 965 // 966 // Save the affinity mask for the current thread. 967 // 968 kmp_affin_mask_t *oldMask; 969 KMP_CPU_ALLOC(oldMask); 970 KMP_ASSERT(oldMask != NULL); 971 __kmp_get_system_affinity(oldMask, TRUE); 972 973 // 974 // Run through each of the available contexts, binding the current thread 975 // to it, and obtaining the pertinent information using the cpuid instr. 976 // 977 // The relevant information is: 978 // 979 // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 980 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 981 // 982 // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The 983 // value of this field determines the width of the core# + thread# 984 // fields in the Apic Id. It is also an upper bound on the number 985 // of threads per package, but it has been verified that situations 986 // happen were it is not exact. In particular, on certain OS/chip 987 // combinations where Intel(R) Hyper-Threading Technology is supported 988 // by the chip but has 989 // been disabled, the value of this field will be 2 (for a single core 990 // chip). On other OS/chip combinations supporting 991 // Intel(R) Hyper-Threading Technology, the value of 992 // this field will be 1 when Intel(R) Hyper-Threading Technology is 993 // disabled and 2 when it is enabled. 994 // 995 // Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The 996 // value of this field (+1) determines the width of the core# field in 997 // the Apic Id. The comments in "cpucount.cpp" say that this value is 998 // an upper bound, but the IA-32 architecture manual says that it is 999 // exactly the number of cores per package, and I haven't seen any 1000 // case where it wasn't. 1001 // 1002 // From this information, deduce the package Id, core Id, and thread Id, 1003 // and set the corresponding fields in the apicThreadInfo struct. 1004 // 1005 unsigned i; 1006 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 1007 __kmp_avail_proc * sizeof(apicThreadInfo)); 1008 unsigned nApics = 0; 1009 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1010 // 1011 // Skip this proc if it is not included in the machine model. 1012 // 1013 if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1014 continue; 1015 } 1016 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 1017 1018 __kmp_affinity_dispatch->bind_thread(i); 1019 threadInfo[nApics].osId = i; 1020 1021 // 1022 // The apic id and max threads per pkg come from cpuid(1). 1023 // 1024 __kmp_x86_cpuid(1, 0, &buf); 1025 if (((buf.edx >> 9) & 1) == 0) { 1026 __kmp_set_system_affinity(oldMask, TRUE); 1027 __kmp_free(threadInfo); 1028 KMP_CPU_FREE(oldMask); 1029 *msg_id = kmp_i18n_str_ApicNotPresent; 1030 return -1; 1031 } 1032 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 1033 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1034 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 1035 threadInfo[nApics].maxThreadsPerPkg = 1; 1036 } 1037 1038 // 1039 // Max cores per pkg comes from cpuid(4). 1040 // 1 must be added to the encoded value. 1041 // 1042 // First, we need to check if cpuid(4) is supported on this chip. 1043 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax 1044 // has the value n or greater. 1045 // 1046 __kmp_x86_cpuid(0, 0, &buf); 1047 if (buf.eax >= 4) { 1048 __kmp_x86_cpuid(4, 0, &buf); 1049 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1050 } 1051 else { 1052 threadInfo[nApics].maxCoresPerPkg = 1; 1053 } 1054 1055 // 1056 // Infer the pkgId / coreId / threadId using only the info 1057 // obtained locally. 1058 // 1059 int widthCT = __kmp_cpuid_mask_width( 1060 threadInfo[nApics].maxThreadsPerPkg); 1061 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 1062 1063 int widthC = __kmp_cpuid_mask_width( 1064 threadInfo[nApics].maxCoresPerPkg); 1065 int widthT = widthCT - widthC; 1066 if (widthT < 0) { 1067 // 1068 // I've never seen this one happen, but I suppose it could, if 1069 // the cpuid instruction on a chip was really screwed up. 1070 // Make sure to restore the affinity mask before the tail call. 1071 // 1072 __kmp_set_system_affinity(oldMask, TRUE); 1073 __kmp_free(threadInfo); 1074 KMP_CPU_FREE(oldMask); 1075 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1076 return -1; 1077 } 1078 1079 int maskC = (1 << widthC) - 1; 1080 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) 1081 &maskC; 1082 1083 int maskT = (1 << widthT) - 1; 1084 threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT; 1085 1086 nApics++; 1087 } 1088 1089 // 1090 // We've collected all the info we need. 1091 // Restore the old affinity mask for this thread. 1092 // 1093 __kmp_set_system_affinity(oldMask, TRUE); 1094 1095 // 1096 // If there's only one thread context to bind to, form an Address object 1097 // with depth 1 and return immediately (or, if affinity is off, set 1098 // address2os to NULL and return). 1099 // 1100 // If it is configured to omit the package level when there is only a 1101 // single package, the logic at the end of this routine won't work if 1102 // there is only a single thread - it would try to form an Address 1103 // object with depth 0. 1104 // 1105 KMP_ASSERT(nApics > 0); 1106 if (nApics == 1) { 1107 __kmp_ncores = nPackages = 1; 1108 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1109 if (__kmp_affinity_verbose) { 1110 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1111 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1112 1113 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1114 if (__kmp_affinity_respect_mask) { 1115 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1116 } else { 1117 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1118 } 1119 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1120 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1121 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1122 __kmp_nThreadsPerCore, __kmp_ncores); 1123 } 1124 1125 if (__kmp_affinity_type == affinity_none) { 1126 __kmp_free(threadInfo); 1127 KMP_CPU_FREE(oldMask); 1128 return 0; 1129 } 1130 1131 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair)); 1132 Address addr(1); 1133 addr.labels[0] = threadInfo[0].pkgId; 1134 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId); 1135 1136 if (__kmp_affinity_gran_levels < 0) { 1137 __kmp_affinity_gran_levels = 0; 1138 } 1139 1140 if (__kmp_affinity_verbose) { 1141 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 1142 } 1143 1144 __kmp_free(threadInfo); 1145 KMP_CPU_FREE(oldMask); 1146 return 1; 1147 } 1148 1149 // 1150 // Sort the threadInfo table by physical Id. 1151 // 1152 qsort(threadInfo, nApics, sizeof(*threadInfo), 1153 __kmp_affinity_cmp_apicThreadInfo_phys_id); 1154 1155 // 1156 // The table is now sorted by pkgId / coreId / threadId, but we really 1157 // don't know the radix of any of the fields. pkgId's may be sparsely 1158 // assigned among the chips on a system. Although coreId's are usually 1159 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned 1160 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1161 // 1162 // For that matter, we don't know what coresPerPkg and threadsPerCore 1163 // (or the total # packages) are at this point - we want to determine 1164 // that now. We only have an upper bound on the first two figures. 1165 // 1166 // We also perform a consistency check at this point: the values returned 1167 // by the cpuid instruction for any thread bound to a given package had 1168 // better return the same info for maxThreadsPerPkg and maxCoresPerPkg. 1169 // 1170 nPackages = 1; 1171 nCoresPerPkg = 1; 1172 __kmp_nThreadsPerCore = 1; 1173 unsigned nCores = 1; 1174 1175 unsigned pkgCt = 1; // to determine radii 1176 unsigned lastPkgId = threadInfo[0].pkgId; 1177 unsigned coreCt = 1; 1178 unsigned lastCoreId = threadInfo[0].coreId; 1179 unsigned threadCt = 1; 1180 unsigned lastThreadId = threadInfo[0].threadId; 1181 1182 // intra-pkg consist checks 1183 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 1184 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 1185 1186 for (i = 1; i < nApics; i++) { 1187 if (threadInfo[i].pkgId != lastPkgId) { 1188 nCores++; 1189 pkgCt++; 1190 lastPkgId = threadInfo[i].pkgId; 1191 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; 1192 coreCt = 1; 1193 lastCoreId = threadInfo[i].coreId; 1194 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1195 threadCt = 1; 1196 lastThreadId = threadInfo[i].threadId; 1197 1198 // 1199 // This is a different package, so go on to the next iteration 1200 // without doing any consistency checks. Reset the consistency 1201 // check vars, though. 1202 // 1203 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 1204 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 1205 continue; 1206 } 1207 1208 if (threadInfo[i].coreId != lastCoreId) { 1209 nCores++; 1210 coreCt++; 1211 lastCoreId = threadInfo[i].coreId; 1212 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1213 threadCt = 1; 1214 lastThreadId = threadInfo[i].threadId; 1215 } 1216 else if (threadInfo[i].threadId != lastThreadId) { 1217 threadCt++; 1218 lastThreadId = threadInfo[i].threadId; 1219 } 1220 else { 1221 __kmp_free(threadInfo); 1222 KMP_CPU_FREE(oldMask); 1223 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1224 return -1; 1225 } 1226 1227 // 1228 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 1229 // fields agree between all the threads bounds to a given package. 1230 // 1231 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) 1232 || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 1233 __kmp_free(threadInfo); 1234 KMP_CPU_FREE(oldMask); 1235 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1236 return -1; 1237 } 1238 } 1239 nPackages = pkgCt; 1240 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; 1241 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1242 1243 // 1244 // When affinity is off, this routine will still be called to set 1245 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 1246 // nCoresPerPkg, & nPackages. Make sure all these vars are set 1247 // correctly, and return now if affinity is not enabled. 1248 // 1249 __kmp_ncores = nCores; 1250 if (__kmp_affinity_verbose) { 1251 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1252 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1253 1254 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1255 if (__kmp_affinity_respect_mask) { 1256 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1257 } else { 1258 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1259 } 1260 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1261 if (__kmp_affinity_uniform_topology()) { 1262 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1263 } else { 1264 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1265 } 1266 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1267 __kmp_nThreadsPerCore, __kmp_ncores); 1268 1269 } 1270 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 1271 KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc); 1272 __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 1273 for (i = 0; i < nApics; ++i) { 1274 __kmp_pu_os_idx[i] = threadInfo[i].osId; 1275 } 1276 if (__kmp_affinity_type == affinity_none) { 1277 __kmp_free(threadInfo); 1278 KMP_CPU_FREE(oldMask); 1279 return 0; 1280 } 1281 1282 // 1283 // Now that we've determined the number of packages, the number of cores 1284 // per package, and the number of threads per core, we can construct the 1285 // data structure that is to be returned. 1286 // 1287 int pkgLevel = 0; 1288 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1; 1289 int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 1290 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 1291 1292 KMP_ASSERT(depth > 0); 1293 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1294 1295 for (i = 0; i < nApics; ++i) { 1296 Address addr(depth); 1297 unsigned os = threadInfo[i].osId; 1298 int d = 0; 1299 1300 if (pkgLevel >= 0) { 1301 addr.labels[d++] = threadInfo[i].pkgId; 1302 } 1303 if (coreLevel >= 0) { 1304 addr.labels[d++] = threadInfo[i].coreId; 1305 } 1306 if (threadLevel >= 0) { 1307 addr.labels[d++] = threadInfo[i].threadId; 1308 } 1309 (*address2os)[i] = AddrUnsPair(addr, os); 1310 } 1311 1312 if (__kmp_affinity_gran_levels < 0) { 1313 // 1314 // Set the granularity level based on what levels are modeled 1315 // in the machine topology map. 1316 // 1317 __kmp_affinity_gran_levels = 0; 1318 if ((threadLevel >= 0) 1319 && (__kmp_affinity_gran > affinity_gran_thread)) { 1320 __kmp_affinity_gran_levels++; 1321 } 1322 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1323 __kmp_affinity_gran_levels++; 1324 } 1325 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) { 1326 __kmp_affinity_gran_levels++; 1327 } 1328 } 1329 1330 if (__kmp_affinity_verbose) { 1331 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel, 1332 coreLevel, threadLevel); 1333 } 1334 1335 __kmp_free(threadInfo); 1336 KMP_CPU_FREE(oldMask); 1337 return depth; 1338 } 1339 1340 1341 // 1342 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 1343 // architectures support a newer interface for specifying the x2APIC Ids, 1344 // based on cpuid leaf 11. 1345 // 1346 static int 1347 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os, 1348 kmp_i18n_id_t *const msg_id) 1349 { 1350 kmp_cpuid buf; 1351 1352 *address2os = NULL; 1353 *msg_id = kmp_i18n_null; 1354 1355 // 1356 // Check to see if cpuid leaf 11 is supported. 1357 // 1358 __kmp_x86_cpuid(0, 0, &buf); 1359 if (buf.eax < 11) { 1360 *msg_id = kmp_i18n_str_NoLeaf11Support; 1361 return -1; 1362 } 1363 __kmp_x86_cpuid(11, 0, &buf); 1364 if (buf.ebx == 0) { 1365 *msg_id = kmp_i18n_str_NoLeaf11Support; 1366 return -1; 1367 } 1368 1369 // 1370 // Find the number of levels in the machine topology. While we're at it, 1371 // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will 1372 // try to get more accurate values later by explicitly counting them, 1373 // but get reasonable defaults now, in case we return early. 1374 // 1375 int level; 1376 int threadLevel = -1; 1377 int coreLevel = -1; 1378 int pkgLevel = -1; 1379 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1380 1381 for (level = 0;; level++) { 1382 if (level > 31) { 1383 // 1384 // FIXME: Hack for DPD200163180 1385 // 1386 // If level is big then something went wrong -> exiting 1387 // 1388 // There could actually be 32 valid levels in the machine topology, 1389 // but so far, the only machine we have seen which does not exit 1390 // this loop before iteration 32 has fubar x2APIC settings. 1391 // 1392 // For now, just reject this case based upon loop trip count. 1393 // 1394 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1395 return -1; 1396 } 1397 __kmp_x86_cpuid(11, level, &buf); 1398 if (buf.ebx == 0) { 1399 if (pkgLevel < 0) { 1400 // 1401 // Will infer nPackages from __kmp_xproc 1402 // 1403 pkgLevel = level; 1404 level++; 1405 } 1406 break; 1407 } 1408 int kind = (buf.ecx >> 8) & 0xff; 1409 if (kind == 1) { 1410 // 1411 // SMT level 1412 // 1413 threadLevel = level; 1414 coreLevel = -1; 1415 pkgLevel = -1; 1416 __kmp_nThreadsPerCore = buf.ebx & 0xffff; 1417 if (__kmp_nThreadsPerCore == 0) { 1418 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1419 return -1; 1420 } 1421 } 1422 else if (kind == 2) { 1423 // 1424 // core level 1425 // 1426 coreLevel = level; 1427 pkgLevel = -1; 1428 nCoresPerPkg = buf.ebx & 0xffff; 1429 if (nCoresPerPkg == 0) { 1430 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1431 return -1; 1432 } 1433 } 1434 else { 1435 if (level <= 0) { 1436 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1437 return -1; 1438 } 1439 if (pkgLevel >= 0) { 1440 continue; 1441 } 1442 pkgLevel = level; 1443 nPackages = buf.ebx & 0xffff; 1444 if (nPackages == 0) { 1445 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1446 return -1; 1447 } 1448 } 1449 } 1450 int depth = level; 1451 1452 // 1453 // In the above loop, "level" was counted from the finest level (usually 1454 // thread) to the coarsest. The caller expects that we will place the 1455 // labels in (*address2os)[].first.labels[] in the inverse order, so 1456 // we need to invert the vars saying which level means what. 1457 // 1458 if (threadLevel >= 0) { 1459 threadLevel = depth - threadLevel - 1; 1460 } 1461 if (coreLevel >= 0) { 1462 coreLevel = depth - coreLevel - 1; 1463 } 1464 KMP_DEBUG_ASSERT(pkgLevel >= 0); 1465 pkgLevel = depth - pkgLevel - 1; 1466 1467 // 1468 // The algorithm used starts by setting the affinity to each available 1469 // thread and retrieving info from the cpuid instruction, so if we are 1470 // not capable of calling __kmp_get_system_affinity() and 1471 // _kmp_get_system_affinity(), then we need to do something else - use 1472 // the defaults that we calculated from issuing cpuid without binding 1473 // to each proc. 1474 // 1475 if (! KMP_AFFINITY_CAPABLE()) 1476 { 1477 // 1478 // Hack to try and infer the machine topology using only the data 1479 // available from cpuid on the current thread, and __kmp_xproc. 1480 // 1481 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1482 1483 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1484 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1485 if (__kmp_affinity_verbose) { 1486 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 1487 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1488 if (__kmp_affinity_uniform_topology()) { 1489 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1490 } else { 1491 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1492 } 1493 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1494 __kmp_nThreadsPerCore, __kmp_ncores); 1495 } 1496 return 0; 1497 } 1498 1499 // 1500 // 1501 // From here on, we can assume that it is safe to call 1502 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), 1503 // even if __kmp_affinity_type = affinity_none. 1504 // 1505 1506 // 1507 // Save the affinity mask for the current thread. 1508 // 1509 kmp_affin_mask_t *oldMask; 1510 KMP_CPU_ALLOC(oldMask); 1511 __kmp_get_system_affinity(oldMask, TRUE); 1512 1513 // 1514 // Allocate the data structure to be returned. 1515 // 1516 AddrUnsPair *retval = (AddrUnsPair *) 1517 __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 1518 1519 // 1520 // Run through each of the available contexts, binding the current thread 1521 // to it, and obtaining the pertinent information using the cpuid instr. 1522 // 1523 unsigned int proc; 1524 int nApics = 0; 1525 KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) { 1526 // 1527 // Skip this proc if it is not included in the machine model. 1528 // 1529 if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 1530 continue; 1531 } 1532 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc); 1533 1534 __kmp_affinity_dispatch->bind_thread(proc); 1535 1536 // 1537 // Extrach the labels for each level in the machine topology map 1538 // from the Apic ID. 1539 // 1540 Address addr(depth); 1541 int prev_shift = 0; 1542 1543 for (level = 0; level < depth; level++) { 1544 __kmp_x86_cpuid(11, level, &buf); 1545 unsigned apicId = buf.edx; 1546 if (buf.ebx == 0) { 1547 if (level != depth - 1) { 1548 KMP_CPU_FREE(oldMask); 1549 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1550 return -1; 1551 } 1552 addr.labels[depth - level - 1] = apicId >> prev_shift; 1553 level++; 1554 break; 1555 } 1556 int shift = buf.eax & 0x1f; 1557 int mask = (1 << shift) - 1; 1558 addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift; 1559 prev_shift = shift; 1560 } 1561 if (level != depth) { 1562 KMP_CPU_FREE(oldMask); 1563 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1564 return -1; 1565 } 1566 1567 retval[nApics] = AddrUnsPair(addr, proc); 1568 nApics++; 1569 } 1570 1571 // 1572 // We've collected all the info we need. 1573 // Restore the old affinity mask for this thread. 1574 // 1575 __kmp_set_system_affinity(oldMask, TRUE); 1576 1577 // 1578 // If there's only one thread context to bind to, return now. 1579 // 1580 KMP_ASSERT(nApics > 0); 1581 if (nApics == 1) { 1582 __kmp_ncores = nPackages = 1; 1583 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1584 if (__kmp_affinity_verbose) { 1585 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1586 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1587 1588 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1589 if (__kmp_affinity_respect_mask) { 1590 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1591 } else { 1592 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1593 } 1594 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1595 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1596 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1597 __kmp_nThreadsPerCore, __kmp_ncores); 1598 } 1599 1600 if (__kmp_affinity_type == affinity_none) { 1601 __kmp_free(retval); 1602 KMP_CPU_FREE(oldMask); 1603 return 0; 1604 } 1605 1606 // 1607 // Form an Address object which only includes the package level. 1608 // 1609 Address addr(1); 1610 addr.labels[0] = retval[0].first.labels[pkgLevel]; 1611 retval[0].first = addr; 1612 1613 if (__kmp_affinity_gran_levels < 0) { 1614 __kmp_affinity_gran_levels = 0; 1615 } 1616 1617 if (__kmp_affinity_verbose) { 1618 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 1619 } 1620 1621 *address2os = retval; 1622 KMP_CPU_FREE(oldMask); 1623 return 1; 1624 } 1625 1626 // 1627 // Sort the table by physical Id. 1628 // 1629 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels); 1630 1631 // 1632 // Find the radix at each of the levels. 1633 // 1634 unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1635 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1636 unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1637 unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1638 for (level = 0; level < depth; level++) { 1639 totals[level] = 1; 1640 maxCt[level] = 1; 1641 counts[level] = 1; 1642 last[level] = retval[0].first.labels[level]; 1643 } 1644 1645 // 1646 // From here on, the iteration variable "level" runs from the finest 1647 // level to the coarsest, i.e. we iterate forward through 1648 // (*address2os)[].first.labels[] - in the previous loops, we iterated 1649 // backwards. 1650 // 1651 for (proc = 1; (int)proc < nApics; proc++) { 1652 int level; 1653 for (level = 0; level < depth; level++) { 1654 if (retval[proc].first.labels[level] != last[level]) { 1655 int j; 1656 for (j = level + 1; j < depth; j++) { 1657 totals[j]++; 1658 counts[j] = 1; 1659 // The line below causes printing incorrect topology information 1660 // in case the max value for some level (maxCt[level]) is encountered earlier than 1661 // some less value while going through the array. 1662 // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2 1663 // whereas it must be 4. 1664 // TODO!!! Check if it can be commented safely 1665 //maxCt[j] = 1; 1666 last[j] = retval[proc].first.labels[j]; 1667 } 1668 totals[level]++; 1669 counts[level]++; 1670 if (counts[level] > maxCt[level]) { 1671 maxCt[level] = counts[level]; 1672 } 1673 last[level] = retval[proc].first.labels[level]; 1674 break; 1675 } 1676 else if (level == depth - 1) { 1677 __kmp_free(last); 1678 __kmp_free(maxCt); 1679 __kmp_free(counts); 1680 __kmp_free(totals); 1681 __kmp_free(retval); 1682 KMP_CPU_FREE(oldMask); 1683 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 1684 return -1; 1685 } 1686 } 1687 } 1688 1689 // 1690 // When affinity is off, this routine will still be called to set 1691 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 1692 // nCoresPerPkg, & nPackages. Make sure all these vars are set 1693 // correctly, and return if affinity is not enabled. 1694 // 1695 if (threadLevel >= 0) { 1696 __kmp_nThreadsPerCore = maxCt[threadLevel]; 1697 } 1698 else { 1699 __kmp_nThreadsPerCore = 1; 1700 } 1701 nPackages = totals[pkgLevel]; 1702 1703 if (coreLevel >= 0) { 1704 __kmp_ncores = totals[coreLevel]; 1705 nCoresPerPkg = maxCt[coreLevel]; 1706 } 1707 else { 1708 __kmp_ncores = nPackages; 1709 nCoresPerPkg = 1; 1710 } 1711 1712 // 1713 // Check to see if the machine topology is uniform 1714 // 1715 unsigned prod = maxCt[0]; 1716 for (level = 1; level < depth; level++) { 1717 prod *= maxCt[level]; 1718 } 1719 bool uniform = (prod == totals[level - 1]); 1720 1721 // 1722 // Print the machine topology summary. 1723 // 1724 if (__kmp_affinity_verbose) { 1725 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 1726 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1727 1728 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1729 if (__kmp_affinity_respect_mask) { 1730 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 1731 } else { 1732 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 1733 } 1734 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1735 if (uniform) { 1736 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1737 } else { 1738 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1739 } 1740 1741 kmp_str_buf_t buf; 1742 __kmp_str_buf_init(&buf); 1743 1744 __kmp_str_buf_print(&buf, "%d", totals[0]); 1745 for (level = 1; level <= pkgLevel; level++) { 1746 __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 1747 } 1748 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 1749 __kmp_nThreadsPerCore, __kmp_ncores); 1750 1751 __kmp_str_buf_free(&buf); 1752 } 1753 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 1754 KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc); 1755 __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 1756 for (proc = 0; (int)proc < nApics; ++proc) { 1757 __kmp_pu_os_idx[proc] = retval[proc].second; 1758 } 1759 if (__kmp_affinity_type == affinity_none) { 1760 __kmp_free(last); 1761 __kmp_free(maxCt); 1762 __kmp_free(counts); 1763 __kmp_free(totals); 1764 __kmp_free(retval); 1765 KMP_CPU_FREE(oldMask); 1766 return 0; 1767 } 1768 1769 // 1770 // Find any levels with radiix 1, and remove them from the map 1771 // (except for the package level). 1772 // 1773 int new_depth = 0; 1774 for (level = 0; level < depth; level++) { 1775 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1776 continue; 1777 } 1778 new_depth++; 1779 } 1780 1781 // 1782 // If we are removing any levels, allocate a new vector to return, 1783 // and copy the relevant information to it. 1784 // 1785 if (new_depth != depth) { 1786 AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate( 1787 sizeof(AddrUnsPair) * nApics); 1788 for (proc = 0; (int)proc < nApics; proc++) { 1789 Address addr(new_depth); 1790 new_retval[proc] = AddrUnsPair(addr, retval[proc].second); 1791 } 1792 int new_level = 0; 1793 int newPkgLevel = -1; 1794 int newCoreLevel = -1; 1795 int newThreadLevel = -1; 1796 int i; 1797 for (level = 0; level < depth; level++) { 1798 if ((maxCt[level] == 1) 1799 && (level != pkgLevel)) { 1800 // 1801 // Remove this level. Never remove the package level 1802 // 1803 continue; 1804 } 1805 if (level == pkgLevel) { 1806 newPkgLevel = level; 1807 } 1808 if (level == coreLevel) { 1809 newCoreLevel = level; 1810 } 1811 if (level == threadLevel) { 1812 newThreadLevel = level; 1813 } 1814 for (proc = 0; (int)proc < nApics; proc++) { 1815 new_retval[proc].first.labels[new_level] 1816 = retval[proc].first.labels[level]; 1817 } 1818 new_level++; 1819 } 1820 1821 __kmp_free(retval); 1822 retval = new_retval; 1823 depth = new_depth; 1824 pkgLevel = newPkgLevel; 1825 coreLevel = newCoreLevel; 1826 threadLevel = newThreadLevel; 1827 } 1828 1829 if (__kmp_affinity_gran_levels < 0) { 1830 // 1831 // Set the granularity level based on what levels are modeled 1832 // in the machine topology map. 1833 // 1834 __kmp_affinity_gran_levels = 0; 1835 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1836 __kmp_affinity_gran_levels++; 1837 } 1838 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1839 __kmp_affinity_gran_levels++; 1840 } 1841 if (__kmp_affinity_gran > affinity_gran_package) { 1842 __kmp_affinity_gran_levels++; 1843 } 1844 } 1845 1846 if (__kmp_affinity_verbose) { 1847 __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, 1848 coreLevel, threadLevel); 1849 } 1850 1851 __kmp_free(last); 1852 __kmp_free(maxCt); 1853 __kmp_free(counts); 1854 __kmp_free(totals); 1855 KMP_CPU_FREE(oldMask); 1856 *address2os = retval; 1857 return depth; 1858 } 1859 1860 1861 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1862 1863 1864 #define osIdIndex 0 1865 #define threadIdIndex 1 1866 #define coreIdIndex 2 1867 #define pkgIdIndex 3 1868 #define nodeIdIndex 4 1869 1870 typedef unsigned *ProcCpuInfo; 1871 static unsigned maxIndex = pkgIdIndex; 1872 1873 1874 static int 1875 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b) 1876 { 1877 const unsigned *aa = (const unsigned *)a; 1878 const unsigned *bb = (const unsigned *)b; 1879 if (aa[osIdIndex] < bb[osIdIndex]) return -1; 1880 if (aa[osIdIndex] > bb[osIdIndex]) return 1; 1881 return 0; 1882 }; 1883 1884 1885 static int 1886 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b) 1887 { 1888 unsigned i; 1889 const unsigned *aa = *((const unsigned **)a); 1890 const unsigned *bb = *((const unsigned **)b); 1891 for (i = maxIndex; ; i--) { 1892 if (aa[i] < bb[i]) return -1; 1893 if (aa[i] > bb[i]) return 1; 1894 if (i == osIdIndex) break; 1895 } 1896 return 0; 1897 } 1898 1899 1900 // 1901 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 1902 // affinity map. 1903 // 1904 static int 1905 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line, 1906 kmp_i18n_id_t *const msg_id, FILE *f) 1907 { 1908 *address2os = NULL; 1909 *msg_id = kmp_i18n_null; 1910 1911 // 1912 // Scan of the file, and count the number of "processor" (osId) fields, 1913 // and find the highest value of <n> for a node_<n> field. 1914 // 1915 char buf[256]; 1916 unsigned num_records = 0; 1917 while (! feof(f)) { 1918 buf[sizeof(buf) - 1] = 1; 1919 if (! fgets(buf, sizeof(buf), f)) { 1920 // 1921 // Read errors presumably because of EOF 1922 // 1923 break; 1924 } 1925 1926 char s1[] = "processor"; 1927 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1928 num_records++; 1929 continue; 1930 } 1931 1932 // 1933 // FIXME - this will match "node_<n> <garbage>" 1934 // 1935 unsigned level; 1936 if (KMP_SSCANF(buf, "node_%d id", &level) == 1) { 1937 if (nodeIdIndex + level >= maxIndex) { 1938 maxIndex = nodeIdIndex + level; 1939 } 1940 continue; 1941 } 1942 } 1943 1944 // 1945 // Check for empty file / no valid processor records, or too many. 1946 // The number of records can't exceed the number of valid bits in the 1947 // affinity mask. 1948 // 1949 if (num_records == 0) { 1950 *line = 0; 1951 *msg_id = kmp_i18n_str_NoProcRecords; 1952 return -1; 1953 } 1954 if (num_records > (unsigned)__kmp_xproc) { 1955 *line = 0; 1956 *msg_id = kmp_i18n_str_TooManyProcRecords; 1957 return -1; 1958 } 1959 1960 // 1961 // Set the file pointer back to the begginning, so that we can scan the 1962 // file again, this time performing a full parse of the data. 1963 // Allocate a vector of ProcCpuInfo object, where we will place the data. 1964 // Adding an extra element at the end allows us to remove a lot of extra 1965 // checks for termination conditions. 1966 // 1967 if (fseek(f, 0, SEEK_SET) != 0) { 1968 *line = 0; 1969 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 1970 return -1; 1971 } 1972 1973 // 1974 // Allocate the array of records to store the proc info in. The dummy 1975 // element at the end makes the logic in filling them out easier to code. 1976 // 1977 unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1) 1978 * sizeof(unsigned *)); 1979 unsigned i; 1980 for (i = 0; i <= num_records; i++) { 1981 threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1) 1982 * sizeof(unsigned)); 1983 } 1984 1985 #define CLEANUP_THREAD_INFO \ 1986 for (i = 0; i <= num_records; i++) { \ 1987 __kmp_free(threadInfo[i]); \ 1988 } \ 1989 __kmp_free(threadInfo); 1990 1991 // 1992 // A value of UINT_MAX means that we didn't find the field 1993 // 1994 unsigned __index; 1995 1996 #define INIT_PROC_INFO(p) \ 1997 for (__index = 0; __index <= maxIndex; __index++) { \ 1998 (p)[__index] = UINT_MAX; \ 1999 } 2000 2001 for (i = 0; i <= num_records; i++) { 2002 INIT_PROC_INFO(threadInfo[i]); 2003 } 2004 2005 unsigned num_avail = 0; 2006 *line = 0; 2007 while (! feof(f)) { 2008 // 2009 // Create an inner scoping level, so that all the goto targets at the 2010 // end of the loop appear in an outer scoping level. This avoids 2011 // warnings about jumping past an initialization to a target in the 2012 // same block. 2013 // 2014 { 2015 buf[sizeof(buf) - 1] = 1; 2016 bool long_line = false; 2017 if (! fgets(buf, sizeof(buf), f)) { 2018 // 2019 // Read errors presumably because of EOF 2020 // 2021 // If there is valid data in threadInfo[num_avail], then fake 2022 // a blank line in ensure that the last address gets parsed. 2023 // 2024 bool valid = false; 2025 for (i = 0; i <= maxIndex; i++) { 2026 if (threadInfo[num_avail][i] != UINT_MAX) { 2027 valid = true; 2028 } 2029 } 2030 if (! valid) { 2031 break; 2032 } 2033 buf[0] = 0; 2034 } else if (!buf[sizeof(buf) - 1]) { 2035 // 2036 // The line is longer than the buffer. Set a flag and don't 2037 // emit an error if we were going to ignore the line, anyway. 2038 // 2039 long_line = true; 2040 2041 #define CHECK_LINE \ 2042 if (long_line) { \ 2043 CLEANUP_THREAD_INFO; \ 2044 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 2045 return -1; \ 2046 } 2047 } 2048 (*line)++; 2049 2050 char s1[] = "processor"; 2051 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 2052 CHECK_LINE; 2053 char *p = strchr(buf + sizeof(s1) - 1, ':'); 2054 unsigned val; 2055 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2056 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field; 2057 threadInfo[num_avail][osIdIndex] = val; 2058 #if KMP_OS_LINUX && USE_SYSFS_INFO 2059 char path[256]; 2060 KMP_SNPRINTF(path, sizeof(path), 2061 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 2062 threadInfo[num_avail][osIdIndex]); 2063 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 2064 2065 KMP_SNPRINTF(path, sizeof(path), 2066 "/sys/devices/system/cpu/cpu%u/topology/core_id", 2067 threadInfo[num_avail][osIdIndex]); 2068 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 2069 continue; 2070 #else 2071 } 2072 char s2[] = "physical id"; 2073 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 2074 CHECK_LINE; 2075 char *p = strchr(buf + sizeof(s2) - 1, ':'); 2076 unsigned val; 2077 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2078 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field; 2079 threadInfo[num_avail][pkgIdIndex] = val; 2080 continue; 2081 } 2082 char s3[] = "core id"; 2083 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 2084 CHECK_LINE; 2085 char *p = strchr(buf + sizeof(s3) - 1, ':'); 2086 unsigned val; 2087 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2088 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field; 2089 threadInfo[num_avail][coreIdIndex] = val; 2090 continue; 2091 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 2092 } 2093 char s4[] = "thread id"; 2094 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 2095 CHECK_LINE; 2096 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2097 unsigned val; 2098 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2099 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field; 2100 threadInfo[num_avail][threadIdIndex] = val; 2101 continue; 2102 } 2103 unsigned level; 2104 if (KMP_SSCANF(buf, "node_%d id", &level) == 1) { 2105 CHECK_LINE; 2106 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2107 unsigned val; 2108 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2109 KMP_ASSERT(nodeIdIndex + level <= maxIndex); 2110 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field; 2111 threadInfo[num_avail][nodeIdIndex + level] = val; 2112 continue; 2113 } 2114 2115 // 2116 // We didn't recognize the leading token on the line. 2117 // There are lots of leading tokens that we don't recognize - 2118 // if the line isn't empty, go on to the next line. 2119 // 2120 if ((*buf != 0) && (*buf != '\n')) { 2121 // 2122 // If the line is longer than the buffer, read characters 2123 // until we find a newline. 2124 // 2125 if (long_line) { 2126 int ch; 2127 while (((ch = fgetc(f)) != EOF) && (ch != '\n')); 2128 } 2129 continue; 2130 } 2131 2132 // 2133 // A newline has signalled the end of the processor record. 2134 // Check that there aren't too many procs specified. 2135 // 2136 if ((int)num_avail == __kmp_xproc) { 2137 CLEANUP_THREAD_INFO; 2138 *msg_id = kmp_i18n_str_TooManyEntries; 2139 return -1; 2140 } 2141 2142 // 2143 // Check for missing fields. The osId field must be there, and we 2144 // currently require that the physical id field is specified, also. 2145 // 2146 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 2147 CLEANUP_THREAD_INFO; 2148 *msg_id = kmp_i18n_str_MissingProcField; 2149 return -1; 2150 } 2151 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 2152 CLEANUP_THREAD_INFO; 2153 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 2154 return -1; 2155 } 2156 2157 // 2158 // Skip this proc if it is not included in the machine model. 2159 // 2160 if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], __kmp_affin_fullMask)) { 2161 INIT_PROC_INFO(threadInfo[num_avail]); 2162 continue; 2163 } 2164 2165 // 2166 // We have a successful parse of this proc's info. 2167 // Increment the counter, and prepare for the next proc. 2168 // 2169 num_avail++; 2170 KMP_ASSERT(num_avail <= num_records); 2171 INIT_PROC_INFO(threadInfo[num_avail]); 2172 } 2173 continue; 2174 2175 no_val: 2176 CLEANUP_THREAD_INFO; 2177 *msg_id = kmp_i18n_str_MissingValCpuinfo; 2178 return -1; 2179 2180 dup_field: 2181 CLEANUP_THREAD_INFO; 2182 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 2183 return -1; 2184 } 2185 *line = 0; 2186 2187 # if KMP_MIC && REDUCE_TEAM_SIZE 2188 unsigned teamSize = 0; 2189 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2190 2191 // check for num_records == __kmp_xproc ??? 2192 2193 // 2194 // If there's only one thread context to bind to, form an Address object 2195 // with depth 1 and return immediately (or, if affinity is off, set 2196 // address2os to NULL and return). 2197 // 2198 // If it is configured to omit the package level when there is only a 2199 // single package, the logic at the end of this routine won't work if 2200 // there is only a single thread - it would try to form an Address 2201 // object with depth 0. 2202 // 2203 KMP_ASSERT(num_avail > 0); 2204 KMP_ASSERT(num_avail <= num_records); 2205 if (num_avail == 1) { 2206 __kmp_ncores = 1; 2207 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 2208 if (__kmp_affinity_verbose) { 2209 if (! KMP_AFFINITY_CAPABLE()) { 2210 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2211 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2212 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2213 } 2214 else { 2215 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2216 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 2217 __kmp_affin_fullMask); 2218 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2219 if (__kmp_affinity_respect_mask) { 2220 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2221 } else { 2222 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2223 } 2224 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2225 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2226 } 2227 int index; 2228 kmp_str_buf_t buf; 2229 __kmp_str_buf_init(&buf); 2230 __kmp_str_buf_print(&buf, "1"); 2231 for (index = maxIndex - 1; index > pkgIdIndex; index--) { 2232 __kmp_str_buf_print(&buf, " x 1"); 2233 } 2234 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1); 2235 __kmp_str_buf_free(&buf); 2236 } 2237 2238 if (__kmp_affinity_type == affinity_none) { 2239 CLEANUP_THREAD_INFO; 2240 return 0; 2241 } 2242 2243 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair)); 2244 Address addr(1); 2245 addr.labels[0] = threadInfo[0][pkgIdIndex]; 2246 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]); 2247 2248 if (__kmp_affinity_gran_levels < 0) { 2249 __kmp_affinity_gran_levels = 0; 2250 } 2251 2252 if (__kmp_affinity_verbose) { 2253 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 2254 } 2255 2256 CLEANUP_THREAD_INFO; 2257 return 1; 2258 } 2259 2260 // 2261 // Sort the threadInfo table by physical Id. 2262 // 2263 qsort(threadInfo, num_avail, sizeof(*threadInfo), 2264 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 2265 2266 // 2267 // The table is now sorted by pkgId / coreId / threadId, but we really 2268 // don't know the radix of any of the fields. pkgId's may be sparsely 2269 // assigned among the chips on a system. Although coreId's are usually 2270 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned 2271 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2272 // 2273 // For that matter, we don't know what coresPerPkg and threadsPerCore 2274 // (or the total # packages) are at this point - we want to determine 2275 // that now. We only have an upper bound on the first two figures. 2276 // 2277 unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1) 2278 * sizeof(unsigned)); 2279 unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1) 2280 * sizeof(unsigned)); 2281 unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1) 2282 * sizeof(unsigned)); 2283 unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1) 2284 * sizeof(unsigned)); 2285 2286 bool assign_thread_ids = false; 2287 unsigned threadIdCt; 2288 unsigned index; 2289 2290 restart_radix_check: 2291 threadIdCt = 0; 2292 2293 // 2294 // Initialize the counter arrays with data from threadInfo[0]. 2295 // 2296 if (assign_thread_ids) { 2297 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 2298 threadInfo[0][threadIdIndex] = threadIdCt++; 2299 } 2300 else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 2301 threadIdCt = threadInfo[0][threadIdIndex] + 1; 2302 } 2303 } 2304 for (index = 0; index <= maxIndex; index++) { 2305 counts[index] = 1; 2306 maxCt[index] = 1; 2307 totals[index] = 1; 2308 lastId[index] = threadInfo[0][index];; 2309 } 2310 2311 // 2312 // Run through the rest of the OS procs. 2313 // 2314 for (i = 1; i < num_avail; i++) { 2315 // 2316 // Find the most significant index whose id differs 2317 // from the id for the previous OS proc. 2318 // 2319 for (index = maxIndex; index >= threadIdIndex; index--) { 2320 if (assign_thread_ids && (index == threadIdIndex)) { 2321 // 2322 // Auto-assign the thread id field if it wasn't specified. 2323 // 2324 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2325 threadInfo[i][threadIdIndex] = threadIdCt++; 2326 } 2327 2328 // 2329 // Aparrently the thread id field was specified for some 2330 // entries and not others. Start the thread id counter 2331 // off at the next higher thread id. 2332 // 2333 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2334 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2335 } 2336 } 2337 if (threadInfo[i][index] != lastId[index]) { 2338 // 2339 // Run through all indices which are less significant, 2340 // and reset the counts to 1. 2341 // 2342 // At all levels up to and including index, we need to 2343 // increment the totals and record the last id. 2344 // 2345 unsigned index2; 2346 for (index2 = threadIdIndex; index2 < index; index2++) { 2347 totals[index2]++; 2348 if (counts[index2] > maxCt[index2]) { 2349 maxCt[index2] = counts[index2]; 2350 } 2351 counts[index2] = 1; 2352 lastId[index2] = threadInfo[i][index2]; 2353 } 2354 counts[index]++; 2355 totals[index]++; 2356 lastId[index] = threadInfo[i][index]; 2357 2358 if (assign_thread_ids && (index > threadIdIndex)) { 2359 2360 # if KMP_MIC && REDUCE_TEAM_SIZE 2361 // 2362 // The default team size is the total #threads in the machine 2363 // minus 1 thread for every core that has 3 or more threads. 2364 // 2365 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 ); 2366 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2367 2368 // 2369 // Restart the thread counter, as we are on a new core. 2370 // 2371 threadIdCt = 0; 2372 2373 // 2374 // Auto-assign the thread id field if it wasn't specified. 2375 // 2376 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2377 threadInfo[i][threadIdIndex] = threadIdCt++; 2378 } 2379 2380 // 2381 // Aparrently the thread id field was specified for some 2382 // entries and not others. Start the thread id counter 2383 // off at the next higher thread id. 2384 // 2385 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2386 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2387 } 2388 } 2389 break; 2390 } 2391 } 2392 if (index < threadIdIndex) { 2393 // 2394 // If thread ids were specified, it is an error if they are not 2395 // unique. Also, check that we waven't already restarted the 2396 // loop (to be safe - shouldn't need to). 2397 // 2398 if ((threadInfo[i][threadIdIndex] != UINT_MAX) 2399 || assign_thread_ids) { 2400 __kmp_free(lastId); 2401 __kmp_free(totals); 2402 __kmp_free(maxCt); 2403 __kmp_free(counts); 2404 CLEANUP_THREAD_INFO; 2405 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2406 return -1; 2407 } 2408 2409 // 2410 // If the thread ids were not specified and we see entries 2411 // entries that are duplicates, start the loop over and 2412 // assign the thread ids manually. 2413 // 2414 assign_thread_ids = true; 2415 goto restart_radix_check; 2416 } 2417 } 2418 2419 # if KMP_MIC && REDUCE_TEAM_SIZE 2420 // 2421 // The default team size is the total #threads in the machine 2422 // minus 1 thread for every core that has 3 or more threads. 2423 // 2424 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 ); 2425 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2426 2427 for (index = threadIdIndex; index <= maxIndex; index++) { 2428 if (counts[index] > maxCt[index]) { 2429 maxCt[index] = counts[index]; 2430 } 2431 } 2432 2433 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2434 nCoresPerPkg = maxCt[coreIdIndex]; 2435 nPackages = totals[pkgIdIndex]; 2436 2437 // 2438 // Check to see if the machine topology is uniform 2439 // 2440 unsigned prod = totals[maxIndex]; 2441 for (index = threadIdIndex; index < maxIndex; index++) { 2442 prod *= maxCt[index]; 2443 } 2444 bool uniform = (prod == totals[threadIdIndex]); 2445 2446 // 2447 // When affinity is off, this routine will still be called to set 2448 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 2449 // nCoresPerPkg, & nPackages. Make sure all these vars are set 2450 // correctly, and return now if affinity is not enabled. 2451 // 2452 __kmp_ncores = totals[coreIdIndex]; 2453 2454 if (__kmp_affinity_verbose) { 2455 if (! KMP_AFFINITY_CAPABLE()) { 2456 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2457 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2458 if (uniform) { 2459 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2460 } else { 2461 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2462 } 2463 } 2464 else { 2465 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2466 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, __kmp_affin_fullMask); 2467 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2468 if (__kmp_affinity_respect_mask) { 2469 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2470 } else { 2471 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2472 } 2473 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2474 if (uniform) { 2475 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2476 } else { 2477 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2478 } 2479 } 2480 kmp_str_buf_t buf; 2481 __kmp_str_buf_init(&buf); 2482 2483 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]); 2484 for (index = maxIndex - 1; index >= pkgIdIndex; index--) { 2485 __kmp_str_buf_print(&buf, " x %d", maxCt[index]); 2486 } 2487 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex], 2488 maxCt[threadIdIndex], __kmp_ncores); 2489 2490 __kmp_str_buf_free(&buf); 2491 } 2492 2493 # if KMP_MIC && REDUCE_TEAM_SIZE 2494 // 2495 // Set the default team size. 2496 // 2497 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2498 __kmp_dflt_team_nth = teamSize; 2499 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n", 2500 __kmp_dflt_team_nth)); 2501 } 2502 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2503 2504 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 2505 KMP_DEBUG_ASSERT(num_avail == __kmp_avail_proc); 2506 __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 2507 for (i = 0; i < num_avail; ++i) { // fill the os indices 2508 __kmp_pu_os_idx[i] = threadInfo[i][osIdIndex]; 2509 } 2510 2511 if (__kmp_affinity_type == affinity_none) { 2512 __kmp_free(lastId); 2513 __kmp_free(totals); 2514 __kmp_free(maxCt); 2515 __kmp_free(counts); 2516 CLEANUP_THREAD_INFO; 2517 return 0; 2518 } 2519 2520 // 2521 // Count the number of levels which have more nodes at that level than 2522 // at the parent's level (with there being an implicit root node of 2523 // the top level). This is equivalent to saying that there is at least 2524 // one node at this level which has a sibling. These levels are in the 2525 // map, and the package level is always in the map. 2526 // 2527 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2528 int level = 0; 2529 for (index = threadIdIndex; index < maxIndex; index++) { 2530 KMP_ASSERT(totals[index] >= totals[index + 1]); 2531 inMap[index] = (totals[index] > totals[index + 1]); 2532 } 2533 inMap[maxIndex] = (totals[maxIndex] > 1); 2534 inMap[pkgIdIndex] = true; 2535 2536 int depth = 0; 2537 for (index = threadIdIndex; index <= maxIndex; index++) { 2538 if (inMap[index]) { 2539 depth++; 2540 } 2541 } 2542 KMP_ASSERT(depth > 0); 2543 2544 // 2545 // Construct the data structure that is to be returned. 2546 // 2547 *address2os = (AddrUnsPair*) 2548 __kmp_allocate(sizeof(AddrUnsPair) * num_avail); 2549 int pkgLevel = -1; 2550 int coreLevel = -1; 2551 int threadLevel = -1; 2552 2553 for (i = 0; i < num_avail; ++i) { 2554 Address addr(depth); 2555 unsigned os = threadInfo[i][osIdIndex]; 2556 int src_index; 2557 int dst_index = 0; 2558 2559 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2560 if (! inMap[src_index]) { 2561 continue; 2562 } 2563 addr.labels[dst_index] = threadInfo[i][src_index]; 2564 if (src_index == pkgIdIndex) { 2565 pkgLevel = dst_index; 2566 } 2567 else if (src_index == coreIdIndex) { 2568 coreLevel = dst_index; 2569 } 2570 else if (src_index == threadIdIndex) { 2571 threadLevel = dst_index; 2572 } 2573 dst_index++; 2574 } 2575 (*address2os)[i] = AddrUnsPair(addr, os); 2576 } 2577 2578 if (__kmp_affinity_gran_levels < 0) { 2579 // 2580 // Set the granularity level based on what levels are modeled 2581 // in the machine topology map. 2582 // 2583 unsigned src_index; 2584 __kmp_affinity_gran_levels = 0; 2585 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) { 2586 if (! inMap[src_index]) { 2587 continue; 2588 } 2589 switch (src_index) { 2590 case threadIdIndex: 2591 if (__kmp_affinity_gran > affinity_gran_thread) { 2592 __kmp_affinity_gran_levels++; 2593 } 2594 2595 break; 2596 case coreIdIndex: 2597 if (__kmp_affinity_gran > affinity_gran_core) { 2598 __kmp_affinity_gran_levels++; 2599 } 2600 break; 2601 2602 case pkgIdIndex: 2603 if (__kmp_affinity_gran > affinity_gran_package) { 2604 __kmp_affinity_gran_levels++; 2605 } 2606 break; 2607 } 2608 } 2609 } 2610 2611 if (__kmp_affinity_verbose) { 2612 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel, 2613 coreLevel, threadLevel); 2614 } 2615 2616 __kmp_free(inMap); 2617 __kmp_free(lastId); 2618 __kmp_free(totals); 2619 __kmp_free(maxCt); 2620 __kmp_free(counts); 2621 CLEANUP_THREAD_INFO; 2622 return depth; 2623 } 2624 2625 2626 // 2627 // Create and return a table of affinity masks, indexed by OS thread ID. 2628 // This routine handles OR'ing together all the affinity masks of threads 2629 // that are sufficiently close, if granularity > fine. 2630 // 2631 static kmp_affin_mask_t * 2632 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique, 2633 AddrUnsPair *address2os, unsigned numAddrs) 2634 { 2635 // 2636 // First form a table of affinity masks in order of OS thread id. 2637 // 2638 unsigned depth; 2639 unsigned maxOsId; 2640 unsigned i; 2641 2642 KMP_ASSERT(numAddrs > 0); 2643 depth = address2os[0].first.depth; 2644 2645 maxOsId = 0; 2646 for (i = 0; i < numAddrs; i++) { 2647 unsigned osId = address2os[i].second; 2648 if (osId > maxOsId) { 2649 maxOsId = osId; 2650 } 2651 } 2652 kmp_affin_mask_t *osId2Mask; 2653 KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId+1)); 2654 2655 // 2656 // Sort the address2os table according to physical order. Doing so 2657 // will put all threads on the same core/package/node in consecutive 2658 // locations. 2659 // 2660 qsort(address2os, numAddrs, sizeof(*address2os), 2661 __kmp_affinity_cmp_Address_labels); 2662 2663 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2664 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2665 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2666 } 2667 if (__kmp_affinity_gran_levels >= (int)depth) { 2668 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2669 && (__kmp_affinity_type != affinity_none))) { 2670 KMP_WARNING(AffThreadsMayMigrate); 2671 } 2672 } 2673 2674 // 2675 // Run through the table, forming the masks for all threads on each 2676 // core. Threads on the same core will have identical "Address" 2677 // objects, not considering the last level, which must be the thread 2678 // id. All threads on a core will appear consecutively. 2679 // 2680 unsigned unique = 0; 2681 unsigned j = 0; // index of 1st thread on core 2682 unsigned leader = 0; 2683 Address *leaderAddr = &(address2os[0].first); 2684 kmp_affin_mask_t *sum; 2685 KMP_CPU_ALLOC_ON_STACK(sum); 2686 KMP_CPU_ZERO(sum); 2687 KMP_CPU_SET(address2os[0].second, sum); 2688 for (i = 1; i < numAddrs; i++) { 2689 // 2690 // If this thread is sufficiently close to the leader (within the 2691 // granularity setting), then set the bit for this os thread in the 2692 // affinity mask for this group, and go on to the next thread. 2693 // 2694 if (leaderAddr->isClose(address2os[i].first, 2695 __kmp_affinity_gran_levels)) { 2696 KMP_CPU_SET(address2os[i].second, sum); 2697 continue; 2698 } 2699 2700 // 2701 // For every thread in this group, copy the mask to the thread's 2702 // entry in the osId2Mask table. Mark the first address as a 2703 // leader. 2704 // 2705 for (; j < i; j++) { 2706 unsigned osId = address2os[j].second; 2707 KMP_DEBUG_ASSERT(osId <= maxOsId); 2708 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2709 KMP_CPU_COPY(mask, sum); 2710 address2os[j].first.leader = (j == leader); 2711 } 2712 unique++; 2713 2714 // 2715 // Start a new mask. 2716 // 2717 leader = i; 2718 leaderAddr = &(address2os[i].first); 2719 KMP_CPU_ZERO(sum); 2720 KMP_CPU_SET(address2os[i].second, sum); 2721 } 2722 2723 // 2724 // For every thread in last group, copy the mask to the thread's 2725 // entry in the osId2Mask table. 2726 // 2727 for (; j < i; j++) { 2728 unsigned osId = address2os[j].second; 2729 KMP_DEBUG_ASSERT(osId <= maxOsId); 2730 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2731 KMP_CPU_COPY(mask, sum); 2732 address2os[j].first.leader = (j == leader); 2733 } 2734 unique++; 2735 KMP_CPU_FREE_FROM_STACK(sum); 2736 2737 *maxIndex = maxOsId; 2738 *numUnique = unique; 2739 return osId2Mask; 2740 } 2741 2742 2743 // 2744 // Stuff for the affinity proclist parsers. It's easier to declare these vars 2745 // as file-static than to try and pass them through the calling sequence of 2746 // the recursive-descent OMP_PLACES parser. 2747 // 2748 static kmp_affin_mask_t *newMasks; 2749 static int numNewMasks; 2750 static int nextNewMask; 2751 2752 #define ADD_MASK(_mask) \ 2753 { \ 2754 if (nextNewMask >= numNewMasks) { \ 2755 int i; \ 2756 numNewMasks *= 2; \ 2757 kmp_affin_mask_t* temp; \ 2758 KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \ 2759 for(i=0;i<numNewMasks/2;i++) { \ 2760 kmp_affin_mask_t* src = KMP_CPU_INDEX(newMasks, i); \ 2761 kmp_affin_mask_t* dest = KMP_CPU_INDEX(temp, i); \ 2762 KMP_CPU_COPY(dest, src); \ 2763 } \ 2764 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks/2); \ 2765 newMasks = temp; \ 2766 } \ 2767 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 2768 nextNewMask++; \ 2769 } 2770 2771 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \ 2772 { \ 2773 if (((_osId) > _maxOsId) || \ 2774 (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 2775 if (__kmp_affinity_verbose || (__kmp_affinity_warnings \ 2776 && (__kmp_affinity_type != affinity_none))) { \ 2777 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 2778 } \ 2779 } \ 2780 else { \ 2781 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 2782 } \ 2783 } 2784 2785 2786 // 2787 // Re-parse the proclist (for the explicit affinity type), and form the list 2788 // of affinity newMasks indexed by gtid. 2789 // 2790 static void 2791 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 2792 unsigned int *out_numMasks, const char *proclist, 2793 kmp_affin_mask_t *osId2Mask, int maxOsId) 2794 { 2795 int i; 2796 const char *scan = proclist; 2797 const char *next = proclist; 2798 2799 // 2800 // We use malloc() for the temporary mask vector, 2801 // so that we can use realloc() to extend it. 2802 // 2803 numNewMasks = 2; 2804 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 2805 nextNewMask = 0; 2806 kmp_affin_mask_t *sumMask; 2807 KMP_CPU_ALLOC(sumMask); 2808 int setSize = 0; 2809 2810 for (;;) { 2811 int start, end, stride; 2812 2813 SKIP_WS(scan); 2814 next = scan; 2815 if (*next == '\0') { 2816 break; 2817 } 2818 2819 if (*next == '{') { 2820 int num; 2821 setSize = 0; 2822 next++; // skip '{' 2823 SKIP_WS(next); 2824 scan = next; 2825 2826 // 2827 // Read the first integer in the set. 2828 // 2829 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2830 "bad proclist"); 2831 SKIP_DIGITS(next); 2832 num = __kmp_str_to_int(scan, *next); 2833 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2834 2835 // 2836 // Copy the mask for that osId to the sum (union) mask. 2837 // 2838 if ((num > maxOsId) || 2839 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2840 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2841 && (__kmp_affinity_type != affinity_none))) { 2842 KMP_WARNING(AffIgnoreInvalidProcID, num); 2843 } 2844 KMP_CPU_ZERO(sumMask); 2845 } 2846 else { 2847 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2848 setSize = 1; 2849 } 2850 2851 for (;;) { 2852 // 2853 // Check for end of set. 2854 // 2855 SKIP_WS(next); 2856 if (*next == '}') { 2857 next++; // skip '}' 2858 break; 2859 } 2860 2861 // 2862 // Skip optional comma. 2863 // 2864 if (*next == ',') { 2865 next++; 2866 } 2867 SKIP_WS(next); 2868 2869 // 2870 // Read the next integer in the set. 2871 // 2872 scan = next; 2873 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2874 "bad explicit proc list"); 2875 2876 SKIP_DIGITS(next); 2877 num = __kmp_str_to_int(scan, *next); 2878 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2879 2880 // 2881 // Add the mask for that osId to the sum mask. 2882 // 2883 if ((num > maxOsId) || 2884 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2885 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2886 && (__kmp_affinity_type != affinity_none))) { 2887 KMP_WARNING(AffIgnoreInvalidProcID, num); 2888 } 2889 } 2890 else { 2891 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2892 setSize++; 2893 } 2894 } 2895 if (setSize > 0) { 2896 ADD_MASK(sumMask); 2897 } 2898 2899 SKIP_WS(next); 2900 if (*next == ',') { 2901 next++; 2902 } 2903 scan = next; 2904 continue; 2905 } 2906 2907 // 2908 // Read the first integer. 2909 // 2910 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2911 SKIP_DIGITS(next); 2912 start = __kmp_str_to_int(scan, *next); 2913 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 2914 SKIP_WS(next); 2915 2916 // 2917 // If this isn't a range, then add a mask to the list and go on. 2918 // 2919 if (*next != '-') { 2920 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2921 2922 // 2923 // Skip optional comma. 2924 // 2925 if (*next == ',') { 2926 next++; 2927 } 2928 scan = next; 2929 continue; 2930 } 2931 2932 // 2933 // This is a range. Skip over the '-' and read in the 2nd int. 2934 // 2935 next++; // skip '-' 2936 SKIP_WS(next); 2937 scan = next; 2938 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2939 SKIP_DIGITS(next); 2940 end = __kmp_str_to_int(scan, *next); 2941 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 2942 2943 // 2944 // Check for a stride parameter 2945 // 2946 stride = 1; 2947 SKIP_WS(next); 2948 if (*next == ':') { 2949 // 2950 // A stride is specified. Skip over the ':" and read the 3rd int. 2951 // 2952 int sign = +1; 2953 next++; // skip ':' 2954 SKIP_WS(next); 2955 scan = next; 2956 if (*next == '-') { 2957 sign = -1; 2958 next++; 2959 SKIP_WS(next); 2960 scan = next; 2961 } 2962 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2963 "bad explicit proc list"); 2964 SKIP_DIGITS(next); 2965 stride = __kmp_str_to_int(scan, *next); 2966 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 2967 stride *= sign; 2968 } 2969 2970 // 2971 // Do some range checks. 2972 // 2973 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 2974 if (stride > 0) { 2975 KMP_ASSERT2(start <= end, "bad explicit proc list"); 2976 } 2977 else { 2978 KMP_ASSERT2(start >= end, "bad explicit proc list"); 2979 } 2980 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 2981 2982 // 2983 // Add the mask for each OS proc # to the list. 2984 // 2985 if (stride > 0) { 2986 do { 2987 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2988 start += stride; 2989 } while (start <= end); 2990 } 2991 else { 2992 do { 2993 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2994 start += stride; 2995 } while (start >= end); 2996 } 2997 2998 // 2999 // Skip optional comma. 3000 // 3001 SKIP_WS(next); 3002 if (*next == ',') { 3003 next++; 3004 } 3005 scan = next; 3006 } 3007 3008 *out_numMasks = nextNewMask; 3009 if (nextNewMask == 0) { 3010 *out_masks = NULL; 3011 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3012 return; 3013 } 3014 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3015 for(i = 0; i < nextNewMask; i++) { 3016 kmp_affin_mask_t* src = KMP_CPU_INDEX(newMasks, i); 3017 kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i); 3018 KMP_CPU_COPY(dest, src); 3019 } 3020 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3021 KMP_CPU_FREE(sumMask); 3022 } 3023 3024 3025 # if OMP_40_ENABLED 3026 3027 /*----------------------------------------------------------------------------- 3028 3029 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 3030 places. Again, Here is the grammar: 3031 3032 place_list := place 3033 place_list := place , place_list 3034 place := num 3035 place := place : num 3036 place := place : num : signed 3037 place := { subplacelist } 3038 place := ! place // (lowest priority) 3039 subplace_list := subplace 3040 subplace_list := subplace , subplace_list 3041 subplace := num 3042 subplace := num : num 3043 subplace := num : num : signed 3044 signed := num 3045 signed := + signed 3046 signed := - signed 3047 3048 -----------------------------------------------------------------------------*/ 3049 3050 static void 3051 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask, 3052 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize) 3053 { 3054 const char *next; 3055 3056 for (;;) { 3057 int start, count, stride, i; 3058 3059 // 3060 // Read in the starting proc id 3061 // 3062 SKIP_WS(*scan); 3063 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 3064 "bad explicit places list"); 3065 next = *scan; 3066 SKIP_DIGITS(next); 3067 start = __kmp_str_to_int(*scan, *next); 3068 KMP_ASSERT(start >= 0); 3069 *scan = next; 3070 3071 // 3072 // valid follow sets are ',' ':' and '}' 3073 // 3074 SKIP_WS(*scan); 3075 if (**scan == '}' || **scan == ',') { 3076 if ((start > maxOsId) || 3077 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3078 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3079 && (__kmp_affinity_type != affinity_none))) { 3080 KMP_WARNING(AffIgnoreInvalidProcID, start); 3081 } 3082 } 3083 else { 3084 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3085 (*setSize)++; 3086 } 3087 if (**scan == '}') { 3088 break; 3089 } 3090 (*scan)++; // skip ',' 3091 continue; 3092 } 3093 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3094 (*scan)++; // skip ':' 3095 3096 // 3097 // Read count parameter 3098 // 3099 SKIP_WS(*scan); 3100 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 3101 "bad explicit places list"); 3102 next = *scan; 3103 SKIP_DIGITS(next); 3104 count = __kmp_str_to_int(*scan, *next); 3105 KMP_ASSERT(count >= 0); 3106 *scan = next; 3107 3108 // 3109 // valid follow sets are ',' ':' and '}' 3110 // 3111 SKIP_WS(*scan); 3112 if (**scan == '}' || **scan == ',') { 3113 for (i = 0; i < count; i++) { 3114 if ((start > maxOsId) || 3115 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3116 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3117 && (__kmp_affinity_type != affinity_none))) { 3118 KMP_WARNING(AffIgnoreInvalidProcID, start); 3119 } 3120 break; // don't proliferate warnings for large count 3121 } 3122 else { 3123 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3124 start++; 3125 (*setSize)++; 3126 } 3127 } 3128 if (**scan == '}') { 3129 break; 3130 } 3131 (*scan)++; // skip ',' 3132 continue; 3133 } 3134 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3135 (*scan)++; // skip ':' 3136 3137 // 3138 // Read stride parameter 3139 // 3140 int sign = +1; 3141 for (;;) { 3142 SKIP_WS(*scan); 3143 if (**scan == '+') { 3144 (*scan)++; // skip '+' 3145 continue; 3146 } 3147 if (**scan == '-') { 3148 sign *= -1; 3149 (*scan)++; // skip '-' 3150 continue; 3151 } 3152 break; 3153 } 3154 SKIP_WS(*scan); 3155 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 3156 "bad explicit places list"); 3157 next = *scan; 3158 SKIP_DIGITS(next); 3159 stride = __kmp_str_to_int(*scan, *next); 3160 KMP_ASSERT(stride >= 0); 3161 *scan = next; 3162 stride *= sign; 3163 3164 // 3165 // valid follow sets are ',' and '}' 3166 // 3167 SKIP_WS(*scan); 3168 if (**scan == '}' || **scan == ',') { 3169 for (i = 0; i < count; i++) { 3170 if ((start > maxOsId) || 3171 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3172 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3173 && (__kmp_affinity_type != affinity_none))) { 3174 KMP_WARNING(AffIgnoreInvalidProcID, start); 3175 } 3176 break; // don't proliferate warnings for large count 3177 } 3178 else { 3179 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3180 start += stride; 3181 (*setSize)++; 3182 } 3183 } 3184 if (**scan == '}') { 3185 break; 3186 } 3187 (*scan)++; // skip ',' 3188 continue; 3189 } 3190 3191 KMP_ASSERT2(0, "bad explicit places list"); 3192 } 3193 } 3194 3195 3196 static void 3197 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 3198 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize) 3199 { 3200 const char *next; 3201 3202 // 3203 // valid follow sets are '{' '!' and num 3204 // 3205 SKIP_WS(*scan); 3206 if (**scan == '{') { 3207 (*scan)++; // skip '{' 3208 __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask, 3209 setSize); 3210 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 3211 (*scan)++; // skip '}' 3212 } 3213 else if (**scan == '!') { 3214 (*scan)++; // skip '!' 3215 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 3216 KMP_CPU_COMPLEMENT(maxOsId, tempMask); 3217 } 3218 else if ((**scan >= '0') && (**scan <= '9')) { 3219 next = *scan; 3220 SKIP_DIGITS(next); 3221 int num = __kmp_str_to_int(*scan, *next); 3222 KMP_ASSERT(num >= 0); 3223 if ((num > maxOsId) || 3224 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3225 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3226 && (__kmp_affinity_type != affinity_none))) { 3227 KMP_WARNING(AffIgnoreInvalidProcID, num); 3228 } 3229 } 3230 else { 3231 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 3232 (*setSize)++; 3233 } 3234 *scan = next; // skip num 3235 } 3236 else { 3237 KMP_ASSERT2(0, "bad explicit places list"); 3238 } 3239 } 3240 3241 3242 //static void 3243 void 3244 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 3245 unsigned int *out_numMasks, const char *placelist, 3246 kmp_affin_mask_t *osId2Mask, int maxOsId) 3247 { 3248 int i,j,count,stride,sign; 3249 const char *scan = placelist; 3250 const char *next = placelist; 3251 3252 numNewMasks = 2; 3253 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 3254 nextNewMask = 0; 3255 3256 // tempMask is modified based on the previous or initial 3257 // place to form the current place 3258 // previousMask contains the previous place 3259 kmp_affin_mask_t *tempMask; 3260 kmp_affin_mask_t *previousMask; 3261 KMP_CPU_ALLOC(tempMask); 3262 KMP_CPU_ZERO(tempMask); 3263 KMP_CPU_ALLOC(previousMask); 3264 KMP_CPU_ZERO(previousMask); 3265 int setSize = 0; 3266 3267 for (;;) { 3268 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 3269 3270 // 3271 // valid follow sets are ',' ':' and EOL 3272 // 3273 SKIP_WS(scan); 3274 if (*scan == '\0' || *scan == ',') { 3275 if (setSize > 0) { 3276 ADD_MASK(tempMask); 3277 } 3278 KMP_CPU_ZERO(tempMask); 3279 setSize = 0; 3280 if (*scan == '\0') { 3281 break; 3282 } 3283 scan++; // skip ',' 3284 continue; 3285 } 3286 3287 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3288 scan++; // skip ':' 3289 3290 // 3291 // Read count parameter 3292 // 3293 SKIP_WS(scan); 3294 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), 3295 "bad explicit places list"); 3296 next = scan; 3297 SKIP_DIGITS(next); 3298 count = __kmp_str_to_int(scan, *next); 3299 KMP_ASSERT(count >= 0); 3300 scan = next; 3301 3302 // 3303 // valid follow sets are ',' ':' and EOL 3304 // 3305 SKIP_WS(scan); 3306 if (*scan == '\0' || *scan == ',') { 3307 stride = +1; 3308 } 3309 else { 3310 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3311 scan++; // skip ':' 3312 3313 // 3314 // Read stride parameter 3315 // 3316 sign = +1; 3317 for (;;) { 3318 SKIP_WS(scan); 3319 if (*scan == '+') { 3320 scan++; // skip '+' 3321 continue; 3322 } 3323 if (*scan == '-') { 3324 sign *= -1; 3325 scan++; // skip '-' 3326 continue; 3327 } 3328 break; 3329 } 3330 SKIP_WS(scan); 3331 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), 3332 "bad explicit places list"); 3333 next = scan; 3334 SKIP_DIGITS(next); 3335 stride = __kmp_str_to_int(scan, *next); 3336 KMP_DEBUG_ASSERT(stride >= 0); 3337 scan = next; 3338 stride *= sign; 3339 } 3340 3341 // Add places determined by initial_place : count : stride 3342 for (i = 0; i < count; i++) { 3343 if (setSize == 0) { 3344 break; 3345 } 3346 // Add the current place, then build the next place (tempMask) from that 3347 KMP_CPU_COPY(previousMask, tempMask); 3348 ADD_MASK(previousMask); 3349 KMP_CPU_ZERO(tempMask); 3350 setSize = 0; 3351 KMP_CPU_SET_ITERATE(j, previousMask) { 3352 if (! KMP_CPU_ISSET(j, previousMask)) { 3353 continue; 3354 } 3355 if ((j+stride > maxOsId) || (j+stride < 0) || 3356 (! KMP_CPU_ISSET(j, __kmp_affin_fullMask)) || 3357 (! KMP_CPU_ISSET(j+stride, KMP_CPU_INDEX(osId2Mask, j+stride)))) { 3358 if ((__kmp_affinity_verbose || (__kmp_affinity_warnings 3359 && (__kmp_affinity_type != affinity_none))) && i < count - 1) { 3360 KMP_WARNING(AffIgnoreInvalidProcID, j+stride); 3361 } 3362 continue; 3363 } 3364 KMP_CPU_SET(j+stride, tempMask); 3365 setSize++; 3366 } 3367 } 3368 KMP_CPU_ZERO(tempMask); 3369 setSize = 0; 3370 3371 // 3372 // valid follow sets are ',' and EOL 3373 // 3374 SKIP_WS(scan); 3375 if (*scan == '\0') { 3376 break; 3377 } 3378 if (*scan == ',') { 3379 scan++; // skip ',' 3380 continue; 3381 } 3382 3383 KMP_ASSERT2(0, "bad explicit places list"); 3384 } 3385 3386 *out_numMasks = nextNewMask; 3387 if (nextNewMask == 0) { 3388 *out_masks = NULL; 3389 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3390 return; 3391 } 3392 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3393 KMP_CPU_FREE(tempMask); 3394 KMP_CPU_FREE(previousMask); 3395 for(i = 0; i < nextNewMask; i++) { 3396 kmp_affin_mask_t* src = KMP_CPU_INDEX(newMasks, i); 3397 kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i); 3398 KMP_CPU_COPY(dest, src); 3399 } 3400 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3401 } 3402 3403 # endif /* OMP_40_ENABLED */ 3404 3405 #undef ADD_MASK 3406 #undef ADD_MASK_OSID 3407 3408 #if KMP_USE_HWLOC 3409 static int 3410 __kmp_hwloc_count_children_by_type( 3411 hwloc_topology_t t, hwloc_obj_t o, hwloc_obj_type_t type, hwloc_obj_t* f) 3412 { 3413 if (!hwloc_compare_types(o->type, type)) { 3414 if (*f == NULL) 3415 *f = o; // output first descendant found 3416 return 1; 3417 } 3418 int sum = 0; 3419 for (unsigned i = 0; i < o->arity; i++) 3420 sum += __kmp_hwloc_count_children_by_type(t, o->children[i], type, f); 3421 return sum; // will be 0 if no one found (as PU arity is 0) 3422 } 3423 3424 static int 3425 __kmp_hwloc_count_children_by_depth( 3426 hwloc_topology_t t, hwloc_obj_t o, unsigned depth, hwloc_obj_t* f) 3427 { 3428 if (o->depth == depth) { 3429 if (*f == NULL) 3430 *f = o; // output first descendant found 3431 return 1; 3432 } 3433 int sum = 0; 3434 for (unsigned i = 0; i < o->arity; i++) 3435 sum += __kmp_hwloc_count_children_by_depth(t, o->children[i], depth, f); 3436 return sum; // will be 0 if no one found (as PU arity is 0) 3437 } 3438 3439 static int 3440 __kmp_hwloc_skip_PUs_obj(hwloc_topology_t t, hwloc_obj_t o) 3441 { // skip PUs descendants of the object o 3442 int skipped = 0; 3443 hwloc_obj_t hT = NULL; 3444 int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT); 3445 for (int i = 0; i < N; ++i) { 3446 KMP_DEBUG_ASSERT(hT); 3447 unsigned idx = hT->os_index; 3448 if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3449 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3450 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3451 ++skipped; 3452 } 3453 hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT); 3454 } 3455 return skipped; // count number of skipped units 3456 } 3457 3458 static int 3459 __kmp_hwloc_obj_has_PUs(hwloc_topology_t t, hwloc_obj_t o) 3460 { // check if obj has PUs present in fullMask 3461 hwloc_obj_t hT = NULL; 3462 int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT); 3463 for (int i = 0; i < N; ++i) { 3464 KMP_DEBUG_ASSERT(hT); 3465 unsigned idx = hT->os_index; 3466 if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) 3467 return 1; // found PU 3468 hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT); 3469 } 3470 return 0; // no PUs found 3471 } 3472 #endif // KMP_USE_HWLOC 3473 3474 static void 3475 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) 3476 { 3477 AddrUnsPair *newAddr; 3478 if (__kmp_hws_requested == 0) 3479 goto _exit; // no topology limiting actions requested, exit 3480 #if KMP_USE_HWLOC 3481 if (__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 3482 // Number of subobjects calculated dynamically, this works fine for 3483 // any non-uniform topology. 3484 // L2 cache objects are determined by depth, other objects - by type. 3485 hwloc_topology_t tp = __kmp_hwloc_topology; 3486 int nS=0, nN=0, nL=0, nC=0, nT=0; // logical index including skipped 3487 int nCr=0, nTr=0; // number of requested units 3488 int nPkg=0, nCo=0, n_new=0, n_old = 0, nCpP=0, nTpC=0; // counters 3489 hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to) 3490 int L2depth, idx; 3491 3492 // check support of extensions ---------------------------------- 3493 int numa_support = 0, tile_support = 0; 3494 if (__kmp_pu_os_idx) 3495 hT = hwloc_get_pu_obj_by_os_index( 3496 tp, __kmp_pu_os_idx[__kmp_avail_proc - 1]); 3497 else 3498 hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, __kmp_avail_proc - 1); 3499 if (hT == NULL) { // something's gone wrong 3500 KMP_WARNING(AffHWSubsetUnsupported); 3501 goto _exit; 3502 } 3503 // check NUMA node 3504 hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT); 3505 hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT); 3506 if (hN != NULL && hN->depth > hS->depth) { 3507 numa_support = 1; // 1 in case socket includes node(s) 3508 } else if (__kmp_hws_node.num > 0) { 3509 // don't support sockets inside NUMA node (no such HW found for testing) 3510 KMP_WARNING(AffHWSubsetUnsupported); 3511 goto _exit; 3512 } 3513 // check L2 cahce, get object by depth because of multiple caches 3514 L2depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED); 3515 hL = hwloc_get_ancestor_obj_by_depth(tp, L2depth, hT); 3516 if (hL != NULL && __kmp_hwloc_count_children_by_type( 3517 tp, hL, HWLOC_OBJ_CORE, &hC) > 1) { 3518 tile_support = 1; // no sense to count L2 if it includes single core 3519 } else if (__kmp_hws_tile.num > 0) { 3520 if (__kmp_hws_core.num == 0) { 3521 __kmp_hws_core = __kmp_hws_tile; // replace L2 with core 3522 __kmp_hws_tile.num = 0; 3523 } else { 3524 // L2 and core are both requested, but represent same object 3525 KMP_WARNING(AffHWSubsetInvalid); 3526 goto _exit; 3527 } 3528 } 3529 // end of check of extensions ----------------------------------- 3530 3531 // fill in unset items, validate settings ----------------------- 3532 if (__kmp_hws_socket.num == 0) 3533 __kmp_hws_socket.num = nPackages; // use all available sockets 3534 if (__kmp_hws_socket.offset >= nPackages) { 3535 KMP_WARNING(AffHWSubsetManySockets); 3536 goto _exit; 3537 } 3538 if (numa_support) { 3539 int NN = __kmp_hwloc_count_children_by_type( 3540 tp, hS, HWLOC_OBJ_NUMANODE, &hN); // num nodes in socket 3541 if (__kmp_hws_node.num == 0) 3542 __kmp_hws_node.num = NN; // use all available nodes 3543 if (__kmp_hws_node.offset >= NN) { 3544 KMP_WARNING(AffHWSubsetManyNodes); 3545 goto _exit; 3546 } 3547 if (tile_support) { 3548 // get num tiles in node 3549 int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL); 3550 if (__kmp_hws_tile.num == 0) { 3551 __kmp_hws_tile.num = NL + 1; 3552 } // use all available tiles, some node may have more tiles, thus +1 3553 if (__kmp_hws_tile.offset >= NL) { 3554 KMP_WARNING(AffHWSubsetManyTiles); 3555 goto _exit; 3556 } 3557 int NC = __kmp_hwloc_count_children_by_type( 3558 tp, hL, HWLOC_OBJ_CORE, &hC); // num cores in tile 3559 if (__kmp_hws_core.num == 0) 3560 __kmp_hws_core.num = NC; // use all available cores 3561 if (__kmp_hws_core.offset >= NC) { 3562 KMP_WARNING(AffHWSubsetManyCores); 3563 goto _exit; 3564 } 3565 } else { // tile_support 3566 int NC = __kmp_hwloc_count_children_by_type( 3567 tp, hN, HWLOC_OBJ_CORE, &hC); // num cores in node 3568 if (__kmp_hws_core.num == 0) 3569 __kmp_hws_core.num = NC; // use all available cores 3570 if (__kmp_hws_core.offset >= NC) { 3571 KMP_WARNING(AffHWSubsetManyCores); 3572 goto _exit; 3573 } 3574 } // tile_support 3575 } else { // numa_support 3576 if (tile_support) { 3577 // get num tiles in socket 3578 int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL); 3579 if (__kmp_hws_tile.num == 0) 3580 __kmp_hws_tile.num = NL; // use all available tiles 3581 if (__kmp_hws_tile.offset >= NL) { 3582 KMP_WARNING(AffHWSubsetManyTiles); 3583 goto _exit; 3584 } 3585 int NC = __kmp_hwloc_count_children_by_type( 3586 tp, hL, HWLOC_OBJ_CORE, &hC); // num cores in tile 3587 if (__kmp_hws_core.num == 0) 3588 __kmp_hws_core.num = NC; // use all available cores 3589 if (__kmp_hws_core.offset >= NC) { 3590 KMP_WARNING(AffHWSubsetManyCores); 3591 goto _exit; 3592 } 3593 } else { // tile_support 3594 int NC = __kmp_hwloc_count_children_by_type( 3595 tp, hS, HWLOC_OBJ_CORE, &hC); // num cores in socket 3596 if (__kmp_hws_core.num == 0) 3597 __kmp_hws_core.num = NC; // use all available cores 3598 if (__kmp_hws_core.offset >= NC) { 3599 KMP_WARNING(AffHWSubsetManyCores); 3600 goto _exit; 3601 } 3602 } // tile_support 3603 } 3604 if (__kmp_hws_proc.num == 0) 3605 __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all available procs 3606 if (__kmp_hws_proc.offset >= __kmp_nThreadsPerCore) { 3607 KMP_WARNING(AffHWSubsetManyProcs); 3608 goto _exit; 3609 } 3610 // end of validation -------------------------------------------- 3611 3612 if (pAddr) // pAddr is NULL in case of affinity_none 3613 newAddr = (AddrUnsPair *)__kmp_allocate( 3614 sizeof(AddrUnsPair) * __kmp_avail_proc); // max size 3615 // main loop to form HW subset ---------------------------------- 3616 hS = NULL; 3617 int NP = hwloc_get_nbobjs_by_type(tp, HWLOC_OBJ_PACKAGE); 3618 for (int s = 0; s < NP; ++s) { 3619 // Check Socket ----------------------------------------------- 3620 hS = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hS); 3621 if (!__kmp_hwloc_obj_has_PUs(tp, hS)) 3622 continue; // skip socket if all PUs are out of fullMask 3623 ++nS; // only count objects those have PUs in affinity mask 3624 if (nS <= __kmp_hws_socket.offset || 3625 nS > __kmp_hws_socket.num + __kmp_hws_socket.offset) { 3626 n_old += __kmp_hwloc_skip_PUs_obj(tp, hS); // skip socket 3627 continue; // move to next socket 3628 } 3629 nCr = 0; // count number of cores per socket 3630 // socket requested, go down the topology tree 3631 // check 4 cases: (+NUMA+Tile), (+NUMA-Tile), (-NUMA+Tile), (-NUMA-Tile) 3632 if (numa_support) { 3633 nN = 0; 3634 hN = NULL; 3635 int NN = __kmp_hwloc_count_children_by_type( 3636 tp, hS, HWLOC_OBJ_NUMANODE, &hN); // num nodes in current socket 3637 for (int n = 0; n < NN; ++n) { 3638 // Check NUMA Node ---------------------------------------- 3639 if (!__kmp_hwloc_obj_has_PUs(tp, hN)) { 3640 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3641 continue; // skip node if all PUs are out of fullMask 3642 } 3643 ++nN; 3644 if (nN <= __kmp_hws_node.offset || 3645 nN > __kmp_hws_node.num + __kmp_hws_node.offset) { 3646 // skip node as not requested 3647 n_old += __kmp_hwloc_skip_PUs_obj(tp, hN); // skip node 3648 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3649 continue; // move to next node 3650 } 3651 // node requested, go down the topology tree 3652 if (tile_support) { 3653 nL = 0; 3654 hL = NULL; 3655 int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL); 3656 for (int l = 0; l < NL; ++l) { 3657 // Check L2 (tile) ------------------------------------ 3658 if (!__kmp_hwloc_obj_has_PUs(tp, hL)) { 3659 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3660 continue; // skip tile if all PUs are out of fullMask 3661 } 3662 ++nL; 3663 if (nL <= __kmp_hws_tile.offset || 3664 nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) { 3665 // skip tile as not requested 3666 n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile 3667 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3668 continue; // move to next tile 3669 } 3670 // tile requested, go down the topology tree 3671 nC = 0; 3672 hC = NULL; 3673 int NC = __kmp_hwloc_count_children_by_type( 3674 tp, hL, HWLOC_OBJ_CORE, &hC); // num cores in current tile 3675 for (int c = 0; c < NC; ++c) { 3676 // Check Core --------------------------------------- 3677 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3678 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3679 continue; // skip core if all PUs are out of fullMask 3680 } 3681 ++nC; 3682 if (nC <= __kmp_hws_core.offset || 3683 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3684 // skip node as not requested 3685 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3686 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3687 continue; // move to next node 3688 } 3689 // core requested, go down to PUs 3690 nT = 0; 3691 nTr = 0; 3692 hT = NULL; 3693 int NT = __kmp_hwloc_count_children_by_type( 3694 tp, hC, HWLOC_OBJ_PU, &hT); // num procs in current core 3695 for (int t = 0; t < NT; ++t) { 3696 // Check PU --------------------------------------- 3697 idx = hT->os_index; 3698 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3699 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3700 continue; // skip PU if not in fullMask 3701 } 3702 ++nT; 3703 if (nT <= __kmp_hws_proc.offset || 3704 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3705 // skip PU 3706 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3707 ++n_old; 3708 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3709 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3710 continue; // move to next node 3711 } 3712 ++nTr; 3713 if (pAddr) // collect requested thread's data 3714 newAddr[n_new] = (*pAddr)[n_old]; 3715 ++n_new; 3716 ++n_old; 3717 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3718 } // threads loop 3719 if (nTr > 0) { 3720 ++nCr; // num cores per socket 3721 ++nCo; // total num cores 3722 if (nTr > nTpC) 3723 nTpC = nTr; // calc max threads per core 3724 } 3725 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3726 } // cores loop 3727 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3728 } // tiles loop 3729 } else { // tile_support 3730 // no tiles, check cores 3731 nC = 0; 3732 hC = NULL; 3733 int NC = __kmp_hwloc_count_children_by_type( 3734 tp, hN, HWLOC_OBJ_CORE, &hC); // num cores in current node 3735 for (int c = 0; c < NC; ++c) { 3736 // Check Core --------------------------------------- 3737 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3738 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3739 continue; // skip core if all PUs are out of fullMask 3740 } 3741 ++nC; 3742 if (nC <= __kmp_hws_core.offset || 3743 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3744 // skip node as not requested 3745 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3746 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3747 continue; // move to next node 3748 } 3749 // core requested, go down to PUs 3750 nT = 0; 3751 nTr = 0; 3752 hT = NULL; 3753 int NT = __kmp_hwloc_count_children_by_type( 3754 tp, hC, HWLOC_OBJ_PU, &hT); 3755 for (int t = 0; t < NT; ++t) { 3756 // Check PU --------------------------------------- 3757 idx = hT->os_index; 3758 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3759 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3760 continue; // skip PU if not in fullMask 3761 } 3762 ++nT; 3763 if (nT <= __kmp_hws_proc.offset || 3764 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3765 // skip PU 3766 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3767 ++n_old; 3768 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3769 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3770 continue; // move to next node 3771 } 3772 ++nTr; 3773 if (pAddr) // collect requested thread's data 3774 newAddr[n_new] = (*pAddr)[n_old]; 3775 ++n_new; 3776 ++n_old; 3777 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3778 } // threads loop 3779 if (nTr > 0) { 3780 ++nCr; // num cores per socket 3781 ++nCo; // total num cores 3782 if (nTr > nTpC) 3783 nTpC = nTr; // calc max threads per core 3784 } 3785 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3786 } // cores loop 3787 } // tiles support 3788 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN); 3789 } // nodes loop 3790 } else { // numa_support 3791 // no NUMA support 3792 if (tile_support) { 3793 nL = 0; 3794 hL = NULL; 3795 int NL = __kmp_hwloc_count_children_by_depth( 3796 tp, hS, L2depth, &hL); // num tiles in current socket 3797 for (int l = 0; l < NL; ++l) { 3798 // Check L2 (tile) ------------------------------------ 3799 if (!__kmp_hwloc_obj_has_PUs(tp, hL)) { 3800 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3801 continue; // skip tile if all PUs are out of fullMask 3802 } 3803 ++nL; 3804 if (nL <= __kmp_hws_tile.offset || 3805 nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) { 3806 // skip tile as not requested 3807 n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile 3808 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3809 continue; // move to next tile 3810 } 3811 // tile requested, go down the topology tree 3812 nC = 0; 3813 hC = NULL; 3814 int NC = __kmp_hwloc_count_children_by_type( 3815 tp, hL, HWLOC_OBJ_CORE, &hC); // num cores per tile 3816 for (int c = 0; c < NC; ++c) { 3817 // Check Core --------------------------------------- 3818 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3819 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3820 continue; // skip core if all PUs are out of fullMask 3821 } 3822 ++nC; 3823 if (nC <= __kmp_hws_core.offset || 3824 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3825 // skip node as not requested 3826 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3827 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3828 continue; // move to next node 3829 } 3830 // core requested, go down to PUs 3831 nT = 0; 3832 nTr = 0; 3833 hT = NULL; 3834 int NT = __kmp_hwloc_count_children_by_type( 3835 tp, hC, HWLOC_OBJ_PU, &hT); // num procs per core 3836 for (int t = 0; t < NT; ++t) { 3837 // Check PU --------------------------------------- 3838 idx = hT->os_index; 3839 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3840 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3841 continue; // skip PU if not in fullMask 3842 } 3843 ++nT; 3844 if (nT <= __kmp_hws_proc.offset || 3845 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3846 // skip PU 3847 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3848 ++n_old; 3849 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3850 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3851 continue; // move to next node 3852 } 3853 ++nTr; 3854 if (pAddr) // collect requested thread's data 3855 newAddr[n_new] = (*pAddr)[n_old]; 3856 ++n_new; 3857 ++n_old; 3858 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3859 } // threads loop 3860 if (nTr > 0) { 3861 ++nCr; // num cores per socket 3862 ++nCo; // total num cores 3863 if (nTr > nTpC) 3864 nTpC = nTr; // calc max threads per core 3865 } 3866 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3867 } // cores loop 3868 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL); 3869 } // tiles loop 3870 } else { // tile_support 3871 // no tiles, check cores 3872 nC = 0; 3873 hC = NULL; 3874 int NC = __kmp_hwloc_count_children_by_type( 3875 tp, hS, HWLOC_OBJ_CORE, &hC); // num cores in socket 3876 for (int c = 0; c < NC; ++c) { 3877 // Check Core ------------------------------------------- 3878 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) { 3879 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3880 continue; // skip core if all PUs are out of fullMask 3881 } 3882 ++nC; 3883 if (nC <= __kmp_hws_core.offset || 3884 nC > __kmp_hws_core.num + __kmp_hws_core.offset) { 3885 // skip node as not requested 3886 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core 3887 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3888 continue; // move to next node 3889 } 3890 // core requested, go down to PUs 3891 nT = 0; 3892 nTr = 0; 3893 hT = NULL; 3894 int NT = __kmp_hwloc_count_children_by_type( 3895 tp, hC, HWLOC_OBJ_PU, &hT); // num procs per core 3896 for (int t = 0; t < NT; ++t) { 3897 // Check PU --------------------------------------- 3898 idx = hT->os_index; 3899 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) { 3900 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3901 continue; // skip PU if not in fullMask 3902 } 3903 ++nT; 3904 if (nT <= __kmp_hws_proc.offset || 3905 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) { 3906 // skip PU 3907 KMP_CPU_CLR(idx, __kmp_affin_fullMask); 3908 ++n_old; 3909 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx)); 3910 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3911 continue; // move to next node 3912 } 3913 ++nTr; 3914 if (pAddr) // collect requested thread's data 3915 newAddr[n_new] = (*pAddr)[n_old]; 3916 ++n_new; 3917 ++n_old; 3918 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT); 3919 } // threads loop 3920 if (nTr > 0) { 3921 ++nCr; // num cores per socket 3922 ++nCo; // total num cores 3923 if (nTr > nTpC) 3924 nTpC = nTr; // calc max threads per core 3925 } 3926 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC); 3927 } // cores loop 3928 } // tiles support 3929 } // numa_support 3930 if (nCr > 0) { // found cores? 3931 ++nPkg; // num sockets 3932 if (nCr > nCpP) 3933 nCpP = nCr; // calc max cores per socket 3934 } 3935 } // sockets loop 3936 3937 // check the subset is valid 3938 KMP_DEBUG_ASSERT(n_old == __kmp_avail_proc); 3939 KMP_DEBUG_ASSERT(nPkg > 0); 3940 KMP_DEBUG_ASSERT(nCpP > 0); 3941 KMP_DEBUG_ASSERT(nTpC > 0); 3942 KMP_DEBUG_ASSERT(nCo > 0); 3943 KMP_DEBUG_ASSERT(nPkg <= nPackages); 3944 KMP_DEBUG_ASSERT(nCpP <= nCoresPerPkg); 3945 KMP_DEBUG_ASSERT(nTpC <= __kmp_nThreadsPerCore); 3946 KMP_DEBUG_ASSERT(nCo <= __kmp_ncores); 3947 3948 nPackages = nPkg; // correct num sockets 3949 nCoresPerPkg = nCpP; // correct num cores per socket 3950 __kmp_nThreadsPerCore = nTpC; // correct num threads per core 3951 __kmp_avail_proc = n_new; // correct num procs 3952 __kmp_ncores = nCo; // correct num cores 3953 // hwloc topology method end 3954 } else 3955 #endif // KMP_USE_HWLOC 3956 { 3957 int n_old = 0, n_new = 0, proc_num = 0; 3958 if (__kmp_hws_node.num > 0 || __kmp_hws_tile.num > 0) { 3959 KMP_WARNING(AffHWSubsetNoHWLOC); 3960 goto _exit; 3961 } 3962 if (__kmp_hws_socket.num == 0) 3963 __kmp_hws_socket.num = nPackages; // use all available sockets 3964 if (__kmp_hws_core.num == 0) 3965 __kmp_hws_core.num = nCoresPerPkg; // use all available cores 3966 if (__kmp_hws_proc.num == 0 || 3967 __kmp_hws_proc.num > __kmp_nThreadsPerCore) 3968 __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all HW contexts 3969 if ( !__kmp_affinity_uniform_topology() ) { 3970 KMP_WARNING( AffHWSubsetNonUniform ); 3971 goto _exit; // don't support non-uniform topology 3972 } 3973 if ( depth > 3 ) { 3974 KMP_WARNING( AffHWSubsetNonThreeLevel ); 3975 goto _exit; // don't support not-3-level topology 3976 } 3977 if (__kmp_hws_socket.offset + __kmp_hws_socket.num > nPackages) { 3978 KMP_WARNING(AffHWSubsetManySockets); 3979 goto _exit; 3980 } 3981 if ( __kmp_hws_core.offset + __kmp_hws_core.num > nCoresPerPkg ) { 3982 KMP_WARNING( AffHWSubsetManyCores ); 3983 goto _exit; 3984 } 3985 // Form the requested subset 3986 if (pAddr) // pAddr is NULL in case of affinity_none 3987 newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) * 3988 __kmp_hws_socket.num * __kmp_hws_core.num * __kmp_hws_proc.num); 3989 for (int i = 0; i < nPackages; ++i) { 3990 if (i < __kmp_hws_socket.offset || 3991 i >= __kmp_hws_socket.offset + __kmp_hws_socket.num) { 3992 // skip not-requested socket 3993 n_old += nCoresPerPkg * __kmp_nThreadsPerCore; 3994 if (__kmp_pu_os_idx != NULL) { 3995 // walk through skipped socket 3996 for (int j = 0; j < nCoresPerPkg; ++j) { 3997 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 3998 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3999 ++proc_num; 4000 } 4001 } 4002 } 4003 } else { 4004 // walk through requested socket 4005 for (int j = 0; j < nCoresPerPkg; ++j) { 4006 if (j < __kmp_hws_core.offset || 4007 j >= __kmp_hws_core.offset + __kmp_hws_core.num) 4008 { // skip not-requested core 4009 n_old += __kmp_nThreadsPerCore; 4010 if (__kmp_pu_os_idx != NULL) { 4011 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 4012 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 4013 ++proc_num; 4014 } 4015 } 4016 } else { 4017 // walk through requested core 4018 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) { 4019 if (k < __kmp_hws_proc.num) { 4020 if (pAddr) // collect requested thread's data 4021 newAddr[n_new] = (*pAddr)[n_old]; 4022 n_new++; 4023 } else { 4024 if (__kmp_pu_os_idx != NULL) 4025 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 4026 } 4027 n_old++; 4028 ++proc_num; 4029 } 4030 } 4031 } 4032 } 4033 } 4034 KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore); 4035 KMP_DEBUG_ASSERT(n_new == __kmp_hws_socket.num * __kmp_hws_core.num * 4036 __kmp_hws_proc.num); 4037 nPackages = __kmp_hws_socket.num; // correct nPackages 4038 nCoresPerPkg = __kmp_hws_core.num; // correct nCoresPerPkg 4039 __kmp_nThreadsPerCore = __kmp_hws_proc.num; // correct __kmp_nThreadsPerCore 4040 __kmp_avail_proc = n_new; // correct avail_proc 4041 __kmp_ncores = nPackages * __kmp_hws_core.num; // correct ncores 4042 } // non-hwloc topology method 4043 if (pAddr) { 4044 __kmp_free( *pAddr ); 4045 *pAddr = newAddr; // replace old topology with new one 4046 } 4047 if (__kmp_affinity_verbose) { 4048 char m[KMP_AFFIN_MASK_PRINT_LEN]; 4049 __kmp_affinity_print_mask(m,KMP_AFFIN_MASK_PRINT_LEN,__kmp_affin_fullMask); 4050 if (__kmp_affinity_respect_mask) { 4051 KMP_INFORM(InitOSProcSetRespect, "KMP_HW_SUBSET", m); 4052 } else { 4053 KMP_INFORM(InitOSProcSetNotRespect, "KMP_HW_SUBSET", m); 4054 } 4055 KMP_INFORM(AvailableOSProc, "KMP_HW_SUBSET", __kmp_avail_proc); 4056 kmp_str_buf_t buf; 4057 __kmp_str_buf_init(&buf); 4058 __kmp_str_buf_print(&buf, "%d", nPackages); 4059 KMP_INFORM(TopologyExtra, "KMP_HW_SUBSET", buf.str, nCoresPerPkg, 4060 __kmp_nThreadsPerCore, __kmp_ncores); 4061 __kmp_str_buf_free(&buf); 4062 } 4063 _exit: 4064 if (__kmp_pu_os_idx != NULL) { 4065 __kmp_free(__kmp_pu_os_idx); 4066 __kmp_pu_os_idx = NULL; 4067 } 4068 } 4069 4070 // 4071 // This function figures out the deepest level at which there is at least one cluster/core 4072 // with more than one processing unit bound to it. 4073 // 4074 static int 4075 __kmp_affinity_find_core_level(const AddrUnsPair *address2os, int nprocs, int bottom_level) 4076 { 4077 int core_level = 0; 4078 4079 for( int i = 0; i < nprocs; i++ ) { 4080 for( int j = bottom_level; j > 0; j-- ) { 4081 if( address2os[i].first.labels[j] > 0 ) { 4082 if( core_level < ( j - 1 ) ) { 4083 core_level = j - 1; 4084 } 4085 } 4086 } 4087 } 4088 return core_level; 4089 } 4090 4091 // 4092 // This function counts number of clusters/cores at given level. 4093 // 4094 static int __kmp_affinity_compute_ncores(const AddrUnsPair *address2os, int nprocs, int bottom_level, int core_level) 4095 { 4096 int ncores = 0; 4097 int i, j; 4098 4099 j = bottom_level; 4100 for( i = 0; i < nprocs; i++ ) { 4101 for ( j = bottom_level; j > core_level; j-- ) { 4102 if( ( i + 1 ) < nprocs ) { 4103 if( address2os[i + 1].first.labels[j] > 0 ) { 4104 break; 4105 } 4106 } 4107 } 4108 if( j == core_level ) { 4109 ncores++; 4110 } 4111 } 4112 if( j > core_level ) { 4113 // 4114 // In case of ( nprocs < __kmp_avail_proc ) we may end too deep and miss one core. 4115 // May occur when called from __kmp_affinity_find_core(). 4116 // 4117 ncores++; 4118 } 4119 return ncores; 4120 } 4121 4122 // 4123 // This function finds to which cluster/core given processing unit is bound. 4124 // 4125 static int __kmp_affinity_find_core(const AddrUnsPair *address2os, int proc, int bottom_level, int core_level) 4126 { 4127 return __kmp_affinity_compute_ncores(address2os, proc + 1, bottom_level, core_level) - 1; 4128 } 4129 4130 // 4131 // This function finds maximal number of processing units bound to a cluster/core at given level. 4132 // 4133 static int __kmp_affinity_max_proc_per_core(const AddrUnsPair *address2os, int nprocs, int bottom_level, int core_level) 4134 { 4135 int maxprocpercore = 0; 4136 4137 if( core_level < bottom_level ) { 4138 for( int i = 0; i < nprocs; i++ ) { 4139 int percore = address2os[i].first.labels[core_level + 1] + 1; 4140 4141 if( percore > maxprocpercore ) { 4142 maxprocpercore = percore; 4143 } 4144 } 4145 } else { 4146 maxprocpercore = 1; 4147 } 4148 return maxprocpercore; 4149 } 4150 4151 static AddrUnsPair *address2os = NULL; 4152 static int * procarr = NULL; 4153 static int __kmp_aff_depth = 0; 4154 4155 #define KMP_EXIT_AFF_NONE \ 4156 KMP_ASSERT(__kmp_affinity_type == affinity_none); \ 4157 KMP_ASSERT(address2os == NULL); \ 4158 __kmp_apply_thread_places(NULL, 0); \ 4159 return; 4160 4161 static int 4162 __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) 4163 { 4164 const Address *aa = (const Address *)&(((AddrUnsPair *)a) 4165 ->first); 4166 const Address *bb = (const Address *)&(((AddrUnsPair *)b) 4167 ->first); 4168 unsigned depth = aa->depth; 4169 unsigned i; 4170 KMP_DEBUG_ASSERT(depth == bb->depth); 4171 KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth); 4172 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0); 4173 for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) { 4174 int j = depth - i - 1; 4175 if (aa->childNums[j] < bb->childNums[j]) return -1; 4176 if (aa->childNums[j] > bb->childNums[j]) return 1; 4177 } 4178 for (; i < depth; i++) { 4179 int j = i - __kmp_affinity_compact; 4180 if (aa->childNums[j] < bb->childNums[j]) return -1; 4181 if (aa->childNums[j] > bb->childNums[j]) return 1; 4182 } 4183 return 0; 4184 } 4185 4186 static void 4187 __kmp_aux_affinity_initialize(void) 4188 { 4189 if (__kmp_affinity_masks != NULL) { 4190 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4191 return; 4192 } 4193 4194 // 4195 // Create the "full" mask - this defines all of the processors that we 4196 // consider to be in the machine model. If respect is set, then it is 4197 // the initialization thread's affinity mask. Otherwise, it is all 4198 // processors that we know about on the machine. 4199 // 4200 if (__kmp_affin_fullMask == NULL) { 4201 KMP_CPU_ALLOC(__kmp_affin_fullMask); 4202 } 4203 if (KMP_AFFINITY_CAPABLE()) { 4204 if (__kmp_affinity_respect_mask) { 4205 __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE); 4206 4207 // 4208 // Count the number of available processors. 4209 // 4210 unsigned i; 4211 __kmp_avail_proc = 0; 4212 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 4213 if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 4214 continue; 4215 } 4216 __kmp_avail_proc++; 4217 } 4218 if (__kmp_avail_proc > __kmp_xproc) { 4219 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 4220 && (__kmp_affinity_type != affinity_none))) { 4221 KMP_WARNING(ErrorInitializeAffinity); 4222 } 4223 __kmp_affinity_type = affinity_none; 4224 KMP_AFFINITY_DISABLE(); 4225 return; 4226 } 4227 } 4228 else { 4229 __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask); 4230 __kmp_avail_proc = __kmp_xproc; 4231 } 4232 } 4233 4234 int depth = -1; 4235 kmp_i18n_id_t msg_id = kmp_i18n_null; 4236 4237 // 4238 // For backward compatibility, setting KMP_CPUINFO_FILE => 4239 // KMP_TOPOLOGY_METHOD=cpuinfo 4240 // 4241 if ((__kmp_cpuinfo_file != NULL) && 4242 (__kmp_affinity_top_method == affinity_top_method_all)) { 4243 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 4244 } 4245 4246 if (__kmp_affinity_top_method == affinity_top_method_all) { 4247 // 4248 // In the default code path, errors are not fatal - we just try using 4249 // another method. We only emit a warning message if affinity is on, 4250 // or the verbose flag is set, an the nowarnings flag was not set. 4251 // 4252 const char *file_name = NULL; 4253 int line = 0; 4254 # if KMP_USE_HWLOC 4255 if (depth < 0 && __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 4256 if (__kmp_affinity_verbose) { 4257 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 4258 } 4259 if(!__kmp_hwloc_error) { 4260 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); 4261 if (depth == 0) { 4262 KMP_EXIT_AFF_NONE; 4263 } else if(depth < 0 && __kmp_affinity_verbose) { 4264 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 4265 } 4266 } else if(__kmp_affinity_verbose) { 4267 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 4268 } 4269 } 4270 # endif 4271 4272 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 4273 4274 if (depth < 0) { 4275 if (__kmp_affinity_verbose) { 4276 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 4277 } 4278 4279 file_name = NULL; 4280 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 4281 if (depth == 0) { 4282 KMP_EXIT_AFF_NONE; 4283 } 4284 4285 if (depth < 0) { 4286 if (__kmp_affinity_verbose) { 4287 if (msg_id != kmp_i18n_null) { 4288 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), 4289 KMP_I18N_STR(DecodingLegacyAPIC)); 4290 } 4291 else { 4292 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 4293 } 4294 } 4295 4296 file_name = NULL; 4297 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 4298 if (depth == 0) { 4299 KMP_EXIT_AFF_NONE; 4300 } 4301 } 4302 } 4303 4304 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4305 4306 # if KMP_OS_LINUX 4307 4308 if (depth < 0) { 4309 if (__kmp_affinity_verbose) { 4310 if (msg_id != kmp_i18n_null) { 4311 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo"); 4312 } 4313 else { 4314 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo"); 4315 } 4316 } 4317 4318 FILE *f = fopen("/proc/cpuinfo", "r"); 4319 if (f == NULL) { 4320 msg_id = kmp_i18n_str_CantOpenCpuinfo; 4321 } 4322 else { 4323 file_name = "/proc/cpuinfo"; 4324 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 4325 fclose(f); 4326 if (depth == 0) { 4327 KMP_EXIT_AFF_NONE; 4328 } 4329 } 4330 } 4331 4332 # endif /* KMP_OS_LINUX */ 4333 4334 # if KMP_GROUP_AFFINITY 4335 4336 if ((depth < 0) && (__kmp_num_proc_groups > 1)) { 4337 if (__kmp_affinity_verbose) { 4338 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 4339 } 4340 4341 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 4342 KMP_ASSERT(depth != 0); 4343 } 4344 4345 # endif /* KMP_GROUP_AFFINITY */ 4346 4347 if (depth < 0) { 4348 if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) { 4349 if (file_name == NULL) { 4350 KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id)); 4351 } 4352 else if (line == 0) { 4353 KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id)); 4354 } 4355 else { 4356 KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id)); 4357 } 4358 } 4359 // FIXME - print msg if msg_id = kmp_i18n_null ??? 4360 4361 file_name = ""; 4362 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 4363 if (depth == 0) { 4364 KMP_EXIT_AFF_NONE; 4365 } 4366 KMP_ASSERT(depth > 0); 4367 KMP_ASSERT(address2os != NULL); 4368 } 4369 } 4370 4371 // 4372 // If the user has specified that a paricular topology discovery method 4373 // is to be used, then we abort if that method fails. The exception is 4374 // group affinity, which might have been implicitly set. 4375 // 4376 4377 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 4378 4379 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 4380 if (__kmp_affinity_verbose) { 4381 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 4382 KMP_I18N_STR(Decodingx2APIC)); 4383 } 4384 4385 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 4386 if (depth == 0) { 4387 KMP_EXIT_AFF_NONE; 4388 } 4389 if (depth < 0) { 4390 KMP_ASSERT(msg_id != kmp_i18n_null); 4391 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4392 } 4393 } 4394 else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 4395 if (__kmp_affinity_verbose) { 4396 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 4397 KMP_I18N_STR(DecodingLegacyAPIC)); 4398 } 4399 4400 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 4401 if (depth == 0) { 4402 KMP_EXIT_AFF_NONE; 4403 } 4404 if (depth < 0) { 4405 KMP_ASSERT(msg_id != kmp_i18n_null); 4406 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4407 } 4408 } 4409 4410 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4411 4412 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 4413 const char *filename; 4414 if (__kmp_cpuinfo_file != NULL) { 4415 filename = __kmp_cpuinfo_file; 4416 } 4417 else { 4418 filename = "/proc/cpuinfo"; 4419 } 4420 4421 if (__kmp_affinity_verbose) { 4422 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 4423 } 4424 4425 FILE *f = fopen(filename, "r"); 4426 if (f == NULL) { 4427 int code = errno; 4428 if (__kmp_cpuinfo_file != NULL) { 4429 __kmp_msg( 4430 kmp_ms_fatal, 4431 KMP_MSG(CantOpenFileForReading, filename), 4432 KMP_ERR(code), 4433 KMP_HNT(NameComesFrom_CPUINFO_FILE), 4434 __kmp_msg_null 4435 ); 4436 } 4437 else { 4438 __kmp_msg( 4439 kmp_ms_fatal, 4440 KMP_MSG(CantOpenFileForReading, filename), 4441 KMP_ERR(code), 4442 __kmp_msg_null 4443 ); 4444 } 4445 } 4446 int line = 0; 4447 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 4448 fclose(f); 4449 if (depth < 0) { 4450 KMP_ASSERT(msg_id != kmp_i18n_null); 4451 if (line > 0) { 4452 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id)); 4453 } 4454 else { 4455 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 4456 } 4457 } 4458 if (__kmp_affinity_type == affinity_none) { 4459 KMP_ASSERT(depth == 0); 4460 KMP_EXIT_AFF_NONE; 4461 } 4462 } 4463 4464 # if KMP_GROUP_AFFINITY 4465 4466 else if (__kmp_affinity_top_method == affinity_top_method_group) { 4467 if (__kmp_affinity_verbose) { 4468 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 4469 } 4470 4471 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 4472 KMP_ASSERT(depth != 0); 4473 if (depth < 0) { 4474 KMP_ASSERT(msg_id != kmp_i18n_null); 4475 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 4476 } 4477 } 4478 4479 # endif /* KMP_GROUP_AFFINITY */ 4480 4481 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 4482 if (__kmp_affinity_verbose) { 4483 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY"); 4484 } 4485 4486 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 4487 if (depth == 0) { 4488 KMP_EXIT_AFF_NONE; 4489 } 4490 // should not fail 4491 KMP_ASSERT(depth > 0); 4492 KMP_ASSERT(address2os != NULL); 4493 } 4494 4495 # if KMP_USE_HWLOC 4496 else if (__kmp_affinity_top_method == affinity_top_method_hwloc) { 4497 KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC); 4498 if (__kmp_affinity_verbose) { 4499 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 4500 } 4501 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); 4502 if (depth == 0) { 4503 KMP_EXIT_AFF_NONE; 4504 } 4505 } 4506 # endif // KMP_USE_HWLOC 4507 4508 if (address2os == NULL) { 4509 if (KMP_AFFINITY_CAPABLE() 4510 && (__kmp_affinity_verbose || (__kmp_affinity_warnings 4511 && (__kmp_affinity_type != affinity_none)))) { 4512 KMP_WARNING(ErrorInitializeAffinity); 4513 } 4514 __kmp_affinity_type = affinity_none; 4515 KMP_AFFINITY_DISABLE(); 4516 return; 4517 } 4518 4519 __kmp_apply_thread_places(&address2os, depth); 4520 4521 // 4522 // Create the table of masks, indexed by thread Id. 4523 // 4524 unsigned maxIndex; 4525 unsigned numUnique; 4526 kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique, 4527 address2os, __kmp_avail_proc); 4528 if (__kmp_affinity_gran_levels == 0) { 4529 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 4530 } 4531 4532 // 4533 // Set the childNums vector in all Address objects. This must be done 4534 // before we can sort using __kmp_affinity_cmp_Address_child_num(), 4535 // which takes into account the setting of __kmp_affinity_compact. 4536 // 4537 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc); 4538 4539 switch (__kmp_affinity_type) { 4540 4541 case affinity_explicit: 4542 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 4543 # if OMP_40_ENABLED 4544 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 4545 # endif 4546 { 4547 __kmp_affinity_process_proclist(&__kmp_affinity_masks, 4548 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask, 4549 maxIndex); 4550 } 4551 # if OMP_40_ENABLED 4552 else { 4553 __kmp_affinity_process_placelist(&__kmp_affinity_masks, 4554 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask, 4555 maxIndex); 4556 } 4557 # endif 4558 if (__kmp_affinity_num_masks == 0) { 4559 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 4560 && (__kmp_affinity_type != affinity_none))) { 4561 KMP_WARNING(AffNoValidProcID); 4562 } 4563 __kmp_affinity_type = affinity_none; 4564 return; 4565 } 4566 break; 4567 4568 // 4569 // The other affinity types rely on sorting the Addresses according 4570 // to some permutation of the machine topology tree. Set 4571 // __kmp_affinity_compact and __kmp_affinity_offset appropriately, 4572 // then jump to a common code fragment to do the sort and create 4573 // the array of affinity masks. 4574 // 4575 4576 case affinity_logical: 4577 __kmp_affinity_compact = 0; 4578 if (__kmp_affinity_offset) { 4579 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset 4580 % __kmp_avail_proc; 4581 } 4582 goto sortAddresses; 4583 4584 case affinity_physical: 4585 if (__kmp_nThreadsPerCore > 1) { 4586 __kmp_affinity_compact = 1; 4587 if (__kmp_affinity_compact >= depth) { 4588 __kmp_affinity_compact = 0; 4589 } 4590 } else { 4591 __kmp_affinity_compact = 0; 4592 } 4593 if (__kmp_affinity_offset) { 4594 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset 4595 % __kmp_avail_proc; 4596 } 4597 goto sortAddresses; 4598 4599 case affinity_scatter: 4600 if (__kmp_affinity_compact >= depth) { 4601 __kmp_affinity_compact = 0; 4602 } 4603 else { 4604 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 4605 } 4606 goto sortAddresses; 4607 4608 case affinity_compact: 4609 if (__kmp_affinity_compact >= depth) { 4610 __kmp_affinity_compact = depth - 1; 4611 } 4612 goto sortAddresses; 4613 4614 case affinity_balanced: 4615 if( depth <= 1 ) { 4616 if( __kmp_affinity_verbose || __kmp_affinity_warnings ) { 4617 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" ); 4618 } 4619 __kmp_affinity_type = affinity_none; 4620 return; 4621 } else if( __kmp_affinity_uniform_topology() ) { 4622 break; 4623 } else { // Non-uniform topology 4624 4625 // Save the depth for further usage 4626 __kmp_aff_depth = depth; 4627 4628 int core_level = __kmp_affinity_find_core_level(address2os, __kmp_avail_proc, depth - 1); 4629 int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, depth - 1, core_level); 4630 int maxprocpercore = __kmp_affinity_max_proc_per_core(address2os, __kmp_avail_proc, depth - 1, core_level); 4631 4632 int nproc = ncores * maxprocpercore; 4633 if( ( nproc < 2 ) || ( nproc < __kmp_avail_proc ) ) { 4634 if( __kmp_affinity_verbose || __kmp_affinity_warnings ) { 4635 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" ); 4636 } 4637 __kmp_affinity_type = affinity_none; 4638 return; 4639 } 4640 4641 procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc ); 4642 for( int i = 0; i < nproc; i++ ) { 4643 procarr[ i ] = -1; 4644 } 4645 4646 int lastcore = -1; 4647 int inlastcore = 0; 4648 for( int i = 0; i < __kmp_avail_proc; i++ ) { 4649 int proc = address2os[ i ].second; 4650 int core = __kmp_affinity_find_core(address2os, i, depth - 1, core_level); 4651 4652 if ( core == lastcore ) { 4653 inlastcore++; 4654 } else { 4655 inlastcore = 0; 4656 } 4657 lastcore = core; 4658 4659 procarr[ core * maxprocpercore + inlastcore ] = proc; 4660 } 4661 4662 break; 4663 } 4664 4665 sortAddresses: 4666 // 4667 // Allocate the gtid->affinity mask table. 4668 // 4669 if (__kmp_affinity_dups) { 4670 __kmp_affinity_num_masks = __kmp_avail_proc; 4671 } 4672 else { 4673 __kmp_affinity_num_masks = numUnique; 4674 } 4675 4676 # if OMP_40_ENABLED 4677 if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel ) 4678 && ( __kmp_affinity_num_places > 0 ) 4679 && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) { 4680 __kmp_affinity_num_masks = __kmp_affinity_num_places; 4681 } 4682 # endif 4683 4684 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4685 4686 // 4687 // Sort the address2os table according to the current setting of 4688 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 4689 // 4690 qsort(address2os, __kmp_avail_proc, sizeof(*address2os), 4691 __kmp_affinity_cmp_Address_child_num); 4692 { 4693 int i; 4694 unsigned j; 4695 for (i = 0, j = 0; i < __kmp_avail_proc; i++) { 4696 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) { 4697 continue; 4698 } 4699 unsigned osId = address2os[i].second; 4700 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 4701 kmp_affin_mask_t *dest 4702 = KMP_CPU_INDEX(__kmp_affinity_masks, j); 4703 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 4704 KMP_CPU_COPY(dest, src); 4705 if (++j >= __kmp_affinity_num_masks) { 4706 break; 4707 } 4708 } 4709 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 4710 } 4711 break; 4712 4713 default: 4714 KMP_ASSERT2(0, "Unexpected affinity setting"); 4715 } 4716 4717 KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex+1); 4718 machine_hierarchy.init(address2os, __kmp_avail_proc); 4719 } 4720 #undef KMP_EXIT_AFF_NONE 4721 4722 4723 void 4724 __kmp_affinity_initialize(void) 4725 { 4726 // 4727 // Much of the code above was written assumming that if a machine was not 4728 // affinity capable, then __kmp_affinity_type == affinity_none. We now 4729 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 4730 // 4731 // There are too many checks for __kmp_affinity_type == affinity_none 4732 // in this code. Instead of trying to change them all, check if 4733 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 4734 // affinity_none, call the real initialization routine, then restore 4735 // __kmp_affinity_type to affinity_disabled. 4736 // 4737 int disabled = (__kmp_affinity_type == affinity_disabled); 4738 if (! KMP_AFFINITY_CAPABLE()) { 4739 KMP_ASSERT(disabled); 4740 } 4741 if (disabled) { 4742 __kmp_affinity_type = affinity_none; 4743 } 4744 __kmp_aux_affinity_initialize(); 4745 if (disabled) { 4746 __kmp_affinity_type = affinity_disabled; 4747 } 4748 } 4749 4750 4751 void 4752 __kmp_affinity_uninitialize(void) 4753 { 4754 if (__kmp_affinity_masks != NULL) { 4755 KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4756 __kmp_affinity_masks = NULL; 4757 } 4758 if (__kmp_affin_fullMask != NULL) { 4759 KMP_CPU_FREE(__kmp_affin_fullMask); 4760 __kmp_affin_fullMask = NULL; 4761 } 4762 __kmp_affinity_num_masks = 0; 4763 __kmp_affinity_type = affinity_default; 4764 # if OMP_40_ENABLED 4765 __kmp_affinity_num_places = 0; 4766 # endif 4767 if (__kmp_affinity_proclist != NULL) { 4768 __kmp_free(__kmp_affinity_proclist); 4769 __kmp_affinity_proclist = NULL; 4770 } 4771 if( address2os != NULL ) { 4772 __kmp_free( address2os ); 4773 address2os = NULL; 4774 } 4775 if( procarr != NULL ) { 4776 __kmp_free( procarr ); 4777 procarr = NULL; 4778 } 4779 # if KMP_USE_HWLOC 4780 if (__kmp_hwloc_topology != NULL) { 4781 hwloc_topology_destroy(__kmp_hwloc_topology); 4782 __kmp_hwloc_topology = NULL; 4783 } 4784 # endif 4785 KMPAffinity::destroy_api(); 4786 } 4787 4788 4789 void 4790 __kmp_affinity_set_init_mask(int gtid, int isa_root) 4791 { 4792 if (! KMP_AFFINITY_CAPABLE()) { 4793 return; 4794 } 4795 4796 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4797 if (th->th.th_affin_mask == NULL) { 4798 KMP_CPU_ALLOC(th->th.th_affin_mask); 4799 } 4800 else { 4801 KMP_CPU_ZERO(th->th.th_affin_mask); 4802 } 4803 4804 // 4805 // Copy the thread mask to the kmp_info_t strucuture. 4806 // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one 4807 // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask 4808 // is set, then the full mask is the same as the mask of the initialization 4809 // thread. 4810 // 4811 kmp_affin_mask_t *mask; 4812 int i; 4813 4814 # if OMP_40_ENABLED 4815 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 4816 # endif 4817 { 4818 if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced) 4819 ) { 4820 # if KMP_GROUP_AFFINITY 4821 if (__kmp_num_proc_groups > 1) { 4822 return; 4823 } 4824 # endif 4825 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4826 i = KMP_PLACE_ALL; 4827 mask = __kmp_affin_fullMask; 4828 } 4829 else { 4830 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 ); 4831 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4832 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4833 } 4834 } 4835 # if OMP_40_ENABLED 4836 else { 4837 if ((! isa_root) 4838 || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 4839 # if KMP_GROUP_AFFINITY 4840 if (__kmp_num_proc_groups > 1) { 4841 return; 4842 } 4843 # endif 4844 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4845 i = KMP_PLACE_ALL; 4846 mask = __kmp_affin_fullMask; 4847 } 4848 else { 4849 // 4850 // int i = some hash function or just a counter that doesn't 4851 // always start at 0. Use gtid for now. 4852 // 4853 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 ); 4854 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4855 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4856 } 4857 } 4858 # endif 4859 4860 # if OMP_40_ENABLED 4861 th->th.th_current_place = i; 4862 if (isa_root) { 4863 th->th.th_new_place = i; 4864 th->th.th_first_place = 0; 4865 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4866 } 4867 4868 if (i == KMP_PLACE_ALL) { 4869 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 4870 gtid)); 4871 } 4872 else { 4873 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 4874 gtid, i)); 4875 } 4876 # else 4877 if (i == -1) { 4878 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to __kmp_affin_fullMask\n", 4879 gtid)); 4880 } 4881 else { 4882 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n", 4883 gtid, i)); 4884 } 4885 # endif /* OMP_40_ENABLED */ 4886 4887 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4888 4889 if (__kmp_affinity_verbose) { 4890 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4891 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4892 th->th.th_affin_mask); 4893 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),__kmp_gettid(), gtid, buf); 4894 } 4895 4896 # if KMP_OS_WINDOWS 4897 // 4898 // On Windows* OS, the process affinity mask might have changed. 4899 // If the user didn't request affinity and this call fails, 4900 // just continue silently. See CQ171393. 4901 // 4902 if ( __kmp_affinity_type == affinity_none ) { 4903 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 4904 } 4905 else 4906 # endif 4907 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4908 } 4909 4910 4911 # if OMP_40_ENABLED 4912 4913 void 4914 __kmp_affinity_set_place(int gtid) 4915 { 4916 int retval; 4917 4918 if (! KMP_AFFINITY_CAPABLE()) { 4919 return; 4920 } 4921 4922 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4923 4924 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n", 4925 gtid, th->th.th_new_place, th->th.th_current_place)); 4926 4927 // 4928 // Check that the new place is within this thread's partition. 4929 // 4930 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4931 KMP_ASSERT(th->th.th_new_place >= 0); 4932 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 4933 if (th->th.th_first_place <= th->th.th_last_place) { 4934 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) 4935 && (th->th.th_new_place <= th->th.th_last_place)); 4936 } 4937 else { 4938 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) 4939 || (th->th.th_new_place >= th->th.th_last_place)); 4940 } 4941 4942 // 4943 // Copy the thread mask to the kmp_info_t strucuture, 4944 // and set this thread's affinity. 4945 // 4946 kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, 4947 th->th.th_new_place); 4948 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4949 th->th.th_current_place = th->th.th_new_place; 4950 4951 if (__kmp_affinity_verbose) { 4952 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4953 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4954 th->th.th_affin_mask); 4955 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), __kmp_gettid(), gtid, buf); 4956 } 4957 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4958 } 4959 4960 # endif /* OMP_40_ENABLED */ 4961 4962 4963 int 4964 __kmp_aux_set_affinity(void **mask) 4965 { 4966 int gtid; 4967 kmp_info_t *th; 4968 int retval; 4969 4970 if (! KMP_AFFINITY_CAPABLE()) { 4971 return -1; 4972 } 4973 4974 gtid = __kmp_entry_gtid(); 4975 KA_TRACE(1000, ;{ 4976 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4977 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4978 (kmp_affin_mask_t *)(*mask)); 4979 __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n", 4980 gtid, buf); 4981 }); 4982 4983 if (__kmp_env_consistency_check) { 4984 if ((mask == NULL) || (*mask == NULL)) { 4985 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4986 } 4987 else { 4988 unsigned proc; 4989 int num_procs = 0; 4990 4991 KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t*)(*mask))) { 4992 if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4993 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4994 } 4995 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 4996 continue; 4997 } 4998 num_procs++; 4999 } 5000 if (num_procs == 0) { 5001 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 5002 } 5003 5004 # if KMP_GROUP_AFFINITY 5005 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 5006 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 5007 } 5008 # endif /* KMP_GROUP_AFFINITY */ 5009 5010 } 5011 } 5012 5013 th = __kmp_threads[gtid]; 5014 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 5015 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 5016 if (retval == 0) { 5017 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 5018 } 5019 5020 # if OMP_40_ENABLED 5021 th->th.th_current_place = KMP_PLACE_UNDEFINED; 5022 th->th.th_new_place = KMP_PLACE_UNDEFINED; 5023 th->th.th_first_place = 0; 5024 th->th.th_last_place = __kmp_affinity_num_masks - 1; 5025 5026 // 5027 // Turn off 4.0 affinity for the current tread at this parallel level. 5028 // 5029 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 5030 # endif 5031 5032 return retval; 5033 } 5034 5035 5036 int 5037 __kmp_aux_get_affinity(void **mask) 5038 { 5039 int gtid; 5040 int retval; 5041 kmp_info_t *th; 5042 5043 if (! KMP_AFFINITY_CAPABLE()) { 5044 return -1; 5045 } 5046 5047 gtid = __kmp_entry_gtid(); 5048 th = __kmp_threads[gtid]; 5049 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 5050 5051 KA_TRACE(1000, ;{ 5052 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5053 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 5054 th->th.th_affin_mask); 5055 __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf); 5056 }); 5057 5058 if (__kmp_env_consistency_check) { 5059 if ((mask == NULL) || (*mask == NULL)) { 5060 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 5061 } 5062 } 5063 5064 # if !KMP_OS_WINDOWS 5065 5066 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 5067 KA_TRACE(1000, ;{ 5068 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5069 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 5070 (kmp_affin_mask_t *)(*mask)); 5071 __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf); 5072 }); 5073 return retval; 5074 5075 # else 5076 5077 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 5078 return 0; 5079 5080 # endif /* KMP_OS_WINDOWS */ 5081 5082 } 5083 5084 int 5085 __kmp_aux_get_affinity_max_proc() { 5086 if (! KMP_AFFINITY_CAPABLE()) { 5087 return 0; 5088 } 5089 #if KMP_GROUP_AFFINITY 5090 if ( __kmp_num_proc_groups > 1 ) { 5091 return (int)(__kmp_num_proc_groups*sizeof(DWORD_PTR)*CHAR_BIT); 5092 } 5093 #endif 5094 return __kmp_xproc; 5095 } 5096 5097 int 5098 __kmp_aux_set_affinity_mask_proc(int proc, void **mask) 5099 { 5100 int retval; 5101 5102 if (! KMP_AFFINITY_CAPABLE()) { 5103 return -1; 5104 } 5105 5106 KA_TRACE(1000, ;{ 5107 int gtid = __kmp_entry_gtid(); 5108 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5109 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 5110 (kmp_affin_mask_t *)(*mask)); 5111 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n", 5112 proc, gtid, buf); 5113 }); 5114 5115 if (__kmp_env_consistency_check) { 5116 if ((mask == NULL) || (*mask == NULL)) { 5117 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 5118 } 5119 } 5120 5121 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 5122 return -1; 5123 } 5124 if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 5125 return -2; 5126 } 5127 5128 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 5129 return 0; 5130 } 5131 5132 5133 int 5134 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) 5135 { 5136 int retval; 5137 5138 if (! KMP_AFFINITY_CAPABLE()) { 5139 return -1; 5140 } 5141 5142 KA_TRACE(1000, ;{ 5143 int gtid = __kmp_entry_gtid(); 5144 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5145 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 5146 (kmp_affin_mask_t *)(*mask)); 5147 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n", 5148 proc, gtid, buf); 5149 }); 5150 5151 if (__kmp_env_consistency_check) { 5152 if ((mask == NULL) || (*mask == NULL)) { 5153 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 5154 } 5155 } 5156 5157 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 5158 return -1; 5159 } 5160 if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 5161 return -2; 5162 } 5163 5164 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 5165 return 0; 5166 } 5167 5168 5169 int 5170 __kmp_aux_get_affinity_mask_proc(int proc, void **mask) 5171 { 5172 int retval; 5173 5174 if (! KMP_AFFINITY_CAPABLE()) { 5175 return -1; 5176 } 5177 5178 KA_TRACE(1000, ;{ 5179 int gtid = __kmp_entry_gtid(); 5180 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5181 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 5182 (kmp_affin_mask_t *)(*mask)); 5183 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n", 5184 proc, gtid, buf); 5185 }); 5186 5187 if (__kmp_env_consistency_check) { 5188 if ((mask == NULL) || (*mask == NULL)) { 5189 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc"); 5190 } 5191 } 5192 5193 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 5194 return -1; 5195 } 5196 if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 5197 return 0; 5198 } 5199 5200 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 5201 } 5202 5203 5204 // Dynamic affinity settings - Affinity balanced 5205 void __kmp_balanced_affinity( int tid, int nthreads ) 5206 { 5207 bool fine_gran = true; 5208 5209 switch (__kmp_affinity_gran) { 5210 case affinity_gran_fine: 5211 case affinity_gran_thread: 5212 break; 5213 case affinity_gran_core: 5214 if( __kmp_nThreadsPerCore > 1) { 5215 fine_gran = false; 5216 } 5217 break; 5218 case affinity_gran_package: 5219 if( nCoresPerPkg > 1) { 5220 fine_gran = false; 5221 } 5222 break; 5223 default: 5224 fine_gran = false; 5225 } 5226 5227 if( __kmp_affinity_uniform_topology() ) { 5228 int coreID; 5229 int threadID; 5230 // Number of hyper threads per core in HT machine 5231 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 5232 // Number of cores 5233 int ncores = __kmp_ncores; 5234 if( ( nPackages > 1 ) && ( __kmp_nth_per_core <= 1 ) ) { 5235 __kmp_nth_per_core = __kmp_avail_proc / nPackages; 5236 ncores = nPackages; 5237 } 5238 // How many threads will be bound to each core 5239 int chunk = nthreads / ncores; 5240 // How many cores will have an additional thread bound to it - "big cores" 5241 int big_cores = nthreads % ncores; 5242 // Number of threads on the big cores 5243 int big_nth = ( chunk + 1 ) * big_cores; 5244 if( tid < big_nth ) { 5245 coreID = tid / (chunk + 1 ); 5246 threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ; 5247 } else { //tid >= big_nth 5248 coreID = ( tid - big_cores ) / chunk; 5249 threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ; 5250 } 5251 5252 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 5253 "Illegal set affinity operation when not capable"); 5254 5255 kmp_affin_mask_t *mask; 5256 KMP_CPU_ALLOC_ON_STACK(mask); 5257 KMP_CPU_ZERO(mask); 5258 5259 if( fine_gran ) { 5260 int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second; 5261 KMP_CPU_SET( osID, mask); 5262 } else { 5263 for( int i = 0; i < __kmp_nth_per_core; i++ ) { 5264 int osID; 5265 osID = address2os[ coreID * __kmp_nth_per_core + i ].second; 5266 KMP_CPU_SET( osID, mask); 5267 } 5268 } 5269 if (__kmp_affinity_verbose) { 5270 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5271 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 5272 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 5273 __kmp_gettid(), tid, buf); 5274 } 5275 __kmp_set_system_affinity( mask, TRUE ); 5276 KMP_CPU_FREE_FROM_STACK(mask); 5277 } else { // Non-uniform topology 5278 5279 kmp_affin_mask_t *mask; 5280 KMP_CPU_ALLOC_ON_STACK(mask); 5281 KMP_CPU_ZERO(mask); 5282 5283 int core_level = __kmp_affinity_find_core_level(address2os, __kmp_avail_proc, __kmp_aff_depth - 1); 5284 int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level); 5285 int nth_per_core = __kmp_affinity_max_proc_per_core(address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level); 5286 5287 // For performance gain consider the special case nthreads == __kmp_avail_proc 5288 if( nthreads == __kmp_avail_proc ) { 5289 if( fine_gran ) { 5290 int osID = address2os[ tid ].second; 5291 KMP_CPU_SET( osID, mask); 5292 } else { 5293 int core = __kmp_affinity_find_core(address2os, tid, __kmp_aff_depth - 1, core_level); 5294 for( int i = 0; i < __kmp_avail_proc; i++ ) { 5295 int osID = address2os[ i ].second; 5296 if( __kmp_affinity_find_core(address2os, i, __kmp_aff_depth - 1, core_level) == core ) { 5297 KMP_CPU_SET( osID, mask); 5298 } 5299 } 5300 } 5301 } else if( nthreads <= ncores ) { 5302 5303 int core = 0; 5304 for( int i = 0; i < ncores; i++ ) { 5305 // Check if this core from procarr[] is in the mask 5306 int in_mask = 0; 5307 for( int j = 0; j < nth_per_core; j++ ) { 5308 if( procarr[ i * nth_per_core + j ] != - 1 ) { 5309 in_mask = 1; 5310 break; 5311 } 5312 } 5313 if( in_mask ) { 5314 if( tid == core ) { 5315 for( int j = 0; j < nth_per_core; j++ ) { 5316 int osID = procarr[ i * nth_per_core + j ]; 5317 if( osID != -1 ) { 5318 KMP_CPU_SET( osID, mask ); 5319 // For fine granularity it is enough to set the first available osID for this core 5320 if( fine_gran) { 5321 break; 5322 } 5323 } 5324 } 5325 break; 5326 } else { 5327 core++; 5328 } 5329 } 5330 } 5331 5332 } else { // nthreads > ncores 5333 5334 // Array to save the number of processors at each core 5335 int* nproc_at_core = (int*)KMP_ALLOCA(sizeof(int)*ncores); 5336 // Array to save the number of cores with "x" available processors; 5337 int* ncores_with_x_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1)); 5338 // Array to save the number of cores with # procs from x to nth_per_core 5339 int* ncores_with_x_to_max_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1)); 5340 5341 for( int i = 0; i <= nth_per_core; i++ ) { 5342 ncores_with_x_procs[ i ] = 0; 5343 ncores_with_x_to_max_procs[ i ] = 0; 5344 } 5345 5346 for( int i = 0; i < ncores; i++ ) { 5347 int cnt = 0; 5348 for( int j = 0; j < nth_per_core; j++ ) { 5349 if( procarr[ i * nth_per_core + j ] != -1 ) { 5350 cnt++; 5351 } 5352 } 5353 nproc_at_core[ i ] = cnt; 5354 ncores_with_x_procs[ cnt ]++; 5355 } 5356 5357 for( int i = 0; i <= nth_per_core; i++ ) { 5358 for( int j = i; j <= nth_per_core; j++ ) { 5359 ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ]; 5360 } 5361 } 5362 5363 // Max number of processors 5364 int nproc = nth_per_core * ncores; 5365 // An array to keep number of threads per each context 5366 int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc ); 5367 for( int i = 0; i < nproc; i++ ) { 5368 newarr[ i ] = 0; 5369 } 5370 5371 int nth = nthreads; 5372 int flag = 0; 5373 while( nth > 0 ) { 5374 for( int j = 1; j <= nth_per_core; j++ ) { 5375 int cnt = ncores_with_x_to_max_procs[ j ]; 5376 for( int i = 0; i < ncores; i++ ) { 5377 // Skip the core with 0 processors 5378 if( nproc_at_core[ i ] == 0 ) { 5379 continue; 5380 } 5381 for( int k = 0; k < nth_per_core; k++ ) { 5382 if( procarr[ i * nth_per_core + k ] != -1 ) { 5383 if( newarr[ i * nth_per_core + k ] == 0 ) { 5384 newarr[ i * nth_per_core + k ] = 1; 5385 cnt--; 5386 nth--; 5387 break; 5388 } else { 5389 if( flag != 0 ) { 5390 newarr[ i * nth_per_core + k ] ++; 5391 cnt--; 5392 nth--; 5393 break; 5394 } 5395 } 5396 } 5397 } 5398 if( cnt == 0 || nth == 0 ) { 5399 break; 5400 } 5401 } 5402 if( nth == 0 ) { 5403 break; 5404 } 5405 } 5406 flag = 1; 5407 } 5408 int sum = 0; 5409 for( int i = 0; i < nproc; i++ ) { 5410 sum += newarr[ i ]; 5411 if( sum > tid ) { 5412 if( fine_gran) { 5413 int osID = procarr[ i ]; 5414 KMP_CPU_SET( osID, mask); 5415 } else { 5416 int coreID = i / nth_per_core; 5417 for( int ii = 0; ii < nth_per_core; ii++ ) { 5418 int osID = procarr[ coreID * nth_per_core + ii ]; 5419 if( osID != -1 ) { 5420 KMP_CPU_SET( osID, mask); 5421 } 5422 } 5423 } 5424 break; 5425 } 5426 } 5427 __kmp_free( newarr ); 5428 } 5429 5430 if (__kmp_affinity_verbose) { 5431 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 5432 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 5433 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 5434 __kmp_gettid(), tid, buf); 5435 } 5436 __kmp_set_system_affinity( mask, TRUE ); 5437 KMP_CPU_FREE_FROM_STACK(mask); 5438 } 5439 } 5440 5441 #if KMP_OS_LINUX 5442 // We don't need this entry for Windows because 5443 // there is GetProcessAffinityMask() api 5444 // 5445 // The intended usage is indicated by these steps: 5446 // 1) The user gets the current affinity mask 5447 // 2) Then sets the affinity by calling this function 5448 // 3) Error check the return value 5449 // 4) Use non-OpenMP parallelization 5450 // 5) Reset the affinity to what was stored in step 1) 5451 #ifdef __cplusplus 5452 extern "C" 5453 #endif 5454 int 5455 kmp_set_thread_affinity_mask_initial() 5456 // the function returns 0 on success, 5457 // -1 if we cannot bind thread 5458 // >0 (errno) if an error happened during binding 5459 { 5460 int gtid = __kmp_get_gtid(); 5461 if (gtid < 0) { 5462 // Do not touch non-omp threads 5463 KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: " 5464 "non-omp thread, returning\n")); 5465 return -1; 5466 } 5467 if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) { 5468 KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: " 5469 "affinity not initialized, returning\n")); 5470 return -1; 5471 } 5472 KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: " 5473 "set full mask for thread %d\n", gtid)); 5474 KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL); 5475 return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE); 5476 } 5477 #endif 5478 5479 #endif // KMP_AFFINITY_SUPPORTED 5480