1 /* 2 * kmp_affinity.cpp -- affinity management 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "kmp.h" 17 #include "kmp_i18n.h" 18 #include "kmp_io.h" 19 #include "kmp_str.h" 20 #include "kmp_wrapper_getpid.h" 21 #include "kmp_affinity.h" 22 23 // Store the real or imagined machine hierarchy here 24 static hierarchy_info machine_hierarchy; 25 26 void __kmp_cleanup_hierarchy() { 27 machine_hierarchy.fini(); 28 } 29 30 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 31 kmp_uint32 depth; 32 // The test below is true if affinity is available, but set to "none". Need to init on first use of hierarchical barrier. 33 if (TCR_1(machine_hierarchy.uninitialized)) 34 machine_hierarchy.init(NULL, nproc); 35 36 // Adjust the hierarchy in case num threads exceeds original 37 if (nproc > machine_hierarchy.base_num_threads) 38 machine_hierarchy.resize(nproc); 39 40 depth = machine_hierarchy.depth; 41 KMP_DEBUG_ASSERT(depth > 0); 42 43 thr_bar->depth = depth; 44 thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1; 45 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 46 } 47 48 #if KMP_AFFINITY_SUPPORTED 49 50 // 51 // Print the affinity mask to the character array in a pretty format. 52 // 53 #if KMP_USE_HWLOC 54 char * 55 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask) 56 { 57 int num_chars_to_write, num_chars_written; 58 char* scan; 59 KMP_ASSERT(buf_len >= 40); 60 61 // bufsize of 0 just retrieves the needed buffer size. 62 num_chars_to_write = hwloc_bitmap_list_snprintf(buf, 0, (hwloc_bitmap_t)mask); 63 64 // need '{', "xxxxxxxx...xx", '}', '\0' = num_chars_to_write + 3 bytes 65 // * num_chars_to_write returned by hwloc_bitmap_list_snprintf does not 66 // take into account the '\0' character. 67 if(hwloc_bitmap_iszero((hwloc_bitmap_t)mask)) { 68 KMP_SNPRINTF(buf, buf_len, "{<empty>}"); 69 } else if(num_chars_to_write < buf_len - 3) { 70 // no problem fitting the mask into buf_len number of characters 71 buf[0] = '{'; 72 // use buf_len-3 because we have the three characters: '{' '}' '\0' to add to the buffer 73 num_chars_written = hwloc_bitmap_list_snprintf(buf+1, buf_len-3, (hwloc_bitmap_t)mask); 74 buf[num_chars_written+1] = '}'; 75 buf[num_chars_written+2] = '\0'; 76 } else { 77 // Need to truncate the affinity mask string and add ellipsis. 78 // To do this, we first write out the '{' + str(mask) 79 buf[0] = '{'; 80 hwloc_bitmap_list_snprintf(buf+1, buf_len-1, (hwloc_bitmap_t)mask); 81 // then, what we do here is go to the 7th to last character, then go backwards until we are NOT 82 // on a digit then write "...}\0". This way it is a clean ellipsis addition and we don't 83 // overwrite part of an affinity number. i.e., we avoid something like { 45, 67, 8...} and get 84 // { 45, 67,...} instead. 85 scan = buf + buf_len - 7; 86 while(*scan >= '0' && *scan <= '9' && scan >= buf) 87 scan--; 88 *(scan+1) = '.'; 89 *(scan+2) = '.'; 90 *(scan+3) = '.'; 91 *(scan+4) = '}'; 92 *(scan+5) = '\0'; 93 } 94 return buf; 95 } 96 #else 97 char * 98 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask) 99 { 100 KMP_ASSERT(buf_len >= 40); 101 char *scan = buf; 102 char *end = buf + buf_len - 1; 103 104 // 105 // Find first element / check for empty set. 106 // 107 size_t i; 108 for (i = 0; i < KMP_CPU_SETSIZE; i++) { 109 if (KMP_CPU_ISSET(i, mask)) { 110 break; 111 } 112 } 113 if (i == KMP_CPU_SETSIZE) { 114 KMP_SNPRINTF(scan, end-scan+1, "{<empty>}"); 115 while (*scan != '\0') scan++; 116 KMP_ASSERT(scan <= end); 117 return buf; 118 } 119 120 KMP_SNPRINTF(scan, end-scan+1, "{%ld", (long)i); 121 while (*scan != '\0') scan++; 122 i++; 123 for (; i < KMP_CPU_SETSIZE; i++) { 124 if (! KMP_CPU_ISSET(i, mask)) { 125 continue; 126 } 127 128 // 129 // Check for buffer overflow. A string of the form ",<n>" will have 130 // at most 10 characters, plus we want to leave room to print ",...}" 131 // if the set is too large to print for a total of 15 characters. 132 // We already left room for '\0' in setting end. 133 // 134 if (end - scan < 15) { 135 break; 136 } 137 KMP_SNPRINTF(scan, end-scan+1, ",%-ld", (long)i); 138 while (*scan != '\0') scan++; 139 } 140 if (i < KMP_CPU_SETSIZE) { 141 KMP_SNPRINTF(scan, end-scan+1, ",..."); 142 while (*scan != '\0') scan++; 143 } 144 KMP_SNPRINTF(scan, end-scan+1, "}"); 145 while (*scan != '\0') scan++; 146 KMP_ASSERT(scan <= end); 147 return buf; 148 } 149 #endif // KMP_USE_HWLOC 150 151 152 void 153 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) 154 { 155 KMP_CPU_ZERO(mask); 156 157 # if KMP_GROUP_AFFINITY 158 159 if (__kmp_num_proc_groups > 1) { 160 int group; 161 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 162 for (group = 0; group < __kmp_num_proc_groups; group++) { 163 int i; 164 int num = __kmp_GetActiveProcessorCount(group); 165 for (i = 0; i < num; i++) { 166 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 167 } 168 } 169 } 170 else 171 172 # endif /* KMP_GROUP_AFFINITY */ 173 174 { 175 int proc; 176 for (proc = 0; proc < __kmp_xproc; proc++) { 177 KMP_CPU_SET(proc, mask); 178 } 179 } 180 } 181 182 // 183 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be 184 // called to renumber the labels from [0..n] and place them into the child_num 185 // vector of the address object. This is done in case the labels used for 186 // the children at one node of the hierarchy differ from those used for 187 // another node at the same level. Example: suppose the machine has 2 nodes 188 // with 2 packages each. The first node contains packages 601 and 602, and 189 // second node contains packages 603 and 604. If we try to sort the table 190 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604 191 // because we are paying attention to the labels themselves, not the ordinal 192 // child numbers. By using the child numbers in the sort, the result is 193 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604. 194 // 195 static void 196 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os, 197 int numAddrs) 198 { 199 KMP_DEBUG_ASSERT(numAddrs > 0); 200 int depth = address2os->first.depth; 201 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 202 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth 203 * sizeof(unsigned)); 204 int labCt; 205 for (labCt = 0; labCt < depth; labCt++) { 206 address2os[0].first.childNums[labCt] = counts[labCt] = 0; 207 lastLabel[labCt] = address2os[0].first.labels[labCt]; 208 } 209 int i; 210 for (i = 1; i < numAddrs; i++) { 211 for (labCt = 0; labCt < depth; labCt++) { 212 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) { 213 int labCt2; 214 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) { 215 counts[labCt2] = 0; 216 lastLabel[labCt2] = address2os[i].first.labels[labCt2]; 217 } 218 counts[labCt]++; 219 lastLabel[labCt] = address2os[i].first.labels[labCt]; 220 break; 221 } 222 } 223 for (labCt = 0; labCt < depth; labCt++) { 224 address2os[i].first.childNums[labCt] = counts[labCt]; 225 } 226 for (; labCt < (int)Address::maxDepth; labCt++) { 227 address2os[i].first.childNums[labCt] = 0; 228 } 229 } 230 } 231 232 233 // 234 // All of the __kmp_affinity_create_*_map() routines should set 235 // __kmp_affinity_masks to a vector of affinity mask objects of length 236 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and 237 // return the number of levels in the machine topology tree (zero if 238 // __kmp_affinity_type == affinity_none). 239 // 240 // All of the __kmp_affinity_create_*_map() routines should set *__kmp_affin_fullMask 241 // to the affinity mask for the initialization thread. They need to save and 242 // restore the mask, and it could be needed later, so saving it is just an 243 // optimization to avoid calling kmp_get_system_affinity() again. 244 // 245 kmp_affin_mask_t *__kmp_affin_fullMask = NULL; 246 247 static int nCoresPerPkg, nPackages; 248 static int __kmp_nThreadsPerCore; 249 #ifndef KMP_DFLT_NTH_CORES 250 static int __kmp_ncores; 251 #endif 252 static int *__kmp_pu_os_idx = NULL; 253 254 // 255 // __kmp_affinity_uniform_topology() doesn't work when called from 256 // places which support arbitrarily many levels in the machine topology 257 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map() 258 // __kmp_affinity_create_x2apicid_map(). 259 // 260 inline static bool 261 __kmp_affinity_uniform_topology() 262 { 263 return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages); 264 } 265 266 267 // 268 // Print out the detailed machine topology map, i.e. the physical locations 269 // of each OS proc. 270 // 271 static void 272 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth, 273 int pkgLevel, int coreLevel, int threadLevel) 274 { 275 int proc; 276 277 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 278 for (proc = 0; proc < len; proc++) { 279 int level; 280 kmp_str_buf_t buf; 281 __kmp_str_buf_init(&buf); 282 for (level = 0; level < depth; level++) { 283 if (level == threadLevel) { 284 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread)); 285 } 286 else if (level == coreLevel) { 287 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core)); 288 } 289 else if (level == pkgLevel) { 290 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package)); 291 } 292 else if (level > pkgLevel) { 293 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node), 294 level - pkgLevel - 1); 295 } 296 else { 297 __kmp_str_buf_print(&buf, "L%d ", level); 298 } 299 __kmp_str_buf_print(&buf, "%d ", 300 address2os[proc].first.labels[level]); 301 } 302 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second, 303 buf.str); 304 __kmp_str_buf_free(&buf); 305 } 306 } 307 308 #if KMP_USE_HWLOC 309 310 // This function removes the topology levels that are radix 1 and don't offer 311 // further information about the topology. The most common example is when you 312 // have one thread context per core, we don't want the extra thread context 313 // level if it offers no unique labels. So they are removed. 314 // return value: the new depth of address2os 315 static int 316 __kmp_affinity_remove_radix_one_levels(AddrUnsPair *address2os, int nActiveThreads, int depth, int* pkgLevel, int* coreLevel, int* threadLevel) { 317 int level; 318 int i; 319 int radix1_detected; 320 321 for (level = depth-1; level >= 0; --level) { 322 // Always keep the package level 323 if (level == *pkgLevel) 324 continue; 325 // Detect if this level is radix 1 326 radix1_detected = 1; 327 for (i = 1; i < nActiveThreads; ++i) { 328 if (address2os[0].first.labels[level] != address2os[i].first.labels[level]) { 329 // There are differing label values for this level so it stays 330 radix1_detected = 0; 331 break; 332 } 333 } 334 if (!radix1_detected) 335 continue; 336 // Radix 1 was detected 337 if (level == *threadLevel) { 338 // If only one thread per core, then just decrement 339 // the depth which removes the threadlevel from address2os 340 for (i = 0; i < nActiveThreads; ++i) { 341 address2os[i].first.depth--; 342 } 343 *threadLevel = -1; 344 } else if (level == *coreLevel) { 345 // For core level, we move the thread labels over if they are still 346 // valid (*threadLevel != -1), and also reduce the depth another level 347 for (i = 0; i < nActiveThreads; ++i) { 348 if (*threadLevel != -1) { 349 address2os[i].first.labels[*coreLevel] = address2os[i].first.labels[*threadLevel]; 350 } 351 address2os[i].first.depth--; 352 } 353 *coreLevel = -1; 354 } 355 } 356 return address2os[0].first.depth; 357 } 358 359 // Returns the number of objects of type 'type' below 'obj' within the topology tree structure. 360 // e.g., if obj is a HWLOC_OBJ_SOCKET object, and type is HWLOC_OBJ_PU, then 361 // this will return the number of PU's under the SOCKET object. 362 static int 363 __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj, hwloc_obj_type_t type) { 364 int retval = 0; 365 hwloc_obj_t first; 366 for(first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type, obj->logical_index, type, 0); 367 first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, obj->type, first) == obj; 368 first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type, first)) 369 { 370 ++retval; 371 } 372 return retval; 373 } 374 375 static int 376 __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os, 377 kmp_i18n_id_t *const msg_id) 378 { 379 *address2os = NULL; 380 *msg_id = kmp_i18n_null; 381 382 // 383 // Save the affinity mask for the current thread. 384 // 385 kmp_affin_mask_t *oldMask; 386 KMP_CPU_ALLOC(oldMask); 387 __kmp_get_system_affinity(oldMask, TRUE); 388 389 int depth = 3; 390 int pkgLevel = 0; 391 int coreLevel = 1; 392 int threadLevel = 2; 393 394 if (! KMP_AFFINITY_CAPABLE()) 395 { 396 // 397 // Hack to try and infer the machine topology using only the data 398 // available from cpuid on the current thread, and __kmp_xproc. 399 // 400 KMP_ASSERT(__kmp_affinity_type == affinity_none); 401 402 nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET, 0), HWLOC_OBJ_CORE); 403 __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, 0), HWLOC_OBJ_PU); 404 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 405 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 406 if (__kmp_affinity_verbose) { 407 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 408 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 409 if (__kmp_affinity_uniform_topology()) { 410 KMP_INFORM(Uniform, "KMP_AFFINITY"); 411 } else { 412 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 413 } 414 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 415 __kmp_nThreadsPerCore, __kmp_ncores); 416 } 417 KMP_CPU_FREE(oldMask); 418 return 0; 419 } 420 421 // 422 // Allocate the data structure to be returned. 423 // 424 AddrUnsPair *retval = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 425 __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 426 427 // 428 // When affinity is off, this routine will still be called to set 429 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 430 // nCoresPerPkg, & nPackages. Make sure all these vars are set 431 // correctly, and return if affinity is not enabled. 432 // 433 434 hwloc_obj_t pu; 435 hwloc_obj_t core; 436 hwloc_obj_t socket; 437 int nActiveThreads = 0; 438 int socket_identifier = 0; 439 // re-calculate globals to count only accessible resources 440 __kmp_ncores = nPackages = nCoresPerPkg = __kmp_nThreadsPerCore = 0; 441 for(socket = hwloc_get_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET, 0); 442 socket != NULL; 443 socket = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET, socket), 444 socket_identifier++) 445 { 446 int core_identifier = 0; 447 int num_active_cores = 0; 448 for(core = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, socket->type, socket->logical_index, HWLOC_OBJ_CORE, 0); 449 core != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, socket->type, core) == socket; 450 core = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_CORE, core), 451 core_identifier++) 452 { 453 int pu_identifier = 0; 454 int num_active_threads = 0; 455 for(pu = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, core->type, core->logical_index, HWLOC_OBJ_PU, 0); 456 pu != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, core->type, pu) == core; 457 pu = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, HWLOC_OBJ_PU, pu), 458 pu_identifier++) 459 { 460 Address addr(3); 461 if(! KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask)) 462 continue; // skip inactive (inaccessible) unit 463 KA_TRACE(20, ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n", 464 socket->os_index, socket->logical_index, core->os_index, core->logical_index, pu->os_index,pu->logical_index)); 465 addr.labels[0] = socket_identifier; // package 466 addr.labels[1] = core_identifier; // core 467 addr.labels[2] = pu_identifier; // pu 468 retval[nActiveThreads] = AddrUnsPair(addr, pu->os_index); 469 __kmp_pu_os_idx[nActiveThreads] = pu->os_index; // keep os index for each active pu 470 nActiveThreads++; 471 ++num_active_threads; // count active threads per core 472 } 473 if (num_active_threads) { // were there any active threads on the core? 474 ++__kmp_ncores; // count total active cores 475 ++num_active_cores; // count active cores per socket 476 if (num_active_threads > __kmp_nThreadsPerCore) 477 __kmp_nThreadsPerCore = num_active_threads; // calc maximum 478 } 479 } 480 if (num_active_cores) { // were there any active cores on the socket? 481 ++nPackages; // count total active packages 482 if (num_active_cores > nCoresPerPkg) 483 nCoresPerPkg = num_active_cores; // calc maximum 484 } 485 } 486 487 // 488 // If there's only one thread context to bind to, return now. 489 // 490 KMP_DEBUG_ASSERT(nActiveThreads == __kmp_avail_proc); 491 KMP_ASSERT(nActiveThreads > 0); 492 if (nActiveThreads == 1) { 493 __kmp_ncores = nPackages = 1; 494 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 495 if (__kmp_affinity_verbose) { 496 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 497 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 498 499 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 500 if (__kmp_affinity_respect_mask) { 501 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 502 } else { 503 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 504 } 505 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 506 KMP_INFORM(Uniform, "KMP_AFFINITY"); 507 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 508 __kmp_nThreadsPerCore, __kmp_ncores); 509 } 510 511 if (__kmp_affinity_type == affinity_none) { 512 __kmp_free(retval); 513 KMP_CPU_FREE(oldMask); 514 return 0; 515 } 516 517 // 518 // Form an Address object which only includes the package level. 519 // 520 Address addr(1); 521 addr.labels[0] = retval[0].first.labels[pkgLevel]; 522 retval[0].first = addr; 523 524 if (__kmp_affinity_gran_levels < 0) { 525 __kmp_affinity_gran_levels = 0; 526 } 527 528 if (__kmp_affinity_verbose) { 529 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 530 } 531 532 *address2os = retval; 533 KMP_CPU_FREE(oldMask); 534 return 1; 535 } 536 537 // 538 // Sort the table by physical Id. 539 // 540 qsort(retval, nActiveThreads, sizeof(*retval), __kmp_affinity_cmp_Address_labels); 541 542 // 543 // Check to see if the machine topology is uniform 544 // 545 unsigned uniform = (nPackages * nCoresPerPkg * __kmp_nThreadsPerCore == nActiveThreads); 546 547 // 548 // Print the machine topology summary. 549 // 550 if (__kmp_affinity_verbose) { 551 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 552 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 553 554 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 555 if (__kmp_affinity_respect_mask) { 556 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 557 } else { 558 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 559 } 560 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 561 if (uniform) { 562 KMP_INFORM(Uniform, "KMP_AFFINITY"); 563 } else { 564 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 565 } 566 567 kmp_str_buf_t buf; 568 __kmp_str_buf_init(&buf); 569 570 __kmp_str_buf_print(&buf, "%d", nPackages); 571 //for (level = 1; level <= pkgLevel; level++) { 572 // __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 573 // } 574 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 575 __kmp_nThreadsPerCore, __kmp_ncores); 576 577 __kmp_str_buf_free(&buf); 578 } 579 580 if (__kmp_affinity_type == affinity_none) { 581 __kmp_free(retval); 582 KMP_CPU_FREE(oldMask); 583 return 0; 584 } 585 586 // 587 // Find any levels with radiix 1, and remove them from the map 588 // (except for the package level). 589 // 590 depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth, &pkgLevel, &coreLevel, &threadLevel); 591 592 if (__kmp_affinity_gran_levels < 0) { 593 // 594 // Set the granularity level based on what levels are modeled 595 // in the machine topology map. 596 // 597 __kmp_affinity_gran_levels = 0; 598 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 599 __kmp_affinity_gran_levels++; 600 } 601 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 602 __kmp_affinity_gran_levels++; 603 } 604 if (__kmp_affinity_gran > affinity_gran_package) { 605 __kmp_affinity_gran_levels++; 606 } 607 } 608 609 if (__kmp_affinity_verbose) { 610 __kmp_affinity_print_topology(retval, nActiveThreads, depth, pkgLevel, 611 coreLevel, threadLevel); 612 } 613 614 KMP_CPU_FREE(oldMask); 615 *address2os = retval; 616 return depth; 617 } 618 #endif // KMP_USE_HWLOC 619 620 // 621 // If we don't know how to retrieve the machine's processor topology, or 622 // encounter an error in doing so, this routine is called to form a "flat" 623 // mapping of os thread id's <-> processor id's. 624 // 625 static int 626 __kmp_affinity_create_flat_map(AddrUnsPair **address2os, 627 kmp_i18n_id_t *const msg_id) 628 { 629 *address2os = NULL; 630 *msg_id = kmp_i18n_null; 631 632 // 633 // Even if __kmp_affinity_type == affinity_none, this routine might still 634 // called to set __kmp_ncores, as well as 635 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 636 // 637 if (! KMP_AFFINITY_CAPABLE()) { 638 KMP_ASSERT(__kmp_affinity_type == affinity_none); 639 __kmp_ncores = nPackages = __kmp_xproc; 640 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 641 if (__kmp_affinity_verbose) { 642 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY"); 643 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 644 KMP_INFORM(Uniform, "KMP_AFFINITY"); 645 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 646 __kmp_nThreadsPerCore, __kmp_ncores); 647 } 648 return 0; 649 } 650 651 // 652 // When affinity is off, this routine will still be called to set 653 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 654 // nCoresPerPkg, & nPackages. Make sure all these vars are set 655 // correctly, and return now if affinity is not enabled. 656 // 657 __kmp_ncores = nPackages = __kmp_avail_proc; 658 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 659 if (__kmp_affinity_verbose) { 660 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 661 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, __kmp_affin_fullMask); 662 663 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY"); 664 if (__kmp_affinity_respect_mask) { 665 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 666 } else { 667 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 668 } 669 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 670 KMP_INFORM(Uniform, "KMP_AFFINITY"); 671 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 672 __kmp_nThreadsPerCore, __kmp_ncores); 673 } 674 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 675 __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 676 if (__kmp_affinity_type == affinity_none) { 677 int avail_ct = 0; 678 unsigned int i; 679 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 680 if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) 681 continue; 682 __kmp_pu_os_idx[avail_ct++] = i; // suppose indices are flat 683 } 684 return 0; 685 } 686 687 // 688 // Contruct the data structure to be returned. 689 // 690 *address2os = (AddrUnsPair*) 691 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 692 int avail_ct = 0; 693 unsigned int i; 694 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 695 // 696 // Skip this proc if it is not included in the machine model. 697 // 698 if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 699 continue; 700 } 701 __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat 702 Address addr(1); 703 addr.labels[0] = i; 704 (*address2os)[avail_ct++] = AddrUnsPair(addr,i); 705 } 706 if (__kmp_affinity_verbose) { 707 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 708 } 709 710 if (__kmp_affinity_gran_levels < 0) { 711 // 712 // Only the package level is modeled in the machine topology map, 713 // so the #levels of granularity is either 0 or 1. 714 // 715 if (__kmp_affinity_gran > affinity_gran_package) { 716 __kmp_affinity_gran_levels = 1; 717 } 718 else { 719 __kmp_affinity_gran_levels = 0; 720 } 721 } 722 return 1; 723 } 724 725 726 # if KMP_GROUP_AFFINITY 727 728 // 729 // If multiple Windows* OS processor groups exist, we can create a 2-level 730 // topology map with the groups at level 0 and the individual procs at 731 // level 1. 732 // 733 // This facilitates letting the threads float among all procs in a group, 734 // if granularity=group (the default when there are multiple groups). 735 // 736 static int 737 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os, 738 kmp_i18n_id_t *const msg_id) 739 { 740 *address2os = NULL; 741 *msg_id = kmp_i18n_null; 742 743 // 744 // If we don't have multiple processor groups, return now. 745 // The flat mapping will be used. 746 // 747 if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(__kmp_affin_fullMask) >= 0)) { 748 // FIXME set *msg_id 749 return -1; 750 } 751 752 // 753 // Contruct the data structure to be returned. 754 // 755 *address2os = (AddrUnsPair*) 756 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 757 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 758 __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 759 int avail_ct = 0; 760 int i; 761 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 762 // 763 // Skip this proc if it is not included in the machine model. 764 // 765 if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 766 continue; 767 } 768 __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat 769 Address addr(2); 770 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR)); 771 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR)); 772 (*address2os)[avail_ct++] = AddrUnsPair(addr,i); 773 774 if (__kmp_affinity_verbose) { 775 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0], 776 addr.labels[1]); 777 } 778 } 779 780 if (__kmp_affinity_gran_levels < 0) { 781 if (__kmp_affinity_gran == affinity_gran_group) { 782 __kmp_affinity_gran_levels = 1; 783 } 784 else if ((__kmp_affinity_gran == affinity_gran_fine) 785 || (__kmp_affinity_gran == affinity_gran_thread)) { 786 __kmp_affinity_gran_levels = 0; 787 } 788 else { 789 const char *gran_str = NULL; 790 if (__kmp_affinity_gran == affinity_gran_core) { 791 gran_str = "core"; 792 } 793 else if (__kmp_affinity_gran == affinity_gran_package) { 794 gran_str = "package"; 795 } 796 else if (__kmp_affinity_gran == affinity_gran_node) { 797 gran_str = "node"; 798 } 799 else { 800 KMP_ASSERT(0); 801 } 802 803 // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread" 804 __kmp_affinity_gran_levels = 0; 805 } 806 } 807 return 2; 808 } 809 810 # endif /* KMP_GROUP_AFFINITY */ 811 812 813 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 814 815 static int 816 __kmp_cpuid_mask_width(int count) { 817 int r = 0; 818 819 while((1<<r) < count) 820 ++r; 821 return r; 822 } 823 824 825 class apicThreadInfo { 826 public: 827 unsigned osId; // param to __kmp_affinity_bind_thread 828 unsigned apicId; // from cpuid after binding 829 unsigned maxCoresPerPkg; // "" 830 unsigned maxThreadsPerPkg; // "" 831 unsigned pkgId; // inferred from above values 832 unsigned coreId; // "" 833 unsigned threadId; // "" 834 }; 835 836 837 static int 838 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b) 839 { 840 const apicThreadInfo *aa = (const apicThreadInfo *)a; 841 const apicThreadInfo *bb = (const apicThreadInfo *)b; 842 if (aa->osId < bb->osId) return -1; 843 if (aa->osId > bb->osId) return 1; 844 return 0; 845 } 846 847 848 static int 849 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b) 850 { 851 const apicThreadInfo *aa = (const apicThreadInfo *)a; 852 const apicThreadInfo *bb = (const apicThreadInfo *)b; 853 if (aa->pkgId < bb->pkgId) return -1; 854 if (aa->pkgId > bb->pkgId) return 1; 855 if (aa->coreId < bb->coreId) return -1; 856 if (aa->coreId > bb->coreId) return 1; 857 if (aa->threadId < bb->threadId) return -1; 858 if (aa->threadId > bb->threadId) return 1; 859 return 0; 860 } 861 862 863 // 864 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 865 // an algorithm which cycles through the available os threads, setting 866 // the current thread's affinity mask to that thread, and then retrieves 867 // the Apic Id for each thread context using the cpuid instruction. 868 // 869 static int 870 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os, 871 kmp_i18n_id_t *const msg_id) 872 { 873 kmp_cpuid buf; 874 int rc; 875 *address2os = NULL; 876 *msg_id = kmp_i18n_null; 877 878 // 879 // Check if cpuid leaf 4 is supported. 880 // 881 __kmp_x86_cpuid(0, 0, &buf); 882 if (buf.eax < 4) { 883 *msg_id = kmp_i18n_str_NoLeaf4Support; 884 return -1; 885 } 886 887 // 888 // The algorithm used starts by setting the affinity to each available 889 // thread and retrieving info from the cpuid instruction, so if we are 890 // not capable of calling __kmp_get_system_affinity() and 891 // _kmp_get_system_affinity(), then we need to do something else - use 892 // the defaults that we calculated from issuing cpuid without binding 893 // to each proc. 894 // 895 if (! KMP_AFFINITY_CAPABLE()) { 896 // 897 // Hack to try and infer the machine topology using only the data 898 // available from cpuid on the current thread, and __kmp_xproc. 899 // 900 KMP_ASSERT(__kmp_affinity_type == affinity_none); 901 902 // 903 // Get an upper bound on the number of threads per package using 904 // cpuid(1). 905 // 906 // On some OS/chps combinations where HT is supported by the chip 907 // but is disabled, this value will be 2 on a single core chip. 908 // Usually, it will be 2 if HT is enabled and 1 if HT is disabled. 909 // 910 __kmp_x86_cpuid(1, 0, &buf); 911 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 912 if (maxThreadsPerPkg == 0) { 913 maxThreadsPerPkg = 1; 914 } 915 916 // 917 // The num cores per pkg comes from cpuid(4). 918 // 1 must be added to the encoded value. 919 // 920 // The author of cpu_count.cpp treated this only an upper bound 921 // on the number of cores, but I haven't seen any cases where it 922 // was greater than the actual number of cores, so we will treat 923 // it as exact in this block of code. 924 // 925 // First, we need to check if cpuid(4) is supported on this chip. 926 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax 927 // has the value n or greater. 928 // 929 __kmp_x86_cpuid(0, 0, &buf); 930 if (buf.eax >= 4) { 931 __kmp_x86_cpuid(4, 0, &buf); 932 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 933 } 934 else { 935 nCoresPerPkg = 1; 936 } 937 938 // 939 // There is no way to reliably tell if HT is enabled without issuing 940 // the cpuid instruction from every thread, can correlating the cpuid 941 // info, so if the machine is not affinity capable, we assume that HT 942 // is off. We have seen quite a few machines where maxThreadsPerPkg 943 // is 2, yet the machine does not support HT. 944 // 945 // - Older OSes are usually found on machines with older chips, which 946 // do not support HT. 947 // 948 // - The performance penalty for mistakenly identifying a machine as 949 // HT when it isn't (which results in blocktime being incorrecly set 950 // to 0) is greater than the penalty when for mistakenly identifying 951 // a machine as being 1 thread/core when it is really HT enabled 952 // (which results in blocktime being incorrectly set to a positive 953 // value). 954 // 955 __kmp_ncores = __kmp_xproc; 956 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 957 __kmp_nThreadsPerCore = 1; 958 if (__kmp_affinity_verbose) { 959 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY"); 960 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 961 if (__kmp_affinity_uniform_topology()) { 962 KMP_INFORM(Uniform, "KMP_AFFINITY"); 963 } else { 964 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 965 } 966 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 967 __kmp_nThreadsPerCore, __kmp_ncores); 968 } 969 return 0; 970 } 971 972 // 973 // 974 // From here on, we can assume that it is safe to call 975 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), 976 // even if __kmp_affinity_type = affinity_none. 977 // 978 979 // 980 // Save the affinity mask for the current thread. 981 // 982 kmp_affin_mask_t *oldMask; 983 KMP_CPU_ALLOC(oldMask); 984 KMP_ASSERT(oldMask != NULL); 985 __kmp_get_system_affinity(oldMask, TRUE); 986 987 // 988 // Run through each of the available contexts, binding the current thread 989 // to it, and obtaining the pertinent information using the cpuid instr. 990 // 991 // The relevant information is: 992 // 993 // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 994 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 995 // 996 // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The 997 // value of this field determines the width of the core# + thread# 998 // fields in the Apic Id. It is also an upper bound on the number 999 // of threads per package, but it has been verified that situations 1000 // happen were it is not exact. In particular, on certain OS/chip 1001 // combinations where Intel(R) Hyper-Threading Technology is supported 1002 // by the chip but has 1003 // been disabled, the value of this field will be 2 (for a single core 1004 // chip). On other OS/chip combinations supporting 1005 // Intel(R) Hyper-Threading Technology, the value of 1006 // this field will be 1 when Intel(R) Hyper-Threading Technology is 1007 // disabled and 2 when it is enabled. 1008 // 1009 // Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The 1010 // value of this field (+1) determines the width of the core# field in 1011 // the Apic Id. The comments in "cpucount.cpp" say that this value is 1012 // an upper bound, but the IA-32 architecture manual says that it is 1013 // exactly the number of cores per package, and I haven't seen any 1014 // case where it wasn't. 1015 // 1016 // From this information, deduce the package Id, core Id, and thread Id, 1017 // and set the corresponding fields in the apicThreadInfo struct. 1018 // 1019 unsigned i; 1020 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 1021 __kmp_avail_proc * sizeof(apicThreadInfo)); 1022 unsigned nApics = 0; 1023 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1024 // 1025 // Skip this proc if it is not included in the machine model. 1026 // 1027 if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1028 continue; 1029 } 1030 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 1031 1032 __kmp_affinity_bind_thread(i); 1033 threadInfo[nApics].osId = i; 1034 1035 // 1036 // The apic id and max threads per pkg come from cpuid(1). 1037 // 1038 __kmp_x86_cpuid(1, 0, &buf); 1039 if (! (buf.edx >> 9) & 1) { 1040 __kmp_set_system_affinity(oldMask, TRUE); 1041 __kmp_free(threadInfo); 1042 KMP_CPU_FREE(oldMask); 1043 *msg_id = kmp_i18n_str_ApicNotPresent; 1044 return -1; 1045 } 1046 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 1047 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1048 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 1049 threadInfo[nApics].maxThreadsPerPkg = 1; 1050 } 1051 1052 // 1053 // Max cores per pkg comes from cpuid(4). 1054 // 1 must be added to the encoded value. 1055 // 1056 // First, we need to check if cpuid(4) is supported on this chip. 1057 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax 1058 // has the value n or greater. 1059 // 1060 __kmp_x86_cpuid(0, 0, &buf); 1061 if (buf.eax >= 4) { 1062 __kmp_x86_cpuid(4, 0, &buf); 1063 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1064 } 1065 else { 1066 threadInfo[nApics].maxCoresPerPkg = 1; 1067 } 1068 1069 // 1070 // Infer the pkgId / coreId / threadId using only the info 1071 // obtained locally. 1072 // 1073 int widthCT = __kmp_cpuid_mask_width( 1074 threadInfo[nApics].maxThreadsPerPkg); 1075 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 1076 1077 int widthC = __kmp_cpuid_mask_width( 1078 threadInfo[nApics].maxCoresPerPkg); 1079 int widthT = widthCT - widthC; 1080 if (widthT < 0) { 1081 // 1082 // I've never seen this one happen, but I suppose it could, if 1083 // the cpuid instruction on a chip was really screwed up. 1084 // Make sure to restore the affinity mask before the tail call. 1085 // 1086 __kmp_set_system_affinity(oldMask, TRUE); 1087 __kmp_free(threadInfo); 1088 KMP_CPU_FREE(oldMask); 1089 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1090 return -1; 1091 } 1092 1093 int maskC = (1 << widthC) - 1; 1094 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) 1095 &maskC; 1096 1097 int maskT = (1 << widthT) - 1; 1098 threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT; 1099 1100 nApics++; 1101 } 1102 1103 // 1104 // We've collected all the info we need. 1105 // Restore the old affinity mask for this thread. 1106 // 1107 __kmp_set_system_affinity(oldMask, TRUE); 1108 1109 // 1110 // If there's only one thread context to bind to, form an Address object 1111 // with depth 1 and return immediately (or, if affinity is off, set 1112 // address2os to NULL and return). 1113 // 1114 // If it is configured to omit the package level when there is only a 1115 // single package, the logic at the end of this routine won't work if 1116 // there is only a single thread - it would try to form an Address 1117 // object with depth 0. 1118 // 1119 KMP_ASSERT(nApics > 0); 1120 if (nApics == 1) { 1121 __kmp_ncores = nPackages = 1; 1122 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1123 if (__kmp_affinity_verbose) { 1124 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1125 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1126 1127 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1128 if (__kmp_affinity_respect_mask) { 1129 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1130 } else { 1131 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1132 } 1133 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1134 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1135 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1136 __kmp_nThreadsPerCore, __kmp_ncores); 1137 } 1138 1139 if (__kmp_affinity_type == affinity_none) { 1140 __kmp_free(threadInfo); 1141 KMP_CPU_FREE(oldMask); 1142 return 0; 1143 } 1144 1145 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair)); 1146 Address addr(1); 1147 addr.labels[0] = threadInfo[0].pkgId; 1148 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId); 1149 1150 if (__kmp_affinity_gran_levels < 0) { 1151 __kmp_affinity_gran_levels = 0; 1152 } 1153 1154 if (__kmp_affinity_verbose) { 1155 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 1156 } 1157 1158 __kmp_free(threadInfo); 1159 KMP_CPU_FREE(oldMask); 1160 return 1; 1161 } 1162 1163 // 1164 // Sort the threadInfo table by physical Id. 1165 // 1166 qsort(threadInfo, nApics, sizeof(*threadInfo), 1167 __kmp_affinity_cmp_apicThreadInfo_phys_id); 1168 1169 // 1170 // The table is now sorted by pkgId / coreId / threadId, but we really 1171 // don't know the radix of any of the fields. pkgId's may be sparsely 1172 // assigned among the chips on a system. Although coreId's are usually 1173 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned 1174 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1175 // 1176 // For that matter, we don't know what coresPerPkg and threadsPerCore 1177 // (or the total # packages) are at this point - we want to determine 1178 // that now. We only have an upper bound on the first two figures. 1179 // 1180 // We also perform a consistency check at this point: the values returned 1181 // by the cpuid instruction for any thread bound to a given package had 1182 // better return the same info for maxThreadsPerPkg and maxCoresPerPkg. 1183 // 1184 nPackages = 1; 1185 nCoresPerPkg = 1; 1186 __kmp_nThreadsPerCore = 1; 1187 unsigned nCores = 1; 1188 1189 unsigned pkgCt = 1; // to determine radii 1190 unsigned lastPkgId = threadInfo[0].pkgId; 1191 unsigned coreCt = 1; 1192 unsigned lastCoreId = threadInfo[0].coreId; 1193 unsigned threadCt = 1; 1194 unsigned lastThreadId = threadInfo[0].threadId; 1195 1196 // intra-pkg consist checks 1197 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 1198 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 1199 1200 for (i = 1; i < nApics; i++) { 1201 if (threadInfo[i].pkgId != lastPkgId) { 1202 nCores++; 1203 pkgCt++; 1204 lastPkgId = threadInfo[i].pkgId; 1205 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; 1206 coreCt = 1; 1207 lastCoreId = threadInfo[i].coreId; 1208 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1209 threadCt = 1; 1210 lastThreadId = threadInfo[i].threadId; 1211 1212 // 1213 // This is a different package, so go on to the next iteration 1214 // without doing any consistency checks. Reset the consistency 1215 // check vars, though. 1216 // 1217 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 1218 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 1219 continue; 1220 } 1221 1222 if (threadInfo[i].coreId != lastCoreId) { 1223 nCores++; 1224 coreCt++; 1225 lastCoreId = threadInfo[i].coreId; 1226 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1227 threadCt = 1; 1228 lastThreadId = threadInfo[i].threadId; 1229 } 1230 else if (threadInfo[i].threadId != lastThreadId) { 1231 threadCt++; 1232 lastThreadId = threadInfo[i].threadId; 1233 } 1234 else { 1235 __kmp_free(threadInfo); 1236 KMP_CPU_FREE(oldMask); 1237 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1238 return -1; 1239 } 1240 1241 // 1242 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 1243 // fields agree between all the threads bounds to a given package. 1244 // 1245 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) 1246 || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 1247 __kmp_free(threadInfo); 1248 KMP_CPU_FREE(oldMask); 1249 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1250 return -1; 1251 } 1252 } 1253 nPackages = pkgCt; 1254 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; 1255 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1256 1257 // 1258 // When affinity is off, this routine will still be called to set 1259 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 1260 // nCoresPerPkg, & nPackages. Make sure all these vars are set 1261 // correctly, and return now if affinity is not enabled. 1262 // 1263 __kmp_ncores = nCores; 1264 if (__kmp_affinity_verbose) { 1265 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1266 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1267 1268 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1269 if (__kmp_affinity_respect_mask) { 1270 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1271 } else { 1272 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1273 } 1274 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1275 if (__kmp_affinity_uniform_topology()) { 1276 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1277 } else { 1278 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1279 } 1280 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1281 __kmp_nThreadsPerCore, __kmp_ncores); 1282 1283 } 1284 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 1285 KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc); 1286 __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 1287 for (i = 0; i < nApics; ++i) { 1288 __kmp_pu_os_idx[i] = threadInfo[i].osId; 1289 } 1290 if (__kmp_affinity_type == affinity_none) { 1291 __kmp_free(threadInfo); 1292 KMP_CPU_FREE(oldMask); 1293 return 0; 1294 } 1295 1296 // 1297 // Now that we've determined the number of packages, the number of cores 1298 // per package, and the number of threads per core, we can construct the 1299 // data structure that is to be returned. 1300 // 1301 int pkgLevel = 0; 1302 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1; 1303 int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 1304 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 1305 1306 KMP_ASSERT(depth > 0); 1307 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1308 1309 for (i = 0; i < nApics; ++i) { 1310 Address addr(depth); 1311 unsigned os = threadInfo[i].osId; 1312 int d = 0; 1313 1314 if (pkgLevel >= 0) { 1315 addr.labels[d++] = threadInfo[i].pkgId; 1316 } 1317 if (coreLevel >= 0) { 1318 addr.labels[d++] = threadInfo[i].coreId; 1319 } 1320 if (threadLevel >= 0) { 1321 addr.labels[d++] = threadInfo[i].threadId; 1322 } 1323 (*address2os)[i] = AddrUnsPair(addr, os); 1324 } 1325 1326 if (__kmp_affinity_gran_levels < 0) { 1327 // 1328 // Set the granularity level based on what levels are modeled 1329 // in the machine topology map. 1330 // 1331 __kmp_affinity_gran_levels = 0; 1332 if ((threadLevel >= 0) 1333 && (__kmp_affinity_gran > affinity_gran_thread)) { 1334 __kmp_affinity_gran_levels++; 1335 } 1336 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1337 __kmp_affinity_gran_levels++; 1338 } 1339 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) { 1340 __kmp_affinity_gran_levels++; 1341 } 1342 } 1343 1344 if (__kmp_affinity_verbose) { 1345 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel, 1346 coreLevel, threadLevel); 1347 } 1348 1349 __kmp_free(threadInfo); 1350 KMP_CPU_FREE(oldMask); 1351 return depth; 1352 } 1353 1354 1355 // 1356 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 1357 // architectures support a newer interface for specifying the x2APIC Ids, 1358 // based on cpuid leaf 11. 1359 // 1360 static int 1361 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os, 1362 kmp_i18n_id_t *const msg_id) 1363 { 1364 kmp_cpuid buf; 1365 1366 *address2os = NULL; 1367 *msg_id = kmp_i18n_null; 1368 1369 // 1370 // Check to see if cpuid leaf 11 is supported. 1371 // 1372 __kmp_x86_cpuid(0, 0, &buf); 1373 if (buf.eax < 11) { 1374 *msg_id = kmp_i18n_str_NoLeaf11Support; 1375 return -1; 1376 } 1377 __kmp_x86_cpuid(11, 0, &buf); 1378 if (buf.ebx == 0) { 1379 *msg_id = kmp_i18n_str_NoLeaf11Support; 1380 return -1; 1381 } 1382 1383 // 1384 // Find the number of levels in the machine topology. While we're at it, 1385 // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will 1386 // try to get more accurate values later by explicitly counting them, 1387 // but get reasonable defaults now, in case we return early. 1388 // 1389 int level; 1390 int threadLevel = -1; 1391 int coreLevel = -1; 1392 int pkgLevel = -1; 1393 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1394 1395 for (level = 0;; level++) { 1396 if (level > 31) { 1397 // 1398 // FIXME: Hack for DPD200163180 1399 // 1400 // If level is big then something went wrong -> exiting 1401 // 1402 // There could actually be 32 valid levels in the machine topology, 1403 // but so far, the only machine we have seen which does not exit 1404 // this loop before iteration 32 has fubar x2APIC settings. 1405 // 1406 // For now, just reject this case based upon loop trip count. 1407 // 1408 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1409 return -1; 1410 } 1411 __kmp_x86_cpuid(11, level, &buf); 1412 if (buf.ebx == 0) { 1413 if (pkgLevel < 0) { 1414 // 1415 // Will infer nPackages from __kmp_xproc 1416 // 1417 pkgLevel = level; 1418 level++; 1419 } 1420 break; 1421 } 1422 int kind = (buf.ecx >> 8) & 0xff; 1423 if (kind == 1) { 1424 // 1425 // SMT level 1426 // 1427 threadLevel = level; 1428 coreLevel = -1; 1429 pkgLevel = -1; 1430 __kmp_nThreadsPerCore = buf.ebx & 0xff; 1431 if (__kmp_nThreadsPerCore == 0) { 1432 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1433 return -1; 1434 } 1435 } 1436 else if (kind == 2) { 1437 // 1438 // core level 1439 // 1440 coreLevel = level; 1441 pkgLevel = -1; 1442 nCoresPerPkg = buf.ebx & 0xff; 1443 if (nCoresPerPkg == 0) { 1444 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1445 return -1; 1446 } 1447 } 1448 else { 1449 if (level <= 0) { 1450 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1451 return -1; 1452 } 1453 if (pkgLevel >= 0) { 1454 continue; 1455 } 1456 pkgLevel = level; 1457 nPackages = buf.ebx & 0xff; 1458 if (nPackages == 0) { 1459 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1460 return -1; 1461 } 1462 } 1463 } 1464 int depth = level; 1465 1466 // 1467 // In the above loop, "level" was counted from the finest level (usually 1468 // thread) to the coarsest. The caller expects that we will place the 1469 // labels in (*address2os)[].first.labels[] in the inverse order, so 1470 // we need to invert the vars saying which level means what. 1471 // 1472 if (threadLevel >= 0) { 1473 threadLevel = depth - threadLevel - 1; 1474 } 1475 if (coreLevel >= 0) { 1476 coreLevel = depth - coreLevel - 1; 1477 } 1478 KMP_DEBUG_ASSERT(pkgLevel >= 0); 1479 pkgLevel = depth - pkgLevel - 1; 1480 1481 // 1482 // The algorithm used starts by setting the affinity to each available 1483 // thread and retrieving info from the cpuid instruction, so if we are 1484 // not capable of calling __kmp_get_system_affinity() and 1485 // _kmp_get_system_affinity(), then we need to do something else - use 1486 // the defaults that we calculated from issuing cpuid without binding 1487 // to each proc. 1488 // 1489 if (! KMP_AFFINITY_CAPABLE()) 1490 { 1491 // 1492 // Hack to try and infer the machine topology using only the data 1493 // available from cpuid on the current thread, and __kmp_xproc. 1494 // 1495 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1496 1497 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1498 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1499 if (__kmp_affinity_verbose) { 1500 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 1501 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1502 if (__kmp_affinity_uniform_topology()) { 1503 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1504 } else { 1505 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1506 } 1507 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1508 __kmp_nThreadsPerCore, __kmp_ncores); 1509 } 1510 return 0; 1511 } 1512 1513 // 1514 // 1515 // From here on, we can assume that it is safe to call 1516 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), 1517 // even if __kmp_affinity_type = affinity_none. 1518 // 1519 1520 // 1521 // Save the affinity mask for the current thread. 1522 // 1523 kmp_affin_mask_t *oldMask; 1524 KMP_CPU_ALLOC(oldMask); 1525 __kmp_get_system_affinity(oldMask, TRUE); 1526 1527 // 1528 // Allocate the data structure to be returned. 1529 // 1530 AddrUnsPair *retval = (AddrUnsPair *) 1531 __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 1532 1533 // 1534 // Run through each of the available contexts, binding the current thread 1535 // to it, and obtaining the pertinent information using the cpuid instr. 1536 // 1537 unsigned int proc; 1538 int nApics = 0; 1539 KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) { 1540 // 1541 // Skip this proc if it is not included in the machine model. 1542 // 1543 if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 1544 continue; 1545 } 1546 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc); 1547 1548 __kmp_affinity_bind_thread(proc); 1549 1550 // 1551 // Extrach the labels for each level in the machine topology map 1552 // from the Apic ID. 1553 // 1554 Address addr(depth); 1555 int prev_shift = 0; 1556 1557 for (level = 0; level < depth; level++) { 1558 __kmp_x86_cpuid(11, level, &buf); 1559 unsigned apicId = buf.edx; 1560 if (buf.ebx == 0) { 1561 if (level != depth - 1) { 1562 KMP_CPU_FREE(oldMask); 1563 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1564 return -1; 1565 } 1566 addr.labels[depth - level - 1] = apicId >> prev_shift; 1567 level++; 1568 break; 1569 } 1570 int shift = buf.eax & 0x1f; 1571 int mask = (1 << shift) - 1; 1572 addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift; 1573 prev_shift = shift; 1574 } 1575 if (level != depth) { 1576 KMP_CPU_FREE(oldMask); 1577 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1578 return -1; 1579 } 1580 1581 retval[nApics] = AddrUnsPair(addr, proc); 1582 nApics++; 1583 } 1584 1585 // 1586 // We've collected all the info we need. 1587 // Restore the old affinity mask for this thread. 1588 // 1589 __kmp_set_system_affinity(oldMask, TRUE); 1590 1591 // 1592 // If there's only one thread context to bind to, return now. 1593 // 1594 KMP_ASSERT(nApics > 0); 1595 if (nApics == 1) { 1596 __kmp_ncores = nPackages = 1; 1597 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1598 if (__kmp_affinity_verbose) { 1599 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1600 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1601 1602 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1603 if (__kmp_affinity_respect_mask) { 1604 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1605 } else { 1606 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1607 } 1608 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1609 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1610 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1611 __kmp_nThreadsPerCore, __kmp_ncores); 1612 } 1613 1614 if (__kmp_affinity_type == affinity_none) { 1615 __kmp_free(retval); 1616 KMP_CPU_FREE(oldMask); 1617 return 0; 1618 } 1619 1620 // 1621 // Form an Address object which only includes the package level. 1622 // 1623 Address addr(1); 1624 addr.labels[0] = retval[0].first.labels[pkgLevel]; 1625 retval[0].first = addr; 1626 1627 if (__kmp_affinity_gran_levels < 0) { 1628 __kmp_affinity_gran_levels = 0; 1629 } 1630 1631 if (__kmp_affinity_verbose) { 1632 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 1633 } 1634 1635 *address2os = retval; 1636 KMP_CPU_FREE(oldMask); 1637 return 1; 1638 } 1639 1640 // 1641 // Sort the table by physical Id. 1642 // 1643 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels); 1644 1645 // 1646 // Find the radix at each of the levels. 1647 // 1648 unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1649 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1650 unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1651 unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1652 for (level = 0; level < depth; level++) { 1653 totals[level] = 1; 1654 maxCt[level] = 1; 1655 counts[level] = 1; 1656 last[level] = retval[0].first.labels[level]; 1657 } 1658 1659 // 1660 // From here on, the iteration variable "level" runs from the finest 1661 // level to the coarsest, i.e. we iterate forward through 1662 // (*address2os)[].first.labels[] - in the previous loops, we iterated 1663 // backwards. 1664 // 1665 for (proc = 1; (int)proc < nApics; proc++) { 1666 int level; 1667 for (level = 0; level < depth; level++) { 1668 if (retval[proc].first.labels[level] != last[level]) { 1669 int j; 1670 for (j = level + 1; j < depth; j++) { 1671 totals[j]++; 1672 counts[j] = 1; 1673 // The line below causes printing incorrect topology information 1674 // in case the max value for some level (maxCt[level]) is encountered earlier than 1675 // some less value while going through the array. 1676 // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2 1677 // whereas it must be 4. 1678 // TODO!!! Check if it can be commented safely 1679 //maxCt[j] = 1; 1680 last[j] = retval[proc].first.labels[j]; 1681 } 1682 totals[level]++; 1683 counts[level]++; 1684 if (counts[level] > maxCt[level]) { 1685 maxCt[level] = counts[level]; 1686 } 1687 last[level] = retval[proc].first.labels[level]; 1688 break; 1689 } 1690 else if (level == depth - 1) { 1691 __kmp_free(last); 1692 __kmp_free(maxCt); 1693 __kmp_free(counts); 1694 __kmp_free(totals); 1695 __kmp_free(retval); 1696 KMP_CPU_FREE(oldMask); 1697 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 1698 return -1; 1699 } 1700 } 1701 } 1702 1703 // 1704 // When affinity is off, this routine will still be called to set 1705 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 1706 // nCoresPerPkg, & nPackages. Make sure all these vars are set 1707 // correctly, and return if affinity is not enabled. 1708 // 1709 if (threadLevel >= 0) { 1710 __kmp_nThreadsPerCore = maxCt[threadLevel]; 1711 } 1712 else { 1713 __kmp_nThreadsPerCore = 1; 1714 } 1715 nPackages = totals[pkgLevel]; 1716 1717 if (coreLevel >= 0) { 1718 __kmp_ncores = totals[coreLevel]; 1719 nCoresPerPkg = maxCt[coreLevel]; 1720 } 1721 else { 1722 __kmp_ncores = nPackages; 1723 nCoresPerPkg = 1; 1724 } 1725 1726 // 1727 // Check to see if the machine topology is uniform 1728 // 1729 unsigned prod = maxCt[0]; 1730 for (level = 1; level < depth; level++) { 1731 prod *= maxCt[level]; 1732 } 1733 bool uniform = (prod == totals[level - 1]); 1734 1735 // 1736 // Print the machine topology summary. 1737 // 1738 if (__kmp_affinity_verbose) { 1739 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 1740 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1741 1742 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1743 if (__kmp_affinity_respect_mask) { 1744 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 1745 } else { 1746 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 1747 } 1748 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1749 if (uniform) { 1750 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1751 } else { 1752 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1753 } 1754 1755 kmp_str_buf_t buf; 1756 __kmp_str_buf_init(&buf); 1757 1758 __kmp_str_buf_print(&buf, "%d", totals[0]); 1759 for (level = 1; level <= pkgLevel; level++) { 1760 __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 1761 } 1762 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 1763 __kmp_nThreadsPerCore, __kmp_ncores); 1764 1765 __kmp_str_buf_free(&buf); 1766 } 1767 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 1768 KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc); 1769 __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 1770 for (proc = 0; (int)proc < nApics; ++proc) { 1771 __kmp_pu_os_idx[proc] = retval[proc].second; 1772 } 1773 if (__kmp_affinity_type == affinity_none) { 1774 __kmp_free(last); 1775 __kmp_free(maxCt); 1776 __kmp_free(counts); 1777 __kmp_free(totals); 1778 __kmp_free(retval); 1779 KMP_CPU_FREE(oldMask); 1780 return 0; 1781 } 1782 1783 // 1784 // Find any levels with radiix 1, and remove them from the map 1785 // (except for the package level). 1786 // 1787 int new_depth = 0; 1788 for (level = 0; level < depth; level++) { 1789 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1790 continue; 1791 } 1792 new_depth++; 1793 } 1794 1795 // 1796 // If we are removing any levels, allocate a new vector to return, 1797 // and copy the relevant information to it. 1798 // 1799 if (new_depth != depth) { 1800 AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate( 1801 sizeof(AddrUnsPair) * nApics); 1802 for (proc = 0; (int)proc < nApics; proc++) { 1803 Address addr(new_depth); 1804 new_retval[proc] = AddrUnsPair(addr, retval[proc].second); 1805 } 1806 int new_level = 0; 1807 int newPkgLevel = -1; 1808 int newCoreLevel = -1; 1809 int newThreadLevel = -1; 1810 int i; 1811 for (level = 0; level < depth; level++) { 1812 if ((maxCt[level] == 1) 1813 && (level != pkgLevel)) { 1814 // 1815 // Remove this level. Never remove the package level 1816 // 1817 continue; 1818 } 1819 if (level == pkgLevel) { 1820 newPkgLevel = level; 1821 } 1822 if (level == coreLevel) { 1823 newCoreLevel = level; 1824 } 1825 if (level == threadLevel) { 1826 newThreadLevel = level; 1827 } 1828 for (proc = 0; (int)proc < nApics; proc++) { 1829 new_retval[proc].first.labels[new_level] 1830 = retval[proc].first.labels[level]; 1831 } 1832 new_level++; 1833 } 1834 1835 __kmp_free(retval); 1836 retval = new_retval; 1837 depth = new_depth; 1838 pkgLevel = newPkgLevel; 1839 coreLevel = newCoreLevel; 1840 threadLevel = newThreadLevel; 1841 } 1842 1843 if (__kmp_affinity_gran_levels < 0) { 1844 // 1845 // Set the granularity level based on what levels are modeled 1846 // in the machine topology map. 1847 // 1848 __kmp_affinity_gran_levels = 0; 1849 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1850 __kmp_affinity_gran_levels++; 1851 } 1852 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1853 __kmp_affinity_gran_levels++; 1854 } 1855 if (__kmp_affinity_gran > affinity_gran_package) { 1856 __kmp_affinity_gran_levels++; 1857 } 1858 } 1859 1860 if (__kmp_affinity_verbose) { 1861 __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, 1862 coreLevel, threadLevel); 1863 } 1864 1865 __kmp_free(last); 1866 __kmp_free(maxCt); 1867 __kmp_free(counts); 1868 __kmp_free(totals); 1869 KMP_CPU_FREE(oldMask); 1870 *address2os = retval; 1871 return depth; 1872 } 1873 1874 1875 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1876 1877 1878 #define osIdIndex 0 1879 #define threadIdIndex 1 1880 #define coreIdIndex 2 1881 #define pkgIdIndex 3 1882 #define nodeIdIndex 4 1883 1884 typedef unsigned *ProcCpuInfo; 1885 static unsigned maxIndex = pkgIdIndex; 1886 1887 1888 static int 1889 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b) 1890 { 1891 const unsigned *aa = (const unsigned *)a; 1892 const unsigned *bb = (const unsigned *)b; 1893 if (aa[osIdIndex] < bb[osIdIndex]) return -1; 1894 if (aa[osIdIndex] > bb[osIdIndex]) return 1; 1895 return 0; 1896 }; 1897 1898 1899 static int 1900 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b) 1901 { 1902 unsigned i; 1903 const unsigned *aa = *((const unsigned **)a); 1904 const unsigned *bb = *((const unsigned **)b); 1905 for (i = maxIndex; ; i--) { 1906 if (aa[i] < bb[i]) return -1; 1907 if (aa[i] > bb[i]) return 1; 1908 if (i == osIdIndex) break; 1909 } 1910 return 0; 1911 } 1912 1913 1914 // 1915 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 1916 // affinity map. 1917 // 1918 static int 1919 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line, 1920 kmp_i18n_id_t *const msg_id, FILE *f) 1921 { 1922 *address2os = NULL; 1923 *msg_id = kmp_i18n_null; 1924 1925 // 1926 // Scan of the file, and count the number of "processor" (osId) fields, 1927 // and find the highest value of <n> for a node_<n> field. 1928 // 1929 char buf[256]; 1930 unsigned num_records = 0; 1931 while (! feof(f)) { 1932 buf[sizeof(buf) - 1] = 1; 1933 if (! fgets(buf, sizeof(buf), f)) { 1934 // 1935 // Read errors presumably because of EOF 1936 // 1937 break; 1938 } 1939 1940 char s1[] = "processor"; 1941 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1942 num_records++; 1943 continue; 1944 } 1945 1946 // 1947 // FIXME - this will match "node_<n> <garbage>" 1948 // 1949 unsigned level; 1950 if (KMP_SSCANF(buf, "node_%d id", &level) == 1) { 1951 if (nodeIdIndex + level >= maxIndex) { 1952 maxIndex = nodeIdIndex + level; 1953 } 1954 continue; 1955 } 1956 } 1957 1958 // 1959 // Check for empty file / no valid processor records, or too many. 1960 // The number of records can't exceed the number of valid bits in the 1961 // affinity mask. 1962 // 1963 if (num_records == 0) { 1964 *line = 0; 1965 *msg_id = kmp_i18n_str_NoProcRecords; 1966 return -1; 1967 } 1968 if (num_records > (unsigned)__kmp_xproc) { 1969 *line = 0; 1970 *msg_id = kmp_i18n_str_TooManyProcRecords; 1971 return -1; 1972 } 1973 1974 // 1975 // Set the file pointer back to the begginning, so that we can scan the 1976 // file again, this time performing a full parse of the data. 1977 // Allocate a vector of ProcCpuInfo object, where we will place the data. 1978 // Adding an extra element at the end allows us to remove a lot of extra 1979 // checks for termination conditions. 1980 // 1981 if (fseek(f, 0, SEEK_SET) != 0) { 1982 *line = 0; 1983 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 1984 return -1; 1985 } 1986 1987 // 1988 // Allocate the array of records to store the proc info in. The dummy 1989 // element at the end makes the logic in filling them out easier to code. 1990 // 1991 unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1) 1992 * sizeof(unsigned *)); 1993 unsigned i; 1994 for (i = 0; i <= num_records; i++) { 1995 threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1) 1996 * sizeof(unsigned)); 1997 } 1998 1999 #define CLEANUP_THREAD_INFO \ 2000 for (i = 0; i <= num_records; i++) { \ 2001 __kmp_free(threadInfo[i]); \ 2002 } \ 2003 __kmp_free(threadInfo); 2004 2005 // 2006 // A value of UINT_MAX means that we didn't find the field 2007 // 2008 unsigned __index; 2009 2010 #define INIT_PROC_INFO(p) \ 2011 for (__index = 0; __index <= maxIndex; __index++) { \ 2012 (p)[__index] = UINT_MAX; \ 2013 } 2014 2015 for (i = 0; i <= num_records; i++) { 2016 INIT_PROC_INFO(threadInfo[i]); 2017 } 2018 2019 unsigned num_avail = 0; 2020 *line = 0; 2021 while (! feof(f)) { 2022 // 2023 // Create an inner scoping level, so that all the goto targets at the 2024 // end of the loop appear in an outer scoping level. This avoids 2025 // warnings about jumping past an initialization to a target in the 2026 // same block. 2027 // 2028 { 2029 buf[sizeof(buf) - 1] = 1; 2030 bool long_line = false; 2031 if (! fgets(buf, sizeof(buf), f)) { 2032 // 2033 // Read errors presumably because of EOF 2034 // 2035 // If there is valid data in threadInfo[num_avail], then fake 2036 // a blank line in ensure that the last address gets parsed. 2037 // 2038 bool valid = false; 2039 for (i = 0; i <= maxIndex; i++) { 2040 if (threadInfo[num_avail][i] != UINT_MAX) { 2041 valid = true; 2042 } 2043 } 2044 if (! valid) { 2045 break; 2046 } 2047 buf[0] = 0; 2048 } else if (!buf[sizeof(buf) - 1]) { 2049 // 2050 // The line is longer than the buffer. Set a flag and don't 2051 // emit an error if we were going to ignore the line, anyway. 2052 // 2053 long_line = true; 2054 2055 #define CHECK_LINE \ 2056 if (long_line) { \ 2057 CLEANUP_THREAD_INFO; \ 2058 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 2059 return -1; \ 2060 } 2061 } 2062 (*line)++; 2063 2064 char s1[] = "processor"; 2065 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 2066 CHECK_LINE; 2067 char *p = strchr(buf + sizeof(s1) - 1, ':'); 2068 unsigned val; 2069 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2070 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field; 2071 threadInfo[num_avail][osIdIndex] = val; 2072 #if KMP_OS_LINUX && USE_SYSFS_INFO 2073 char path[256]; 2074 KMP_SNPRINTF(path, sizeof(path), 2075 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 2076 threadInfo[num_avail][osIdIndex]); 2077 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 2078 2079 KMP_SNPRINTF(path, sizeof(path), 2080 "/sys/devices/system/cpu/cpu%u/topology/core_id", 2081 threadInfo[num_avail][osIdIndex]); 2082 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 2083 continue; 2084 #else 2085 } 2086 char s2[] = "physical id"; 2087 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 2088 CHECK_LINE; 2089 char *p = strchr(buf + sizeof(s2) - 1, ':'); 2090 unsigned val; 2091 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2092 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field; 2093 threadInfo[num_avail][pkgIdIndex] = val; 2094 continue; 2095 } 2096 char s3[] = "core id"; 2097 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 2098 CHECK_LINE; 2099 char *p = strchr(buf + sizeof(s3) - 1, ':'); 2100 unsigned val; 2101 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2102 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field; 2103 threadInfo[num_avail][coreIdIndex] = val; 2104 continue; 2105 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 2106 } 2107 char s4[] = "thread id"; 2108 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 2109 CHECK_LINE; 2110 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2111 unsigned val; 2112 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2113 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field; 2114 threadInfo[num_avail][threadIdIndex] = val; 2115 continue; 2116 } 2117 unsigned level; 2118 if (KMP_SSCANF(buf, "node_%d id", &level) == 1) { 2119 CHECK_LINE; 2120 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2121 unsigned val; 2122 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2123 KMP_ASSERT(nodeIdIndex + level <= maxIndex); 2124 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field; 2125 threadInfo[num_avail][nodeIdIndex + level] = val; 2126 continue; 2127 } 2128 2129 // 2130 // We didn't recognize the leading token on the line. 2131 // There are lots of leading tokens that we don't recognize - 2132 // if the line isn't empty, go on to the next line. 2133 // 2134 if ((*buf != 0) && (*buf != '\n')) { 2135 // 2136 // If the line is longer than the buffer, read characters 2137 // until we find a newline. 2138 // 2139 if (long_line) { 2140 int ch; 2141 while (((ch = fgetc(f)) != EOF) && (ch != '\n')); 2142 } 2143 continue; 2144 } 2145 2146 // 2147 // A newline has signalled the end of the processor record. 2148 // Check that there aren't too many procs specified. 2149 // 2150 if ((int)num_avail == __kmp_xproc) { 2151 CLEANUP_THREAD_INFO; 2152 *msg_id = kmp_i18n_str_TooManyEntries; 2153 return -1; 2154 } 2155 2156 // 2157 // Check for missing fields. The osId field must be there, and we 2158 // currently require that the physical id field is specified, also. 2159 // 2160 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 2161 CLEANUP_THREAD_INFO; 2162 *msg_id = kmp_i18n_str_MissingProcField; 2163 return -1; 2164 } 2165 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 2166 CLEANUP_THREAD_INFO; 2167 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 2168 return -1; 2169 } 2170 2171 // 2172 // Skip this proc if it is not included in the machine model. 2173 // 2174 if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], __kmp_affin_fullMask)) { 2175 INIT_PROC_INFO(threadInfo[num_avail]); 2176 continue; 2177 } 2178 2179 // 2180 // We have a successful parse of this proc's info. 2181 // Increment the counter, and prepare for the next proc. 2182 // 2183 num_avail++; 2184 KMP_ASSERT(num_avail <= num_records); 2185 INIT_PROC_INFO(threadInfo[num_avail]); 2186 } 2187 continue; 2188 2189 no_val: 2190 CLEANUP_THREAD_INFO; 2191 *msg_id = kmp_i18n_str_MissingValCpuinfo; 2192 return -1; 2193 2194 dup_field: 2195 CLEANUP_THREAD_INFO; 2196 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 2197 return -1; 2198 } 2199 *line = 0; 2200 2201 # if KMP_MIC && REDUCE_TEAM_SIZE 2202 unsigned teamSize = 0; 2203 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2204 2205 // check for num_records == __kmp_xproc ??? 2206 2207 // 2208 // If there's only one thread context to bind to, form an Address object 2209 // with depth 1 and return immediately (or, if affinity is off, set 2210 // address2os to NULL and return). 2211 // 2212 // If it is configured to omit the package level when there is only a 2213 // single package, the logic at the end of this routine won't work if 2214 // there is only a single thread - it would try to form an Address 2215 // object with depth 0. 2216 // 2217 KMP_ASSERT(num_avail > 0); 2218 KMP_ASSERT(num_avail <= num_records); 2219 if (num_avail == 1) { 2220 __kmp_ncores = 1; 2221 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 2222 if (__kmp_affinity_verbose) { 2223 if (! KMP_AFFINITY_CAPABLE()) { 2224 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2225 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2226 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2227 } 2228 else { 2229 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2230 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 2231 __kmp_affin_fullMask); 2232 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2233 if (__kmp_affinity_respect_mask) { 2234 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2235 } else { 2236 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2237 } 2238 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2239 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2240 } 2241 int index; 2242 kmp_str_buf_t buf; 2243 __kmp_str_buf_init(&buf); 2244 __kmp_str_buf_print(&buf, "1"); 2245 for (index = maxIndex - 1; index > pkgIdIndex; index--) { 2246 __kmp_str_buf_print(&buf, " x 1"); 2247 } 2248 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1); 2249 __kmp_str_buf_free(&buf); 2250 } 2251 2252 if (__kmp_affinity_type == affinity_none) { 2253 CLEANUP_THREAD_INFO; 2254 return 0; 2255 } 2256 2257 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair)); 2258 Address addr(1); 2259 addr.labels[0] = threadInfo[0][pkgIdIndex]; 2260 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]); 2261 2262 if (__kmp_affinity_gran_levels < 0) { 2263 __kmp_affinity_gran_levels = 0; 2264 } 2265 2266 if (__kmp_affinity_verbose) { 2267 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 2268 } 2269 2270 CLEANUP_THREAD_INFO; 2271 return 1; 2272 } 2273 2274 // 2275 // Sort the threadInfo table by physical Id. 2276 // 2277 qsort(threadInfo, num_avail, sizeof(*threadInfo), 2278 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 2279 2280 // 2281 // The table is now sorted by pkgId / coreId / threadId, but we really 2282 // don't know the radix of any of the fields. pkgId's may be sparsely 2283 // assigned among the chips on a system. Although coreId's are usually 2284 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned 2285 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2286 // 2287 // For that matter, we don't know what coresPerPkg and threadsPerCore 2288 // (or the total # packages) are at this point - we want to determine 2289 // that now. We only have an upper bound on the first two figures. 2290 // 2291 unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1) 2292 * sizeof(unsigned)); 2293 unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1) 2294 * sizeof(unsigned)); 2295 unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1) 2296 * sizeof(unsigned)); 2297 unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1) 2298 * sizeof(unsigned)); 2299 2300 bool assign_thread_ids = false; 2301 unsigned threadIdCt; 2302 unsigned index; 2303 2304 restart_radix_check: 2305 threadIdCt = 0; 2306 2307 // 2308 // Initialize the counter arrays with data from threadInfo[0]. 2309 // 2310 if (assign_thread_ids) { 2311 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 2312 threadInfo[0][threadIdIndex] = threadIdCt++; 2313 } 2314 else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 2315 threadIdCt = threadInfo[0][threadIdIndex] + 1; 2316 } 2317 } 2318 for (index = 0; index <= maxIndex; index++) { 2319 counts[index] = 1; 2320 maxCt[index] = 1; 2321 totals[index] = 1; 2322 lastId[index] = threadInfo[0][index];; 2323 } 2324 2325 // 2326 // Run through the rest of the OS procs. 2327 // 2328 for (i = 1; i < num_avail; i++) { 2329 // 2330 // Find the most significant index whose id differs 2331 // from the id for the previous OS proc. 2332 // 2333 for (index = maxIndex; index >= threadIdIndex; index--) { 2334 if (assign_thread_ids && (index == threadIdIndex)) { 2335 // 2336 // Auto-assign the thread id field if it wasn't specified. 2337 // 2338 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2339 threadInfo[i][threadIdIndex] = threadIdCt++; 2340 } 2341 2342 // 2343 // Aparrently the thread id field was specified for some 2344 // entries and not others. Start the thread id counter 2345 // off at the next higher thread id. 2346 // 2347 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2348 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2349 } 2350 } 2351 if (threadInfo[i][index] != lastId[index]) { 2352 // 2353 // Run through all indices which are less significant, 2354 // and reset the counts to 1. 2355 // 2356 // At all levels up to and including index, we need to 2357 // increment the totals and record the last id. 2358 // 2359 unsigned index2; 2360 for (index2 = threadIdIndex; index2 < index; index2++) { 2361 totals[index2]++; 2362 if (counts[index2] > maxCt[index2]) { 2363 maxCt[index2] = counts[index2]; 2364 } 2365 counts[index2] = 1; 2366 lastId[index2] = threadInfo[i][index2]; 2367 } 2368 counts[index]++; 2369 totals[index]++; 2370 lastId[index] = threadInfo[i][index]; 2371 2372 if (assign_thread_ids && (index > threadIdIndex)) { 2373 2374 # if KMP_MIC && REDUCE_TEAM_SIZE 2375 // 2376 // The default team size is the total #threads in the machine 2377 // minus 1 thread for every core that has 3 or more threads. 2378 // 2379 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 ); 2380 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2381 2382 // 2383 // Restart the thread counter, as we are on a new core. 2384 // 2385 threadIdCt = 0; 2386 2387 // 2388 // Auto-assign the thread id field if it wasn't specified. 2389 // 2390 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2391 threadInfo[i][threadIdIndex] = threadIdCt++; 2392 } 2393 2394 // 2395 // Aparrently the thread id field was specified for some 2396 // entries and not others. Start the thread id counter 2397 // off at the next higher thread id. 2398 // 2399 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2400 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2401 } 2402 } 2403 break; 2404 } 2405 } 2406 if (index < threadIdIndex) { 2407 // 2408 // If thread ids were specified, it is an error if they are not 2409 // unique. Also, check that we waven't already restarted the 2410 // loop (to be safe - shouldn't need to). 2411 // 2412 if ((threadInfo[i][threadIdIndex] != UINT_MAX) 2413 || assign_thread_ids) { 2414 __kmp_free(lastId); 2415 __kmp_free(totals); 2416 __kmp_free(maxCt); 2417 __kmp_free(counts); 2418 CLEANUP_THREAD_INFO; 2419 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2420 return -1; 2421 } 2422 2423 // 2424 // If the thread ids were not specified and we see entries 2425 // entries that are duplicates, start the loop over and 2426 // assign the thread ids manually. 2427 // 2428 assign_thread_ids = true; 2429 goto restart_radix_check; 2430 } 2431 } 2432 2433 # if KMP_MIC && REDUCE_TEAM_SIZE 2434 // 2435 // The default team size is the total #threads in the machine 2436 // minus 1 thread for every core that has 3 or more threads. 2437 // 2438 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 ); 2439 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2440 2441 for (index = threadIdIndex; index <= maxIndex; index++) { 2442 if (counts[index] > maxCt[index]) { 2443 maxCt[index] = counts[index]; 2444 } 2445 } 2446 2447 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2448 nCoresPerPkg = maxCt[coreIdIndex]; 2449 nPackages = totals[pkgIdIndex]; 2450 2451 // 2452 // Check to see if the machine topology is uniform 2453 // 2454 unsigned prod = totals[maxIndex]; 2455 for (index = threadIdIndex; index < maxIndex; index++) { 2456 prod *= maxCt[index]; 2457 } 2458 bool uniform = (prod == totals[threadIdIndex]); 2459 2460 // 2461 // When affinity is off, this routine will still be called to set 2462 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 2463 // nCoresPerPkg, & nPackages. Make sure all these vars are set 2464 // correctly, and return now if affinity is not enabled. 2465 // 2466 __kmp_ncores = totals[coreIdIndex]; 2467 2468 if (__kmp_affinity_verbose) { 2469 if (! KMP_AFFINITY_CAPABLE()) { 2470 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2471 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2472 if (uniform) { 2473 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2474 } else { 2475 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2476 } 2477 } 2478 else { 2479 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2480 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, __kmp_affin_fullMask); 2481 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2482 if (__kmp_affinity_respect_mask) { 2483 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2484 } else { 2485 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2486 } 2487 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2488 if (uniform) { 2489 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2490 } else { 2491 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2492 } 2493 } 2494 kmp_str_buf_t buf; 2495 __kmp_str_buf_init(&buf); 2496 2497 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]); 2498 for (index = maxIndex - 1; index >= pkgIdIndex; index--) { 2499 __kmp_str_buf_print(&buf, " x %d", maxCt[index]); 2500 } 2501 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex], 2502 maxCt[threadIdIndex], __kmp_ncores); 2503 2504 __kmp_str_buf_free(&buf); 2505 } 2506 2507 # if KMP_MIC && REDUCE_TEAM_SIZE 2508 // 2509 // Set the default team size. 2510 // 2511 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2512 __kmp_dflt_team_nth = teamSize; 2513 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n", 2514 __kmp_dflt_team_nth)); 2515 } 2516 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2517 2518 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL); 2519 KMP_DEBUG_ASSERT(num_avail == __kmp_avail_proc); 2520 __kmp_pu_os_idx = (int*)__kmp_allocate(sizeof(int) * __kmp_avail_proc); 2521 for (i = 0; i < num_avail; ++i) { // fill the os indices 2522 __kmp_pu_os_idx[i] = threadInfo[i][osIdIndex]; 2523 } 2524 2525 if (__kmp_affinity_type == affinity_none) { 2526 __kmp_free(lastId); 2527 __kmp_free(totals); 2528 __kmp_free(maxCt); 2529 __kmp_free(counts); 2530 CLEANUP_THREAD_INFO; 2531 return 0; 2532 } 2533 2534 // 2535 // Count the number of levels which have more nodes at that level than 2536 // at the parent's level (with there being an implicit root node of 2537 // the top level). This is equivalent to saying that there is at least 2538 // one node at this level which has a sibling. These levels are in the 2539 // map, and the package level is always in the map. 2540 // 2541 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2542 int level = 0; 2543 for (index = threadIdIndex; index < maxIndex; index++) { 2544 KMP_ASSERT(totals[index] >= totals[index + 1]); 2545 inMap[index] = (totals[index] > totals[index + 1]); 2546 } 2547 inMap[maxIndex] = (totals[maxIndex] > 1); 2548 inMap[pkgIdIndex] = true; 2549 2550 int depth = 0; 2551 for (index = threadIdIndex; index <= maxIndex; index++) { 2552 if (inMap[index]) { 2553 depth++; 2554 } 2555 } 2556 KMP_ASSERT(depth > 0); 2557 2558 // 2559 // Construct the data structure that is to be returned. 2560 // 2561 *address2os = (AddrUnsPair*) 2562 __kmp_allocate(sizeof(AddrUnsPair) * num_avail); 2563 int pkgLevel = -1; 2564 int coreLevel = -1; 2565 int threadLevel = -1; 2566 2567 for (i = 0; i < num_avail; ++i) { 2568 Address addr(depth); 2569 unsigned os = threadInfo[i][osIdIndex]; 2570 int src_index; 2571 int dst_index = 0; 2572 2573 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2574 if (! inMap[src_index]) { 2575 continue; 2576 } 2577 addr.labels[dst_index] = threadInfo[i][src_index]; 2578 if (src_index == pkgIdIndex) { 2579 pkgLevel = dst_index; 2580 } 2581 else if (src_index == coreIdIndex) { 2582 coreLevel = dst_index; 2583 } 2584 else if (src_index == threadIdIndex) { 2585 threadLevel = dst_index; 2586 } 2587 dst_index++; 2588 } 2589 (*address2os)[i] = AddrUnsPair(addr, os); 2590 } 2591 2592 if (__kmp_affinity_gran_levels < 0) { 2593 // 2594 // Set the granularity level based on what levels are modeled 2595 // in the machine topology map. 2596 // 2597 unsigned src_index; 2598 __kmp_affinity_gran_levels = 0; 2599 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) { 2600 if (! inMap[src_index]) { 2601 continue; 2602 } 2603 switch (src_index) { 2604 case threadIdIndex: 2605 if (__kmp_affinity_gran > affinity_gran_thread) { 2606 __kmp_affinity_gran_levels++; 2607 } 2608 2609 break; 2610 case coreIdIndex: 2611 if (__kmp_affinity_gran > affinity_gran_core) { 2612 __kmp_affinity_gran_levels++; 2613 } 2614 break; 2615 2616 case pkgIdIndex: 2617 if (__kmp_affinity_gran > affinity_gran_package) { 2618 __kmp_affinity_gran_levels++; 2619 } 2620 break; 2621 } 2622 } 2623 } 2624 2625 if (__kmp_affinity_verbose) { 2626 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel, 2627 coreLevel, threadLevel); 2628 } 2629 2630 __kmp_free(inMap); 2631 __kmp_free(lastId); 2632 __kmp_free(totals); 2633 __kmp_free(maxCt); 2634 __kmp_free(counts); 2635 CLEANUP_THREAD_INFO; 2636 return depth; 2637 } 2638 2639 2640 // 2641 // Create and return a table of affinity masks, indexed by OS thread ID. 2642 // This routine handles OR'ing together all the affinity masks of threads 2643 // that are sufficiently close, if granularity > fine. 2644 // 2645 static kmp_affin_mask_t * 2646 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique, 2647 AddrUnsPair *address2os, unsigned numAddrs) 2648 { 2649 // 2650 // First form a table of affinity masks in order of OS thread id. 2651 // 2652 unsigned depth; 2653 unsigned maxOsId; 2654 unsigned i; 2655 2656 KMP_ASSERT(numAddrs > 0); 2657 depth = address2os[0].first.depth; 2658 2659 maxOsId = 0; 2660 for (i = 0; i < numAddrs; i++) { 2661 unsigned osId = address2os[i].second; 2662 if (osId > maxOsId) { 2663 maxOsId = osId; 2664 } 2665 } 2666 kmp_affin_mask_t *osId2Mask; 2667 KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId+1)); 2668 2669 // 2670 // Sort the address2os table according to physical order. Doing so 2671 // will put all threads on the same core/package/node in consecutive 2672 // locations. 2673 // 2674 qsort(address2os, numAddrs, sizeof(*address2os), 2675 __kmp_affinity_cmp_Address_labels); 2676 2677 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2678 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2679 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2680 } 2681 if (__kmp_affinity_gran_levels >= (int)depth) { 2682 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2683 && (__kmp_affinity_type != affinity_none))) { 2684 KMP_WARNING(AffThreadsMayMigrate); 2685 } 2686 } 2687 2688 // 2689 // Run through the table, forming the masks for all threads on each 2690 // core. Threads on the same core will have identical "Address" 2691 // objects, not considering the last level, which must be the thread 2692 // id. All threads on a core will appear consecutively. 2693 // 2694 unsigned unique = 0; 2695 unsigned j = 0; // index of 1st thread on core 2696 unsigned leader = 0; 2697 Address *leaderAddr = &(address2os[0].first); 2698 kmp_affin_mask_t *sum; 2699 KMP_CPU_ALLOC_ON_STACK(sum); 2700 KMP_CPU_ZERO(sum); 2701 KMP_CPU_SET(address2os[0].second, sum); 2702 for (i = 1; i < numAddrs; i++) { 2703 // 2704 // If this thread is sufficiently close to the leader (within the 2705 // granularity setting), then set the bit for this os thread in the 2706 // affinity mask for this group, and go on to the next thread. 2707 // 2708 if (leaderAddr->isClose(address2os[i].first, 2709 __kmp_affinity_gran_levels)) { 2710 KMP_CPU_SET(address2os[i].second, sum); 2711 continue; 2712 } 2713 2714 // 2715 // For every thread in this group, copy the mask to the thread's 2716 // entry in the osId2Mask table. Mark the first address as a 2717 // leader. 2718 // 2719 for (; j < i; j++) { 2720 unsigned osId = address2os[j].second; 2721 KMP_DEBUG_ASSERT(osId <= maxOsId); 2722 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2723 KMP_CPU_COPY(mask, sum); 2724 address2os[j].first.leader = (j == leader); 2725 } 2726 unique++; 2727 2728 // 2729 // Start a new mask. 2730 // 2731 leader = i; 2732 leaderAddr = &(address2os[i].first); 2733 KMP_CPU_ZERO(sum); 2734 KMP_CPU_SET(address2os[i].second, sum); 2735 } 2736 2737 // 2738 // For every thread in last group, copy the mask to the thread's 2739 // entry in the osId2Mask table. 2740 // 2741 for (; j < i; j++) { 2742 unsigned osId = address2os[j].second; 2743 KMP_DEBUG_ASSERT(osId <= maxOsId); 2744 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2745 KMP_CPU_COPY(mask, sum); 2746 address2os[j].first.leader = (j == leader); 2747 } 2748 unique++; 2749 KMP_CPU_FREE_FROM_STACK(sum); 2750 2751 *maxIndex = maxOsId; 2752 *numUnique = unique; 2753 return osId2Mask; 2754 } 2755 2756 2757 // 2758 // Stuff for the affinity proclist parsers. It's easier to declare these vars 2759 // as file-static than to try and pass them through the calling sequence of 2760 // the recursive-descent OMP_PLACES parser. 2761 // 2762 static kmp_affin_mask_t *newMasks; 2763 static int numNewMasks; 2764 static int nextNewMask; 2765 2766 #define ADD_MASK(_mask) \ 2767 { \ 2768 if (nextNewMask >= numNewMasks) { \ 2769 int i; \ 2770 numNewMasks *= 2; \ 2771 kmp_affin_mask_t* temp; \ 2772 KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \ 2773 for(i=0;i<numNewMasks/2;i++) { \ 2774 kmp_affin_mask_t* src = KMP_CPU_INDEX(newMasks, i); \ 2775 kmp_affin_mask_t* dest = KMP_CPU_INDEX(temp, i); \ 2776 KMP_CPU_COPY(dest, src); \ 2777 } \ 2778 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks/2); \ 2779 newMasks = temp; \ 2780 } \ 2781 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 2782 nextNewMask++; \ 2783 } 2784 2785 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \ 2786 { \ 2787 if (((_osId) > _maxOsId) || \ 2788 (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 2789 if (__kmp_affinity_verbose || (__kmp_affinity_warnings \ 2790 && (__kmp_affinity_type != affinity_none))) { \ 2791 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 2792 } \ 2793 } \ 2794 else { \ 2795 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 2796 } \ 2797 } 2798 2799 2800 // 2801 // Re-parse the proclist (for the explicit affinity type), and form the list 2802 // of affinity newMasks indexed by gtid. 2803 // 2804 static void 2805 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 2806 unsigned int *out_numMasks, const char *proclist, 2807 kmp_affin_mask_t *osId2Mask, int maxOsId) 2808 { 2809 int i; 2810 const char *scan = proclist; 2811 const char *next = proclist; 2812 2813 // 2814 // We use malloc() for the temporary mask vector, 2815 // so that we can use realloc() to extend it. 2816 // 2817 numNewMasks = 2; 2818 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 2819 nextNewMask = 0; 2820 kmp_affin_mask_t *sumMask; 2821 KMP_CPU_ALLOC(sumMask); 2822 int setSize = 0; 2823 2824 for (;;) { 2825 int start, end, stride; 2826 2827 SKIP_WS(scan); 2828 next = scan; 2829 if (*next == '\0') { 2830 break; 2831 } 2832 2833 if (*next == '{') { 2834 int num; 2835 setSize = 0; 2836 next++; // skip '{' 2837 SKIP_WS(next); 2838 scan = next; 2839 2840 // 2841 // Read the first integer in the set. 2842 // 2843 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2844 "bad proclist"); 2845 SKIP_DIGITS(next); 2846 num = __kmp_str_to_int(scan, *next); 2847 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2848 2849 // 2850 // Copy the mask for that osId to the sum (union) mask. 2851 // 2852 if ((num > maxOsId) || 2853 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2854 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2855 && (__kmp_affinity_type != affinity_none))) { 2856 KMP_WARNING(AffIgnoreInvalidProcID, num); 2857 } 2858 KMP_CPU_ZERO(sumMask); 2859 } 2860 else { 2861 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2862 setSize = 1; 2863 } 2864 2865 for (;;) { 2866 // 2867 // Check for end of set. 2868 // 2869 SKIP_WS(next); 2870 if (*next == '}') { 2871 next++; // skip '}' 2872 break; 2873 } 2874 2875 // 2876 // Skip optional comma. 2877 // 2878 if (*next == ',') { 2879 next++; 2880 } 2881 SKIP_WS(next); 2882 2883 // 2884 // Read the next integer in the set. 2885 // 2886 scan = next; 2887 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2888 "bad explicit proc list"); 2889 2890 SKIP_DIGITS(next); 2891 num = __kmp_str_to_int(scan, *next); 2892 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2893 2894 // 2895 // Add the mask for that osId to the sum mask. 2896 // 2897 if ((num > maxOsId) || 2898 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2899 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2900 && (__kmp_affinity_type != affinity_none))) { 2901 KMP_WARNING(AffIgnoreInvalidProcID, num); 2902 } 2903 } 2904 else { 2905 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2906 setSize++; 2907 } 2908 } 2909 if (setSize > 0) { 2910 ADD_MASK(sumMask); 2911 } 2912 2913 SKIP_WS(next); 2914 if (*next == ',') { 2915 next++; 2916 } 2917 scan = next; 2918 continue; 2919 } 2920 2921 // 2922 // Read the first integer. 2923 // 2924 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2925 SKIP_DIGITS(next); 2926 start = __kmp_str_to_int(scan, *next); 2927 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 2928 SKIP_WS(next); 2929 2930 // 2931 // If this isn't a range, then add a mask to the list and go on. 2932 // 2933 if (*next != '-') { 2934 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2935 2936 // 2937 // Skip optional comma. 2938 // 2939 if (*next == ',') { 2940 next++; 2941 } 2942 scan = next; 2943 continue; 2944 } 2945 2946 // 2947 // This is a range. Skip over the '-' and read in the 2nd int. 2948 // 2949 next++; // skip '-' 2950 SKIP_WS(next); 2951 scan = next; 2952 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2953 SKIP_DIGITS(next); 2954 end = __kmp_str_to_int(scan, *next); 2955 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 2956 2957 // 2958 // Check for a stride parameter 2959 // 2960 stride = 1; 2961 SKIP_WS(next); 2962 if (*next == ':') { 2963 // 2964 // A stride is specified. Skip over the ':" and read the 3rd int. 2965 // 2966 int sign = +1; 2967 next++; // skip ':' 2968 SKIP_WS(next); 2969 scan = next; 2970 if (*next == '-') { 2971 sign = -1; 2972 next++; 2973 SKIP_WS(next); 2974 scan = next; 2975 } 2976 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2977 "bad explicit proc list"); 2978 SKIP_DIGITS(next); 2979 stride = __kmp_str_to_int(scan, *next); 2980 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 2981 stride *= sign; 2982 } 2983 2984 // 2985 // Do some range checks. 2986 // 2987 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 2988 if (stride > 0) { 2989 KMP_ASSERT2(start <= end, "bad explicit proc list"); 2990 } 2991 else { 2992 KMP_ASSERT2(start >= end, "bad explicit proc list"); 2993 } 2994 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 2995 2996 // 2997 // Add the mask for each OS proc # to the list. 2998 // 2999 if (stride > 0) { 3000 do { 3001 ADD_MASK_OSID(start, osId2Mask, maxOsId); 3002 start += stride; 3003 } while (start <= end); 3004 } 3005 else { 3006 do { 3007 ADD_MASK_OSID(start, osId2Mask, maxOsId); 3008 start += stride; 3009 } while (start >= end); 3010 } 3011 3012 // 3013 // Skip optional comma. 3014 // 3015 SKIP_WS(next); 3016 if (*next == ',') { 3017 next++; 3018 } 3019 scan = next; 3020 } 3021 3022 *out_numMasks = nextNewMask; 3023 if (nextNewMask == 0) { 3024 *out_masks = NULL; 3025 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3026 return; 3027 } 3028 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3029 for(i = 0; i < nextNewMask; i++) { 3030 kmp_affin_mask_t* src = KMP_CPU_INDEX(newMasks, i); 3031 kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i); 3032 KMP_CPU_COPY(dest, src); 3033 } 3034 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3035 KMP_CPU_FREE(sumMask); 3036 } 3037 3038 3039 # if OMP_40_ENABLED 3040 3041 /*----------------------------------------------------------------------------- 3042 3043 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 3044 places. Again, Here is the grammar: 3045 3046 place_list := place 3047 place_list := place , place_list 3048 place := num 3049 place := place : num 3050 place := place : num : signed 3051 place := { subplacelist } 3052 place := ! place // (lowest priority) 3053 subplace_list := subplace 3054 subplace_list := subplace , subplace_list 3055 subplace := num 3056 subplace := num : num 3057 subplace := num : num : signed 3058 signed := num 3059 signed := + signed 3060 signed := - signed 3061 3062 -----------------------------------------------------------------------------*/ 3063 3064 static void 3065 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask, 3066 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize) 3067 { 3068 const char *next; 3069 3070 for (;;) { 3071 int start, count, stride, i; 3072 3073 // 3074 // Read in the starting proc id 3075 // 3076 SKIP_WS(*scan); 3077 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 3078 "bad explicit places list"); 3079 next = *scan; 3080 SKIP_DIGITS(next); 3081 start = __kmp_str_to_int(*scan, *next); 3082 KMP_ASSERT(start >= 0); 3083 *scan = next; 3084 3085 // 3086 // valid follow sets are ',' ':' and '}' 3087 // 3088 SKIP_WS(*scan); 3089 if (**scan == '}' || **scan == ',') { 3090 if ((start > maxOsId) || 3091 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3092 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3093 && (__kmp_affinity_type != affinity_none))) { 3094 KMP_WARNING(AffIgnoreInvalidProcID, start); 3095 } 3096 } 3097 else { 3098 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3099 (*setSize)++; 3100 } 3101 if (**scan == '}') { 3102 break; 3103 } 3104 (*scan)++; // skip ',' 3105 continue; 3106 } 3107 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3108 (*scan)++; // skip ':' 3109 3110 // 3111 // Read count parameter 3112 // 3113 SKIP_WS(*scan); 3114 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 3115 "bad explicit places list"); 3116 next = *scan; 3117 SKIP_DIGITS(next); 3118 count = __kmp_str_to_int(*scan, *next); 3119 KMP_ASSERT(count >= 0); 3120 *scan = next; 3121 3122 // 3123 // valid follow sets are ',' ':' and '}' 3124 // 3125 SKIP_WS(*scan); 3126 if (**scan == '}' || **scan == ',') { 3127 for (i = 0; i < count; i++) { 3128 if ((start > maxOsId) || 3129 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3130 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3131 && (__kmp_affinity_type != affinity_none))) { 3132 KMP_WARNING(AffIgnoreInvalidProcID, start); 3133 } 3134 break; // don't proliferate warnings for large count 3135 } 3136 else { 3137 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3138 start++; 3139 (*setSize)++; 3140 } 3141 } 3142 if (**scan == '}') { 3143 break; 3144 } 3145 (*scan)++; // skip ',' 3146 continue; 3147 } 3148 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3149 (*scan)++; // skip ':' 3150 3151 // 3152 // Read stride parameter 3153 // 3154 int sign = +1; 3155 for (;;) { 3156 SKIP_WS(*scan); 3157 if (**scan == '+') { 3158 (*scan)++; // skip '+' 3159 continue; 3160 } 3161 if (**scan == '-') { 3162 sign *= -1; 3163 (*scan)++; // skip '-' 3164 continue; 3165 } 3166 break; 3167 } 3168 SKIP_WS(*scan); 3169 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 3170 "bad explicit places list"); 3171 next = *scan; 3172 SKIP_DIGITS(next); 3173 stride = __kmp_str_to_int(*scan, *next); 3174 KMP_ASSERT(stride >= 0); 3175 *scan = next; 3176 stride *= sign; 3177 3178 // 3179 // valid follow sets are ',' and '}' 3180 // 3181 SKIP_WS(*scan); 3182 if (**scan == '}' || **scan == ',') { 3183 for (i = 0; i < count; i++) { 3184 if ((start > maxOsId) || 3185 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3186 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3187 && (__kmp_affinity_type != affinity_none))) { 3188 KMP_WARNING(AffIgnoreInvalidProcID, start); 3189 } 3190 break; // don't proliferate warnings for large count 3191 } 3192 else { 3193 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3194 start += stride; 3195 (*setSize)++; 3196 } 3197 } 3198 if (**scan == '}') { 3199 break; 3200 } 3201 (*scan)++; // skip ',' 3202 continue; 3203 } 3204 3205 KMP_ASSERT2(0, "bad explicit places list"); 3206 } 3207 } 3208 3209 3210 static void 3211 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 3212 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize) 3213 { 3214 const char *next; 3215 3216 // 3217 // valid follow sets are '{' '!' and num 3218 // 3219 SKIP_WS(*scan); 3220 if (**scan == '{') { 3221 (*scan)++; // skip '{' 3222 __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask, 3223 setSize); 3224 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 3225 (*scan)++; // skip '}' 3226 } 3227 else if (**scan == '!') { 3228 (*scan)++; // skip '!' 3229 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 3230 KMP_CPU_COMPLEMENT(maxOsId, tempMask); 3231 } 3232 else if ((**scan >= '0') && (**scan <= '9')) { 3233 next = *scan; 3234 SKIP_DIGITS(next); 3235 int num = __kmp_str_to_int(*scan, *next); 3236 KMP_ASSERT(num >= 0); 3237 if ((num > maxOsId) || 3238 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3239 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3240 && (__kmp_affinity_type != affinity_none))) { 3241 KMP_WARNING(AffIgnoreInvalidProcID, num); 3242 } 3243 } 3244 else { 3245 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 3246 (*setSize)++; 3247 } 3248 *scan = next; // skip num 3249 } 3250 else { 3251 KMP_ASSERT2(0, "bad explicit places list"); 3252 } 3253 } 3254 3255 3256 //static void 3257 void 3258 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 3259 unsigned int *out_numMasks, const char *placelist, 3260 kmp_affin_mask_t *osId2Mask, int maxOsId) 3261 { 3262 int i,j,count,stride,sign; 3263 const char *scan = placelist; 3264 const char *next = placelist; 3265 3266 numNewMasks = 2; 3267 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 3268 nextNewMask = 0; 3269 3270 // tempMask is modified based on the previous or initial 3271 // place to form the current place 3272 // previousMask contains the previous place 3273 kmp_affin_mask_t *tempMask; 3274 kmp_affin_mask_t *previousMask; 3275 KMP_CPU_ALLOC(tempMask); 3276 KMP_CPU_ZERO(tempMask); 3277 KMP_CPU_ALLOC(previousMask); 3278 KMP_CPU_ZERO(previousMask); 3279 int setSize = 0; 3280 3281 for (;;) { 3282 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 3283 3284 // 3285 // valid follow sets are ',' ':' and EOL 3286 // 3287 SKIP_WS(scan); 3288 if (*scan == '\0' || *scan == ',') { 3289 if (setSize > 0) { 3290 ADD_MASK(tempMask); 3291 } 3292 KMP_CPU_ZERO(tempMask); 3293 setSize = 0; 3294 if (*scan == '\0') { 3295 break; 3296 } 3297 scan++; // skip ',' 3298 continue; 3299 } 3300 3301 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3302 scan++; // skip ':' 3303 3304 // 3305 // Read count parameter 3306 // 3307 SKIP_WS(scan); 3308 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), 3309 "bad explicit places list"); 3310 next = scan; 3311 SKIP_DIGITS(next); 3312 count = __kmp_str_to_int(scan, *next); 3313 KMP_ASSERT(count >= 0); 3314 scan = next; 3315 3316 // 3317 // valid follow sets are ',' ':' and EOL 3318 // 3319 SKIP_WS(scan); 3320 if (*scan == '\0' || *scan == ',') { 3321 stride = +1; 3322 } 3323 else { 3324 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3325 scan++; // skip ':' 3326 3327 // 3328 // Read stride parameter 3329 // 3330 sign = +1; 3331 for (;;) { 3332 SKIP_WS(scan); 3333 if (*scan == '+') { 3334 scan++; // skip '+' 3335 continue; 3336 } 3337 if (*scan == '-') { 3338 sign *= -1; 3339 scan++; // skip '-' 3340 continue; 3341 } 3342 break; 3343 } 3344 SKIP_WS(scan); 3345 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), 3346 "bad explicit places list"); 3347 next = scan; 3348 SKIP_DIGITS(next); 3349 stride = __kmp_str_to_int(scan, *next); 3350 KMP_DEBUG_ASSERT(stride >= 0); 3351 scan = next; 3352 stride *= sign; 3353 } 3354 3355 // Add places determined by initial_place : count : stride 3356 for (i = 0; i < count; i++) { 3357 if (setSize == 0) { 3358 break; 3359 } 3360 // Add the current place, then build the next place (tempMask) from that 3361 KMP_CPU_COPY(previousMask, tempMask); 3362 ADD_MASK(previousMask); 3363 KMP_CPU_ZERO(tempMask); 3364 setSize = 0; 3365 KMP_CPU_SET_ITERATE(j, previousMask) { 3366 if (! KMP_CPU_ISSET(j, previousMask)) { 3367 continue; 3368 } 3369 if ((j+stride > maxOsId) || (j+stride < 0) || 3370 (! KMP_CPU_ISSET(j, __kmp_affin_fullMask)) || 3371 (! KMP_CPU_ISSET(j+stride, KMP_CPU_INDEX(osId2Mask, j+stride)))) { 3372 if ((__kmp_affinity_verbose || (__kmp_affinity_warnings 3373 && (__kmp_affinity_type != affinity_none))) && i < count - 1) { 3374 KMP_WARNING(AffIgnoreInvalidProcID, j+stride); 3375 } 3376 continue; 3377 } 3378 KMP_CPU_SET(j+stride, tempMask); 3379 setSize++; 3380 } 3381 } 3382 KMP_CPU_ZERO(tempMask); 3383 setSize = 0; 3384 3385 // 3386 // valid follow sets are ',' and EOL 3387 // 3388 SKIP_WS(scan); 3389 if (*scan == '\0') { 3390 break; 3391 } 3392 if (*scan == ',') { 3393 scan++; // skip ',' 3394 continue; 3395 } 3396 3397 KMP_ASSERT2(0, "bad explicit places list"); 3398 } 3399 3400 *out_numMasks = nextNewMask; 3401 if (nextNewMask == 0) { 3402 *out_masks = NULL; 3403 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3404 return; 3405 } 3406 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3407 KMP_CPU_FREE(tempMask); 3408 KMP_CPU_FREE(previousMask); 3409 for(i = 0; i < nextNewMask; i++) { 3410 kmp_affin_mask_t* src = KMP_CPU_INDEX(newMasks, i); 3411 kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i); 3412 KMP_CPU_COPY(dest, src); 3413 } 3414 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3415 } 3416 3417 # endif /* OMP_40_ENABLED */ 3418 3419 #undef ADD_MASK 3420 #undef ADD_MASK_OSID 3421 3422 static void 3423 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) 3424 { 3425 int i, j, k, n_old = 0, n_new = 0, proc_num = 0; 3426 if (__kmp_place_num_sockets == 0 && 3427 __kmp_place_num_cores == 0 && 3428 __kmp_place_num_threads_per_core == 0 ) 3429 goto _exit; // no topology limiting actions requested, exit 3430 if (__kmp_place_num_sockets == 0) 3431 __kmp_place_num_sockets = nPackages; // use all available sockets 3432 if (__kmp_place_num_cores == 0) 3433 __kmp_place_num_cores = nCoresPerPkg; // use all available cores 3434 if (__kmp_place_num_threads_per_core == 0 || 3435 __kmp_place_num_threads_per_core > __kmp_nThreadsPerCore) 3436 __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts 3437 3438 if ( !__kmp_affinity_uniform_topology() ) { 3439 KMP_WARNING( AffHWSubsetNonUniform ); 3440 goto _exit; // don't support non-uniform topology 3441 } 3442 if ( depth > 3 ) { 3443 KMP_WARNING( AffHWSubsetNonThreeLevel ); 3444 goto _exit; // don't support not-3-level topology 3445 } 3446 if (__kmp_place_socket_offset + __kmp_place_num_sockets > nPackages) { 3447 KMP_WARNING(AffHWSubsetManySockets); 3448 goto _exit; 3449 } 3450 if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) { 3451 KMP_WARNING( AffHWSubsetManyCores ); 3452 goto _exit; 3453 } 3454 3455 AddrUnsPair *newAddr; 3456 if (pAddr) // pAddr is NULL in case of affinity_none 3457 newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) * 3458 __kmp_place_num_sockets * __kmp_place_num_cores * __kmp_place_num_threads_per_core); 3459 3460 for (i = 0; i < nPackages; ++i) { 3461 if (i < __kmp_place_socket_offset || 3462 i >= __kmp_place_socket_offset + __kmp_place_num_sockets) { 3463 n_old += nCoresPerPkg * __kmp_nThreadsPerCore; // skip not-requested socket 3464 if (__kmp_pu_os_idx != NULL) { 3465 for (j = 0; j < nCoresPerPkg; ++j) { // walk through skipped socket 3466 for (k = 0; k < __kmp_nThreadsPerCore; ++k) { 3467 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3468 ++proc_num; 3469 } 3470 } 3471 } 3472 } else { 3473 for (j = 0; j < nCoresPerPkg; ++j) { // walk through requested socket 3474 if (j < __kmp_place_core_offset || 3475 j >= __kmp_place_core_offset + __kmp_place_num_cores) { 3476 n_old += __kmp_nThreadsPerCore; // skip not-requested core 3477 if (__kmp_pu_os_idx != NULL) { 3478 for (k = 0; k < __kmp_nThreadsPerCore; ++k) { // walk through skipped core 3479 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3480 ++proc_num; 3481 } 3482 } 3483 } else { 3484 for (k = 0; k < __kmp_nThreadsPerCore; ++k) { // walk through requested core 3485 if (k < __kmp_place_num_threads_per_core) { 3486 if (pAddr) 3487 newAddr[n_new] = (*pAddr)[n_old]; // collect requested thread's data 3488 n_new++; 3489 } else { 3490 if (__kmp_pu_os_idx != NULL) 3491 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask); 3492 } 3493 n_old++; 3494 ++proc_num; 3495 } 3496 } 3497 } 3498 } 3499 } 3500 KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore); 3501 KMP_DEBUG_ASSERT(n_new == __kmp_place_num_sockets * __kmp_place_num_cores * 3502 __kmp_place_num_threads_per_core); 3503 3504 nPackages = __kmp_place_num_sockets; // correct nPackages 3505 nCoresPerPkg = __kmp_place_num_cores; // correct nCoresPerPkg 3506 __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore 3507 __kmp_avail_proc = n_new; // correct avail_proc 3508 __kmp_ncores = nPackages * __kmp_place_num_cores; // correct ncores 3509 3510 if (pAddr) { 3511 __kmp_free( *pAddr ); 3512 *pAddr = newAddr; // replace old topology with new one 3513 } 3514 _exit: 3515 if (__kmp_pu_os_idx != NULL) { 3516 __kmp_free(__kmp_pu_os_idx); 3517 __kmp_pu_os_idx = NULL; 3518 } 3519 } 3520 3521 3522 static AddrUnsPair *address2os = NULL; 3523 static int * procarr = NULL; 3524 static int __kmp_aff_depth = 0; 3525 3526 #define KMP_EXIT_AFF_NONE \ 3527 KMP_ASSERT(__kmp_affinity_type == affinity_none); \ 3528 KMP_ASSERT(address2os == NULL); \ 3529 __kmp_apply_thread_places(NULL, 0); \ 3530 return; 3531 3532 static void 3533 __kmp_aux_affinity_initialize(void) 3534 { 3535 if (__kmp_affinity_masks != NULL) { 3536 KMP_ASSERT(__kmp_affin_fullMask != NULL); 3537 return; 3538 } 3539 3540 // 3541 // Create the "full" mask - this defines all of the processors that we 3542 // consider to be in the machine model. If respect is set, then it is 3543 // the initialization thread's affinity mask. Otherwise, it is all 3544 // processors that we know about on the machine. 3545 // 3546 if (__kmp_affin_fullMask == NULL) { 3547 KMP_CPU_ALLOC(__kmp_affin_fullMask); 3548 } 3549 if (KMP_AFFINITY_CAPABLE()) { 3550 if (__kmp_affinity_respect_mask) { 3551 __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE); 3552 3553 // 3554 // Count the number of available processors. 3555 // 3556 unsigned i; 3557 __kmp_avail_proc = 0; 3558 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 3559 if (! KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 3560 continue; 3561 } 3562 __kmp_avail_proc++; 3563 } 3564 if (__kmp_avail_proc > __kmp_xproc) { 3565 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3566 && (__kmp_affinity_type != affinity_none))) { 3567 KMP_WARNING(ErrorInitializeAffinity); 3568 } 3569 __kmp_affinity_type = affinity_none; 3570 KMP_AFFINITY_DISABLE(); 3571 return; 3572 } 3573 } 3574 else { 3575 __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask); 3576 __kmp_avail_proc = __kmp_xproc; 3577 } 3578 } 3579 3580 int depth = -1; 3581 kmp_i18n_id_t msg_id = kmp_i18n_null; 3582 3583 // 3584 // For backward compatibility, setting KMP_CPUINFO_FILE => 3585 // KMP_TOPOLOGY_METHOD=cpuinfo 3586 // 3587 if ((__kmp_cpuinfo_file != NULL) && 3588 (__kmp_affinity_top_method == affinity_top_method_all)) { 3589 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 3590 } 3591 3592 if (__kmp_affinity_top_method == affinity_top_method_all) { 3593 // 3594 // In the default code path, errors are not fatal - we just try using 3595 // another method. We only emit a warning message if affinity is on, 3596 // or the verbose flag is set, an the nowarnings flag was not set. 3597 // 3598 const char *file_name = NULL; 3599 int line = 0; 3600 # if KMP_USE_HWLOC 3601 if (depth < 0) { 3602 if (__kmp_affinity_verbose) { 3603 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 3604 } 3605 if(!__kmp_hwloc_error) { 3606 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); 3607 if (depth == 0) { 3608 KMP_EXIT_AFF_NONE; 3609 } else if(depth < 0 && __kmp_affinity_verbose) { 3610 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 3611 } 3612 } else if(__kmp_affinity_verbose) { 3613 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 3614 } 3615 } 3616 # endif 3617 3618 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 3619 3620 if (depth < 0) { 3621 if (__kmp_affinity_verbose) { 3622 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 3623 } 3624 3625 file_name = NULL; 3626 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3627 if (depth == 0) { 3628 KMP_EXIT_AFF_NONE; 3629 } 3630 3631 if (depth < 0) { 3632 if (__kmp_affinity_verbose) { 3633 if (msg_id != kmp_i18n_null) { 3634 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), 3635 KMP_I18N_STR(DecodingLegacyAPIC)); 3636 } 3637 else { 3638 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 3639 } 3640 } 3641 3642 file_name = NULL; 3643 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3644 if (depth == 0) { 3645 KMP_EXIT_AFF_NONE; 3646 } 3647 } 3648 } 3649 3650 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3651 3652 # if KMP_OS_LINUX 3653 3654 if (depth < 0) { 3655 if (__kmp_affinity_verbose) { 3656 if (msg_id != kmp_i18n_null) { 3657 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo"); 3658 } 3659 else { 3660 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo"); 3661 } 3662 } 3663 3664 FILE *f = fopen("/proc/cpuinfo", "r"); 3665 if (f == NULL) { 3666 msg_id = kmp_i18n_str_CantOpenCpuinfo; 3667 } 3668 else { 3669 file_name = "/proc/cpuinfo"; 3670 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3671 fclose(f); 3672 if (depth == 0) { 3673 KMP_EXIT_AFF_NONE; 3674 } 3675 } 3676 } 3677 3678 # endif /* KMP_OS_LINUX */ 3679 3680 # if KMP_GROUP_AFFINITY 3681 3682 if ((depth < 0) && (__kmp_num_proc_groups > 1)) { 3683 if (__kmp_affinity_verbose) { 3684 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 3685 } 3686 3687 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 3688 KMP_ASSERT(depth != 0); 3689 } 3690 3691 # endif /* KMP_GROUP_AFFINITY */ 3692 3693 if (depth < 0) { 3694 if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) { 3695 if (file_name == NULL) { 3696 KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id)); 3697 } 3698 else if (line == 0) { 3699 KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id)); 3700 } 3701 else { 3702 KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id)); 3703 } 3704 } 3705 // FIXME - print msg if msg_id = kmp_i18n_null ??? 3706 3707 file_name = ""; 3708 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3709 if (depth == 0) { 3710 KMP_EXIT_AFF_NONE; 3711 } 3712 KMP_ASSERT(depth > 0); 3713 KMP_ASSERT(address2os != NULL); 3714 } 3715 } 3716 3717 // 3718 // If the user has specified that a paricular topology discovery method 3719 // is to be used, then we abort if that method fails. The exception is 3720 // group affinity, which might have been implicitly set. 3721 // 3722 3723 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 3724 3725 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 3726 if (__kmp_affinity_verbose) { 3727 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3728 KMP_I18N_STR(Decodingx2APIC)); 3729 } 3730 3731 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3732 if (depth == 0) { 3733 KMP_EXIT_AFF_NONE; 3734 } 3735 if (depth < 0) { 3736 KMP_ASSERT(msg_id != kmp_i18n_null); 3737 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3738 } 3739 } 3740 else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 3741 if (__kmp_affinity_verbose) { 3742 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3743 KMP_I18N_STR(DecodingLegacyAPIC)); 3744 } 3745 3746 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3747 if (depth == 0) { 3748 KMP_EXIT_AFF_NONE; 3749 } 3750 if (depth < 0) { 3751 KMP_ASSERT(msg_id != kmp_i18n_null); 3752 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3753 } 3754 } 3755 3756 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3757 3758 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 3759 const char *filename; 3760 if (__kmp_cpuinfo_file != NULL) { 3761 filename = __kmp_cpuinfo_file; 3762 } 3763 else { 3764 filename = "/proc/cpuinfo"; 3765 } 3766 3767 if (__kmp_affinity_verbose) { 3768 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 3769 } 3770 3771 FILE *f = fopen(filename, "r"); 3772 if (f == NULL) { 3773 int code = errno; 3774 if (__kmp_cpuinfo_file != NULL) { 3775 __kmp_msg( 3776 kmp_ms_fatal, 3777 KMP_MSG(CantOpenFileForReading, filename), 3778 KMP_ERR(code), 3779 KMP_HNT(NameComesFrom_CPUINFO_FILE), 3780 __kmp_msg_null 3781 ); 3782 } 3783 else { 3784 __kmp_msg( 3785 kmp_ms_fatal, 3786 KMP_MSG(CantOpenFileForReading, filename), 3787 KMP_ERR(code), 3788 __kmp_msg_null 3789 ); 3790 } 3791 } 3792 int line = 0; 3793 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3794 fclose(f); 3795 if (depth < 0) { 3796 KMP_ASSERT(msg_id != kmp_i18n_null); 3797 if (line > 0) { 3798 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id)); 3799 } 3800 else { 3801 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 3802 } 3803 } 3804 if (__kmp_affinity_type == affinity_none) { 3805 KMP_ASSERT(depth == 0); 3806 KMP_EXIT_AFF_NONE; 3807 } 3808 } 3809 3810 # if KMP_GROUP_AFFINITY 3811 3812 else if (__kmp_affinity_top_method == affinity_top_method_group) { 3813 if (__kmp_affinity_verbose) { 3814 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 3815 } 3816 3817 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 3818 KMP_ASSERT(depth != 0); 3819 if (depth < 0) { 3820 KMP_ASSERT(msg_id != kmp_i18n_null); 3821 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3822 } 3823 } 3824 3825 # endif /* KMP_GROUP_AFFINITY */ 3826 3827 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 3828 if (__kmp_affinity_verbose) { 3829 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY"); 3830 } 3831 3832 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3833 if (depth == 0) { 3834 KMP_EXIT_AFF_NONE; 3835 } 3836 // should not fail 3837 KMP_ASSERT(depth > 0); 3838 KMP_ASSERT(address2os != NULL); 3839 } 3840 3841 # if KMP_USE_HWLOC 3842 else if (__kmp_affinity_top_method == affinity_top_method_hwloc) { 3843 if (__kmp_affinity_verbose) { 3844 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 3845 } 3846 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); 3847 if (depth == 0) { 3848 KMP_EXIT_AFF_NONE; 3849 } 3850 } 3851 # endif // KMP_USE_HWLOC 3852 3853 if (address2os == NULL) { 3854 if (KMP_AFFINITY_CAPABLE() 3855 && (__kmp_affinity_verbose || (__kmp_affinity_warnings 3856 && (__kmp_affinity_type != affinity_none)))) { 3857 KMP_WARNING(ErrorInitializeAffinity); 3858 } 3859 __kmp_affinity_type = affinity_none; 3860 KMP_AFFINITY_DISABLE(); 3861 return; 3862 } 3863 3864 __kmp_apply_thread_places(&address2os, depth); 3865 3866 // 3867 // Create the table of masks, indexed by thread Id. 3868 // 3869 unsigned maxIndex; 3870 unsigned numUnique; 3871 kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique, 3872 address2os, __kmp_avail_proc); 3873 if (__kmp_affinity_gran_levels == 0) { 3874 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 3875 } 3876 3877 // 3878 // Set the childNums vector in all Address objects. This must be done 3879 // before we can sort using __kmp_affinity_cmp_Address_child_num(), 3880 // which takes into account the setting of __kmp_affinity_compact. 3881 // 3882 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc); 3883 3884 switch (__kmp_affinity_type) { 3885 3886 case affinity_explicit: 3887 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 3888 # if OMP_40_ENABLED 3889 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 3890 # endif 3891 { 3892 __kmp_affinity_process_proclist(&__kmp_affinity_masks, 3893 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask, 3894 maxIndex); 3895 } 3896 # if OMP_40_ENABLED 3897 else { 3898 __kmp_affinity_process_placelist(&__kmp_affinity_masks, 3899 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask, 3900 maxIndex); 3901 } 3902 # endif 3903 if (__kmp_affinity_num_masks == 0) { 3904 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3905 && (__kmp_affinity_type != affinity_none))) { 3906 KMP_WARNING(AffNoValidProcID); 3907 } 3908 __kmp_affinity_type = affinity_none; 3909 return; 3910 } 3911 break; 3912 3913 // 3914 // The other affinity types rely on sorting the Addresses according 3915 // to some permutation of the machine topology tree. Set 3916 // __kmp_affinity_compact and __kmp_affinity_offset appropriately, 3917 // then jump to a common code fragment to do the sort and create 3918 // the array of affinity masks. 3919 // 3920 3921 case affinity_logical: 3922 __kmp_affinity_compact = 0; 3923 if (__kmp_affinity_offset) { 3924 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset 3925 % __kmp_avail_proc; 3926 } 3927 goto sortAddresses; 3928 3929 case affinity_physical: 3930 if (__kmp_nThreadsPerCore > 1) { 3931 __kmp_affinity_compact = 1; 3932 if (__kmp_affinity_compact >= depth) { 3933 __kmp_affinity_compact = 0; 3934 } 3935 } else { 3936 __kmp_affinity_compact = 0; 3937 } 3938 if (__kmp_affinity_offset) { 3939 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset 3940 % __kmp_avail_proc; 3941 } 3942 goto sortAddresses; 3943 3944 case affinity_scatter: 3945 if (__kmp_affinity_compact >= depth) { 3946 __kmp_affinity_compact = 0; 3947 } 3948 else { 3949 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 3950 } 3951 goto sortAddresses; 3952 3953 case affinity_compact: 3954 if (__kmp_affinity_compact >= depth) { 3955 __kmp_affinity_compact = depth - 1; 3956 } 3957 goto sortAddresses; 3958 3959 case affinity_balanced: 3960 // Balanced works only for the case of a single package 3961 if( nPackages > 1 ) { 3962 if( __kmp_affinity_verbose || __kmp_affinity_warnings ) { 3963 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" ); 3964 } 3965 __kmp_affinity_type = affinity_none; 3966 return; 3967 } else if( __kmp_affinity_uniform_topology() ) { 3968 break; 3969 } else { // Non-uniform topology 3970 3971 // Save the depth for further usage 3972 __kmp_aff_depth = depth; 3973 3974 // Number of hyper threads per core in HT machine 3975 int nth_per_core = __kmp_nThreadsPerCore; 3976 3977 int core_level; 3978 if( nth_per_core > 1 ) { 3979 core_level = depth - 2; 3980 } else { 3981 core_level = depth - 1; 3982 } 3983 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1; 3984 int nproc = nth_per_core * ncores; 3985 3986 procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc ); 3987 for( int i = 0; i < nproc; i++ ) { 3988 procarr[ i ] = -1; 3989 } 3990 3991 for( int i = 0; i < __kmp_avail_proc; i++ ) { 3992 int proc = address2os[ i ].second; 3993 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread. 3994 // If there is only one thread per core then depth == 2: level 0 - package, 3995 // level 1 - core. 3996 int level = depth - 1; 3997 3998 // __kmp_nth_per_core == 1 3999 int thread = 0; 4000 int core = address2os[ i ].first.labels[ level ]; 4001 // If the thread level exists, that is we have more than one thread context per core 4002 if( nth_per_core > 1 ) { 4003 thread = address2os[ i ].first.labels[ level ] % nth_per_core; 4004 core = address2os[ i ].first.labels[ level - 1 ]; 4005 } 4006 procarr[ core * nth_per_core + thread ] = proc; 4007 } 4008 4009 break; 4010 } 4011 4012 sortAddresses: 4013 // 4014 // Allocate the gtid->affinity mask table. 4015 // 4016 if (__kmp_affinity_dups) { 4017 __kmp_affinity_num_masks = __kmp_avail_proc; 4018 } 4019 else { 4020 __kmp_affinity_num_masks = numUnique; 4021 } 4022 4023 # if OMP_40_ENABLED 4024 if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel ) 4025 && ( __kmp_affinity_num_places > 0 ) 4026 && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) { 4027 __kmp_affinity_num_masks = __kmp_affinity_num_places; 4028 } 4029 # endif 4030 4031 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4032 4033 // 4034 // Sort the address2os table according to the current setting of 4035 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 4036 // 4037 qsort(address2os, __kmp_avail_proc, sizeof(*address2os), 4038 __kmp_affinity_cmp_Address_child_num); 4039 { 4040 int i; 4041 unsigned j; 4042 for (i = 0, j = 0; i < __kmp_avail_proc; i++) { 4043 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) { 4044 continue; 4045 } 4046 unsigned osId = address2os[i].second; 4047 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 4048 kmp_affin_mask_t *dest 4049 = KMP_CPU_INDEX(__kmp_affinity_masks, j); 4050 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 4051 KMP_CPU_COPY(dest, src); 4052 if (++j >= __kmp_affinity_num_masks) { 4053 break; 4054 } 4055 } 4056 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 4057 } 4058 break; 4059 4060 default: 4061 KMP_ASSERT2(0, "Unexpected affinity setting"); 4062 } 4063 4064 __kmp_free(osId2Mask); 4065 machine_hierarchy.init(address2os, __kmp_avail_proc); 4066 } 4067 #undef KMP_EXIT_AFF_NONE 4068 4069 4070 void 4071 __kmp_affinity_initialize(void) 4072 { 4073 // 4074 // Much of the code above was written assumming that if a machine was not 4075 // affinity capable, then __kmp_affinity_type == affinity_none. We now 4076 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 4077 // 4078 // There are too many checks for __kmp_affinity_type == affinity_none 4079 // in this code. Instead of trying to change them all, check if 4080 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 4081 // affinity_none, call the real initialization routine, then restore 4082 // __kmp_affinity_type to affinity_disabled. 4083 // 4084 int disabled = (__kmp_affinity_type == affinity_disabled); 4085 if (! KMP_AFFINITY_CAPABLE()) { 4086 KMP_ASSERT(disabled); 4087 } 4088 if (disabled) { 4089 __kmp_affinity_type = affinity_none; 4090 } 4091 __kmp_aux_affinity_initialize(); 4092 if (disabled) { 4093 __kmp_affinity_type = affinity_disabled; 4094 } 4095 } 4096 4097 4098 void 4099 __kmp_affinity_uninitialize(void) 4100 { 4101 if (__kmp_affinity_masks != NULL) { 4102 KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4103 __kmp_affinity_masks = NULL; 4104 } 4105 if (__kmp_affin_fullMask != NULL) { 4106 KMP_CPU_FREE(__kmp_affin_fullMask); 4107 __kmp_affin_fullMask = NULL; 4108 } 4109 __kmp_affinity_num_masks = 0; 4110 # if OMP_40_ENABLED 4111 __kmp_affinity_num_places = 0; 4112 # endif 4113 if (__kmp_affinity_proclist != NULL) { 4114 __kmp_free(__kmp_affinity_proclist); 4115 __kmp_affinity_proclist = NULL; 4116 } 4117 if( address2os != NULL ) { 4118 __kmp_free( address2os ); 4119 address2os = NULL; 4120 } 4121 if( procarr != NULL ) { 4122 __kmp_free( procarr ); 4123 procarr = NULL; 4124 } 4125 # if KMP_USE_HWLOC 4126 if (__kmp_hwloc_topology != NULL) { 4127 hwloc_topology_destroy(__kmp_hwloc_topology); 4128 __kmp_hwloc_topology = NULL; 4129 } 4130 # endif 4131 } 4132 4133 4134 void 4135 __kmp_affinity_set_init_mask(int gtid, int isa_root) 4136 { 4137 if (! KMP_AFFINITY_CAPABLE()) { 4138 return; 4139 } 4140 4141 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4142 if (th->th.th_affin_mask == NULL) { 4143 KMP_CPU_ALLOC(th->th.th_affin_mask); 4144 } 4145 else { 4146 KMP_CPU_ZERO(th->th.th_affin_mask); 4147 } 4148 4149 // 4150 // Copy the thread mask to the kmp_info_t strucuture. 4151 // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one 4152 // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask 4153 // is set, then the full mask is the same as the mask of the initialization 4154 // thread. 4155 // 4156 kmp_affin_mask_t *mask; 4157 int i; 4158 4159 # if OMP_40_ENABLED 4160 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 4161 # endif 4162 { 4163 if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced) 4164 ) { 4165 # if KMP_GROUP_AFFINITY 4166 if (__kmp_num_proc_groups > 1) { 4167 return; 4168 } 4169 # endif 4170 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4171 i = KMP_PLACE_ALL; 4172 mask = __kmp_affin_fullMask; 4173 } 4174 else { 4175 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 ); 4176 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4177 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4178 } 4179 } 4180 # if OMP_40_ENABLED 4181 else { 4182 if ((! isa_root) 4183 || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 4184 # if KMP_GROUP_AFFINITY 4185 if (__kmp_num_proc_groups > 1) { 4186 return; 4187 } 4188 # endif 4189 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4190 i = KMP_PLACE_ALL; 4191 mask = __kmp_affin_fullMask; 4192 } 4193 else { 4194 // 4195 // int i = some hash function or just a counter that doesn't 4196 // always start at 0. Use gtid for now. 4197 // 4198 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 ); 4199 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4200 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4201 } 4202 } 4203 # endif 4204 4205 # if OMP_40_ENABLED 4206 th->th.th_current_place = i; 4207 if (isa_root) { 4208 th->th.th_new_place = i; 4209 th->th.th_first_place = 0; 4210 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4211 } 4212 4213 if (i == KMP_PLACE_ALL) { 4214 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 4215 gtid)); 4216 } 4217 else { 4218 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 4219 gtid, i)); 4220 } 4221 # else 4222 if (i == -1) { 4223 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to __kmp_affin_fullMask\n", 4224 gtid)); 4225 } 4226 else { 4227 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n", 4228 gtid, i)); 4229 } 4230 # endif /* OMP_40_ENABLED */ 4231 4232 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4233 4234 if (__kmp_affinity_verbose) { 4235 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4236 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4237 th->th.th_affin_mask); 4238 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid, 4239 buf); 4240 } 4241 4242 # if KMP_OS_WINDOWS 4243 // 4244 // On Windows* OS, the process affinity mask might have changed. 4245 // If the user didn't request affinity and this call fails, 4246 // just continue silently. See CQ171393. 4247 // 4248 if ( __kmp_affinity_type == affinity_none ) { 4249 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 4250 } 4251 else 4252 # endif 4253 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4254 } 4255 4256 4257 # if OMP_40_ENABLED 4258 4259 void 4260 __kmp_affinity_set_place(int gtid) 4261 { 4262 int retval; 4263 4264 if (! KMP_AFFINITY_CAPABLE()) { 4265 return; 4266 } 4267 4268 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4269 4270 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n", 4271 gtid, th->th.th_new_place, th->th.th_current_place)); 4272 4273 // 4274 // Check that the new place is within this thread's partition. 4275 // 4276 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4277 KMP_ASSERT(th->th.th_new_place >= 0); 4278 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 4279 if (th->th.th_first_place <= th->th.th_last_place) { 4280 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) 4281 && (th->th.th_new_place <= th->th.th_last_place)); 4282 } 4283 else { 4284 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) 4285 || (th->th.th_new_place >= th->th.th_last_place)); 4286 } 4287 4288 // 4289 // Copy the thread mask to the kmp_info_t strucuture, 4290 // and set this thread's affinity. 4291 // 4292 kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, 4293 th->th.th_new_place); 4294 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4295 th->th.th_current_place = th->th.th_new_place; 4296 4297 if (__kmp_affinity_verbose) { 4298 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4299 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4300 th->th.th_affin_mask); 4301 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), 4302 gtid, buf); 4303 } 4304 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4305 } 4306 4307 # endif /* OMP_40_ENABLED */ 4308 4309 4310 int 4311 __kmp_aux_set_affinity(void **mask) 4312 { 4313 int gtid; 4314 kmp_info_t *th; 4315 int retval; 4316 4317 if (! KMP_AFFINITY_CAPABLE()) { 4318 return -1; 4319 } 4320 4321 gtid = __kmp_entry_gtid(); 4322 KA_TRACE(1000, ;{ 4323 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4324 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4325 (kmp_affin_mask_t *)(*mask)); 4326 __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n", 4327 gtid, buf); 4328 }); 4329 4330 if (__kmp_env_consistency_check) { 4331 if ((mask == NULL) || (*mask == NULL)) { 4332 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4333 } 4334 else { 4335 unsigned proc; 4336 int num_procs = 0; 4337 4338 KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t*)(*mask))) { 4339 if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4340 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4341 } 4342 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 4343 continue; 4344 } 4345 num_procs++; 4346 } 4347 if (num_procs == 0) { 4348 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4349 } 4350 4351 # if KMP_GROUP_AFFINITY 4352 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 4353 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4354 } 4355 # endif /* KMP_GROUP_AFFINITY */ 4356 4357 } 4358 } 4359 4360 th = __kmp_threads[gtid]; 4361 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4362 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4363 if (retval == 0) { 4364 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 4365 } 4366 4367 # if OMP_40_ENABLED 4368 th->th.th_current_place = KMP_PLACE_UNDEFINED; 4369 th->th.th_new_place = KMP_PLACE_UNDEFINED; 4370 th->th.th_first_place = 0; 4371 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4372 4373 // 4374 // Turn off 4.0 affinity for the current tread at this parallel level. 4375 // 4376 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 4377 # endif 4378 4379 return retval; 4380 } 4381 4382 4383 int 4384 __kmp_aux_get_affinity(void **mask) 4385 { 4386 int gtid; 4387 int retval; 4388 kmp_info_t *th; 4389 4390 if (! KMP_AFFINITY_CAPABLE()) { 4391 return -1; 4392 } 4393 4394 gtid = __kmp_entry_gtid(); 4395 th = __kmp_threads[gtid]; 4396 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4397 4398 KA_TRACE(1000, ;{ 4399 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4400 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4401 th->th.th_affin_mask); 4402 __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf); 4403 }); 4404 4405 if (__kmp_env_consistency_check) { 4406 if ((mask == NULL) || (*mask == NULL)) { 4407 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 4408 } 4409 } 4410 4411 # if !KMP_OS_WINDOWS 4412 4413 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4414 KA_TRACE(1000, ;{ 4415 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4416 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4417 (kmp_affin_mask_t *)(*mask)); 4418 __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf); 4419 }); 4420 return retval; 4421 4422 # else 4423 4424 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 4425 return 0; 4426 4427 # endif /* KMP_OS_WINDOWS */ 4428 4429 } 4430 4431 int 4432 __kmp_aux_set_affinity_mask_proc(int proc, void **mask) 4433 { 4434 int retval; 4435 4436 if (! KMP_AFFINITY_CAPABLE()) { 4437 return -1; 4438 } 4439 4440 KA_TRACE(1000, ;{ 4441 int gtid = __kmp_entry_gtid(); 4442 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4443 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4444 (kmp_affin_mask_t *)(*mask)); 4445 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n", 4446 proc, gtid, buf); 4447 }); 4448 4449 if (__kmp_env_consistency_check) { 4450 if ((mask == NULL) || (*mask == NULL)) { 4451 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4452 } 4453 } 4454 4455 if ((proc < 0) 4456 # if !KMP_USE_HWLOC 4457 || ((unsigned)proc >= KMP_CPU_SETSIZE) 4458 # endif 4459 ) { 4460 return -1; 4461 } 4462 if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4463 return -2; 4464 } 4465 4466 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 4467 return 0; 4468 } 4469 4470 4471 int 4472 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) 4473 { 4474 int retval; 4475 4476 if (! KMP_AFFINITY_CAPABLE()) { 4477 return -1; 4478 } 4479 4480 KA_TRACE(1000, ;{ 4481 int gtid = __kmp_entry_gtid(); 4482 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4483 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4484 (kmp_affin_mask_t *)(*mask)); 4485 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n", 4486 proc, gtid, buf); 4487 }); 4488 4489 if (__kmp_env_consistency_check) { 4490 if ((mask == NULL) || (*mask == NULL)) { 4491 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 4492 } 4493 } 4494 4495 if ((proc < 0) 4496 # if !KMP_USE_HWLOC 4497 || ((unsigned)proc >= KMP_CPU_SETSIZE) 4498 # endif 4499 ) { 4500 return -1; 4501 } 4502 if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4503 return -2; 4504 } 4505 4506 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 4507 return 0; 4508 } 4509 4510 4511 int 4512 __kmp_aux_get_affinity_mask_proc(int proc, void **mask) 4513 { 4514 int retval; 4515 4516 if (! KMP_AFFINITY_CAPABLE()) { 4517 return -1; 4518 } 4519 4520 KA_TRACE(1000, ;{ 4521 int gtid = __kmp_entry_gtid(); 4522 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4523 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4524 (kmp_affin_mask_t *)(*mask)); 4525 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n", 4526 proc, gtid, buf); 4527 }); 4528 4529 if (__kmp_env_consistency_check) { 4530 if ((mask == NULL) || (*mask == NULL)) { 4531 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc"); 4532 } 4533 } 4534 4535 if ((proc < 0) 4536 # if !KMP_USE_HWLOC 4537 || ((unsigned)proc >= KMP_CPU_SETSIZE) 4538 # endif 4539 ) { 4540 return -1; 4541 } 4542 if (! KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4543 return 0; 4544 } 4545 4546 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 4547 } 4548 4549 4550 // Dynamic affinity settings - Affinity balanced 4551 void __kmp_balanced_affinity( int tid, int nthreads ) 4552 { 4553 if( __kmp_affinity_uniform_topology() ) { 4554 int coreID; 4555 int threadID; 4556 // Number of hyper threads per core in HT machine 4557 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 4558 // Number of cores 4559 int ncores = __kmp_ncores; 4560 // How many threads will be bound to each core 4561 int chunk = nthreads / ncores; 4562 // How many cores will have an additional thread bound to it - "big cores" 4563 int big_cores = nthreads % ncores; 4564 // Number of threads on the big cores 4565 int big_nth = ( chunk + 1 ) * big_cores; 4566 if( tid < big_nth ) { 4567 coreID = tid / (chunk + 1 ); 4568 threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ; 4569 } else { //tid >= big_nth 4570 coreID = ( tid - big_cores ) / chunk; 4571 threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ; 4572 } 4573 4574 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 4575 "Illegal set affinity operation when not capable"); 4576 4577 kmp_affin_mask_t *mask; 4578 KMP_CPU_ALLOC_ON_STACK(mask); 4579 KMP_CPU_ZERO(mask); 4580 4581 // Granularity == thread 4582 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4583 int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second; 4584 KMP_CPU_SET( osID, mask); 4585 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4586 for( int i = 0; i < __kmp_nth_per_core; i++ ) { 4587 int osID; 4588 osID = address2os[ coreID * __kmp_nth_per_core + i ].second; 4589 KMP_CPU_SET( osID, mask); 4590 } 4591 } 4592 if (__kmp_affinity_verbose) { 4593 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4594 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4595 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4596 tid, buf); 4597 } 4598 __kmp_set_system_affinity( mask, TRUE ); 4599 KMP_CPU_FREE_FROM_STACK(mask); 4600 } else { // Non-uniform topology 4601 4602 kmp_affin_mask_t *mask; 4603 KMP_CPU_ALLOC_ON_STACK(mask); 4604 KMP_CPU_ZERO(mask); 4605 4606 // Number of hyper threads per core in HT machine 4607 int nth_per_core = __kmp_nThreadsPerCore; 4608 int core_level; 4609 if( nth_per_core > 1 ) { 4610 core_level = __kmp_aff_depth - 2; 4611 } else { 4612 core_level = __kmp_aff_depth - 1; 4613 } 4614 4615 // Number of cores - maximum value; it does not count trail cores with 0 processors 4616 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1; 4617 4618 // For performance gain consider the special case nthreads == __kmp_avail_proc 4619 if( nthreads == __kmp_avail_proc ) { 4620 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4621 int osID = address2os[ tid ].second; 4622 KMP_CPU_SET( osID, mask); 4623 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4624 int coreID = address2os[ tid ].first.labels[ core_level ]; 4625 // We'll count found osIDs for the current core; they can be not more than nth_per_core; 4626 // since the address2os is sortied we can break when cnt==nth_per_core 4627 int cnt = 0; 4628 for( int i = 0; i < __kmp_avail_proc; i++ ) { 4629 int osID = address2os[ i ].second; 4630 int core = address2os[ i ].first.labels[ core_level ]; 4631 if( core == coreID ) { 4632 KMP_CPU_SET( osID, mask); 4633 cnt++; 4634 if( cnt == nth_per_core ) { 4635 break; 4636 } 4637 } 4638 } 4639 } 4640 } else if( nthreads <= __kmp_ncores ) { 4641 4642 int core = 0; 4643 for( int i = 0; i < ncores; i++ ) { 4644 // Check if this core from procarr[] is in the mask 4645 int in_mask = 0; 4646 for( int j = 0; j < nth_per_core; j++ ) { 4647 if( procarr[ i * nth_per_core + j ] != - 1 ) { 4648 in_mask = 1; 4649 break; 4650 } 4651 } 4652 if( in_mask ) { 4653 if( tid == core ) { 4654 for( int j = 0; j < nth_per_core; j++ ) { 4655 int osID = procarr[ i * nth_per_core + j ]; 4656 if( osID != -1 ) { 4657 KMP_CPU_SET( osID, mask ); 4658 // For granularity=thread it is enough to set the first available osID for this core 4659 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4660 break; 4661 } 4662 } 4663 } 4664 break; 4665 } else { 4666 core++; 4667 } 4668 } 4669 } 4670 4671 } else { // nthreads > __kmp_ncores 4672 4673 // Array to save the number of processors at each core 4674 int* nproc_at_core = (int*)KMP_ALLOCA(sizeof(int)*ncores); 4675 // Array to save the number of cores with "x" available processors; 4676 int* ncores_with_x_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1)); 4677 // Array to save the number of cores with # procs from x to nth_per_core 4678 int* ncores_with_x_to_max_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1)); 4679 4680 for( int i = 0; i <= nth_per_core; i++ ) { 4681 ncores_with_x_procs[ i ] = 0; 4682 ncores_with_x_to_max_procs[ i ] = 0; 4683 } 4684 4685 for( int i = 0; i < ncores; i++ ) { 4686 int cnt = 0; 4687 for( int j = 0; j < nth_per_core; j++ ) { 4688 if( procarr[ i * nth_per_core + j ] != -1 ) { 4689 cnt++; 4690 } 4691 } 4692 nproc_at_core[ i ] = cnt; 4693 ncores_with_x_procs[ cnt ]++; 4694 } 4695 4696 for( int i = 0; i <= nth_per_core; i++ ) { 4697 for( int j = i; j <= nth_per_core; j++ ) { 4698 ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ]; 4699 } 4700 } 4701 4702 // Max number of processors 4703 int nproc = nth_per_core * ncores; 4704 // An array to keep number of threads per each context 4705 int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc ); 4706 for( int i = 0; i < nproc; i++ ) { 4707 newarr[ i ] = 0; 4708 } 4709 4710 int nth = nthreads; 4711 int flag = 0; 4712 while( nth > 0 ) { 4713 for( int j = 1; j <= nth_per_core; j++ ) { 4714 int cnt = ncores_with_x_to_max_procs[ j ]; 4715 for( int i = 0; i < ncores; i++ ) { 4716 // Skip the core with 0 processors 4717 if( nproc_at_core[ i ] == 0 ) { 4718 continue; 4719 } 4720 for( int k = 0; k < nth_per_core; k++ ) { 4721 if( procarr[ i * nth_per_core + k ] != -1 ) { 4722 if( newarr[ i * nth_per_core + k ] == 0 ) { 4723 newarr[ i * nth_per_core + k ] = 1; 4724 cnt--; 4725 nth--; 4726 break; 4727 } else { 4728 if( flag != 0 ) { 4729 newarr[ i * nth_per_core + k ] ++; 4730 cnt--; 4731 nth--; 4732 break; 4733 } 4734 } 4735 } 4736 } 4737 if( cnt == 0 || nth == 0 ) { 4738 break; 4739 } 4740 } 4741 if( nth == 0 ) { 4742 break; 4743 } 4744 } 4745 flag = 1; 4746 } 4747 int sum = 0; 4748 for( int i = 0; i < nproc; i++ ) { 4749 sum += newarr[ i ]; 4750 if( sum > tid ) { 4751 // Granularity == thread 4752 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4753 int osID = procarr[ i ]; 4754 KMP_CPU_SET( osID, mask); 4755 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4756 int coreID = i / nth_per_core; 4757 for( int ii = 0; ii < nth_per_core; ii++ ) { 4758 int osID = procarr[ coreID * nth_per_core + ii ]; 4759 if( osID != -1 ) { 4760 KMP_CPU_SET( osID, mask); 4761 } 4762 } 4763 } 4764 break; 4765 } 4766 } 4767 __kmp_free( newarr ); 4768 } 4769 4770 if (__kmp_affinity_verbose) { 4771 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4772 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4773 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4774 tid, buf); 4775 } 4776 __kmp_set_system_affinity( mask, TRUE ); 4777 KMP_CPU_FREE_FROM_STACK(mask); 4778 } 4779 } 4780 4781 #if KMP_OS_LINUX 4782 // We don't need this entry for Windows because 4783 // there is GetProcessAffinityMask() api 4784 // 4785 // The intended usage is indicated by these steps: 4786 // 1) The user gets the current affinity mask 4787 // 2) Then sets the affinity by calling this function 4788 // 3) Error check the return value 4789 // 4) Use non-OpenMP parallelization 4790 // 5) Reset the affinity to what was stored in step 1) 4791 #ifdef __cplusplus 4792 extern "C" 4793 #endif 4794 int 4795 kmp_set_thread_affinity_mask_initial() 4796 // the function returns 0 on success, 4797 // -1 if we cannot bind thread 4798 // >0 (errno) if an error happened during binding 4799 { 4800 int gtid = __kmp_get_gtid(); 4801 if (gtid < 0) { 4802 // Do not touch non-omp threads 4803 KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: " 4804 "non-omp thread, returning\n")); 4805 return -1; 4806 } 4807 if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) { 4808 KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: " 4809 "affinity not initialized, returning\n")); 4810 return -1; 4811 } 4812 KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: " 4813 "set full mask for thread %d\n", gtid)); 4814 KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL); 4815 return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE); 4816 } 4817 #endif 4818 4819 #endif // KMP_AFFINITY_SUPPORTED 4820