1 /* 2 * kmp_affinity.cpp -- affinity management 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_i18n.h" 16 #include "kmp_io.h" 17 #include "kmp_str.h" 18 #include "kmp_wrapper_getpid.h" 19 #if KMP_USE_HIER_SCHED 20 #include "kmp_dispatch_hier.h" 21 #endif 22 #if KMP_USE_HWLOC 23 // Copied from hwloc 24 #define HWLOC_GROUP_KIND_INTEL_MODULE 102 25 #define HWLOC_GROUP_KIND_INTEL_TILE 103 26 #define HWLOC_GROUP_KIND_INTEL_DIE 104 27 #define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220 28 #endif 29 #include <ctype.h> 30 31 // The machine topology 32 kmp_topology_t *__kmp_topology = nullptr; 33 // KMP_HW_SUBSET environment variable 34 kmp_hw_subset_t *__kmp_hw_subset = nullptr; 35 36 // Store the real or imagined machine hierarchy here 37 static hierarchy_info machine_hierarchy; 38 39 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); } 40 41 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 42 kmp_uint32 depth; 43 // The test below is true if affinity is available, but set to "none". Need to 44 // init on first use of hierarchical barrier. 45 if (TCR_1(machine_hierarchy.uninitialized)) 46 machine_hierarchy.init(nproc); 47 48 // Adjust the hierarchy in case num threads exceeds original 49 if (nproc > machine_hierarchy.base_num_threads) 50 machine_hierarchy.resize(nproc); 51 52 depth = machine_hierarchy.depth; 53 KMP_DEBUG_ASSERT(depth > 0); 54 55 thr_bar->depth = depth; 56 __kmp_type_convert(machine_hierarchy.numPerLevel[0] - 1, 57 &(thr_bar->base_leaf_kids)); 58 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 59 } 60 61 static int nCoresPerPkg, nPackages; 62 static int __kmp_nThreadsPerCore; 63 #ifndef KMP_DFLT_NTH_CORES 64 static int __kmp_ncores; 65 #endif 66 67 const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) { 68 switch (type) { 69 case KMP_HW_SOCKET: 70 return ((plural) ? KMP_I18N_STR(Sockets) : KMP_I18N_STR(Socket)); 71 case KMP_HW_DIE: 72 return ((plural) ? KMP_I18N_STR(Dice) : KMP_I18N_STR(Die)); 73 case KMP_HW_MODULE: 74 return ((plural) ? KMP_I18N_STR(Modules) : KMP_I18N_STR(Module)); 75 case KMP_HW_TILE: 76 return ((plural) ? KMP_I18N_STR(Tiles) : KMP_I18N_STR(Tile)); 77 case KMP_HW_NUMA: 78 return ((plural) ? KMP_I18N_STR(NumaDomains) : KMP_I18N_STR(NumaDomain)); 79 case KMP_HW_L3: 80 return ((plural) ? KMP_I18N_STR(L3Caches) : KMP_I18N_STR(L3Cache)); 81 case KMP_HW_L2: 82 return ((plural) ? KMP_I18N_STR(L2Caches) : KMP_I18N_STR(L2Cache)); 83 case KMP_HW_L1: 84 return ((plural) ? KMP_I18N_STR(L1Caches) : KMP_I18N_STR(L1Cache)); 85 case KMP_HW_LLC: 86 return ((plural) ? KMP_I18N_STR(LLCaches) : KMP_I18N_STR(LLCache)); 87 case KMP_HW_CORE: 88 return ((plural) ? KMP_I18N_STR(Cores) : KMP_I18N_STR(Core)); 89 case KMP_HW_THREAD: 90 return ((plural) ? KMP_I18N_STR(Threads) : KMP_I18N_STR(Thread)); 91 case KMP_HW_PROC_GROUP: 92 return ((plural) ? KMP_I18N_STR(ProcGroups) : KMP_I18N_STR(ProcGroup)); 93 } 94 return KMP_I18N_STR(Unknown); 95 } 96 97 const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) { 98 switch (type) { 99 case KMP_HW_SOCKET: 100 return ((plural) ? "sockets" : "socket"); 101 case KMP_HW_DIE: 102 return ((plural) ? "dice" : "die"); 103 case KMP_HW_MODULE: 104 return ((plural) ? "modules" : "module"); 105 case KMP_HW_TILE: 106 return ((plural) ? "tiles" : "tile"); 107 case KMP_HW_NUMA: 108 return ((plural) ? "numa_domains" : "numa_domain"); 109 case KMP_HW_L3: 110 return ((plural) ? "l3_caches" : "l3_cache"); 111 case KMP_HW_L2: 112 return ((plural) ? "l2_caches" : "l2_cache"); 113 case KMP_HW_L1: 114 return ((plural) ? "l1_caches" : "l1_cache"); 115 case KMP_HW_LLC: 116 return ((plural) ? "ll_caches" : "ll_cache"); 117 case KMP_HW_CORE: 118 return ((plural) ? "cores" : "core"); 119 case KMP_HW_THREAD: 120 return ((plural) ? "threads" : "thread"); 121 case KMP_HW_PROC_GROUP: 122 return ((plural) ? "proc_groups" : "proc_group"); 123 } 124 return ((plural) ? "unknowns" : "unknown"); 125 } 126 127 const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) { 128 switch (type) { 129 case KMP_HW_CORE_TYPE_UNKNOWN: 130 return "unknown"; 131 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 132 case KMP_HW_CORE_TYPE_ATOM: 133 return "Intel Atom(R) processor"; 134 case KMP_HW_CORE_TYPE_CORE: 135 return "Intel(R) Core(TM) processor"; 136 #endif 137 } 138 return "unknown"; 139 } 140 141 //////////////////////////////////////////////////////////////////////////////// 142 // kmp_hw_thread_t methods 143 int kmp_hw_thread_t::compare_ids(const void *a, const void *b) { 144 const kmp_hw_thread_t *ahwthread = (const kmp_hw_thread_t *)a; 145 const kmp_hw_thread_t *bhwthread = (const kmp_hw_thread_t *)b; 146 int depth = __kmp_topology->get_depth(); 147 for (int level = 0; level < depth; ++level) { 148 if (ahwthread->ids[level] < bhwthread->ids[level]) 149 return -1; 150 else if (ahwthread->ids[level] > bhwthread->ids[level]) 151 return 1; 152 } 153 if (ahwthread->os_id < bhwthread->os_id) 154 return -1; 155 else if (ahwthread->os_id > bhwthread->os_id) 156 return 1; 157 return 0; 158 } 159 160 #if KMP_AFFINITY_SUPPORTED 161 int kmp_hw_thread_t::compare_compact(const void *a, const void *b) { 162 int i; 163 const kmp_hw_thread_t *aa = (const kmp_hw_thread_t *)a; 164 const kmp_hw_thread_t *bb = (const kmp_hw_thread_t *)b; 165 int depth = __kmp_topology->get_depth(); 166 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0); 167 KMP_DEBUG_ASSERT(__kmp_affinity_compact <= depth); 168 for (i = 0; i < __kmp_affinity_compact; i++) { 169 int j = depth - i - 1; 170 if (aa->sub_ids[j] < bb->sub_ids[j]) 171 return -1; 172 if (aa->sub_ids[j] > bb->sub_ids[j]) 173 return 1; 174 } 175 for (; i < depth; i++) { 176 int j = i - __kmp_affinity_compact; 177 if (aa->sub_ids[j] < bb->sub_ids[j]) 178 return -1; 179 if (aa->sub_ids[j] > bb->sub_ids[j]) 180 return 1; 181 } 182 return 0; 183 } 184 #endif 185 186 void kmp_hw_thread_t::print() const { 187 int depth = __kmp_topology->get_depth(); 188 printf("%4d ", os_id); 189 for (int i = 0; i < depth; ++i) { 190 printf("%4d ", ids[i]); 191 } 192 if (core_type != KMP_HW_CORE_TYPE_UNKNOWN) { 193 printf(" (%s)", __kmp_hw_get_core_type_string(core_type)); 194 } 195 printf("\n"); 196 } 197 198 //////////////////////////////////////////////////////////////////////////////// 199 // kmp_topology_t methods 200 201 // Add a layer to the topology based on the ids. Assume the topology 202 // is perfectly nested (i.e., so no object has more than one parent) 203 void kmp_topology_t::_insert_layer(kmp_hw_t type, const int *ids) { 204 // Figure out where the layer should go by comparing the ids of the current 205 // layers with the new ids 206 int target_layer; 207 int previous_id = kmp_hw_thread_t::UNKNOWN_ID; 208 int previous_new_id = kmp_hw_thread_t::UNKNOWN_ID; 209 210 // Start from the highest layer and work down to find target layer 211 // If new layer is equal to another layer then put the new layer above 212 for (target_layer = 0; target_layer < depth; ++target_layer) { 213 bool layers_equal = true; 214 bool strictly_above_target_layer = false; 215 for (int i = 0; i < num_hw_threads; ++i) { 216 int id = hw_threads[i].ids[target_layer]; 217 int new_id = ids[i]; 218 if (id != previous_id && new_id == previous_new_id) { 219 // Found the layer we are strictly above 220 strictly_above_target_layer = true; 221 layers_equal = false; 222 break; 223 } else if (id == previous_id && new_id != previous_new_id) { 224 // Found a layer we are below. Move to next layer and check. 225 layers_equal = false; 226 break; 227 } 228 previous_id = id; 229 previous_new_id = new_id; 230 } 231 if (strictly_above_target_layer || layers_equal) 232 break; 233 } 234 235 // Found the layer we are above. Now move everything to accommodate the new 236 // layer. And put the new ids and type into the topology. 237 for (int i = depth - 1, j = depth; i >= target_layer; --i, --j) 238 types[j] = types[i]; 239 types[target_layer] = type; 240 for (int k = 0; k < num_hw_threads; ++k) { 241 for (int i = depth - 1, j = depth; i >= target_layer; --i, --j) 242 hw_threads[k].ids[j] = hw_threads[k].ids[i]; 243 hw_threads[k].ids[target_layer] = ids[k]; 244 } 245 equivalent[type] = type; 246 depth++; 247 } 248 249 #if KMP_GROUP_AFFINITY 250 // Insert the Windows Processor Group structure into the topology 251 void kmp_topology_t::_insert_windows_proc_groups() { 252 // Do not insert the processor group structure for a single group 253 if (__kmp_num_proc_groups == 1) 254 return; 255 kmp_affin_mask_t *mask; 256 int *ids = (int *)__kmp_allocate(sizeof(int) * num_hw_threads); 257 KMP_CPU_ALLOC(mask); 258 for (int i = 0; i < num_hw_threads; ++i) { 259 KMP_CPU_ZERO(mask); 260 KMP_CPU_SET(hw_threads[i].os_id, mask); 261 ids[i] = __kmp_get_proc_group(mask); 262 } 263 KMP_CPU_FREE(mask); 264 _insert_layer(KMP_HW_PROC_GROUP, ids); 265 __kmp_free(ids); 266 } 267 #endif 268 269 // Remove layers that don't add information to the topology. 270 // This is done by having the layer take on the id = UNKNOWN_ID (-1) 271 void kmp_topology_t::_remove_radix1_layers() { 272 int preference[KMP_HW_LAST]; 273 int top_index1, top_index2; 274 // Set up preference associative array 275 preference[KMP_HW_SOCKET] = 110; 276 preference[KMP_HW_PROC_GROUP] = 100; 277 preference[KMP_HW_CORE] = 95; 278 preference[KMP_HW_THREAD] = 90; 279 preference[KMP_HW_NUMA] = 85; 280 preference[KMP_HW_DIE] = 80; 281 preference[KMP_HW_TILE] = 75; 282 preference[KMP_HW_MODULE] = 73; 283 preference[KMP_HW_L3] = 70; 284 preference[KMP_HW_L2] = 65; 285 preference[KMP_HW_L1] = 60; 286 preference[KMP_HW_LLC] = 5; 287 top_index1 = 0; 288 top_index2 = 1; 289 while (top_index1 < depth - 1 && top_index2 < depth) { 290 kmp_hw_t type1 = types[top_index1]; 291 kmp_hw_t type2 = types[top_index2]; 292 KMP_ASSERT_VALID_HW_TYPE(type1); 293 KMP_ASSERT_VALID_HW_TYPE(type2); 294 // Do not allow the three main topology levels (sockets, cores, threads) to 295 // be compacted down 296 if ((type1 == KMP_HW_THREAD || type1 == KMP_HW_CORE || 297 type1 == KMP_HW_SOCKET) && 298 (type2 == KMP_HW_THREAD || type2 == KMP_HW_CORE || 299 type2 == KMP_HW_SOCKET)) { 300 top_index1 = top_index2++; 301 continue; 302 } 303 bool radix1 = true; 304 bool all_same = true; 305 int id1 = hw_threads[0].ids[top_index1]; 306 int id2 = hw_threads[0].ids[top_index2]; 307 int pref1 = preference[type1]; 308 int pref2 = preference[type2]; 309 for (int hwidx = 1; hwidx < num_hw_threads; ++hwidx) { 310 if (hw_threads[hwidx].ids[top_index1] == id1 && 311 hw_threads[hwidx].ids[top_index2] != id2) { 312 radix1 = false; 313 break; 314 } 315 if (hw_threads[hwidx].ids[top_index2] != id2) 316 all_same = false; 317 id1 = hw_threads[hwidx].ids[top_index1]; 318 id2 = hw_threads[hwidx].ids[top_index2]; 319 } 320 if (radix1) { 321 // Select the layer to remove based on preference 322 kmp_hw_t remove_type, keep_type; 323 int remove_layer, remove_layer_ids; 324 if (pref1 > pref2) { 325 remove_type = type2; 326 remove_layer = remove_layer_ids = top_index2; 327 keep_type = type1; 328 } else { 329 remove_type = type1; 330 remove_layer = remove_layer_ids = top_index1; 331 keep_type = type2; 332 } 333 // If all the indexes for the second (deeper) layer are the same. 334 // e.g., all are zero, then make sure to keep the first layer's ids 335 if (all_same) 336 remove_layer_ids = top_index2; 337 // Remove radix one type by setting the equivalence, removing the id from 338 // the hw threads and removing the layer from types and depth 339 set_equivalent_type(remove_type, keep_type); 340 for (int idx = 0; idx < num_hw_threads; ++idx) { 341 kmp_hw_thread_t &hw_thread = hw_threads[idx]; 342 for (int d = remove_layer_ids; d < depth - 1; ++d) 343 hw_thread.ids[d] = hw_thread.ids[d + 1]; 344 } 345 for (int idx = remove_layer; idx < depth - 1; ++idx) 346 types[idx] = types[idx + 1]; 347 depth--; 348 } else { 349 top_index1 = top_index2++; 350 } 351 } 352 KMP_ASSERT(depth > 0); 353 } 354 355 void kmp_topology_t::_set_last_level_cache() { 356 if (get_equivalent_type(KMP_HW_L3) != KMP_HW_UNKNOWN) 357 set_equivalent_type(KMP_HW_LLC, KMP_HW_L3); 358 else if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN) 359 set_equivalent_type(KMP_HW_LLC, KMP_HW_L2); 360 #if KMP_MIC_SUPPORTED 361 else if (__kmp_mic_type == mic3) { 362 if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN) 363 set_equivalent_type(KMP_HW_LLC, KMP_HW_L2); 364 else if (get_equivalent_type(KMP_HW_TILE) != KMP_HW_UNKNOWN) 365 set_equivalent_type(KMP_HW_LLC, KMP_HW_TILE); 366 // L2/Tile wasn't detected so just say L1 367 else 368 set_equivalent_type(KMP_HW_LLC, KMP_HW_L1); 369 } 370 #endif 371 else if (get_equivalent_type(KMP_HW_L1) != KMP_HW_UNKNOWN) 372 set_equivalent_type(KMP_HW_LLC, KMP_HW_L1); 373 // Fallback is to set last level cache to socket or core 374 if (get_equivalent_type(KMP_HW_LLC) == KMP_HW_UNKNOWN) { 375 if (get_equivalent_type(KMP_HW_SOCKET) != KMP_HW_UNKNOWN) 376 set_equivalent_type(KMP_HW_LLC, KMP_HW_SOCKET); 377 else if (get_equivalent_type(KMP_HW_CORE) != KMP_HW_UNKNOWN) 378 set_equivalent_type(KMP_HW_LLC, KMP_HW_CORE); 379 } 380 KMP_ASSERT(get_equivalent_type(KMP_HW_LLC) != KMP_HW_UNKNOWN); 381 } 382 383 // Gather the count of each topology layer and the ratio 384 void kmp_topology_t::_gather_enumeration_information() { 385 int previous_id[KMP_HW_LAST]; 386 int max[KMP_HW_LAST]; 387 int previous_core_id = kmp_hw_thread_t::UNKNOWN_ID; 388 389 for (int i = 0; i < depth; ++i) { 390 previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID; 391 max[i] = 0; 392 count[i] = 0; 393 ratio[i] = 0; 394 } 395 if (__kmp_is_hybrid_cpu()) { 396 for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i) { 397 core_types_count[i] = 0; 398 core_types[i] = KMP_HW_CORE_TYPE_UNKNOWN; 399 } 400 } 401 for (int i = 0; i < num_hw_threads; ++i) { 402 kmp_hw_thread_t &hw_thread = hw_threads[i]; 403 for (int layer = 0; layer < depth; ++layer) { 404 int id = hw_thread.ids[layer]; 405 if (id != previous_id[layer]) { 406 // Add an additional increment to each count 407 for (int l = layer; l < depth; ++l) 408 count[l]++; 409 // Keep track of topology layer ratio statistics 410 max[layer]++; 411 for (int l = layer + 1; l < depth; ++l) { 412 if (max[l] > ratio[l]) 413 ratio[l] = max[l]; 414 max[l] = 1; 415 } 416 break; 417 } 418 } 419 for (int layer = 0; layer < depth; ++layer) { 420 previous_id[layer] = hw_thread.ids[layer]; 421 } 422 // Figure out the number of each core type for hybrid CPUs 423 if (__kmp_is_hybrid_cpu()) { 424 int core_level = get_level(KMP_HW_CORE); 425 if (core_level != -1) { 426 if (hw_thread.ids[core_level] != previous_core_id) 427 _increment_core_type(hw_thread.core_type); 428 previous_core_id = hw_thread.ids[core_level]; 429 } 430 } 431 } 432 for (int layer = 0; layer < depth; ++layer) { 433 if (max[layer] > ratio[layer]) 434 ratio[layer] = max[layer]; 435 } 436 } 437 438 // Find out if the topology is uniform 439 void kmp_topology_t::_discover_uniformity() { 440 int num = 1; 441 for (int level = 0; level < depth; ++level) 442 num *= ratio[level]; 443 flags.uniform = (num == count[depth - 1]); 444 } 445 446 // Set all the sub_ids for each hardware thread 447 void kmp_topology_t::_set_sub_ids() { 448 int previous_id[KMP_HW_LAST]; 449 int sub_id[KMP_HW_LAST]; 450 451 for (int i = 0; i < depth; ++i) { 452 previous_id[i] = -1; 453 sub_id[i] = -1; 454 } 455 for (int i = 0; i < num_hw_threads; ++i) { 456 kmp_hw_thread_t &hw_thread = hw_threads[i]; 457 // Setup the sub_id 458 for (int j = 0; j < depth; ++j) { 459 if (hw_thread.ids[j] != previous_id[j]) { 460 sub_id[j]++; 461 for (int k = j + 1; k < depth; ++k) { 462 sub_id[k] = 0; 463 } 464 break; 465 } 466 } 467 // Set previous_id 468 for (int j = 0; j < depth; ++j) { 469 previous_id[j] = hw_thread.ids[j]; 470 } 471 // Set the sub_ids field 472 for (int j = 0; j < depth; ++j) { 473 hw_thread.sub_ids[j] = sub_id[j]; 474 } 475 } 476 } 477 478 void kmp_topology_t::_set_globals() { 479 // Set nCoresPerPkg, nPackages, __kmp_nThreadsPerCore, __kmp_ncores 480 int core_level, thread_level, package_level; 481 package_level = get_level(KMP_HW_SOCKET); 482 #if KMP_GROUP_AFFINITY 483 if (package_level == -1) 484 package_level = get_level(KMP_HW_PROC_GROUP); 485 #endif 486 core_level = get_level(KMP_HW_CORE); 487 thread_level = get_level(KMP_HW_THREAD); 488 489 KMP_ASSERT(core_level != -1); 490 KMP_ASSERT(thread_level != -1); 491 492 __kmp_nThreadsPerCore = calculate_ratio(thread_level, core_level); 493 if (package_level != -1) { 494 nCoresPerPkg = calculate_ratio(core_level, package_level); 495 nPackages = get_count(package_level); 496 } else { 497 // assume one socket 498 nCoresPerPkg = get_count(core_level); 499 nPackages = 1; 500 } 501 #ifndef KMP_DFLT_NTH_CORES 502 __kmp_ncores = get_count(core_level); 503 #endif 504 } 505 506 kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth, 507 const kmp_hw_t *types) { 508 kmp_topology_t *retval; 509 // Allocate all data in one large allocation 510 size_t size = sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc + 511 sizeof(int) * (size_t)KMP_HW_LAST * 3; 512 char *bytes = (char *)__kmp_allocate(size); 513 retval = (kmp_topology_t *)bytes; 514 if (nproc > 0) { 515 retval->hw_threads = (kmp_hw_thread_t *)(bytes + sizeof(kmp_topology_t)); 516 } else { 517 retval->hw_threads = nullptr; 518 } 519 retval->num_hw_threads = nproc; 520 retval->depth = ndepth; 521 int *arr = 522 (int *)(bytes + sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc); 523 retval->types = (kmp_hw_t *)arr; 524 retval->ratio = arr + (size_t)KMP_HW_LAST; 525 retval->count = arr + 2 * (size_t)KMP_HW_LAST; 526 KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; } 527 for (int i = 0; i < ndepth; ++i) { 528 retval->types[i] = types[i]; 529 retval->equivalent[types[i]] = types[i]; 530 } 531 return retval; 532 } 533 534 void kmp_topology_t::deallocate(kmp_topology_t *topology) { 535 if (topology) 536 __kmp_free(topology); 537 } 538 539 bool kmp_topology_t::check_ids() const { 540 // Assume ids have been sorted 541 if (num_hw_threads == 0) 542 return true; 543 for (int i = 1; i < num_hw_threads; ++i) { 544 kmp_hw_thread_t ¤t_thread = hw_threads[i]; 545 kmp_hw_thread_t &previous_thread = hw_threads[i - 1]; 546 bool unique = false; 547 for (int j = 0; j < depth; ++j) { 548 if (previous_thread.ids[j] != current_thread.ids[j]) { 549 unique = true; 550 break; 551 } 552 } 553 if (unique) 554 continue; 555 return false; 556 } 557 return true; 558 } 559 560 void kmp_topology_t::dump() const { 561 printf("***********************\n"); 562 printf("*** __kmp_topology: ***\n"); 563 printf("***********************\n"); 564 printf("* depth: %d\n", depth); 565 566 printf("* types: "); 567 for (int i = 0; i < depth; ++i) 568 printf("%15s ", __kmp_hw_get_keyword(types[i])); 569 printf("\n"); 570 571 printf("* ratio: "); 572 for (int i = 0; i < depth; ++i) { 573 printf("%15d ", ratio[i]); 574 } 575 printf("\n"); 576 577 printf("* count: "); 578 for (int i = 0; i < depth; ++i) { 579 printf("%15d ", count[i]); 580 } 581 printf("\n"); 582 583 printf("* core_types:\n"); 584 for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i) { 585 if (core_types[i] != KMP_HW_CORE_TYPE_UNKNOWN) { 586 printf(" %d %s core%c\n", core_types_count[i], 587 __kmp_hw_get_core_type_string(core_types[i]), 588 ((core_types_count[i] > 1) ? 's' : ' ')); 589 } else { 590 if (i == 0) 591 printf("No hybrid information available\n"); 592 break; 593 } 594 } 595 596 printf("* equivalent map:\n"); 597 KMP_FOREACH_HW_TYPE(i) { 598 const char *key = __kmp_hw_get_keyword(i); 599 const char *value = __kmp_hw_get_keyword(equivalent[i]); 600 printf("%-15s -> %-15s\n", key, value); 601 } 602 603 printf("* uniform: %s\n", (is_uniform() ? "Yes" : "No")); 604 605 printf("* num_hw_threads: %d\n", num_hw_threads); 606 printf("* hw_threads:\n"); 607 for (int i = 0; i < num_hw_threads; ++i) { 608 hw_threads[i].print(); 609 } 610 printf("***********************\n"); 611 } 612 613 void kmp_topology_t::print(const char *env_var) const { 614 kmp_str_buf_t buf; 615 int print_types_depth; 616 __kmp_str_buf_init(&buf); 617 kmp_hw_t print_types[KMP_HW_LAST + 2]; 618 619 // Num Available Threads 620 KMP_INFORM(AvailableOSProc, env_var, num_hw_threads); 621 622 // Uniform or not 623 if (is_uniform()) { 624 KMP_INFORM(Uniform, env_var); 625 } else { 626 KMP_INFORM(NonUniform, env_var); 627 } 628 629 // Equivalent types 630 KMP_FOREACH_HW_TYPE(type) { 631 kmp_hw_t eq_type = equivalent[type]; 632 if (eq_type != KMP_HW_UNKNOWN && eq_type != type) { 633 KMP_INFORM(AffEqualTopologyTypes, env_var, 634 __kmp_hw_get_catalog_string(type), 635 __kmp_hw_get_catalog_string(eq_type)); 636 } 637 } 638 639 // Quick topology 640 KMP_ASSERT(depth > 0 && depth <= (int)KMP_HW_LAST); 641 // Create a print types array that always guarantees printing 642 // the core and thread level 643 print_types_depth = 0; 644 for (int level = 0; level < depth; ++level) 645 print_types[print_types_depth++] = types[level]; 646 if (equivalent[KMP_HW_CORE] != KMP_HW_CORE) { 647 // Force in the core level for quick topology 648 if (print_types[print_types_depth - 1] == KMP_HW_THREAD) { 649 // Force core before thread e.g., 1 socket X 2 threads/socket 650 // becomes 1 socket X 1 core/socket X 2 threads/socket 651 print_types[print_types_depth - 1] = KMP_HW_CORE; 652 print_types[print_types_depth++] = KMP_HW_THREAD; 653 } else { 654 print_types[print_types_depth++] = KMP_HW_CORE; 655 } 656 } 657 // Always put threads at very end of quick topology 658 if (equivalent[KMP_HW_THREAD] != KMP_HW_THREAD) 659 print_types[print_types_depth++] = KMP_HW_THREAD; 660 661 __kmp_str_buf_clear(&buf); 662 kmp_hw_t numerator_type; 663 kmp_hw_t denominator_type = KMP_HW_UNKNOWN; 664 int core_level = get_level(KMP_HW_CORE); 665 int ncores = get_count(core_level); 666 667 for (int plevel = 0, level = 0; plevel < print_types_depth; ++plevel) { 668 int c; 669 bool plural; 670 numerator_type = print_types[plevel]; 671 KMP_ASSERT_VALID_HW_TYPE(numerator_type); 672 if (equivalent[numerator_type] != numerator_type) 673 c = 1; 674 else 675 c = get_ratio(level++); 676 plural = (c > 1); 677 if (plevel == 0) { 678 __kmp_str_buf_print(&buf, "%d %s", c, 679 __kmp_hw_get_catalog_string(numerator_type, plural)); 680 } else { 681 __kmp_str_buf_print(&buf, " x %d %s/%s", c, 682 __kmp_hw_get_catalog_string(numerator_type, plural), 683 __kmp_hw_get_catalog_string(denominator_type)); 684 } 685 denominator_type = numerator_type; 686 } 687 KMP_INFORM(TopologyGeneric, env_var, buf.str, ncores); 688 689 if (__kmp_is_hybrid_cpu()) { 690 for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i) { 691 if (core_types[i] == KMP_HW_CORE_TYPE_UNKNOWN) 692 break; 693 KMP_INFORM(TopologyHybrid, env_var, core_types_count[i], 694 __kmp_hw_get_core_type_string(core_types[i])); 695 } 696 } 697 698 if (num_hw_threads <= 0) { 699 __kmp_str_buf_free(&buf); 700 return; 701 } 702 703 // Full OS proc to hardware thread map 704 KMP_INFORM(OSProcToPhysicalThreadMap, env_var); 705 for (int i = 0; i < num_hw_threads; i++) { 706 __kmp_str_buf_clear(&buf); 707 for (int level = 0; level < depth; ++level) { 708 kmp_hw_t type = types[level]; 709 __kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type)); 710 __kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]); 711 } 712 if (__kmp_is_hybrid_cpu()) 713 __kmp_str_buf_print( 714 &buf, "(%s)", __kmp_hw_get_core_type_string(hw_threads[i].core_type)); 715 KMP_INFORM(OSProcMapToPack, env_var, hw_threads[i].os_id, buf.str); 716 } 717 718 __kmp_str_buf_free(&buf); 719 } 720 721 void kmp_topology_t::canonicalize() { 722 #if KMP_GROUP_AFFINITY 723 _insert_windows_proc_groups(); 724 #endif 725 _remove_radix1_layers(); 726 _gather_enumeration_information(); 727 _discover_uniformity(); 728 _set_sub_ids(); 729 _set_globals(); 730 _set_last_level_cache(); 731 732 #if KMP_MIC_SUPPORTED 733 // Manually Add L2 = Tile equivalence 734 if (__kmp_mic_type == mic3) { 735 if (get_level(KMP_HW_L2) != -1) 736 set_equivalent_type(KMP_HW_TILE, KMP_HW_L2); 737 else if (get_level(KMP_HW_TILE) != -1) 738 set_equivalent_type(KMP_HW_L2, KMP_HW_TILE); 739 } 740 #endif 741 742 // Perform post canonicalization checking 743 KMP_ASSERT(depth > 0); 744 for (int level = 0; level < depth; ++level) { 745 // All counts, ratios, and types must be valid 746 KMP_ASSERT(count[level] > 0 && ratio[level] > 0); 747 KMP_ASSERT_VALID_HW_TYPE(types[level]); 748 // Detected types must point to themselves 749 KMP_ASSERT(equivalent[types[level]] == types[level]); 750 } 751 752 #if KMP_AFFINITY_SUPPORTED 753 // Set the number of affinity granularity levels 754 if (__kmp_affinity_gran_levels < 0) { 755 kmp_hw_t gran_type = get_equivalent_type(__kmp_affinity_gran); 756 // Check if user's granularity request is valid 757 if (gran_type == KMP_HW_UNKNOWN) { 758 // First try core, then thread, then package 759 kmp_hw_t gran_types[3] = {KMP_HW_CORE, KMP_HW_THREAD, KMP_HW_SOCKET}; 760 for (auto g : gran_types) { 761 if (__kmp_topology->get_equivalent_type(g) != KMP_HW_UNKNOWN) { 762 gran_type = g; 763 break; 764 } 765 } 766 KMP_ASSERT(gran_type != KMP_HW_UNKNOWN); 767 // Warn user what granularity setting will be used instead 768 KMP_WARNING(AffGranularityBad, "KMP_AFFINITY", 769 __kmp_hw_get_catalog_string(__kmp_affinity_gran), 770 __kmp_hw_get_catalog_string(gran_type)); 771 __kmp_affinity_gran = gran_type; 772 } 773 #if KMP_GROUP_AFFINITY 774 // If more than one processor group exists, and the level of 775 // granularity specified by the user is too coarse, then the 776 // granularity must be adjusted "down" to processor group affinity 777 // because threads can only exist within one processor group. 778 // For example, if a user sets granularity=socket and there are two 779 // processor groups that cover a socket, then the runtime must 780 // restrict the granularity down to the processor group level. 781 if (__kmp_num_proc_groups > 1) { 782 int gran_depth = __kmp_topology->get_level(gran_type); 783 int proc_group_depth = __kmp_topology->get_level(KMP_HW_PROC_GROUP); 784 if (gran_depth >= 0 && proc_group_depth >= 0 && 785 gran_depth < proc_group_depth) { 786 KMP_WARNING(AffGranTooCoarseProcGroup, "KMP_AFFINITY", 787 __kmp_hw_get_catalog_string(__kmp_affinity_gran)); 788 __kmp_affinity_gran = gran_type = KMP_HW_PROC_GROUP; 789 } 790 } 791 #endif 792 __kmp_affinity_gran_levels = 0; 793 for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i) 794 __kmp_affinity_gran_levels++; 795 } 796 #endif // KMP_AFFINITY_SUPPORTED 797 } 798 799 // Canonicalize an explicit packages X cores/pkg X threads/core topology 800 void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg, 801 int nthreads_per_core, int ncores) { 802 int ndepth = 3; 803 depth = ndepth; 804 KMP_FOREACH_HW_TYPE(i) { equivalent[i] = KMP_HW_UNKNOWN; } 805 for (int level = 0; level < depth; ++level) { 806 count[level] = 0; 807 ratio[level] = 0; 808 } 809 count[0] = npackages; 810 count[1] = ncores; 811 count[2] = __kmp_xproc; 812 ratio[0] = npackages; 813 ratio[1] = ncores_per_pkg; 814 ratio[2] = nthreads_per_core; 815 equivalent[KMP_HW_SOCKET] = KMP_HW_SOCKET; 816 equivalent[KMP_HW_CORE] = KMP_HW_CORE; 817 equivalent[KMP_HW_THREAD] = KMP_HW_THREAD; 818 types[0] = KMP_HW_SOCKET; 819 types[1] = KMP_HW_CORE; 820 types[2] = KMP_HW_THREAD; 821 //__kmp_avail_proc = __kmp_xproc; 822 _discover_uniformity(); 823 } 824 825 // Apply the KMP_HW_SUBSET envirable to the topology 826 // Returns true if KMP_HW_SUBSET filtered any processors 827 // otherwise, returns false 828 bool kmp_topology_t::filter_hw_subset() { 829 // If KMP_HW_SUBSET wasn't requested, then do nothing. 830 if (!__kmp_hw_subset) 831 return false; 832 833 // First, sort the KMP_HW_SUBSET items by the machine topology 834 __kmp_hw_subset->sort(); 835 836 // Check to see if KMP_HW_SUBSET is a valid subset of the detected topology 837 int hw_subset_depth = __kmp_hw_subset->get_depth(); 838 kmp_hw_t specified[KMP_HW_LAST]; 839 KMP_ASSERT(hw_subset_depth > 0); 840 KMP_FOREACH_HW_TYPE(i) { specified[i] = KMP_HW_UNKNOWN; } 841 for (int i = 0; i < hw_subset_depth; ++i) { 842 int max_count; 843 int num = __kmp_hw_subset->at(i).num; 844 int offset = __kmp_hw_subset->at(i).offset; 845 kmp_hw_t type = __kmp_hw_subset->at(i).type; 846 kmp_hw_t equivalent_type = equivalent[type]; 847 int level = get_level(type); 848 849 // Check to see if current layer is in detected machine topology 850 if (equivalent_type != KMP_HW_UNKNOWN) { 851 __kmp_hw_subset->at(i).type = equivalent_type; 852 } else { 853 KMP_WARNING(AffHWSubsetNotExistGeneric, 854 __kmp_hw_get_catalog_string(type)); 855 return false; 856 } 857 858 // Check to see if current layer has already been specified 859 // either directly or through an equivalent type 860 if (specified[equivalent_type] != KMP_HW_UNKNOWN) { 861 KMP_WARNING(AffHWSubsetEqvLayers, __kmp_hw_get_catalog_string(type), 862 __kmp_hw_get_catalog_string(specified[equivalent_type])); 863 return false; 864 } 865 specified[equivalent_type] = type; 866 867 // Check to see if each layer's num & offset parameters are valid 868 max_count = get_ratio(level); 869 if (max_count < 0 || num + offset > max_count) { 870 bool plural = (num > 1); 871 KMP_WARNING(AffHWSubsetManyGeneric, 872 __kmp_hw_get_catalog_string(type, plural)); 873 return false; 874 } 875 } 876 877 // Apply the filtered hardware subset 878 int new_index = 0; 879 for (int i = 0; i < num_hw_threads; ++i) { 880 kmp_hw_thread_t &hw_thread = hw_threads[i]; 881 // Check to see if this hardware thread should be filtered 882 bool should_be_filtered = false; 883 for (int level = 0, hw_subset_index = 0; 884 level < depth && hw_subset_index < hw_subset_depth; ++level) { 885 kmp_hw_t topology_type = types[level]; 886 auto hw_subset_item = __kmp_hw_subset->at(hw_subset_index); 887 kmp_hw_t hw_subset_type = hw_subset_item.type; 888 if (topology_type != hw_subset_type) 889 continue; 890 int num = hw_subset_item.num; 891 int offset = hw_subset_item.offset; 892 hw_subset_index++; 893 if (hw_thread.sub_ids[level] < offset || 894 hw_thread.sub_ids[level] >= offset + num) { 895 should_be_filtered = true; 896 break; 897 } 898 } 899 if (!should_be_filtered) { 900 if (i != new_index) 901 hw_threads[new_index] = hw_thread; 902 new_index++; 903 } else { 904 #if KMP_AFFINITY_SUPPORTED 905 KMP_CPU_CLR(hw_thread.os_id, __kmp_affin_fullMask); 906 #endif 907 __kmp_avail_proc--; 908 } 909 } 910 KMP_DEBUG_ASSERT(new_index <= num_hw_threads); 911 num_hw_threads = new_index; 912 913 // Post hardware subset canonicalization 914 _gather_enumeration_information(); 915 _discover_uniformity(); 916 _set_globals(); 917 _set_last_level_cache(); 918 return true; 919 } 920 921 bool kmp_topology_t::is_close(int hwt1, int hwt2, int hw_level) const { 922 if (hw_level >= depth) 923 return true; 924 bool retval = true; 925 const kmp_hw_thread_t &t1 = hw_threads[hwt1]; 926 const kmp_hw_thread_t &t2 = hw_threads[hwt2]; 927 for (int i = 0; i < (depth - hw_level); ++i) { 928 if (t1.ids[i] != t2.ids[i]) 929 return false; 930 } 931 return retval; 932 } 933 934 //////////////////////////////////////////////////////////////////////////////// 935 936 #if KMP_AFFINITY_SUPPORTED 937 class kmp_affinity_raii_t { 938 kmp_affin_mask_t *mask; 939 bool restored; 940 941 public: 942 kmp_affinity_raii_t() : restored(false) { 943 KMP_CPU_ALLOC(mask); 944 KMP_ASSERT(mask != NULL); 945 __kmp_get_system_affinity(mask, TRUE); 946 } 947 void restore() { 948 __kmp_set_system_affinity(mask, TRUE); 949 KMP_CPU_FREE(mask); 950 restored = true; 951 } 952 ~kmp_affinity_raii_t() { 953 if (!restored) { 954 __kmp_set_system_affinity(mask, TRUE); 955 KMP_CPU_FREE(mask); 956 } 957 } 958 }; 959 960 bool KMPAffinity::picked_api = false; 961 962 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); } 963 void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); } 964 void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); } 965 void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); } 966 void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); } 967 void KMPAffinity::operator delete(void *p) { __kmp_free(p); } 968 969 void KMPAffinity::pick_api() { 970 KMPAffinity *affinity_dispatch; 971 if (picked_api) 972 return; 973 #if KMP_USE_HWLOC 974 // Only use Hwloc if affinity isn't explicitly disabled and 975 // user requests Hwloc topology method 976 if (__kmp_affinity_top_method == affinity_top_method_hwloc && 977 __kmp_affinity_type != affinity_disabled) { 978 affinity_dispatch = new KMPHwlocAffinity(); 979 } else 980 #endif 981 { 982 affinity_dispatch = new KMPNativeAffinity(); 983 } 984 __kmp_affinity_dispatch = affinity_dispatch; 985 picked_api = true; 986 } 987 988 void KMPAffinity::destroy_api() { 989 if (__kmp_affinity_dispatch != NULL) { 990 delete __kmp_affinity_dispatch; 991 __kmp_affinity_dispatch = NULL; 992 picked_api = false; 993 } 994 } 995 996 #define KMP_ADVANCE_SCAN(scan) \ 997 while (*scan != '\0') { \ 998 scan++; \ 999 } 1000 1001 // Print the affinity mask to the character array in a pretty format. 1002 // The format is a comma separated list of non-negative integers or integer 1003 // ranges: e.g., 1,2,3-5,7,9-15 1004 // The format can also be the string "{<empty>}" if no bits are set in mask 1005 char *__kmp_affinity_print_mask(char *buf, int buf_len, 1006 kmp_affin_mask_t *mask) { 1007 int start = 0, finish = 0, previous = 0; 1008 bool first_range; 1009 KMP_ASSERT(buf); 1010 KMP_ASSERT(buf_len >= 40); 1011 KMP_ASSERT(mask); 1012 char *scan = buf; 1013 char *end = buf + buf_len - 1; 1014 1015 // Check for empty set. 1016 if (mask->begin() == mask->end()) { 1017 KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}"); 1018 KMP_ADVANCE_SCAN(scan); 1019 KMP_ASSERT(scan <= end); 1020 return buf; 1021 } 1022 1023 first_range = true; 1024 start = mask->begin(); 1025 while (1) { 1026 // Find next range 1027 // [start, previous] is inclusive range of contiguous bits in mask 1028 for (finish = mask->next(start), previous = start; 1029 finish == previous + 1 && finish != mask->end(); 1030 finish = mask->next(finish)) { 1031 previous = finish; 1032 } 1033 1034 // The first range does not need a comma printed before it, but the rest 1035 // of the ranges do need a comma beforehand 1036 if (!first_range) { 1037 KMP_SNPRINTF(scan, end - scan + 1, "%s", ","); 1038 KMP_ADVANCE_SCAN(scan); 1039 } else { 1040 first_range = false; 1041 } 1042 // Range with three or more contiguous bits in the affinity mask 1043 if (previous - start > 1) { 1044 KMP_SNPRINTF(scan, end - scan + 1, "%u-%u", start, previous); 1045 } else { 1046 // Range with one or two contiguous bits in the affinity mask 1047 KMP_SNPRINTF(scan, end - scan + 1, "%u", start); 1048 KMP_ADVANCE_SCAN(scan); 1049 if (previous - start > 0) { 1050 KMP_SNPRINTF(scan, end - scan + 1, ",%u", previous); 1051 } 1052 } 1053 KMP_ADVANCE_SCAN(scan); 1054 // Start over with new start point 1055 start = finish; 1056 if (start == mask->end()) 1057 break; 1058 // Check for overflow 1059 if (end - scan < 2) 1060 break; 1061 } 1062 1063 // Check for overflow 1064 KMP_ASSERT(scan <= end); 1065 return buf; 1066 } 1067 #undef KMP_ADVANCE_SCAN 1068 1069 // Print the affinity mask to the string buffer object in a pretty format 1070 // The format is a comma separated list of non-negative integers or integer 1071 // ranges: e.g., 1,2,3-5,7,9-15 1072 // The format can also be the string "{<empty>}" if no bits are set in mask 1073 kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf, 1074 kmp_affin_mask_t *mask) { 1075 int start = 0, finish = 0, previous = 0; 1076 bool first_range; 1077 KMP_ASSERT(buf); 1078 KMP_ASSERT(mask); 1079 1080 __kmp_str_buf_clear(buf); 1081 1082 // Check for empty set. 1083 if (mask->begin() == mask->end()) { 1084 __kmp_str_buf_print(buf, "%s", "{<empty>}"); 1085 return buf; 1086 } 1087 1088 first_range = true; 1089 start = mask->begin(); 1090 while (1) { 1091 // Find next range 1092 // [start, previous] is inclusive range of contiguous bits in mask 1093 for (finish = mask->next(start), previous = start; 1094 finish == previous + 1 && finish != mask->end(); 1095 finish = mask->next(finish)) { 1096 previous = finish; 1097 } 1098 1099 // The first range does not need a comma printed before it, but the rest 1100 // of the ranges do need a comma beforehand 1101 if (!first_range) { 1102 __kmp_str_buf_print(buf, "%s", ","); 1103 } else { 1104 first_range = false; 1105 } 1106 // Range with three or more contiguous bits in the affinity mask 1107 if (previous - start > 1) { 1108 __kmp_str_buf_print(buf, "%u-%u", start, previous); 1109 } else { 1110 // Range with one or two contiguous bits in the affinity mask 1111 __kmp_str_buf_print(buf, "%u", start); 1112 if (previous - start > 0) { 1113 __kmp_str_buf_print(buf, ",%u", previous); 1114 } 1115 } 1116 // Start over with new start point 1117 start = finish; 1118 if (start == mask->end()) 1119 break; 1120 } 1121 return buf; 1122 } 1123 1124 // Return (possibly empty) affinity mask representing the offline CPUs 1125 // Caller must free the mask 1126 kmp_affin_mask_t *__kmp_affinity_get_offline_cpus() { 1127 kmp_affin_mask_t *offline; 1128 KMP_CPU_ALLOC(offline); 1129 KMP_CPU_ZERO(offline); 1130 #if KMP_OS_LINUX 1131 int n, begin_cpu, end_cpu; 1132 kmp_safe_raii_file_t offline_file; 1133 auto skip_ws = [](FILE *f) { 1134 int c; 1135 do { 1136 c = fgetc(f); 1137 } while (isspace(c)); 1138 if (c != EOF) 1139 ungetc(c, f); 1140 }; 1141 // File contains CSV of integer ranges representing the offline CPUs 1142 // e.g., 1,2,4-7,9,11-15 1143 int status = offline_file.try_open("/sys/devices/system/cpu/offline", "r"); 1144 if (status != 0) 1145 return offline; 1146 while (!feof(offline_file)) { 1147 skip_ws(offline_file); 1148 n = fscanf(offline_file, "%d", &begin_cpu); 1149 if (n != 1) 1150 break; 1151 skip_ws(offline_file); 1152 int c = fgetc(offline_file); 1153 if (c == EOF || c == ',') { 1154 // Just single CPU 1155 end_cpu = begin_cpu; 1156 } else if (c == '-') { 1157 // Range of CPUs 1158 skip_ws(offline_file); 1159 n = fscanf(offline_file, "%d", &end_cpu); 1160 if (n != 1) 1161 break; 1162 skip_ws(offline_file); 1163 c = fgetc(offline_file); // skip ',' 1164 } else { 1165 // Syntax problem 1166 break; 1167 } 1168 // Ensure a valid range of CPUs 1169 if (begin_cpu < 0 || begin_cpu >= __kmp_xproc || end_cpu < 0 || 1170 end_cpu >= __kmp_xproc || begin_cpu > end_cpu) { 1171 continue; 1172 } 1173 // Insert [begin_cpu, end_cpu] into offline mask 1174 for (int cpu = begin_cpu; cpu <= end_cpu; ++cpu) { 1175 KMP_CPU_SET(cpu, offline); 1176 } 1177 } 1178 #endif 1179 return offline; 1180 } 1181 1182 // Return the number of available procs 1183 int __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) { 1184 int avail_proc = 0; 1185 KMP_CPU_ZERO(mask); 1186 1187 #if KMP_GROUP_AFFINITY 1188 1189 if (__kmp_num_proc_groups > 1) { 1190 int group; 1191 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 1192 for (group = 0; group < __kmp_num_proc_groups; group++) { 1193 int i; 1194 int num = __kmp_GetActiveProcessorCount(group); 1195 for (i = 0; i < num; i++) { 1196 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 1197 avail_proc++; 1198 } 1199 } 1200 } else 1201 1202 #endif /* KMP_GROUP_AFFINITY */ 1203 1204 { 1205 int proc; 1206 kmp_affin_mask_t *offline_cpus = __kmp_affinity_get_offline_cpus(); 1207 for (proc = 0; proc < __kmp_xproc; proc++) { 1208 // Skip offline CPUs 1209 if (KMP_CPU_ISSET(proc, offline_cpus)) 1210 continue; 1211 KMP_CPU_SET(proc, mask); 1212 avail_proc++; 1213 } 1214 KMP_CPU_FREE(offline_cpus); 1215 } 1216 1217 return avail_proc; 1218 } 1219 1220 // All of the __kmp_affinity_create_*_map() routines should allocate the 1221 // internal topology object and set the layer ids for it. Each routine 1222 // returns a boolean on whether it was successful at doing so. 1223 kmp_affin_mask_t *__kmp_affin_fullMask = NULL; 1224 1225 #if KMP_USE_HWLOC 1226 static inline bool __kmp_hwloc_is_cache_type(hwloc_obj_t obj) { 1227 #if HWLOC_API_VERSION >= 0x00020000 1228 return hwloc_obj_type_is_cache(obj->type); 1229 #else 1230 return obj->type == HWLOC_OBJ_CACHE; 1231 #endif 1232 } 1233 1234 // Returns KMP_HW_* type derived from HWLOC_* type 1235 static inline kmp_hw_t __kmp_hwloc_type_2_topology_type(hwloc_obj_t obj) { 1236 1237 if (__kmp_hwloc_is_cache_type(obj)) { 1238 if (obj->attr->cache.type == HWLOC_OBJ_CACHE_INSTRUCTION) 1239 return KMP_HW_UNKNOWN; 1240 switch (obj->attr->cache.depth) { 1241 case 1: 1242 return KMP_HW_L1; 1243 case 2: 1244 #if KMP_MIC_SUPPORTED 1245 if (__kmp_mic_type == mic3) { 1246 return KMP_HW_TILE; 1247 } 1248 #endif 1249 return KMP_HW_L2; 1250 case 3: 1251 return KMP_HW_L3; 1252 } 1253 return KMP_HW_UNKNOWN; 1254 } 1255 1256 switch (obj->type) { 1257 case HWLOC_OBJ_PACKAGE: 1258 return KMP_HW_SOCKET; 1259 case HWLOC_OBJ_NUMANODE: 1260 return KMP_HW_NUMA; 1261 case HWLOC_OBJ_CORE: 1262 return KMP_HW_CORE; 1263 case HWLOC_OBJ_PU: 1264 return KMP_HW_THREAD; 1265 case HWLOC_OBJ_GROUP: 1266 if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_DIE) 1267 return KMP_HW_DIE; 1268 else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_TILE) 1269 return KMP_HW_TILE; 1270 else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_MODULE) 1271 return KMP_HW_MODULE; 1272 else if (obj->attr->group.kind == HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP) 1273 return KMP_HW_PROC_GROUP; 1274 return KMP_HW_UNKNOWN; 1275 #if HWLOC_API_VERSION >= 0x00020100 1276 case HWLOC_OBJ_DIE: 1277 return KMP_HW_DIE; 1278 #endif 1279 } 1280 return KMP_HW_UNKNOWN; 1281 } 1282 1283 // Returns the number of objects of type 'type' below 'obj' within the topology 1284 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is 1285 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET 1286 // object. 1287 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj, 1288 hwloc_obj_type_t type) { 1289 int retval = 0; 1290 hwloc_obj_t first; 1291 for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type, 1292 obj->logical_index, type, 0); 1293 first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, 1294 obj->type, first) == obj; 1295 first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type, 1296 first)) { 1297 ++retval; 1298 } 1299 return retval; 1300 } 1301 1302 // This gets the sub_id for a lower object under a higher object in the 1303 // topology tree 1304 static int __kmp_hwloc_get_sub_id(hwloc_topology_t t, hwloc_obj_t higher, 1305 hwloc_obj_t lower) { 1306 hwloc_obj_t obj; 1307 hwloc_obj_type_t ltype = lower->type; 1308 int lindex = lower->logical_index - 1; 1309 int sub_id = 0; 1310 // Get the previous lower object 1311 obj = hwloc_get_obj_by_type(t, ltype, lindex); 1312 while (obj && lindex >= 0 && 1313 hwloc_bitmap_isincluded(obj->cpuset, higher->cpuset)) { 1314 if (obj->userdata) { 1315 sub_id = (int)(RCAST(kmp_intptr_t, obj->userdata)); 1316 break; 1317 } 1318 sub_id++; 1319 lindex--; 1320 obj = hwloc_get_obj_by_type(t, ltype, lindex); 1321 } 1322 // store sub_id + 1 so that 0 is differed from NULL 1323 lower->userdata = RCAST(void *, sub_id + 1); 1324 return sub_id; 1325 } 1326 1327 static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) { 1328 kmp_hw_t type; 1329 int hw_thread_index, sub_id; 1330 int depth; 1331 hwloc_obj_t pu, obj, root, prev; 1332 kmp_hw_t types[KMP_HW_LAST]; 1333 hwloc_obj_type_t hwloc_types[KMP_HW_LAST]; 1334 1335 hwloc_topology_t tp = __kmp_hwloc_topology; 1336 *msg_id = kmp_i18n_null; 1337 if (__kmp_affinity_verbose) { 1338 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 1339 } 1340 1341 if (!KMP_AFFINITY_CAPABLE()) { 1342 // Hack to try and infer the machine topology using only the data 1343 // available from hwloc on the current thread, and __kmp_xproc. 1344 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1345 // hwloc only guarantees existance of PU object, so check PACKAGE and CORE 1346 hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0); 1347 if (o != NULL) 1348 nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_CORE); 1349 else 1350 nCoresPerPkg = 1; // no PACKAGE found 1351 o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0); 1352 if (o != NULL) 1353 __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_PU); 1354 else 1355 __kmp_nThreadsPerCore = 1; // no CORE found 1356 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1357 if (nCoresPerPkg == 0) 1358 nCoresPerPkg = 1; // to prevent possible division by 0 1359 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1360 return true; 1361 } 1362 1363 root = hwloc_get_root_obj(tp); 1364 1365 // Figure out the depth and types in the topology 1366 depth = 0; 1367 pu = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin()); 1368 KMP_ASSERT(pu); 1369 obj = pu; 1370 types[depth] = KMP_HW_THREAD; 1371 hwloc_types[depth] = obj->type; 1372 depth++; 1373 while (obj != root && obj != NULL) { 1374 obj = obj->parent; 1375 #if HWLOC_API_VERSION >= 0x00020000 1376 if (obj->memory_arity) { 1377 hwloc_obj_t memory; 1378 for (memory = obj->memory_first_child; memory; 1379 memory = hwloc_get_next_child(tp, obj, memory)) { 1380 if (memory->type == HWLOC_OBJ_NUMANODE) 1381 break; 1382 } 1383 if (memory && memory->type == HWLOC_OBJ_NUMANODE) { 1384 types[depth] = KMP_HW_NUMA; 1385 hwloc_types[depth] = memory->type; 1386 depth++; 1387 } 1388 } 1389 #endif 1390 type = __kmp_hwloc_type_2_topology_type(obj); 1391 if (type != KMP_HW_UNKNOWN) { 1392 types[depth] = type; 1393 hwloc_types[depth] = obj->type; 1394 depth++; 1395 } 1396 } 1397 KMP_ASSERT(depth > 0); 1398 1399 // Get the order for the types correct 1400 for (int i = 0, j = depth - 1; i < j; ++i, --j) { 1401 hwloc_obj_type_t hwloc_temp = hwloc_types[i]; 1402 kmp_hw_t temp = types[i]; 1403 types[i] = types[j]; 1404 types[j] = temp; 1405 hwloc_types[i] = hwloc_types[j]; 1406 hwloc_types[j] = hwloc_temp; 1407 } 1408 1409 // Allocate the data structure to be returned. 1410 __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types); 1411 1412 hw_thread_index = 0; 1413 pu = NULL; 1414 while (pu = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, pu)) { 1415 int index = depth - 1; 1416 bool included = KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask); 1417 kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index); 1418 if (included) { 1419 hw_thread.clear(); 1420 hw_thread.ids[index] = pu->logical_index; 1421 hw_thread.os_id = pu->os_index; 1422 index--; 1423 } 1424 obj = pu; 1425 prev = obj; 1426 while (obj != root && obj != NULL) { 1427 obj = obj->parent; 1428 #if HWLOC_API_VERSION >= 0x00020000 1429 // NUMA Nodes are handled differently since they are not within the 1430 // parent/child structure anymore. They are separate children 1431 // of obj (memory_first_child points to first memory child) 1432 if (obj->memory_arity) { 1433 hwloc_obj_t memory; 1434 for (memory = obj->memory_first_child; memory; 1435 memory = hwloc_get_next_child(tp, obj, memory)) { 1436 if (memory->type == HWLOC_OBJ_NUMANODE) 1437 break; 1438 } 1439 if (memory && memory->type == HWLOC_OBJ_NUMANODE) { 1440 sub_id = __kmp_hwloc_get_sub_id(tp, memory, prev); 1441 if (included) { 1442 hw_thread.ids[index] = memory->logical_index; 1443 hw_thread.ids[index + 1] = sub_id; 1444 index--; 1445 } 1446 prev = memory; 1447 } 1448 prev = obj; 1449 } 1450 #endif 1451 type = __kmp_hwloc_type_2_topology_type(obj); 1452 if (type != KMP_HW_UNKNOWN) { 1453 sub_id = __kmp_hwloc_get_sub_id(tp, obj, prev); 1454 if (included) { 1455 hw_thread.ids[index] = obj->logical_index; 1456 hw_thread.ids[index + 1] = sub_id; 1457 index--; 1458 } 1459 prev = obj; 1460 } 1461 } 1462 if (included) 1463 hw_thread_index++; 1464 } 1465 __kmp_topology->sort_ids(); 1466 return true; 1467 } 1468 #endif // KMP_USE_HWLOC 1469 1470 // If we don't know how to retrieve the machine's processor topology, or 1471 // encounter an error in doing so, this routine is called to form a "flat" 1472 // mapping of os thread id's <-> processor id's. 1473 static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t *const msg_id) { 1474 *msg_id = kmp_i18n_null; 1475 int depth = 3; 1476 kmp_hw_t types[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD}; 1477 1478 if (__kmp_affinity_verbose) { 1479 KMP_INFORM(UsingFlatOS, "KMP_AFFINITY"); 1480 } 1481 1482 // Even if __kmp_affinity_type == affinity_none, this routine might still 1483 // called to set __kmp_ncores, as well as 1484 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1485 if (!KMP_AFFINITY_CAPABLE()) { 1486 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1487 __kmp_ncores = nPackages = __kmp_xproc; 1488 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1489 return true; 1490 } 1491 1492 // When affinity is off, this routine will still be called to set 1493 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1494 // Make sure all these vars are set correctly, and return now if affinity is 1495 // not enabled. 1496 __kmp_ncores = nPackages = __kmp_avail_proc; 1497 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1498 1499 // Construct the data structure to be returned. 1500 __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types); 1501 int avail_ct = 0; 1502 int i; 1503 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1504 // Skip this proc if it is not included in the machine model. 1505 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1506 continue; 1507 } 1508 kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct); 1509 hw_thread.clear(); 1510 hw_thread.os_id = i; 1511 hw_thread.ids[0] = i; 1512 hw_thread.ids[1] = 0; 1513 hw_thread.ids[2] = 0; 1514 avail_ct++; 1515 } 1516 if (__kmp_affinity_verbose) { 1517 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 1518 } 1519 return true; 1520 } 1521 1522 #if KMP_GROUP_AFFINITY 1523 // If multiple Windows* OS processor groups exist, we can create a 2-level 1524 // topology map with the groups at level 0 and the individual procs at level 1. 1525 // This facilitates letting the threads float among all procs in a group, 1526 // if granularity=group (the default when there are multiple groups). 1527 static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t *const msg_id) { 1528 *msg_id = kmp_i18n_null; 1529 int depth = 3; 1530 kmp_hw_t types[] = {KMP_HW_PROC_GROUP, KMP_HW_CORE, KMP_HW_THREAD}; 1531 const static size_t BITS_PER_GROUP = CHAR_BIT * sizeof(DWORD_PTR); 1532 1533 if (__kmp_affinity_verbose) { 1534 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 1535 } 1536 1537 // If we aren't affinity capable, then use flat topology 1538 if (!KMP_AFFINITY_CAPABLE()) { 1539 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1540 nPackages = __kmp_num_proc_groups; 1541 __kmp_nThreadsPerCore = 1; 1542 __kmp_ncores = __kmp_xproc; 1543 nCoresPerPkg = nPackages / __kmp_ncores; 1544 return true; 1545 } 1546 1547 // Construct the data structure to be returned. 1548 __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types); 1549 int avail_ct = 0; 1550 int i; 1551 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1552 // Skip this proc if it is not included in the machine model. 1553 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1554 continue; 1555 } 1556 kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct++); 1557 hw_thread.clear(); 1558 hw_thread.os_id = i; 1559 hw_thread.ids[0] = i / BITS_PER_GROUP; 1560 hw_thread.ids[1] = hw_thread.ids[2] = i % BITS_PER_GROUP; 1561 } 1562 return true; 1563 } 1564 #endif /* KMP_GROUP_AFFINITY */ 1565 1566 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1567 1568 template <kmp_uint32 LSB, kmp_uint32 MSB> 1569 static inline unsigned __kmp_extract_bits(kmp_uint32 v) { 1570 const kmp_uint32 SHIFT_LEFT = sizeof(kmp_uint32) * 8 - 1 - MSB; 1571 const kmp_uint32 SHIFT_RIGHT = LSB; 1572 kmp_uint32 retval = v; 1573 retval <<= SHIFT_LEFT; 1574 retval >>= (SHIFT_LEFT + SHIFT_RIGHT); 1575 return retval; 1576 } 1577 1578 static int __kmp_cpuid_mask_width(int count) { 1579 int r = 0; 1580 1581 while ((1 << r) < count) 1582 ++r; 1583 return r; 1584 } 1585 1586 class apicThreadInfo { 1587 public: 1588 unsigned osId; // param to __kmp_affinity_bind_thread 1589 unsigned apicId; // from cpuid after binding 1590 unsigned maxCoresPerPkg; // "" 1591 unsigned maxThreadsPerPkg; // "" 1592 unsigned pkgId; // inferred from above values 1593 unsigned coreId; // "" 1594 unsigned threadId; // "" 1595 }; 1596 1597 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, 1598 const void *b) { 1599 const apicThreadInfo *aa = (const apicThreadInfo *)a; 1600 const apicThreadInfo *bb = (const apicThreadInfo *)b; 1601 if (aa->pkgId < bb->pkgId) 1602 return -1; 1603 if (aa->pkgId > bb->pkgId) 1604 return 1; 1605 if (aa->coreId < bb->coreId) 1606 return -1; 1607 if (aa->coreId > bb->coreId) 1608 return 1; 1609 if (aa->threadId < bb->threadId) 1610 return -1; 1611 if (aa->threadId > bb->threadId) 1612 return 1; 1613 return 0; 1614 } 1615 1616 class kmp_cache_info_t { 1617 public: 1618 struct info_t { 1619 unsigned level, mask; 1620 }; 1621 kmp_cache_info_t() : depth(0) { get_leaf4_levels(); } 1622 size_t get_depth() const { return depth; } 1623 info_t &operator[](size_t index) { return table[index]; } 1624 const info_t &operator[](size_t index) const { return table[index]; } 1625 1626 static kmp_hw_t get_topology_type(unsigned level) { 1627 KMP_DEBUG_ASSERT(level >= 1 && level <= MAX_CACHE_LEVEL); 1628 switch (level) { 1629 case 1: 1630 return KMP_HW_L1; 1631 case 2: 1632 return KMP_HW_L2; 1633 case 3: 1634 return KMP_HW_L3; 1635 } 1636 return KMP_HW_UNKNOWN; 1637 } 1638 1639 private: 1640 static const int MAX_CACHE_LEVEL = 3; 1641 1642 size_t depth; 1643 info_t table[MAX_CACHE_LEVEL]; 1644 1645 void get_leaf4_levels() { 1646 unsigned level = 0; 1647 while (depth < MAX_CACHE_LEVEL) { 1648 unsigned cache_type, max_threads_sharing; 1649 unsigned cache_level, cache_mask_width; 1650 kmp_cpuid buf2; 1651 __kmp_x86_cpuid(4, level, &buf2); 1652 cache_type = __kmp_extract_bits<0, 4>(buf2.eax); 1653 if (!cache_type) 1654 break; 1655 // Skip instruction caches 1656 if (cache_type == 2) { 1657 level++; 1658 continue; 1659 } 1660 max_threads_sharing = __kmp_extract_bits<14, 25>(buf2.eax) + 1; 1661 cache_mask_width = __kmp_cpuid_mask_width(max_threads_sharing); 1662 cache_level = __kmp_extract_bits<5, 7>(buf2.eax); 1663 table[depth].level = cache_level; 1664 table[depth].mask = ((-1) << cache_mask_width); 1665 depth++; 1666 level++; 1667 } 1668 } 1669 }; 1670 1671 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 1672 // an algorithm which cycles through the available os threads, setting 1673 // the current thread's affinity mask to that thread, and then retrieves 1674 // the Apic Id for each thread context using the cpuid instruction. 1675 static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) { 1676 kmp_cpuid buf; 1677 *msg_id = kmp_i18n_null; 1678 1679 if (__kmp_affinity_verbose) { 1680 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 1681 } 1682 1683 // Check if cpuid leaf 4 is supported. 1684 __kmp_x86_cpuid(0, 0, &buf); 1685 if (buf.eax < 4) { 1686 *msg_id = kmp_i18n_str_NoLeaf4Support; 1687 return false; 1688 } 1689 1690 // The algorithm used starts by setting the affinity to each available thread 1691 // and retrieving info from the cpuid instruction, so if we are not capable of 1692 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we 1693 // need to do something else - use the defaults that we calculated from 1694 // issuing cpuid without binding to each proc. 1695 if (!KMP_AFFINITY_CAPABLE()) { 1696 // Hack to try and infer the machine topology using only the data 1697 // available from cpuid on the current thread, and __kmp_xproc. 1698 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1699 1700 // Get an upper bound on the number of threads per package using cpuid(1). 1701 // On some OS/chps combinations where HT is supported by the chip but is 1702 // disabled, this value will be 2 on a single core chip. Usually, it will be 1703 // 2 if HT is enabled and 1 if HT is disabled. 1704 __kmp_x86_cpuid(1, 0, &buf); 1705 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1706 if (maxThreadsPerPkg == 0) { 1707 maxThreadsPerPkg = 1; 1708 } 1709 1710 // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded 1711 // value. 1712 // 1713 // The author of cpu_count.cpp treated this only an upper bound on the 1714 // number of cores, but I haven't seen any cases where it was greater than 1715 // the actual number of cores, so we will treat it as exact in this block of 1716 // code. 1717 // 1718 // First, we need to check if cpuid(4) is supported on this chip. To see if 1719 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or 1720 // greater. 1721 __kmp_x86_cpuid(0, 0, &buf); 1722 if (buf.eax >= 4) { 1723 __kmp_x86_cpuid(4, 0, &buf); 1724 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1725 } else { 1726 nCoresPerPkg = 1; 1727 } 1728 1729 // There is no way to reliably tell if HT is enabled without issuing the 1730 // cpuid instruction from every thread, can correlating the cpuid info, so 1731 // if the machine is not affinity capable, we assume that HT is off. We have 1732 // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine 1733 // does not support HT. 1734 // 1735 // - Older OSes are usually found on machines with older chips, which do not 1736 // support HT. 1737 // - The performance penalty for mistakenly identifying a machine as HT when 1738 // it isn't (which results in blocktime being incorrectly set to 0) is 1739 // greater than the penalty when for mistakenly identifying a machine as 1740 // being 1 thread/core when it is really HT enabled (which results in 1741 // blocktime being incorrectly set to a positive value). 1742 __kmp_ncores = __kmp_xproc; 1743 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1744 __kmp_nThreadsPerCore = 1; 1745 return true; 1746 } 1747 1748 // From here on, we can assume that it is safe to call 1749 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 1750 // __kmp_affinity_type = affinity_none. 1751 1752 // Save the affinity mask for the current thread. 1753 kmp_affinity_raii_t previous_affinity; 1754 1755 // Run through each of the available contexts, binding the current thread 1756 // to it, and obtaining the pertinent information using the cpuid instr. 1757 // 1758 // The relevant information is: 1759 // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 1760 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 1761 // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value 1762 // of this field determines the width of the core# + thread# fields in the 1763 // Apic Id. It is also an upper bound on the number of threads per 1764 // package, but it has been verified that situations happen were it is not 1765 // exact. In particular, on certain OS/chip combinations where Intel(R) 1766 // Hyper-Threading Technology is supported by the chip but has been 1767 // disabled, the value of this field will be 2 (for a single core chip). 1768 // On other OS/chip combinations supporting Intel(R) Hyper-Threading 1769 // Technology, the value of this field will be 1 when Intel(R) 1770 // Hyper-Threading Technology is disabled and 2 when it is enabled. 1771 // - Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The value 1772 // of this field (+1) determines the width of the core# field in the Apic 1773 // Id. The comments in "cpucount.cpp" say that this value is an upper 1774 // bound, but the IA-32 architecture manual says that it is exactly the 1775 // number of cores per package, and I haven't seen any case where it 1776 // wasn't. 1777 // 1778 // From this information, deduce the package Id, core Id, and thread Id, 1779 // and set the corresponding fields in the apicThreadInfo struct. 1780 unsigned i; 1781 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 1782 __kmp_avail_proc * sizeof(apicThreadInfo)); 1783 unsigned nApics = 0; 1784 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1785 // Skip this proc if it is not included in the machine model. 1786 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1787 continue; 1788 } 1789 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 1790 1791 __kmp_affinity_dispatch->bind_thread(i); 1792 threadInfo[nApics].osId = i; 1793 1794 // The apic id and max threads per pkg come from cpuid(1). 1795 __kmp_x86_cpuid(1, 0, &buf); 1796 if (((buf.edx >> 9) & 1) == 0) { 1797 __kmp_free(threadInfo); 1798 *msg_id = kmp_i18n_str_ApicNotPresent; 1799 return false; 1800 } 1801 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 1802 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1803 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 1804 threadInfo[nApics].maxThreadsPerPkg = 1; 1805 } 1806 1807 // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded 1808 // value. 1809 // 1810 // First, we need to check if cpuid(4) is supported on this chip. To see if 1811 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n 1812 // or greater. 1813 __kmp_x86_cpuid(0, 0, &buf); 1814 if (buf.eax >= 4) { 1815 __kmp_x86_cpuid(4, 0, &buf); 1816 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1817 } else { 1818 threadInfo[nApics].maxCoresPerPkg = 1; 1819 } 1820 1821 // Infer the pkgId / coreId / threadId using only the info obtained locally. 1822 int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg); 1823 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 1824 1825 int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg); 1826 int widthT = widthCT - widthC; 1827 if (widthT < 0) { 1828 // I've never seen this one happen, but I suppose it could, if the cpuid 1829 // instruction on a chip was really screwed up. Make sure to restore the 1830 // affinity mask before the tail call. 1831 __kmp_free(threadInfo); 1832 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1833 return false; 1834 } 1835 1836 int maskC = (1 << widthC) - 1; 1837 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC; 1838 1839 int maskT = (1 << widthT) - 1; 1840 threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT; 1841 1842 nApics++; 1843 } 1844 1845 // We've collected all the info we need. 1846 // Restore the old affinity mask for this thread. 1847 previous_affinity.restore(); 1848 1849 // Sort the threadInfo table by physical Id. 1850 qsort(threadInfo, nApics, sizeof(*threadInfo), 1851 __kmp_affinity_cmp_apicThreadInfo_phys_id); 1852 1853 // The table is now sorted by pkgId / coreId / threadId, but we really don't 1854 // know the radix of any of the fields. pkgId's may be sparsely assigned among 1855 // the chips on a system. Although coreId's are usually assigned 1856 // [0 .. coresPerPkg-1] and threadId's are usually assigned 1857 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1858 // 1859 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 1860 // total # packages) are at this point - we want to determine that now. We 1861 // only have an upper bound on the first two figures. 1862 // 1863 // We also perform a consistency check at this point: the values returned by 1864 // the cpuid instruction for any thread bound to a given package had better 1865 // return the same info for maxThreadsPerPkg and maxCoresPerPkg. 1866 nPackages = 1; 1867 nCoresPerPkg = 1; 1868 __kmp_nThreadsPerCore = 1; 1869 unsigned nCores = 1; 1870 1871 unsigned pkgCt = 1; // to determine radii 1872 unsigned lastPkgId = threadInfo[0].pkgId; 1873 unsigned coreCt = 1; 1874 unsigned lastCoreId = threadInfo[0].coreId; 1875 unsigned threadCt = 1; 1876 unsigned lastThreadId = threadInfo[0].threadId; 1877 1878 // intra-pkg consist checks 1879 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 1880 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 1881 1882 for (i = 1; i < nApics; i++) { 1883 if (threadInfo[i].pkgId != lastPkgId) { 1884 nCores++; 1885 pkgCt++; 1886 lastPkgId = threadInfo[i].pkgId; 1887 if ((int)coreCt > nCoresPerPkg) 1888 nCoresPerPkg = coreCt; 1889 coreCt = 1; 1890 lastCoreId = threadInfo[i].coreId; 1891 if ((int)threadCt > __kmp_nThreadsPerCore) 1892 __kmp_nThreadsPerCore = threadCt; 1893 threadCt = 1; 1894 lastThreadId = threadInfo[i].threadId; 1895 1896 // This is a different package, so go on to the next iteration without 1897 // doing any consistency checks. Reset the consistency check vars, though. 1898 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 1899 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 1900 continue; 1901 } 1902 1903 if (threadInfo[i].coreId != lastCoreId) { 1904 nCores++; 1905 coreCt++; 1906 lastCoreId = threadInfo[i].coreId; 1907 if ((int)threadCt > __kmp_nThreadsPerCore) 1908 __kmp_nThreadsPerCore = threadCt; 1909 threadCt = 1; 1910 lastThreadId = threadInfo[i].threadId; 1911 } else if (threadInfo[i].threadId != lastThreadId) { 1912 threadCt++; 1913 lastThreadId = threadInfo[i].threadId; 1914 } else { 1915 __kmp_free(threadInfo); 1916 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1917 return false; 1918 } 1919 1920 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 1921 // fields agree between all the threads bounds to a given package. 1922 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) || 1923 (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 1924 __kmp_free(threadInfo); 1925 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1926 return false; 1927 } 1928 } 1929 // When affinity is off, this routine will still be called to set 1930 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1931 // Make sure all these vars are set correctly 1932 nPackages = pkgCt; 1933 if ((int)coreCt > nCoresPerPkg) 1934 nCoresPerPkg = coreCt; 1935 if ((int)threadCt > __kmp_nThreadsPerCore) 1936 __kmp_nThreadsPerCore = threadCt; 1937 __kmp_ncores = nCores; 1938 KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc); 1939 1940 // Now that we've determined the number of packages, the number of cores per 1941 // package, and the number of threads per core, we can construct the data 1942 // structure that is to be returned. 1943 int idx = 0; 1944 int pkgLevel = 0; 1945 int coreLevel = 1; 1946 int threadLevel = 2; 1947 //(__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 1948 int depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 1949 kmp_hw_t types[3]; 1950 if (pkgLevel >= 0) 1951 types[idx++] = KMP_HW_SOCKET; 1952 if (coreLevel >= 0) 1953 types[idx++] = KMP_HW_CORE; 1954 if (threadLevel >= 0) 1955 types[idx++] = KMP_HW_THREAD; 1956 1957 KMP_ASSERT(depth > 0); 1958 __kmp_topology = kmp_topology_t::allocate(nApics, depth, types); 1959 1960 for (i = 0; i < nApics; ++i) { 1961 idx = 0; 1962 unsigned os = threadInfo[i].osId; 1963 kmp_hw_thread_t &hw_thread = __kmp_topology->at(i); 1964 hw_thread.clear(); 1965 1966 if (pkgLevel >= 0) { 1967 hw_thread.ids[idx++] = threadInfo[i].pkgId; 1968 } 1969 if (coreLevel >= 0) { 1970 hw_thread.ids[idx++] = threadInfo[i].coreId; 1971 } 1972 if (threadLevel >= 0) { 1973 hw_thread.ids[idx++] = threadInfo[i].threadId; 1974 } 1975 hw_thread.os_id = os; 1976 } 1977 1978 __kmp_free(threadInfo); 1979 __kmp_topology->sort_ids(); 1980 if (!__kmp_topology->check_ids()) { 1981 kmp_topology_t::deallocate(__kmp_topology); 1982 __kmp_topology = nullptr; 1983 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1984 return false; 1985 } 1986 return true; 1987 } 1988 1989 // Hybrid cpu detection using CPUID.1A 1990 // Thread should be pinned to processor already 1991 static void __kmp_get_hybrid_info(kmp_hw_core_type_t *type, 1992 unsigned *native_model_id) { 1993 kmp_cpuid buf; 1994 __kmp_x86_cpuid(0x1a, 0, &buf); 1995 *type = (kmp_hw_core_type_t)__kmp_extract_bits<24, 31>(buf.eax); 1996 *native_model_id = __kmp_extract_bits<0, 23>(buf.eax); 1997 } 1998 1999 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 2000 // architectures support a newer interface for specifying the x2APIC Ids, 2001 // based on CPUID.B or CPUID.1F 2002 /* 2003 * CPUID.B or 1F, Input ECX (sub leaf # aka level number) 2004 Bits Bits Bits Bits 2005 31-16 15-8 7-4 4-0 2006 ---+-----------+--------------+-------------+-----------------+ 2007 EAX| reserved | reserved | reserved | Bits to Shift | 2008 ---+-----------|--------------+-------------+-----------------| 2009 EBX| reserved | Num logical processors at level (16 bits) | 2010 ---+-----------|--------------+-------------------------------| 2011 ECX| reserved | Level Type | Level Number (8 bits) | 2012 ---+-----------+--------------+-------------------------------| 2013 EDX| X2APIC ID (32 bits) | 2014 ---+----------------------------------------------------------+ 2015 */ 2016 2017 enum { 2018 INTEL_LEVEL_TYPE_INVALID = 0, // Package level 2019 INTEL_LEVEL_TYPE_SMT = 1, 2020 INTEL_LEVEL_TYPE_CORE = 2, 2021 INTEL_LEVEL_TYPE_TILE = 3, 2022 INTEL_LEVEL_TYPE_MODULE = 4, 2023 INTEL_LEVEL_TYPE_DIE = 5, 2024 INTEL_LEVEL_TYPE_LAST = 6, 2025 }; 2026 2027 struct cpuid_level_info_t { 2028 unsigned level_type, mask, mask_width, nitems, cache_mask; 2029 }; 2030 2031 static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) { 2032 switch (intel_type) { 2033 case INTEL_LEVEL_TYPE_INVALID: 2034 return KMP_HW_SOCKET; 2035 case INTEL_LEVEL_TYPE_SMT: 2036 return KMP_HW_THREAD; 2037 case INTEL_LEVEL_TYPE_CORE: 2038 return KMP_HW_CORE; 2039 case INTEL_LEVEL_TYPE_TILE: 2040 return KMP_HW_TILE; 2041 case INTEL_LEVEL_TYPE_MODULE: 2042 return KMP_HW_MODULE; 2043 case INTEL_LEVEL_TYPE_DIE: 2044 return KMP_HW_DIE; 2045 } 2046 return KMP_HW_UNKNOWN; 2047 } 2048 2049 // This function takes the topology leaf, a levels array to store the levels 2050 // detected and a bitmap of the known levels. 2051 // Returns the number of levels in the topology 2052 static unsigned 2053 __kmp_x2apicid_get_levels(int leaf, 2054 cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST], 2055 kmp_uint64 known_levels) { 2056 unsigned level, levels_index; 2057 unsigned level_type, mask_width, nitems; 2058 kmp_cpuid buf; 2059 2060 // New algorithm has known topology layers act as highest unknown topology 2061 // layers when unknown topology layers exist. 2062 // e.g., Suppose layers were SMT <X> CORE <Y> <Z> PACKAGE, where <X> <Y> <Z> 2063 // are unknown topology layers, Then SMT will take the characteristics of 2064 // (SMT x <X>) and CORE will take the characteristics of (CORE x <Y> x <Z>). 2065 // This eliminates unknown portions of the topology while still keeping the 2066 // correct structure. 2067 level = levels_index = 0; 2068 do { 2069 __kmp_x86_cpuid(leaf, level, &buf); 2070 level_type = __kmp_extract_bits<8, 15>(buf.ecx); 2071 mask_width = __kmp_extract_bits<0, 4>(buf.eax); 2072 nitems = __kmp_extract_bits<0, 15>(buf.ebx); 2073 if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0) 2074 return 0; 2075 2076 if (known_levels & (1ull << level_type)) { 2077 // Add a new level to the topology 2078 KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST); 2079 levels[levels_index].level_type = level_type; 2080 levels[levels_index].mask_width = mask_width; 2081 levels[levels_index].nitems = nitems; 2082 levels_index++; 2083 } else { 2084 // If it is an unknown level, then logically move the previous layer up 2085 if (levels_index > 0) { 2086 levels[levels_index - 1].mask_width = mask_width; 2087 levels[levels_index - 1].nitems = nitems; 2088 } 2089 } 2090 level++; 2091 } while (level_type != INTEL_LEVEL_TYPE_INVALID); 2092 2093 // Set the masks to & with apicid 2094 for (unsigned i = 0; i < levels_index; ++i) { 2095 if (levels[i].level_type != INTEL_LEVEL_TYPE_INVALID) { 2096 levels[i].mask = ~((-1) << levels[i].mask_width); 2097 levels[i].cache_mask = (-1) << levels[i].mask_width; 2098 for (unsigned j = 0; j < i; ++j) 2099 levels[i].mask ^= levels[j].mask; 2100 } else { 2101 KMP_DEBUG_ASSERT(levels_index > 0); 2102 levels[i].mask = (-1) << levels[i - 1].mask_width; 2103 levels[i].cache_mask = 0; 2104 } 2105 } 2106 return levels_index; 2107 } 2108 2109 static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) { 2110 2111 cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST]; 2112 kmp_hw_t types[INTEL_LEVEL_TYPE_LAST]; 2113 unsigned levels_index; 2114 kmp_cpuid buf; 2115 kmp_uint64 known_levels; 2116 int topology_leaf, highest_leaf, apic_id; 2117 int num_leaves; 2118 static int leaves[] = {0, 0}; 2119 2120 kmp_i18n_id_t leaf_message_id; 2121 2122 KMP_BUILD_ASSERT(sizeof(known_levels) * CHAR_BIT > KMP_HW_LAST); 2123 2124 *msg_id = kmp_i18n_null; 2125 if (__kmp_affinity_verbose) { 2126 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 2127 } 2128 2129 // Figure out the known topology levels 2130 known_levels = 0ull; 2131 for (int i = 0; i < INTEL_LEVEL_TYPE_LAST; ++i) { 2132 if (__kmp_intel_type_2_topology_type(i) != KMP_HW_UNKNOWN) { 2133 known_levels |= (1ull << i); 2134 } 2135 } 2136 2137 // Get the highest cpuid leaf supported 2138 __kmp_x86_cpuid(0, 0, &buf); 2139 highest_leaf = buf.eax; 2140 2141 // If a specific topology method was requested, only allow that specific leaf 2142 // otherwise, try both leaves 31 and 11 in that order 2143 num_leaves = 0; 2144 if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 2145 num_leaves = 1; 2146 leaves[0] = 11; 2147 leaf_message_id = kmp_i18n_str_NoLeaf11Support; 2148 } else if (__kmp_affinity_top_method == affinity_top_method_x2apicid_1f) { 2149 num_leaves = 1; 2150 leaves[0] = 31; 2151 leaf_message_id = kmp_i18n_str_NoLeaf31Support; 2152 } else { 2153 num_leaves = 2; 2154 leaves[0] = 31; 2155 leaves[1] = 11; 2156 leaf_message_id = kmp_i18n_str_NoLeaf11Support; 2157 } 2158 2159 // Check to see if cpuid leaf 31 or 11 is supported. 2160 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 2161 topology_leaf = -1; 2162 for (int i = 0; i < num_leaves; ++i) { 2163 int leaf = leaves[i]; 2164 if (highest_leaf < leaf) 2165 continue; 2166 __kmp_x86_cpuid(leaf, 0, &buf); 2167 if (buf.ebx == 0) 2168 continue; 2169 topology_leaf = leaf; 2170 levels_index = __kmp_x2apicid_get_levels(leaf, levels, known_levels); 2171 if (levels_index == 0) 2172 continue; 2173 break; 2174 } 2175 if (topology_leaf == -1 || levels_index == 0) { 2176 *msg_id = leaf_message_id; 2177 return false; 2178 } 2179 KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST); 2180 2181 // The algorithm used starts by setting the affinity to each available thread 2182 // and retrieving info from the cpuid instruction, so if we are not capable of 2183 // calling __kmp_get_system_affinity() and __kmp_get_system_affinity(), then 2184 // we need to do something else - use the defaults that we calculated from 2185 // issuing cpuid without binding to each proc. 2186 if (!KMP_AFFINITY_CAPABLE()) { 2187 // Hack to try and infer the machine topology using only the data 2188 // available from cpuid on the current thread, and __kmp_xproc. 2189 KMP_ASSERT(__kmp_affinity_type == affinity_none); 2190 for (unsigned i = 0; i < levels_index; ++i) { 2191 if (levels[i].level_type == INTEL_LEVEL_TYPE_SMT) { 2192 __kmp_nThreadsPerCore = levels[i].nitems; 2193 } else if (levels[i].level_type == INTEL_LEVEL_TYPE_CORE) { 2194 nCoresPerPkg = levels[i].nitems; 2195 } 2196 } 2197 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 2198 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 2199 return true; 2200 } 2201 2202 // Allocate the data structure to be returned. 2203 int depth = levels_index; 2204 for (int i = depth - 1, j = 0; i >= 0; --i, ++j) 2205 types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type); 2206 __kmp_topology = 2207 kmp_topology_t::allocate(__kmp_avail_proc, levels_index, types); 2208 2209 // Insert equivalent cache types if they exist 2210 kmp_cache_info_t cache_info; 2211 for (size_t i = 0; i < cache_info.get_depth(); ++i) { 2212 const kmp_cache_info_t::info_t &info = cache_info[i]; 2213 unsigned cache_mask = info.mask; 2214 unsigned cache_level = info.level; 2215 for (unsigned j = 0; j < levels_index; ++j) { 2216 unsigned hw_cache_mask = levels[j].cache_mask; 2217 kmp_hw_t cache_type = kmp_cache_info_t::get_topology_type(cache_level); 2218 if (hw_cache_mask == cache_mask && j < levels_index - 1) { 2219 kmp_hw_t type = 2220 __kmp_intel_type_2_topology_type(levels[j + 1].level_type); 2221 __kmp_topology->set_equivalent_type(cache_type, type); 2222 } 2223 } 2224 } 2225 2226 // From here on, we can assume that it is safe to call 2227 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 2228 // __kmp_affinity_type = affinity_none. 2229 2230 // Save the affinity mask for the current thread. 2231 kmp_affinity_raii_t previous_affinity; 2232 2233 // Run through each of the available contexts, binding the current thread 2234 // to it, and obtaining the pertinent information using the cpuid instr. 2235 unsigned int proc; 2236 int hw_thread_index = 0; 2237 KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) { 2238 cpuid_level_info_t my_levels[INTEL_LEVEL_TYPE_LAST]; 2239 unsigned my_levels_index; 2240 2241 // Skip this proc if it is not included in the machine model. 2242 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 2243 continue; 2244 } 2245 KMP_DEBUG_ASSERT(hw_thread_index < __kmp_avail_proc); 2246 2247 __kmp_affinity_dispatch->bind_thread(proc); 2248 2249 // New algorithm 2250 __kmp_x86_cpuid(topology_leaf, 0, &buf); 2251 apic_id = buf.edx; 2252 kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index); 2253 my_levels_index = 2254 __kmp_x2apicid_get_levels(topology_leaf, my_levels, known_levels); 2255 if (my_levels_index == 0 || my_levels_index != levels_index) { 2256 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 2257 return false; 2258 } 2259 hw_thread.clear(); 2260 hw_thread.os_id = proc; 2261 // Put in topology information 2262 for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) { 2263 hw_thread.ids[idx] = apic_id & my_levels[j].mask; 2264 if (j > 0) { 2265 hw_thread.ids[idx] >>= my_levels[j - 1].mask_width; 2266 } 2267 } 2268 // Hybrid information 2269 if (__kmp_is_hybrid_cpu() && highest_leaf >= 0x1a) { 2270 kmp_hw_core_type_t type; 2271 unsigned native_model_id; 2272 __kmp_get_hybrid_info(&type, &native_model_id); 2273 hw_thread.core_type = type; 2274 } 2275 hw_thread_index++; 2276 } 2277 KMP_ASSERT(hw_thread_index > 0); 2278 __kmp_topology->sort_ids(); 2279 if (!__kmp_topology->check_ids()) { 2280 kmp_topology_t::deallocate(__kmp_topology); 2281 __kmp_topology = nullptr; 2282 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 2283 return false; 2284 } 2285 return true; 2286 } 2287 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 2288 2289 #define osIdIndex 0 2290 #define threadIdIndex 1 2291 #define coreIdIndex 2 2292 #define pkgIdIndex 3 2293 #define nodeIdIndex 4 2294 2295 typedef unsigned *ProcCpuInfo; 2296 static unsigned maxIndex = pkgIdIndex; 2297 2298 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, 2299 const void *b) { 2300 unsigned i; 2301 const unsigned *aa = *(unsigned *const *)a; 2302 const unsigned *bb = *(unsigned *const *)b; 2303 for (i = maxIndex;; i--) { 2304 if (aa[i] < bb[i]) 2305 return -1; 2306 if (aa[i] > bb[i]) 2307 return 1; 2308 if (i == osIdIndex) 2309 break; 2310 } 2311 return 0; 2312 } 2313 2314 #if KMP_USE_HIER_SCHED 2315 // Set the array sizes for the hierarchy layers 2316 static void __kmp_dispatch_set_hierarchy_values() { 2317 // Set the maximum number of L1's to number of cores 2318 // Set the maximum number of L2's to to either number of cores / 2 for 2319 // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing 2320 // Or the number of cores for Intel(R) Xeon(R) processors 2321 // Set the maximum number of NUMA nodes and L3's to number of packages 2322 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] = 2323 nPackages * nCoresPerPkg * __kmp_nThreadsPerCore; 2324 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores; 2325 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \ 2326 KMP_MIC_SUPPORTED 2327 if (__kmp_mic_type >= mic3) 2328 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2; 2329 else 2330 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) 2331 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores; 2332 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages; 2333 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages; 2334 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1; 2335 // Set the number of threads per unit 2336 // Number of hardware threads per L1/L2/L3/NUMA/LOOP 2337 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1; 2338 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] = 2339 __kmp_nThreadsPerCore; 2340 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \ 2341 KMP_MIC_SUPPORTED 2342 if (__kmp_mic_type >= mic3) 2343 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] = 2344 2 * __kmp_nThreadsPerCore; 2345 else 2346 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) 2347 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] = 2348 __kmp_nThreadsPerCore; 2349 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] = 2350 nCoresPerPkg * __kmp_nThreadsPerCore; 2351 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] = 2352 nCoresPerPkg * __kmp_nThreadsPerCore; 2353 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] = 2354 nPackages * nCoresPerPkg * __kmp_nThreadsPerCore; 2355 } 2356 2357 // Return the index into the hierarchy for this tid and layer type (L1, L2, etc) 2358 // i.e., this thread's L1 or this thread's L2, etc. 2359 int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) { 2360 int index = type + 1; 2361 int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1]; 2362 KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST); 2363 if (type == kmp_hier_layer_e::LAYER_THREAD) 2364 return tid; 2365 else if (type == kmp_hier_layer_e::LAYER_LOOP) 2366 return 0; 2367 KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0); 2368 if (tid >= num_hw_threads) 2369 tid = tid % num_hw_threads; 2370 return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index]; 2371 } 2372 2373 // Return the number of t1's per t2 2374 int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) { 2375 int i1 = t1 + 1; 2376 int i2 = t2 + 1; 2377 KMP_DEBUG_ASSERT(i1 <= i2); 2378 KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST); 2379 KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST); 2380 KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0); 2381 // (nthreads/t2) / (nthreads/t1) = t1 / t2 2382 return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1]; 2383 } 2384 #endif // KMP_USE_HIER_SCHED 2385 2386 static inline const char *__kmp_cpuinfo_get_filename() { 2387 const char *filename; 2388 if (__kmp_cpuinfo_file != nullptr) 2389 filename = __kmp_cpuinfo_file; 2390 else 2391 filename = "/proc/cpuinfo"; 2392 return filename; 2393 } 2394 2395 static inline const char *__kmp_cpuinfo_get_envvar() { 2396 const char *envvar = nullptr; 2397 if (__kmp_cpuinfo_file != nullptr) 2398 envvar = "KMP_CPUINFO_FILE"; 2399 return envvar; 2400 } 2401 2402 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 2403 // affinity map. 2404 static bool __kmp_affinity_create_cpuinfo_map(int *line, 2405 kmp_i18n_id_t *const msg_id) { 2406 const char *filename = __kmp_cpuinfo_get_filename(); 2407 const char *envvar = __kmp_cpuinfo_get_envvar(); 2408 *msg_id = kmp_i18n_null; 2409 2410 if (__kmp_affinity_verbose) { 2411 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 2412 } 2413 2414 kmp_safe_raii_file_t f(filename, "r", envvar); 2415 2416 // Scan of the file, and count the number of "processor" (osId) fields, 2417 // and find the highest value of <n> for a node_<n> field. 2418 char buf[256]; 2419 unsigned num_records = 0; 2420 while (!feof(f)) { 2421 buf[sizeof(buf) - 1] = 1; 2422 if (!fgets(buf, sizeof(buf), f)) { 2423 // Read errors presumably because of EOF 2424 break; 2425 } 2426 2427 char s1[] = "processor"; 2428 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 2429 num_records++; 2430 continue; 2431 } 2432 2433 // FIXME - this will match "node_<n> <garbage>" 2434 unsigned level; 2435 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 2436 // validate the input fisrt: 2437 if (level > (unsigned)__kmp_xproc) { // level is too big 2438 level = __kmp_xproc; 2439 } 2440 if (nodeIdIndex + level >= maxIndex) { 2441 maxIndex = nodeIdIndex + level; 2442 } 2443 continue; 2444 } 2445 } 2446 2447 // Check for empty file / no valid processor records, or too many. The number 2448 // of records can't exceed the number of valid bits in the affinity mask. 2449 if (num_records == 0) { 2450 *msg_id = kmp_i18n_str_NoProcRecords; 2451 return false; 2452 } 2453 if (num_records > (unsigned)__kmp_xproc) { 2454 *msg_id = kmp_i18n_str_TooManyProcRecords; 2455 return false; 2456 } 2457 2458 // Set the file pointer back to the beginning, so that we can scan the file 2459 // again, this time performing a full parse of the data. Allocate a vector of 2460 // ProcCpuInfo object, where we will place the data. Adding an extra element 2461 // at the end allows us to remove a lot of extra checks for termination 2462 // conditions. 2463 if (fseek(f, 0, SEEK_SET) != 0) { 2464 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 2465 return false; 2466 } 2467 2468 // Allocate the array of records to store the proc info in. The dummy 2469 // element at the end makes the logic in filling them out easier to code. 2470 unsigned **threadInfo = 2471 (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *)); 2472 unsigned i; 2473 for (i = 0; i <= num_records; i++) { 2474 threadInfo[i] = 2475 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2476 } 2477 2478 #define CLEANUP_THREAD_INFO \ 2479 for (i = 0; i <= num_records; i++) { \ 2480 __kmp_free(threadInfo[i]); \ 2481 } \ 2482 __kmp_free(threadInfo); 2483 2484 // A value of UINT_MAX means that we didn't find the field 2485 unsigned __index; 2486 2487 #define INIT_PROC_INFO(p) \ 2488 for (__index = 0; __index <= maxIndex; __index++) { \ 2489 (p)[__index] = UINT_MAX; \ 2490 } 2491 2492 for (i = 0; i <= num_records; i++) { 2493 INIT_PROC_INFO(threadInfo[i]); 2494 } 2495 2496 unsigned num_avail = 0; 2497 *line = 0; 2498 while (!feof(f)) { 2499 // Create an inner scoping level, so that all the goto targets at the end of 2500 // the loop appear in an outer scoping level. This avoids warnings about 2501 // jumping past an initialization to a target in the same block. 2502 { 2503 buf[sizeof(buf) - 1] = 1; 2504 bool long_line = false; 2505 if (!fgets(buf, sizeof(buf), f)) { 2506 // Read errors presumably because of EOF 2507 // If there is valid data in threadInfo[num_avail], then fake 2508 // a blank line in ensure that the last address gets parsed. 2509 bool valid = false; 2510 for (i = 0; i <= maxIndex; i++) { 2511 if (threadInfo[num_avail][i] != UINT_MAX) { 2512 valid = true; 2513 } 2514 } 2515 if (!valid) { 2516 break; 2517 } 2518 buf[0] = 0; 2519 } else if (!buf[sizeof(buf) - 1]) { 2520 // The line is longer than the buffer. Set a flag and don't 2521 // emit an error if we were going to ignore the line, anyway. 2522 long_line = true; 2523 2524 #define CHECK_LINE \ 2525 if (long_line) { \ 2526 CLEANUP_THREAD_INFO; \ 2527 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 2528 return false; \ 2529 } 2530 } 2531 (*line)++; 2532 2533 char s1[] = "processor"; 2534 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 2535 CHECK_LINE; 2536 char *p = strchr(buf + sizeof(s1) - 1, ':'); 2537 unsigned val; 2538 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2539 goto no_val; 2540 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) 2541 #if KMP_ARCH_AARCH64 2542 // Handle the old AArch64 /proc/cpuinfo layout differently, 2543 // it contains all of the 'processor' entries listed in a 2544 // single 'Processor' section, therefore the normal looking 2545 // for duplicates in that section will always fail. 2546 num_avail++; 2547 #else 2548 goto dup_field; 2549 #endif 2550 threadInfo[num_avail][osIdIndex] = val; 2551 #if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64) 2552 char path[256]; 2553 KMP_SNPRINTF( 2554 path, sizeof(path), 2555 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 2556 threadInfo[num_avail][osIdIndex]); 2557 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 2558 2559 KMP_SNPRINTF(path, sizeof(path), 2560 "/sys/devices/system/cpu/cpu%u/topology/core_id", 2561 threadInfo[num_avail][osIdIndex]); 2562 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 2563 continue; 2564 #else 2565 } 2566 char s2[] = "physical id"; 2567 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 2568 CHECK_LINE; 2569 char *p = strchr(buf + sizeof(s2) - 1, ':'); 2570 unsigned val; 2571 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2572 goto no_val; 2573 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) 2574 goto dup_field; 2575 threadInfo[num_avail][pkgIdIndex] = val; 2576 continue; 2577 } 2578 char s3[] = "core id"; 2579 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 2580 CHECK_LINE; 2581 char *p = strchr(buf + sizeof(s3) - 1, ':'); 2582 unsigned val; 2583 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2584 goto no_val; 2585 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) 2586 goto dup_field; 2587 threadInfo[num_avail][coreIdIndex] = val; 2588 continue; 2589 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 2590 } 2591 char s4[] = "thread id"; 2592 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 2593 CHECK_LINE; 2594 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2595 unsigned val; 2596 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2597 goto no_val; 2598 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) 2599 goto dup_field; 2600 threadInfo[num_avail][threadIdIndex] = val; 2601 continue; 2602 } 2603 unsigned level; 2604 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 2605 CHECK_LINE; 2606 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2607 unsigned val; 2608 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2609 goto no_val; 2610 // validate the input before using level: 2611 if (level > (unsigned)__kmp_xproc) { // level is too big 2612 level = __kmp_xproc; 2613 } 2614 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) 2615 goto dup_field; 2616 threadInfo[num_avail][nodeIdIndex + level] = val; 2617 continue; 2618 } 2619 2620 // We didn't recognize the leading token on the line. There are lots of 2621 // leading tokens that we don't recognize - if the line isn't empty, go on 2622 // to the next line. 2623 if ((*buf != 0) && (*buf != '\n')) { 2624 // If the line is longer than the buffer, read characters 2625 // until we find a newline. 2626 if (long_line) { 2627 int ch; 2628 while (((ch = fgetc(f)) != EOF) && (ch != '\n')) 2629 ; 2630 } 2631 continue; 2632 } 2633 2634 // A newline has signalled the end of the processor record. 2635 // Check that there aren't too many procs specified. 2636 if ((int)num_avail == __kmp_xproc) { 2637 CLEANUP_THREAD_INFO; 2638 *msg_id = kmp_i18n_str_TooManyEntries; 2639 return false; 2640 } 2641 2642 // Check for missing fields. The osId field must be there, and we 2643 // currently require that the physical id field is specified, also. 2644 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 2645 CLEANUP_THREAD_INFO; 2646 *msg_id = kmp_i18n_str_MissingProcField; 2647 return false; 2648 } 2649 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 2650 CLEANUP_THREAD_INFO; 2651 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 2652 return false; 2653 } 2654 2655 // Skip this proc if it is not included in the machine model. 2656 if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], 2657 __kmp_affin_fullMask)) { 2658 INIT_PROC_INFO(threadInfo[num_avail]); 2659 continue; 2660 } 2661 2662 // We have a successful parse of this proc's info. 2663 // Increment the counter, and prepare for the next proc. 2664 num_avail++; 2665 KMP_ASSERT(num_avail <= num_records); 2666 INIT_PROC_INFO(threadInfo[num_avail]); 2667 } 2668 continue; 2669 2670 no_val: 2671 CLEANUP_THREAD_INFO; 2672 *msg_id = kmp_i18n_str_MissingValCpuinfo; 2673 return false; 2674 2675 dup_field: 2676 CLEANUP_THREAD_INFO; 2677 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 2678 return false; 2679 } 2680 *line = 0; 2681 2682 #if KMP_MIC && REDUCE_TEAM_SIZE 2683 unsigned teamSize = 0; 2684 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2685 2686 // check for num_records == __kmp_xproc ??? 2687 2688 // If it is configured to omit the package level when there is only a single 2689 // package, the logic at the end of this routine won't work if there is only a 2690 // single thread 2691 KMP_ASSERT(num_avail > 0); 2692 KMP_ASSERT(num_avail <= num_records); 2693 2694 // Sort the threadInfo table by physical Id. 2695 qsort(threadInfo, num_avail, sizeof(*threadInfo), 2696 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 2697 2698 // The table is now sorted by pkgId / coreId / threadId, but we really don't 2699 // know the radix of any of the fields. pkgId's may be sparsely assigned among 2700 // the chips on a system. Although coreId's are usually assigned 2701 // [0 .. coresPerPkg-1] and threadId's are usually assigned 2702 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2703 // 2704 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 2705 // total # packages) are at this point - we want to determine that now. We 2706 // only have an upper bound on the first two figures. 2707 unsigned *counts = 2708 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2709 unsigned *maxCt = 2710 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2711 unsigned *totals = 2712 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2713 unsigned *lastId = 2714 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2715 2716 bool assign_thread_ids = false; 2717 unsigned threadIdCt; 2718 unsigned index; 2719 2720 restart_radix_check: 2721 threadIdCt = 0; 2722 2723 // Initialize the counter arrays with data from threadInfo[0]. 2724 if (assign_thread_ids) { 2725 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 2726 threadInfo[0][threadIdIndex] = threadIdCt++; 2727 } else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 2728 threadIdCt = threadInfo[0][threadIdIndex] + 1; 2729 } 2730 } 2731 for (index = 0; index <= maxIndex; index++) { 2732 counts[index] = 1; 2733 maxCt[index] = 1; 2734 totals[index] = 1; 2735 lastId[index] = threadInfo[0][index]; 2736 ; 2737 } 2738 2739 // Run through the rest of the OS procs. 2740 for (i = 1; i < num_avail; i++) { 2741 // Find the most significant index whose id differs from the id for the 2742 // previous OS proc. 2743 for (index = maxIndex; index >= threadIdIndex; index--) { 2744 if (assign_thread_ids && (index == threadIdIndex)) { 2745 // Auto-assign the thread id field if it wasn't specified. 2746 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2747 threadInfo[i][threadIdIndex] = threadIdCt++; 2748 } 2749 // Apparently the thread id field was specified for some entries and not 2750 // others. Start the thread id counter off at the next higher thread id. 2751 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2752 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2753 } 2754 } 2755 if (threadInfo[i][index] != lastId[index]) { 2756 // Run through all indices which are less significant, and reset the 2757 // counts to 1. At all levels up to and including index, we need to 2758 // increment the totals and record the last id. 2759 unsigned index2; 2760 for (index2 = threadIdIndex; index2 < index; index2++) { 2761 totals[index2]++; 2762 if (counts[index2] > maxCt[index2]) { 2763 maxCt[index2] = counts[index2]; 2764 } 2765 counts[index2] = 1; 2766 lastId[index2] = threadInfo[i][index2]; 2767 } 2768 counts[index]++; 2769 totals[index]++; 2770 lastId[index] = threadInfo[i][index]; 2771 2772 if (assign_thread_ids && (index > threadIdIndex)) { 2773 2774 #if KMP_MIC && REDUCE_TEAM_SIZE 2775 // The default team size is the total #threads in the machine 2776 // minus 1 thread for every core that has 3 or more threads. 2777 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2778 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2779 2780 // Restart the thread counter, as we are on a new core. 2781 threadIdCt = 0; 2782 2783 // Auto-assign the thread id field if it wasn't specified. 2784 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2785 threadInfo[i][threadIdIndex] = threadIdCt++; 2786 } 2787 2788 // Apparently the thread id field was specified for some entries and 2789 // not others. Start the thread id counter off at the next higher 2790 // thread id. 2791 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2792 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2793 } 2794 } 2795 break; 2796 } 2797 } 2798 if (index < threadIdIndex) { 2799 // If thread ids were specified, it is an error if they are not unique. 2800 // Also, check that we waven't already restarted the loop (to be safe - 2801 // shouldn't need to). 2802 if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) { 2803 __kmp_free(lastId); 2804 __kmp_free(totals); 2805 __kmp_free(maxCt); 2806 __kmp_free(counts); 2807 CLEANUP_THREAD_INFO; 2808 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2809 return false; 2810 } 2811 2812 // If the thread ids were not specified and we see entries entries that 2813 // are duplicates, start the loop over and assign the thread ids manually. 2814 assign_thread_ids = true; 2815 goto restart_radix_check; 2816 } 2817 } 2818 2819 #if KMP_MIC && REDUCE_TEAM_SIZE 2820 // The default team size is the total #threads in the machine 2821 // minus 1 thread for every core that has 3 or more threads. 2822 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2823 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2824 2825 for (index = threadIdIndex; index <= maxIndex; index++) { 2826 if (counts[index] > maxCt[index]) { 2827 maxCt[index] = counts[index]; 2828 } 2829 } 2830 2831 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2832 nCoresPerPkg = maxCt[coreIdIndex]; 2833 nPackages = totals[pkgIdIndex]; 2834 2835 // When affinity is off, this routine will still be called to set 2836 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 2837 // Make sure all these vars are set correctly, and return now if affinity is 2838 // not enabled. 2839 __kmp_ncores = totals[coreIdIndex]; 2840 if (!KMP_AFFINITY_CAPABLE()) { 2841 KMP_ASSERT(__kmp_affinity_type == affinity_none); 2842 return true; 2843 } 2844 2845 #if KMP_MIC && REDUCE_TEAM_SIZE 2846 // Set the default team size. 2847 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2848 __kmp_dflt_team_nth = teamSize; 2849 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting " 2850 "__kmp_dflt_team_nth = %d\n", 2851 __kmp_dflt_team_nth)); 2852 } 2853 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2854 2855 KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc); 2856 2857 // Count the number of levels which have more nodes at that level than at the 2858 // parent's level (with there being an implicit root node of the top level). 2859 // This is equivalent to saying that there is at least one node at this level 2860 // which has a sibling. These levels are in the map, and the package level is 2861 // always in the map. 2862 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2863 for (index = threadIdIndex; index < maxIndex; index++) { 2864 KMP_ASSERT(totals[index] >= totals[index + 1]); 2865 inMap[index] = (totals[index] > totals[index + 1]); 2866 } 2867 inMap[maxIndex] = (totals[maxIndex] > 1); 2868 inMap[pkgIdIndex] = true; 2869 inMap[coreIdIndex] = true; 2870 inMap[threadIdIndex] = true; 2871 2872 int depth = 0; 2873 int idx = 0; 2874 kmp_hw_t types[KMP_HW_LAST]; 2875 int pkgLevel = -1; 2876 int coreLevel = -1; 2877 int threadLevel = -1; 2878 for (index = threadIdIndex; index <= maxIndex; index++) { 2879 if (inMap[index]) { 2880 depth++; 2881 } 2882 } 2883 if (inMap[pkgIdIndex]) { 2884 pkgLevel = idx; 2885 types[idx++] = KMP_HW_SOCKET; 2886 } 2887 if (inMap[coreIdIndex]) { 2888 coreLevel = idx; 2889 types[idx++] = KMP_HW_CORE; 2890 } 2891 if (inMap[threadIdIndex]) { 2892 threadLevel = idx; 2893 types[idx++] = KMP_HW_THREAD; 2894 } 2895 KMP_ASSERT(depth > 0); 2896 2897 // Construct the data structure that is to be returned. 2898 __kmp_topology = kmp_topology_t::allocate(num_avail, depth, types); 2899 2900 for (i = 0; i < num_avail; ++i) { 2901 unsigned os = threadInfo[i][osIdIndex]; 2902 int src_index; 2903 int dst_index = 0; 2904 kmp_hw_thread_t &hw_thread = __kmp_topology->at(i); 2905 hw_thread.clear(); 2906 hw_thread.os_id = os; 2907 2908 idx = 0; 2909 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2910 if (!inMap[src_index]) { 2911 continue; 2912 } 2913 if (src_index == pkgIdIndex) { 2914 hw_thread.ids[pkgLevel] = threadInfo[i][src_index]; 2915 } else if (src_index == coreIdIndex) { 2916 hw_thread.ids[coreLevel] = threadInfo[i][src_index]; 2917 } else if (src_index == threadIdIndex) { 2918 hw_thread.ids[threadLevel] = threadInfo[i][src_index]; 2919 } 2920 dst_index++; 2921 } 2922 } 2923 2924 __kmp_free(inMap); 2925 __kmp_free(lastId); 2926 __kmp_free(totals); 2927 __kmp_free(maxCt); 2928 __kmp_free(counts); 2929 CLEANUP_THREAD_INFO; 2930 __kmp_topology->sort_ids(); 2931 if (!__kmp_topology->check_ids()) { 2932 kmp_topology_t::deallocate(__kmp_topology); 2933 __kmp_topology = nullptr; 2934 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2935 return false; 2936 } 2937 return true; 2938 } 2939 2940 // Create and return a table of affinity masks, indexed by OS thread ID. 2941 // This routine handles OR'ing together all the affinity masks of threads 2942 // that are sufficiently close, if granularity > fine. 2943 static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex, 2944 unsigned *numUnique) { 2945 // First form a table of affinity masks in order of OS thread id. 2946 int maxOsId; 2947 int i; 2948 int numAddrs = __kmp_topology->get_num_hw_threads(); 2949 int depth = __kmp_topology->get_depth(); 2950 KMP_ASSERT(numAddrs); 2951 KMP_ASSERT(depth); 2952 2953 maxOsId = 0; 2954 for (i = numAddrs - 1;; --i) { 2955 int osId = __kmp_topology->at(i).os_id; 2956 if (osId > maxOsId) { 2957 maxOsId = osId; 2958 } 2959 if (i == 0) 2960 break; 2961 } 2962 kmp_affin_mask_t *osId2Mask; 2963 KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1)); 2964 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2965 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2966 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2967 } 2968 if (__kmp_affinity_gran_levels >= (int)depth) { 2969 if (__kmp_affinity_verbose || 2970 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 2971 KMP_WARNING(AffThreadsMayMigrate); 2972 } 2973 } 2974 2975 // Run through the table, forming the masks for all threads on each core. 2976 // Threads on the same core will have identical kmp_hw_thread_t objects, not 2977 // considering the last level, which must be the thread id. All threads on a 2978 // core will appear consecutively. 2979 int unique = 0; 2980 int j = 0; // index of 1st thread on core 2981 int leader = 0; 2982 kmp_affin_mask_t *sum; 2983 KMP_CPU_ALLOC_ON_STACK(sum); 2984 KMP_CPU_ZERO(sum); 2985 KMP_CPU_SET(__kmp_topology->at(0).os_id, sum); 2986 for (i = 1; i < numAddrs; i++) { 2987 // If this thread is sufficiently close to the leader (within the 2988 // granularity setting), then set the bit for this os thread in the 2989 // affinity mask for this group, and go on to the next thread. 2990 if (__kmp_topology->is_close(leader, i, __kmp_affinity_gran_levels)) { 2991 KMP_CPU_SET(__kmp_topology->at(i).os_id, sum); 2992 continue; 2993 } 2994 2995 // For every thread in this group, copy the mask to the thread's entry in 2996 // the osId2Mask table. Mark the first address as a leader. 2997 for (; j < i; j++) { 2998 int osId = __kmp_topology->at(j).os_id; 2999 KMP_DEBUG_ASSERT(osId <= maxOsId); 3000 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 3001 KMP_CPU_COPY(mask, sum); 3002 __kmp_topology->at(j).leader = (j == leader); 3003 } 3004 unique++; 3005 3006 // Start a new mask. 3007 leader = i; 3008 KMP_CPU_ZERO(sum); 3009 KMP_CPU_SET(__kmp_topology->at(i).os_id, sum); 3010 } 3011 3012 // For every thread in last group, copy the mask to the thread's 3013 // entry in the osId2Mask table. 3014 for (; j < i; j++) { 3015 int osId = __kmp_topology->at(j).os_id; 3016 KMP_DEBUG_ASSERT(osId <= maxOsId); 3017 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 3018 KMP_CPU_COPY(mask, sum); 3019 __kmp_topology->at(j).leader = (j == leader); 3020 } 3021 unique++; 3022 KMP_CPU_FREE_FROM_STACK(sum); 3023 3024 *maxIndex = maxOsId; 3025 *numUnique = unique; 3026 return osId2Mask; 3027 } 3028 3029 // Stuff for the affinity proclist parsers. It's easier to declare these vars 3030 // as file-static than to try and pass them through the calling sequence of 3031 // the recursive-descent OMP_PLACES parser. 3032 static kmp_affin_mask_t *newMasks; 3033 static int numNewMasks; 3034 static int nextNewMask; 3035 3036 #define ADD_MASK(_mask) \ 3037 { \ 3038 if (nextNewMask >= numNewMasks) { \ 3039 int i; \ 3040 numNewMasks *= 2; \ 3041 kmp_affin_mask_t *temp; \ 3042 KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \ 3043 for (i = 0; i < numNewMasks / 2; i++) { \ 3044 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); \ 3045 kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i); \ 3046 KMP_CPU_COPY(dest, src); \ 3047 } \ 3048 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2); \ 3049 newMasks = temp; \ 3050 } \ 3051 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 3052 nextNewMask++; \ 3053 } 3054 3055 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId) \ 3056 { \ 3057 if (((_osId) > _maxOsId) || \ 3058 (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 3059 if (__kmp_affinity_verbose || \ 3060 (__kmp_affinity_warnings && \ 3061 (__kmp_affinity_type != affinity_none))) { \ 3062 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 3063 } \ 3064 } else { \ 3065 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 3066 } \ 3067 } 3068 3069 // Re-parse the proclist (for the explicit affinity type), and form the list 3070 // of affinity newMasks indexed by gtid. 3071 static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 3072 unsigned int *out_numMasks, 3073 const char *proclist, 3074 kmp_affin_mask_t *osId2Mask, 3075 int maxOsId) { 3076 int i; 3077 const char *scan = proclist; 3078 const char *next = proclist; 3079 3080 // We use malloc() for the temporary mask vector, so that we can use 3081 // realloc() to extend it. 3082 numNewMasks = 2; 3083 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 3084 nextNewMask = 0; 3085 kmp_affin_mask_t *sumMask; 3086 KMP_CPU_ALLOC(sumMask); 3087 int setSize = 0; 3088 3089 for (;;) { 3090 int start, end, stride; 3091 3092 SKIP_WS(scan); 3093 next = scan; 3094 if (*next == '\0') { 3095 break; 3096 } 3097 3098 if (*next == '{') { 3099 int num; 3100 setSize = 0; 3101 next++; // skip '{' 3102 SKIP_WS(next); 3103 scan = next; 3104 3105 // Read the first integer in the set. 3106 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist"); 3107 SKIP_DIGITS(next); 3108 num = __kmp_str_to_int(scan, *next); 3109 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 3110 3111 // Copy the mask for that osId to the sum (union) mask. 3112 if ((num > maxOsId) || 3113 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3114 if (__kmp_affinity_verbose || 3115 (__kmp_affinity_warnings && 3116 (__kmp_affinity_type != affinity_none))) { 3117 KMP_WARNING(AffIgnoreInvalidProcID, num); 3118 } 3119 KMP_CPU_ZERO(sumMask); 3120 } else { 3121 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 3122 setSize = 1; 3123 } 3124 3125 for (;;) { 3126 // Check for end of set. 3127 SKIP_WS(next); 3128 if (*next == '}') { 3129 next++; // skip '}' 3130 break; 3131 } 3132 3133 // Skip optional comma. 3134 if (*next == ',') { 3135 next++; 3136 } 3137 SKIP_WS(next); 3138 3139 // Read the next integer in the set. 3140 scan = next; 3141 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 3142 3143 SKIP_DIGITS(next); 3144 num = __kmp_str_to_int(scan, *next); 3145 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 3146 3147 // Add the mask for that osId to the sum mask. 3148 if ((num > maxOsId) || 3149 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3150 if (__kmp_affinity_verbose || 3151 (__kmp_affinity_warnings && 3152 (__kmp_affinity_type != affinity_none))) { 3153 KMP_WARNING(AffIgnoreInvalidProcID, num); 3154 } 3155 } else { 3156 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 3157 setSize++; 3158 } 3159 } 3160 if (setSize > 0) { 3161 ADD_MASK(sumMask); 3162 } 3163 3164 SKIP_WS(next); 3165 if (*next == ',') { 3166 next++; 3167 } 3168 scan = next; 3169 continue; 3170 } 3171 3172 // Read the first integer. 3173 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 3174 SKIP_DIGITS(next); 3175 start = __kmp_str_to_int(scan, *next); 3176 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 3177 SKIP_WS(next); 3178 3179 // If this isn't a range, then add a mask to the list and go on. 3180 if (*next != '-') { 3181 ADD_MASK_OSID(start, osId2Mask, maxOsId); 3182 3183 // Skip optional comma. 3184 if (*next == ',') { 3185 next++; 3186 } 3187 scan = next; 3188 continue; 3189 } 3190 3191 // This is a range. Skip over the '-' and read in the 2nd int. 3192 next++; // skip '-' 3193 SKIP_WS(next); 3194 scan = next; 3195 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 3196 SKIP_DIGITS(next); 3197 end = __kmp_str_to_int(scan, *next); 3198 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 3199 3200 // Check for a stride parameter 3201 stride = 1; 3202 SKIP_WS(next); 3203 if (*next == ':') { 3204 // A stride is specified. Skip over the ':" and read the 3rd int. 3205 int sign = +1; 3206 next++; // skip ':' 3207 SKIP_WS(next); 3208 scan = next; 3209 if (*next == '-') { 3210 sign = -1; 3211 next++; 3212 SKIP_WS(next); 3213 scan = next; 3214 } 3215 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 3216 SKIP_DIGITS(next); 3217 stride = __kmp_str_to_int(scan, *next); 3218 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 3219 stride *= sign; 3220 } 3221 3222 // Do some range checks. 3223 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 3224 if (stride > 0) { 3225 KMP_ASSERT2(start <= end, "bad explicit proc list"); 3226 } else { 3227 KMP_ASSERT2(start >= end, "bad explicit proc list"); 3228 } 3229 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 3230 3231 // Add the mask for each OS proc # to the list. 3232 if (stride > 0) { 3233 do { 3234 ADD_MASK_OSID(start, osId2Mask, maxOsId); 3235 start += stride; 3236 } while (start <= end); 3237 } else { 3238 do { 3239 ADD_MASK_OSID(start, osId2Mask, maxOsId); 3240 start += stride; 3241 } while (start >= end); 3242 } 3243 3244 // Skip optional comma. 3245 SKIP_WS(next); 3246 if (*next == ',') { 3247 next++; 3248 } 3249 scan = next; 3250 } 3251 3252 *out_numMasks = nextNewMask; 3253 if (nextNewMask == 0) { 3254 *out_masks = NULL; 3255 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3256 return; 3257 } 3258 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3259 for (i = 0; i < nextNewMask; i++) { 3260 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 3261 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 3262 KMP_CPU_COPY(dest, src); 3263 } 3264 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3265 KMP_CPU_FREE(sumMask); 3266 } 3267 3268 /*----------------------------------------------------------------------------- 3269 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 3270 places. Again, Here is the grammar: 3271 3272 place_list := place 3273 place_list := place , place_list 3274 place := num 3275 place := place : num 3276 place := place : num : signed 3277 place := { subplacelist } 3278 place := ! place // (lowest priority) 3279 subplace_list := subplace 3280 subplace_list := subplace , subplace_list 3281 subplace := num 3282 subplace := num : num 3283 subplace := num : num : signed 3284 signed := num 3285 signed := + signed 3286 signed := - signed 3287 -----------------------------------------------------------------------------*/ 3288 static void __kmp_process_subplace_list(const char **scan, 3289 kmp_affin_mask_t *osId2Mask, 3290 int maxOsId, kmp_affin_mask_t *tempMask, 3291 int *setSize) { 3292 const char *next; 3293 3294 for (;;) { 3295 int start, count, stride, i; 3296 3297 // Read in the starting proc id 3298 SKIP_WS(*scan); 3299 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3300 next = *scan; 3301 SKIP_DIGITS(next); 3302 start = __kmp_str_to_int(*scan, *next); 3303 KMP_ASSERT(start >= 0); 3304 *scan = next; 3305 3306 // valid follow sets are ',' ':' and '}' 3307 SKIP_WS(*scan); 3308 if (**scan == '}' || **scan == ',') { 3309 if ((start > maxOsId) || 3310 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3311 if (__kmp_affinity_verbose || 3312 (__kmp_affinity_warnings && 3313 (__kmp_affinity_type != affinity_none))) { 3314 KMP_WARNING(AffIgnoreInvalidProcID, start); 3315 } 3316 } else { 3317 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3318 (*setSize)++; 3319 } 3320 if (**scan == '}') { 3321 break; 3322 } 3323 (*scan)++; // skip ',' 3324 continue; 3325 } 3326 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3327 (*scan)++; // skip ':' 3328 3329 // Read count parameter 3330 SKIP_WS(*scan); 3331 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3332 next = *scan; 3333 SKIP_DIGITS(next); 3334 count = __kmp_str_to_int(*scan, *next); 3335 KMP_ASSERT(count >= 0); 3336 *scan = next; 3337 3338 // valid follow sets are ',' ':' and '}' 3339 SKIP_WS(*scan); 3340 if (**scan == '}' || **scan == ',') { 3341 for (i = 0; i < count; i++) { 3342 if ((start > maxOsId) || 3343 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3344 if (__kmp_affinity_verbose || 3345 (__kmp_affinity_warnings && 3346 (__kmp_affinity_type != affinity_none))) { 3347 KMP_WARNING(AffIgnoreInvalidProcID, start); 3348 } 3349 break; // don't proliferate warnings for large count 3350 } else { 3351 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3352 start++; 3353 (*setSize)++; 3354 } 3355 } 3356 if (**scan == '}') { 3357 break; 3358 } 3359 (*scan)++; // skip ',' 3360 continue; 3361 } 3362 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3363 (*scan)++; // skip ':' 3364 3365 // Read stride parameter 3366 int sign = +1; 3367 for (;;) { 3368 SKIP_WS(*scan); 3369 if (**scan == '+') { 3370 (*scan)++; // skip '+' 3371 continue; 3372 } 3373 if (**scan == '-') { 3374 sign *= -1; 3375 (*scan)++; // skip '-' 3376 continue; 3377 } 3378 break; 3379 } 3380 SKIP_WS(*scan); 3381 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3382 next = *scan; 3383 SKIP_DIGITS(next); 3384 stride = __kmp_str_to_int(*scan, *next); 3385 KMP_ASSERT(stride >= 0); 3386 *scan = next; 3387 stride *= sign; 3388 3389 // valid follow sets are ',' and '}' 3390 SKIP_WS(*scan); 3391 if (**scan == '}' || **scan == ',') { 3392 for (i = 0; i < count; i++) { 3393 if ((start > maxOsId) || 3394 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3395 if (__kmp_affinity_verbose || 3396 (__kmp_affinity_warnings && 3397 (__kmp_affinity_type != affinity_none))) { 3398 KMP_WARNING(AffIgnoreInvalidProcID, start); 3399 } 3400 break; // don't proliferate warnings for large count 3401 } else { 3402 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3403 start += stride; 3404 (*setSize)++; 3405 } 3406 } 3407 if (**scan == '}') { 3408 break; 3409 } 3410 (*scan)++; // skip ',' 3411 continue; 3412 } 3413 3414 KMP_ASSERT2(0, "bad explicit places list"); 3415 } 3416 } 3417 3418 static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 3419 int maxOsId, kmp_affin_mask_t *tempMask, 3420 int *setSize) { 3421 const char *next; 3422 3423 // valid follow sets are '{' '!' and num 3424 SKIP_WS(*scan); 3425 if (**scan == '{') { 3426 (*scan)++; // skip '{' 3427 __kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize); 3428 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 3429 (*scan)++; // skip '}' 3430 } else if (**scan == '!') { 3431 (*scan)++; // skip '!' 3432 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 3433 KMP_CPU_COMPLEMENT(maxOsId, tempMask); 3434 } else if ((**scan >= '0') && (**scan <= '9')) { 3435 next = *scan; 3436 SKIP_DIGITS(next); 3437 int num = __kmp_str_to_int(*scan, *next); 3438 KMP_ASSERT(num >= 0); 3439 if ((num > maxOsId) || 3440 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3441 if (__kmp_affinity_verbose || 3442 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 3443 KMP_WARNING(AffIgnoreInvalidProcID, num); 3444 } 3445 } else { 3446 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 3447 (*setSize)++; 3448 } 3449 *scan = next; // skip num 3450 } else { 3451 KMP_ASSERT2(0, "bad explicit places list"); 3452 } 3453 } 3454 3455 // static void 3456 void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 3457 unsigned int *out_numMasks, 3458 const char *placelist, 3459 kmp_affin_mask_t *osId2Mask, 3460 int maxOsId) { 3461 int i, j, count, stride, sign; 3462 const char *scan = placelist; 3463 const char *next = placelist; 3464 3465 numNewMasks = 2; 3466 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 3467 nextNewMask = 0; 3468 3469 // tempMask is modified based on the previous or initial 3470 // place to form the current place 3471 // previousMask contains the previous place 3472 kmp_affin_mask_t *tempMask; 3473 kmp_affin_mask_t *previousMask; 3474 KMP_CPU_ALLOC(tempMask); 3475 KMP_CPU_ZERO(tempMask); 3476 KMP_CPU_ALLOC(previousMask); 3477 KMP_CPU_ZERO(previousMask); 3478 int setSize = 0; 3479 3480 for (;;) { 3481 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 3482 3483 // valid follow sets are ',' ':' and EOL 3484 SKIP_WS(scan); 3485 if (*scan == '\0' || *scan == ',') { 3486 if (setSize > 0) { 3487 ADD_MASK(tempMask); 3488 } 3489 KMP_CPU_ZERO(tempMask); 3490 setSize = 0; 3491 if (*scan == '\0') { 3492 break; 3493 } 3494 scan++; // skip ',' 3495 continue; 3496 } 3497 3498 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3499 scan++; // skip ':' 3500 3501 // Read count parameter 3502 SKIP_WS(scan); 3503 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 3504 next = scan; 3505 SKIP_DIGITS(next); 3506 count = __kmp_str_to_int(scan, *next); 3507 KMP_ASSERT(count >= 0); 3508 scan = next; 3509 3510 // valid follow sets are ',' ':' and EOL 3511 SKIP_WS(scan); 3512 if (*scan == '\0' || *scan == ',') { 3513 stride = +1; 3514 } else { 3515 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3516 scan++; // skip ':' 3517 3518 // Read stride parameter 3519 sign = +1; 3520 for (;;) { 3521 SKIP_WS(scan); 3522 if (*scan == '+') { 3523 scan++; // skip '+' 3524 continue; 3525 } 3526 if (*scan == '-') { 3527 sign *= -1; 3528 scan++; // skip '-' 3529 continue; 3530 } 3531 break; 3532 } 3533 SKIP_WS(scan); 3534 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 3535 next = scan; 3536 SKIP_DIGITS(next); 3537 stride = __kmp_str_to_int(scan, *next); 3538 KMP_DEBUG_ASSERT(stride >= 0); 3539 scan = next; 3540 stride *= sign; 3541 } 3542 3543 // Add places determined by initial_place : count : stride 3544 for (i = 0; i < count; i++) { 3545 if (setSize == 0) { 3546 break; 3547 } 3548 // Add the current place, then build the next place (tempMask) from that 3549 KMP_CPU_COPY(previousMask, tempMask); 3550 ADD_MASK(previousMask); 3551 KMP_CPU_ZERO(tempMask); 3552 setSize = 0; 3553 KMP_CPU_SET_ITERATE(j, previousMask) { 3554 if (!KMP_CPU_ISSET(j, previousMask)) { 3555 continue; 3556 } 3557 if ((j + stride > maxOsId) || (j + stride < 0) || 3558 (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) || 3559 (!KMP_CPU_ISSET(j + stride, 3560 KMP_CPU_INDEX(osId2Mask, j + stride)))) { 3561 if ((__kmp_affinity_verbose || 3562 (__kmp_affinity_warnings && 3563 (__kmp_affinity_type != affinity_none))) && 3564 i < count - 1) { 3565 KMP_WARNING(AffIgnoreInvalidProcID, j + stride); 3566 } 3567 continue; 3568 } 3569 KMP_CPU_SET(j + stride, tempMask); 3570 setSize++; 3571 } 3572 } 3573 KMP_CPU_ZERO(tempMask); 3574 setSize = 0; 3575 3576 // valid follow sets are ',' and EOL 3577 SKIP_WS(scan); 3578 if (*scan == '\0') { 3579 break; 3580 } 3581 if (*scan == ',') { 3582 scan++; // skip ',' 3583 continue; 3584 } 3585 3586 KMP_ASSERT2(0, "bad explicit places list"); 3587 } 3588 3589 *out_numMasks = nextNewMask; 3590 if (nextNewMask == 0) { 3591 *out_masks = NULL; 3592 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3593 return; 3594 } 3595 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3596 KMP_CPU_FREE(tempMask); 3597 KMP_CPU_FREE(previousMask); 3598 for (i = 0; i < nextNewMask; i++) { 3599 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 3600 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 3601 KMP_CPU_COPY(dest, src); 3602 } 3603 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3604 } 3605 3606 #undef ADD_MASK 3607 #undef ADD_MASK_OSID 3608 3609 // This function figures out the deepest level at which there is at least one 3610 // cluster/core with more than one processing unit bound to it. 3611 static int __kmp_affinity_find_core_level(int nprocs, int bottom_level) { 3612 int core_level = 0; 3613 3614 for (int i = 0; i < nprocs; i++) { 3615 const kmp_hw_thread_t &hw_thread = __kmp_topology->at(i); 3616 for (int j = bottom_level; j > 0; j--) { 3617 if (hw_thread.ids[j] > 0) { 3618 if (core_level < (j - 1)) { 3619 core_level = j - 1; 3620 } 3621 } 3622 } 3623 } 3624 return core_level; 3625 } 3626 3627 // This function counts number of clusters/cores at given level. 3628 static int __kmp_affinity_compute_ncores(int nprocs, int bottom_level, 3629 int core_level) { 3630 return __kmp_topology->get_count(core_level); 3631 } 3632 // This function finds to which cluster/core given processing unit is bound. 3633 static int __kmp_affinity_find_core(int proc, int bottom_level, 3634 int core_level) { 3635 int core = 0; 3636 KMP_DEBUG_ASSERT(proc >= 0 && proc < __kmp_topology->get_num_hw_threads()); 3637 for (int i = 0; i <= proc; ++i) { 3638 if (i + 1 <= proc) { 3639 for (int j = 0; j <= core_level; ++j) { 3640 if (__kmp_topology->at(i + 1).sub_ids[j] != 3641 __kmp_topology->at(i).sub_ids[j]) { 3642 core++; 3643 break; 3644 } 3645 } 3646 } 3647 } 3648 return core; 3649 } 3650 3651 // This function finds maximal number of processing units bound to a 3652 // cluster/core at given level. 3653 static int __kmp_affinity_max_proc_per_core(int nprocs, int bottom_level, 3654 int core_level) { 3655 if (core_level >= bottom_level) 3656 return 1; 3657 int thread_level = __kmp_topology->get_level(KMP_HW_THREAD); 3658 return __kmp_topology->calculate_ratio(thread_level, core_level); 3659 } 3660 3661 static int *procarr = NULL; 3662 static int __kmp_aff_depth = 0; 3663 3664 // Create a one element mask array (set of places) which only contains the 3665 // initial process's affinity mask 3666 static void __kmp_create_affinity_none_places() { 3667 KMP_ASSERT(__kmp_affin_fullMask != NULL); 3668 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3669 __kmp_affinity_num_masks = 1; 3670 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 3671 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, 0); 3672 KMP_CPU_COPY(dest, __kmp_affin_fullMask); 3673 } 3674 3675 static void __kmp_aux_affinity_initialize(void) { 3676 if (__kmp_affinity_masks != NULL) { 3677 KMP_ASSERT(__kmp_affin_fullMask != NULL); 3678 return; 3679 } 3680 3681 // Create the "full" mask - this defines all of the processors that we 3682 // consider to be in the machine model. If respect is set, then it is the 3683 // initialization thread's affinity mask. Otherwise, it is all processors that 3684 // we know about on the machine. 3685 if (__kmp_affin_fullMask == NULL) { 3686 KMP_CPU_ALLOC(__kmp_affin_fullMask); 3687 } 3688 if (KMP_AFFINITY_CAPABLE()) { 3689 __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE); 3690 if (__kmp_affinity_respect_mask) { 3691 // Count the number of available processors. 3692 unsigned i; 3693 __kmp_avail_proc = 0; 3694 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 3695 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 3696 continue; 3697 } 3698 __kmp_avail_proc++; 3699 } 3700 if (__kmp_avail_proc > __kmp_xproc) { 3701 if (__kmp_affinity_verbose || 3702 (__kmp_affinity_warnings && 3703 (__kmp_affinity_type != affinity_none))) { 3704 KMP_WARNING(ErrorInitializeAffinity); 3705 } 3706 __kmp_affinity_type = affinity_none; 3707 KMP_AFFINITY_DISABLE(); 3708 return; 3709 } 3710 3711 if (__kmp_affinity_verbose) { 3712 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 3713 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 3714 __kmp_affin_fullMask); 3715 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 3716 } 3717 } else { 3718 if (__kmp_affinity_verbose) { 3719 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 3720 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 3721 __kmp_affin_fullMask); 3722 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 3723 } 3724 __kmp_avail_proc = 3725 __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask); 3726 #if KMP_OS_WINDOWS 3727 // Set the process affinity mask since threads' affinity 3728 // masks must be subset of process mask in Windows* OS 3729 __kmp_affin_fullMask->set_process_affinity(true); 3730 #endif 3731 } 3732 } 3733 3734 kmp_i18n_id_t msg_id = kmp_i18n_null; 3735 3736 // For backward compatibility, setting KMP_CPUINFO_FILE => 3737 // KMP_TOPOLOGY_METHOD=cpuinfo 3738 if ((__kmp_cpuinfo_file != NULL) && 3739 (__kmp_affinity_top_method == affinity_top_method_all)) { 3740 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 3741 } 3742 3743 bool success = false; 3744 if (__kmp_affinity_top_method == affinity_top_method_all) { 3745 // In the default code path, errors are not fatal - we just try using 3746 // another method. We only emit a warning message if affinity is on, or the 3747 // verbose flag is set, an the nowarnings flag was not set. 3748 #if KMP_USE_HWLOC 3749 if (!success && 3750 __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 3751 if (!__kmp_hwloc_error) { 3752 success = __kmp_affinity_create_hwloc_map(&msg_id); 3753 if (!success && __kmp_affinity_verbose) { 3754 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 3755 } 3756 } else if (__kmp_affinity_verbose) { 3757 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 3758 } 3759 } 3760 #endif 3761 3762 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 3763 if (!success) { 3764 success = __kmp_affinity_create_x2apicid_map(&msg_id); 3765 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3766 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3767 } 3768 } 3769 if (!success) { 3770 success = __kmp_affinity_create_apicid_map(&msg_id); 3771 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3772 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3773 } 3774 } 3775 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3776 3777 #if KMP_OS_LINUX 3778 if (!success) { 3779 int line = 0; 3780 success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id); 3781 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3782 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3783 } 3784 } 3785 #endif /* KMP_OS_LINUX */ 3786 3787 #if KMP_GROUP_AFFINITY 3788 if (!success && (__kmp_num_proc_groups > 1)) { 3789 success = __kmp_affinity_create_proc_group_map(&msg_id); 3790 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3791 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3792 } 3793 } 3794 #endif /* KMP_GROUP_AFFINITY */ 3795 3796 if (!success) { 3797 success = __kmp_affinity_create_flat_map(&msg_id); 3798 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3799 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3800 } 3801 KMP_ASSERT(success); 3802 } 3803 } 3804 3805 // If the user has specified that a paricular topology discovery method is to be 3806 // used, then we abort if that method fails. The exception is group affinity, 3807 // which might have been implicitly set. 3808 #if KMP_USE_HWLOC 3809 else if (__kmp_affinity_top_method == affinity_top_method_hwloc) { 3810 KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC); 3811 success = __kmp_affinity_create_hwloc_map(&msg_id); 3812 if (!success) { 3813 KMP_ASSERT(msg_id != kmp_i18n_null); 3814 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3815 } 3816 } 3817 #endif // KMP_USE_HWLOC 3818 3819 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 3820 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid || 3821 __kmp_affinity_top_method == affinity_top_method_x2apicid_1f) { 3822 success = __kmp_affinity_create_x2apicid_map(&msg_id); 3823 if (!success) { 3824 KMP_ASSERT(msg_id != kmp_i18n_null); 3825 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3826 } 3827 } else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 3828 success = __kmp_affinity_create_apicid_map(&msg_id); 3829 if (!success) { 3830 KMP_ASSERT(msg_id != kmp_i18n_null); 3831 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3832 } 3833 } 3834 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3835 3836 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 3837 int line = 0; 3838 success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id); 3839 if (!success) { 3840 KMP_ASSERT(msg_id != kmp_i18n_null); 3841 const char *filename = __kmp_cpuinfo_get_filename(); 3842 if (line > 0) { 3843 KMP_FATAL(FileLineMsgExiting, filename, line, 3844 __kmp_i18n_catgets(msg_id)); 3845 } else { 3846 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 3847 } 3848 } 3849 } 3850 3851 #if KMP_GROUP_AFFINITY 3852 else if (__kmp_affinity_top_method == affinity_top_method_group) { 3853 success = __kmp_affinity_create_proc_group_map(&msg_id); 3854 KMP_ASSERT(success); 3855 if (!success) { 3856 KMP_ASSERT(msg_id != kmp_i18n_null); 3857 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3858 } 3859 } 3860 #endif /* KMP_GROUP_AFFINITY */ 3861 3862 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 3863 success = __kmp_affinity_create_flat_map(&msg_id); 3864 // should not fail 3865 KMP_ASSERT(success); 3866 } 3867 3868 // Early exit if topology could not be created 3869 if (!__kmp_topology) { 3870 if (KMP_AFFINITY_CAPABLE() && 3871 (__kmp_affinity_verbose || 3872 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) { 3873 KMP_WARNING(ErrorInitializeAffinity); 3874 } 3875 if (nPackages > 0 && nCoresPerPkg > 0 && __kmp_nThreadsPerCore > 0 && 3876 __kmp_ncores > 0) { 3877 __kmp_topology = kmp_topology_t::allocate(0, 0, NULL); 3878 __kmp_topology->canonicalize(nPackages, nCoresPerPkg, 3879 __kmp_nThreadsPerCore, __kmp_ncores); 3880 if (__kmp_affinity_verbose) { 3881 __kmp_topology->print("KMP_AFFINITY"); 3882 } 3883 } 3884 __kmp_affinity_type = affinity_none; 3885 __kmp_create_affinity_none_places(); 3886 #if KMP_USE_HIER_SCHED 3887 __kmp_dispatch_set_hierarchy_values(); 3888 #endif 3889 KMP_AFFINITY_DISABLE(); 3890 return; 3891 } 3892 3893 // Canonicalize, print (if requested), apply KMP_HW_SUBSET, and 3894 // initialize other data structures which depend on the topology 3895 __kmp_topology->canonicalize(); 3896 if (__kmp_affinity_verbose) 3897 __kmp_topology->print("KMP_AFFINITY"); 3898 bool filtered = __kmp_topology->filter_hw_subset(); 3899 if (filtered && __kmp_affinity_verbose) 3900 __kmp_topology->print("KMP_HW_SUBSET"); 3901 machine_hierarchy.init(__kmp_topology->get_num_hw_threads()); 3902 KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads()); 3903 // If KMP_AFFINITY=none, then only create the single "none" place 3904 // which is the process's initial affinity mask or the number of 3905 // hardware threads depending on respect,norespect 3906 if (__kmp_affinity_type == affinity_none) { 3907 __kmp_create_affinity_none_places(); 3908 #if KMP_USE_HIER_SCHED 3909 __kmp_dispatch_set_hierarchy_values(); 3910 #endif 3911 return; 3912 } 3913 int depth = __kmp_topology->get_depth(); 3914 3915 // Create the table of masks, indexed by thread Id. 3916 unsigned maxIndex; 3917 unsigned numUnique; 3918 kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique); 3919 if (__kmp_affinity_gran_levels == 0) { 3920 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 3921 } 3922 3923 switch (__kmp_affinity_type) { 3924 3925 case affinity_explicit: 3926 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 3927 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) { 3928 __kmp_affinity_process_proclist( 3929 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 3930 __kmp_affinity_proclist, osId2Mask, maxIndex); 3931 } else { 3932 __kmp_affinity_process_placelist( 3933 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 3934 __kmp_affinity_proclist, osId2Mask, maxIndex); 3935 } 3936 if (__kmp_affinity_num_masks == 0) { 3937 if (__kmp_affinity_verbose || 3938 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 3939 KMP_WARNING(AffNoValidProcID); 3940 } 3941 __kmp_affinity_type = affinity_none; 3942 __kmp_create_affinity_none_places(); 3943 return; 3944 } 3945 break; 3946 3947 // The other affinity types rely on sorting the hardware threads according to 3948 // some permutation of the machine topology tree. Set __kmp_affinity_compact 3949 // and __kmp_affinity_offset appropriately, then jump to a common code 3950 // fragment to do the sort and create the array of affinity masks. 3951 case affinity_logical: 3952 __kmp_affinity_compact = 0; 3953 if (__kmp_affinity_offset) { 3954 __kmp_affinity_offset = 3955 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 3956 } 3957 goto sortTopology; 3958 3959 case affinity_physical: 3960 if (__kmp_nThreadsPerCore > 1) { 3961 __kmp_affinity_compact = 1; 3962 if (__kmp_affinity_compact >= depth) { 3963 __kmp_affinity_compact = 0; 3964 } 3965 } else { 3966 __kmp_affinity_compact = 0; 3967 } 3968 if (__kmp_affinity_offset) { 3969 __kmp_affinity_offset = 3970 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 3971 } 3972 goto sortTopology; 3973 3974 case affinity_scatter: 3975 if (__kmp_affinity_compact >= depth) { 3976 __kmp_affinity_compact = 0; 3977 } else { 3978 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 3979 } 3980 goto sortTopology; 3981 3982 case affinity_compact: 3983 if (__kmp_affinity_compact >= depth) { 3984 __kmp_affinity_compact = depth - 1; 3985 } 3986 goto sortTopology; 3987 3988 case affinity_balanced: 3989 if (depth <= 1) { 3990 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 3991 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 3992 } 3993 __kmp_affinity_type = affinity_none; 3994 __kmp_create_affinity_none_places(); 3995 return; 3996 } else if (!__kmp_topology->is_uniform()) { 3997 // Save the depth for further usage 3998 __kmp_aff_depth = depth; 3999 4000 int core_level = 4001 __kmp_affinity_find_core_level(__kmp_avail_proc, depth - 1); 4002 int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, depth - 1, 4003 core_level); 4004 int maxprocpercore = __kmp_affinity_max_proc_per_core( 4005 __kmp_avail_proc, depth - 1, core_level); 4006 4007 int nproc = ncores * maxprocpercore; 4008 if ((nproc < 2) || (nproc < __kmp_avail_proc)) { 4009 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 4010 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 4011 } 4012 __kmp_affinity_type = affinity_none; 4013 return; 4014 } 4015 4016 procarr = (int *)__kmp_allocate(sizeof(int) * nproc); 4017 for (int i = 0; i < nproc; i++) { 4018 procarr[i] = -1; 4019 } 4020 4021 int lastcore = -1; 4022 int inlastcore = 0; 4023 for (int i = 0; i < __kmp_avail_proc; i++) { 4024 int proc = __kmp_topology->at(i).os_id; 4025 int core = __kmp_affinity_find_core(i, depth - 1, core_level); 4026 4027 if (core == lastcore) { 4028 inlastcore++; 4029 } else { 4030 inlastcore = 0; 4031 } 4032 lastcore = core; 4033 4034 procarr[core * maxprocpercore + inlastcore] = proc; 4035 } 4036 } 4037 if (__kmp_affinity_compact >= depth) { 4038 __kmp_affinity_compact = depth - 1; 4039 } 4040 4041 sortTopology: 4042 // Allocate the gtid->affinity mask table. 4043 if (__kmp_affinity_dups) { 4044 __kmp_affinity_num_masks = __kmp_avail_proc; 4045 } else { 4046 __kmp_affinity_num_masks = numUnique; 4047 } 4048 4049 if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) && 4050 (__kmp_affinity_num_places > 0) && 4051 ((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) { 4052 __kmp_affinity_num_masks = __kmp_affinity_num_places; 4053 } 4054 4055 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4056 4057 // Sort the topology table according to the current setting of 4058 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 4059 __kmp_topology->sort_compact(); 4060 { 4061 int i; 4062 unsigned j; 4063 int num_hw_threads = __kmp_topology->get_num_hw_threads(); 4064 for (i = 0, j = 0; i < num_hw_threads; i++) { 4065 if ((!__kmp_affinity_dups) && (!__kmp_topology->at(i).leader)) { 4066 continue; 4067 } 4068 int osId = __kmp_topology->at(i).os_id; 4069 4070 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 4071 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j); 4072 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 4073 KMP_CPU_COPY(dest, src); 4074 if (++j >= __kmp_affinity_num_masks) { 4075 break; 4076 } 4077 } 4078 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 4079 } 4080 // Sort the topology back using ids 4081 __kmp_topology->sort_ids(); 4082 break; 4083 4084 default: 4085 KMP_ASSERT2(0, "Unexpected affinity setting"); 4086 } 4087 4088 KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1); 4089 } 4090 4091 void __kmp_affinity_initialize(void) { 4092 // Much of the code above was written assuming that if a machine was not 4093 // affinity capable, then __kmp_affinity_type == affinity_none. We now 4094 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 4095 // There are too many checks for __kmp_affinity_type == affinity_none 4096 // in this code. Instead of trying to change them all, check if 4097 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 4098 // affinity_none, call the real initialization routine, then restore 4099 // __kmp_affinity_type to affinity_disabled. 4100 int disabled = (__kmp_affinity_type == affinity_disabled); 4101 if (!KMP_AFFINITY_CAPABLE()) { 4102 KMP_ASSERT(disabled); 4103 } 4104 if (disabled) { 4105 __kmp_affinity_type = affinity_none; 4106 } 4107 __kmp_aux_affinity_initialize(); 4108 if (disabled) { 4109 __kmp_affinity_type = affinity_disabled; 4110 } 4111 } 4112 4113 void __kmp_affinity_uninitialize(void) { 4114 if (__kmp_affinity_masks != NULL) { 4115 KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4116 __kmp_affinity_masks = NULL; 4117 } 4118 if (__kmp_affin_fullMask != NULL) { 4119 KMP_CPU_FREE(__kmp_affin_fullMask); 4120 __kmp_affin_fullMask = NULL; 4121 } 4122 __kmp_affinity_num_masks = 0; 4123 __kmp_affinity_type = affinity_default; 4124 __kmp_affinity_num_places = 0; 4125 if (__kmp_affinity_proclist != NULL) { 4126 __kmp_free(__kmp_affinity_proclist); 4127 __kmp_affinity_proclist = NULL; 4128 } 4129 if (procarr != NULL) { 4130 __kmp_free(procarr); 4131 procarr = NULL; 4132 } 4133 #if KMP_USE_HWLOC 4134 if (__kmp_hwloc_topology != NULL) { 4135 hwloc_topology_destroy(__kmp_hwloc_topology); 4136 __kmp_hwloc_topology = NULL; 4137 } 4138 #endif 4139 if (__kmp_hw_subset) { 4140 kmp_hw_subset_t::deallocate(__kmp_hw_subset); 4141 __kmp_hw_subset = nullptr; 4142 } 4143 if (__kmp_topology) { 4144 kmp_topology_t::deallocate(__kmp_topology); 4145 __kmp_topology = nullptr; 4146 } 4147 KMPAffinity::destroy_api(); 4148 } 4149 4150 void __kmp_affinity_set_init_mask(int gtid, int isa_root) { 4151 if (!KMP_AFFINITY_CAPABLE()) { 4152 return; 4153 } 4154 4155 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4156 if (th->th.th_affin_mask == NULL) { 4157 KMP_CPU_ALLOC(th->th.th_affin_mask); 4158 } else { 4159 KMP_CPU_ZERO(th->th.th_affin_mask); 4160 } 4161 4162 // Copy the thread mask to the kmp_info_t structure. If 4163 // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that 4164 // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set, 4165 // then the full mask is the same as the mask of the initialization thread. 4166 kmp_affin_mask_t *mask; 4167 int i; 4168 4169 if (KMP_AFFINITY_NON_PROC_BIND) { 4170 if ((__kmp_affinity_type == affinity_none) || 4171 (__kmp_affinity_type == affinity_balanced) || 4172 KMP_HIDDEN_HELPER_THREAD(gtid)) { 4173 #if KMP_GROUP_AFFINITY 4174 if (__kmp_num_proc_groups > 1) { 4175 return; 4176 } 4177 #endif 4178 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4179 i = 0; 4180 mask = __kmp_affin_fullMask; 4181 } else { 4182 int mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid); 4183 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 4184 i = (mask_idx + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4185 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4186 } 4187 } else { 4188 if ((!isa_root) || KMP_HIDDEN_HELPER_THREAD(gtid) || 4189 (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 4190 #if KMP_GROUP_AFFINITY 4191 if (__kmp_num_proc_groups > 1) { 4192 return; 4193 } 4194 #endif 4195 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4196 i = KMP_PLACE_ALL; 4197 mask = __kmp_affin_fullMask; 4198 } else { 4199 // int i = some hash function or just a counter that doesn't 4200 // always start at 0. Use adjusted gtid for now. 4201 int mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid); 4202 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 4203 i = (mask_idx + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4204 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4205 } 4206 } 4207 4208 th->th.th_current_place = i; 4209 if (isa_root || KMP_HIDDEN_HELPER_THREAD(gtid)) { 4210 th->th.th_new_place = i; 4211 th->th.th_first_place = 0; 4212 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4213 } else if (KMP_AFFINITY_NON_PROC_BIND) { 4214 // When using a Non-OMP_PROC_BIND affinity method, 4215 // set all threads' place-partition-var to the entire place list 4216 th->th.th_first_place = 0; 4217 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4218 } 4219 4220 if (i == KMP_PLACE_ALL) { 4221 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 4222 gtid)); 4223 } else { 4224 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 4225 gtid, i)); 4226 } 4227 4228 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4229 4230 if (__kmp_affinity_verbose && !KMP_HIDDEN_HELPER_THREAD(gtid) 4231 /* to avoid duplicate printing (will be correctly printed on barrier) */ 4232 && (__kmp_affinity_type == affinity_none || 4233 (i != KMP_PLACE_ALL && __kmp_affinity_type != affinity_balanced))) { 4234 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4235 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4236 th->th.th_affin_mask); 4237 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4238 __kmp_gettid(), gtid, buf); 4239 } 4240 4241 #if KMP_DEBUG 4242 // Hidden helper thread affinity only printed for debug builds 4243 if (__kmp_affinity_verbose && KMP_HIDDEN_HELPER_THREAD(gtid)) { 4244 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4245 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4246 th->th.th_affin_mask); 4247 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY (hidden helper thread)", 4248 (kmp_int32)getpid(), __kmp_gettid(), gtid, buf); 4249 } 4250 #endif 4251 4252 #if KMP_OS_WINDOWS 4253 // On Windows* OS, the process affinity mask might have changed. If the user 4254 // didn't request affinity and this call fails, just continue silently. 4255 // See CQ171393. 4256 if (__kmp_affinity_type == affinity_none) { 4257 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 4258 } else 4259 #endif 4260 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4261 } 4262 4263 void __kmp_affinity_set_place(int gtid) { 4264 if (!KMP_AFFINITY_CAPABLE()) { 4265 return; 4266 } 4267 4268 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4269 4270 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current " 4271 "place = %d)\n", 4272 gtid, th->th.th_new_place, th->th.th_current_place)); 4273 4274 // Check that the new place is within this thread's partition. 4275 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4276 KMP_ASSERT(th->th.th_new_place >= 0); 4277 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 4278 if (th->th.th_first_place <= th->th.th_last_place) { 4279 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) && 4280 (th->th.th_new_place <= th->th.th_last_place)); 4281 } else { 4282 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) || 4283 (th->th.th_new_place >= th->th.th_last_place)); 4284 } 4285 4286 // Copy the thread mask to the kmp_info_t structure, 4287 // and set this thread's affinity. 4288 kmp_affin_mask_t *mask = 4289 KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place); 4290 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4291 th->th.th_current_place = th->th.th_new_place; 4292 4293 if (__kmp_affinity_verbose) { 4294 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4295 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4296 th->th.th_affin_mask); 4297 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), 4298 __kmp_gettid(), gtid, buf); 4299 } 4300 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4301 } 4302 4303 int __kmp_aux_set_affinity(void **mask) { 4304 int gtid; 4305 kmp_info_t *th; 4306 int retval; 4307 4308 if (!KMP_AFFINITY_CAPABLE()) { 4309 return -1; 4310 } 4311 4312 gtid = __kmp_entry_gtid(); 4313 KA_TRACE( 4314 1000, (""); { 4315 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4316 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4317 (kmp_affin_mask_t *)(*mask)); 4318 __kmp_debug_printf( 4319 "kmp_set_affinity: setting affinity mask for thread %d = %s\n", 4320 gtid, buf); 4321 }); 4322 4323 if (__kmp_env_consistency_check) { 4324 if ((mask == NULL) || (*mask == NULL)) { 4325 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4326 } else { 4327 unsigned proc; 4328 int num_procs = 0; 4329 4330 KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) { 4331 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4332 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4333 } 4334 if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 4335 continue; 4336 } 4337 num_procs++; 4338 } 4339 if (num_procs == 0) { 4340 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4341 } 4342 4343 #if KMP_GROUP_AFFINITY 4344 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 4345 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4346 } 4347 #endif /* KMP_GROUP_AFFINITY */ 4348 } 4349 } 4350 4351 th = __kmp_threads[gtid]; 4352 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4353 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4354 if (retval == 0) { 4355 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 4356 } 4357 4358 th->th.th_current_place = KMP_PLACE_UNDEFINED; 4359 th->th.th_new_place = KMP_PLACE_UNDEFINED; 4360 th->th.th_first_place = 0; 4361 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4362 4363 // Turn off 4.0 affinity for the current tread at this parallel level. 4364 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 4365 4366 return retval; 4367 } 4368 4369 int __kmp_aux_get_affinity(void **mask) { 4370 int gtid; 4371 int retval; 4372 #if KMP_OS_WINDOWS || KMP_DEBUG 4373 kmp_info_t *th; 4374 #endif 4375 if (!KMP_AFFINITY_CAPABLE()) { 4376 return -1; 4377 } 4378 4379 gtid = __kmp_entry_gtid(); 4380 #if KMP_OS_WINDOWS || KMP_DEBUG 4381 th = __kmp_threads[gtid]; 4382 #else 4383 (void)gtid; // unused variable 4384 #endif 4385 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4386 4387 KA_TRACE( 4388 1000, (""); { 4389 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4390 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4391 th->th.th_affin_mask); 4392 __kmp_printf( 4393 "kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, 4394 buf); 4395 }); 4396 4397 if (__kmp_env_consistency_check) { 4398 if ((mask == NULL) || (*mask == NULL)) { 4399 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 4400 } 4401 } 4402 4403 #if !KMP_OS_WINDOWS 4404 4405 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4406 KA_TRACE( 4407 1000, (""); { 4408 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4409 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4410 (kmp_affin_mask_t *)(*mask)); 4411 __kmp_printf( 4412 "kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, 4413 buf); 4414 }); 4415 return retval; 4416 4417 #else 4418 (void)retval; 4419 4420 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 4421 return 0; 4422 4423 #endif /* KMP_OS_WINDOWS */ 4424 } 4425 4426 int __kmp_aux_get_affinity_max_proc() { 4427 if (!KMP_AFFINITY_CAPABLE()) { 4428 return 0; 4429 } 4430 #if KMP_GROUP_AFFINITY 4431 if (__kmp_num_proc_groups > 1) { 4432 return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT); 4433 } 4434 #endif 4435 return __kmp_xproc; 4436 } 4437 4438 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) { 4439 if (!KMP_AFFINITY_CAPABLE()) { 4440 return -1; 4441 } 4442 4443 KA_TRACE( 4444 1000, (""); { 4445 int gtid = __kmp_entry_gtid(); 4446 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4447 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4448 (kmp_affin_mask_t *)(*mask)); 4449 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in " 4450 "affinity mask for thread %d = %s\n", 4451 proc, gtid, buf); 4452 }); 4453 4454 if (__kmp_env_consistency_check) { 4455 if ((mask == NULL) || (*mask == NULL)) { 4456 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4457 } 4458 } 4459 4460 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4461 return -1; 4462 } 4463 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4464 return -2; 4465 } 4466 4467 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 4468 return 0; 4469 } 4470 4471 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) { 4472 if (!KMP_AFFINITY_CAPABLE()) { 4473 return -1; 4474 } 4475 4476 KA_TRACE( 4477 1000, (""); { 4478 int gtid = __kmp_entry_gtid(); 4479 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4480 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4481 (kmp_affin_mask_t *)(*mask)); 4482 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in " 4483 "affinity mask for thread %d = %s\n", 4484 proc, gtid, buf); 4485 }); 4486 4487 if (__kmp_env_consistency_check) { 4488 if ((mask == NULL) || (*mask == NULL)) { 4489 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 4490 } 4491 } 4492 4493 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4494 return -1; 4495 } 4496 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4497 return -2; 4498 } 4499 4500 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 4501 return 0; 4502 } 4503 4504 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) { 4505 if (!KMP_AFFINITY_CAPABLE()) { 4506 return -1; 4507 } 4508 4509 KA_TRACE( 4510 1000, (""); { 4511 int gtid = __kmp_entry_gtid(); 4512 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4513 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4514 (kmp_affin_mask_t *)(*mask)); 4515 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in " 4516 "affinity mask for thread %d = %s\n", 4517 proc, gtid, buf); 4518 }); 4519 4520 if (__kmp_env_consistency_check) { 4521 if ((mask == NULL) || (*mask == NULL)) { 4522 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc"); 4523 } 4524 } 4525 4526 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4527 return -1; 4528 } 4529 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4530 return 0; 4531 } 4532 4533 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 4534 } 4535 4536 // Dynamic affinity settings - Affinity balanced 4537 void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) { 4538 KMP_DEBUG_ASSERT(th); 4539 bool fine_gran = true; 4540 int tid = th->th.th_info.ds.ds_tid; 4541 4542 // Do not perform balanced affinity for the hidden helper threads 4543 if (KMP_HIDDEN_HELPER_THREAD(__kmp_gtid_from_thread(th))) 4544 return; 4545 4546 switch (__kmp_affinity_gran) { 4547 case KMP_HW_THREAD: 4548 break; 4549 case KMP_HW_CORE: 4550 if (__kmp_nThreadsPerCore > 1) { 4551 fine_gran = false; 4552 } 4553 break; 4554 case KMP_HW_SOCKET: 4555 if (nCoresPerPkg > 1) { 4556 fine_gran = false; 4557 } 4558 break; 4559 default: 4560 fine_gran = false; 4561 } 4562 4563 if (__kmp_topology->is_uniform()) { 4564 int coreID; 4565 int threadID; 4566 // Number of hyper threads per core in HT machine 4567 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 4568 // Number of cores 4569 int ncores = __kmp_ncores; 4570 if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) { 4571 __kmp_nth_per_core = __kmp_avail_proc / nPackages; 4572 ncores = nPackages; 4573 } 4574 // How many threads will be bound to each core 4575 int chunk = nthreads / ncores; 4576 // How many cores will have an additional thread bound to it - "big cores" 4577 int big_cores = nthreads % ncores; 4578 // Number of threads on the big cores 4579 int big_nth = (chunk + 1) * big_cores; 4580 if (tid < big_nth) { 4581 coreID = tid / (chunk + 1); 4582 threadID = (tid % (chunk + 1)) % __kmp_nth_per_core; 4583 } else { // tid >= big_nth 4584 coreID = (tid - big_cores) / chunk; 4585 threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core; 4586 } 4587 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 4588 "Illegal set affinity operation when not capable"); 4589 4590 kmp_affin_mask_t *mask = th->th.th_affin_mask; 4591 KMP_CPU_ZERO(mask); 4592 4593 if (fine_gran) { 4594 int osID = 4595 __kmp_topology->at(coreID * __kmp_nth_per_core + threadID).os_id; 4596 KMP_CPU_SET(osID, mask); 4597 } else { 4598 for (int i = 0; i < __kmp_nth_per_core; i++) { 4599 int osID; 4600 osID = __kmp_topology->at(coreID * __kmp_nth_per_core + i).os_id; 4601 KMP_CPU_SET(osID, mask); 4602 } 4603 } 4604 if (__kmp_affinity_verbose) { 4605 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4606 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4607 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4608 __kmp_gettid(), tid, buf); 4609 } 4610 __kmp_set_system_affinity(mask, TRUE); 4611 } else { // Non-uniform topology 4612 4613 kmp_affin_mask_t *mask = th->th.th_affin_mask; 4614 KMP_CPU_ZERO(mask); 4615 4616 int core_level = 4617 __kmp_affinity_find_core_level(__kmp_avail_proc, __kmp_aff_depth - 1); 4618 int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, 4619 __kmp_aff_depth - 1, core_level); 4620 int nth_per_core = __kmp_affinity_max_proc_per_core( 4621 __kmp_avail_proc, __kmp_aff_depth - 1, core_level); 4622 4623 // For performance gain consider the special case nthreads == 4624 // __kmp_avail_proc 4625 if (nthreads == __kmp_avail_proc) { 4626 if (fine_gran) { 4627 int osID = __kmp_topology->at(tid).os_id; 4628 KMP_CPU_SET(osID, mask); 4629 } else { 4630 int core = 4631 __kmp_affinity_find_core(tid, __kmp_aff_depth - 1, core_level); 4632 for (int i = 0; i < __kmp_avail_proc; i++) { 4633 int osID = __kmp_topology->at(i).os_id; 4634 if (__kmp_affinity_find_core(i, __kmp_aff_depth - 1, core_level) == 4635 core) { 4636 KMP_CPU_SET(osID, mask); 4637 } 4638 } 4639 } 4640 } else if (nthreads <= ncores) { 4641 4642 int core = 0; 4643 for (int i = 0; i < ncores; i++) { 4644 // Check if this core from procarr[] is in the mask 4645 int in_mask = 0; 4646 for (int j = 0; j < nth_per_core; j++) { 4647 if (procarr[i * nth_per_core + j] != -1) { 4648 in_mask = 1; 4649 break; 4650 } 4651 } 4652 if (in_mask) { 4653 if (tid == core) { 4654 for (int j = 0; j < nth_per_core; j++) { 4655 int osID = procarr[i * nth_per_core + j]; 4656 if (osID != -1) { 4657 KMP_CPU_SET(osID, mask); 4658 // For fine granularity it is enough to set the first available 4659 // osID for this core 4660 if (fine_gran) { 4661 break; 4662 } 4663 } 4664 } 4665 break; 4666 } else { 4667 core++; 4668 } 4669 } 4670 } 4671 } else { // nthreads > ncores 4672 // Array to save the number of processors at each core 4673 int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores); 4674 // Array to save the number of cores with "x" available processors; 4675 int *ncores_with_x_procs = 4676 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 4677 // Array to save the number of cores with # procs from x to nth_per_core 4678 int *ncores_with_x_to_max_procs = 4679 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 4680 4681 for (int i = 0; i <= nth_per_core; i++) { 4682 ncores_with_x_procs[i] = 0; 4683 ncores_with_x_to_max_procs[i] = 0; 4684 } 4685 4686 for (int i = 0; i < ncores; i++) { 4687 int cnt = 0; 4688 for (int j = 0; j < nth_per_core; j++) { 4689 if (procarr[i * nth_per_core + j] != -1) { 4690 cnt++; 4691 } 4692 } 4693 nproc_at_core[i] = cnt; 4694 ncores_with_x_procs[cnt]++; 4695 } 4696 4697 for (int i = 0; i <= nth_per_core; i++) { 4698 for (int j = i; j <= nth_per_core; j++) { 4699 ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j]; 4700 } 4701 } 4702 4703 // Max number of processors 4704 int nproc = nth_per_core * ncores; 4705 // An array to keep number of threads per each context 4706 int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc); 4707 for (int i = 0; i < nproc; i++) { 4708 newarr[i] = 0; 4709 } 4710 4711 int nth = nthreads; 4712 int flag = 0; 4713 while (nth > 0) { 4714 for (int j = 1; j <= nth_per_core; j++) { 4715 int cnt = ncores_with_x_to_max_procs[j]; 4716 for (int i = 0; i < ncores; i++) { 4717 // Skip the core with 0 processors 4718 if (nproc_at_core[i] == 0) { 4719 continue; 4720 } 4721 for (int k = 0; k < nth_per_core; k++) { 4722 if (procarr[i * nth_per_core + k] != -1) { 4723 if (newarr[i * nth_per_core + k] == 0) { 4724 newarr[i * nth_per_core + k] = 1; 4725 cnt--; 4726 nth--; 4727 break; 4728 } else { 4729 if (flag != 0) { 4730 newarr[i * nth_per_core + k]++; 4731 cnt--; 4732 nth--; 4733 break; 4734 } 4735 } 4736 } 4737 } 4738 if (cnt == 0 || nth == 0) { 4739 break; 4740 } 4741 } 4742 if (nth == 0) { 4743 break; 4744 } 4745 } 4746 flag = 1; 4747 } 4748 int sum = 0; 4749 for (int i = 0; i < nproc; i++) { 4750 sum += newarr[i]; 4751 if (sum > tid) { 4752 if (fine_gran) { 4753 int osID = procarr[i]; 4754 KMP_CPU_SET(osID, mask); 4755 } else { 4756 int coreID = i / nth_per_core; 4757 for (int ii = 0; ii < nth_per_core; ii++) { 4758 int osID = procarr[coreID * nth_per_core + ii]; 4759 if (osID != -1) { 4760 KMP_CPU_SET(osID, mask); 4761 } 4762 } 4763 } 4764 break; 4765 } 4766 } 4767 __kmp_free(newarr); 4768 } 4769 4770 if (__kmp_affinity_verbose) { 4771 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4772 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4773 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4774 __kmp_gettid(), tid, buf); 4775 } 4776 __kmp_set_system_affinity(mask, TRUE); 4777 } 4778 } 4779 4780 #if KMP_OS_LINUX || KMP_OS_FREEBSD 4781 // We don't need this entry for Windows because 4782 // there is GetProcessAffinityMask() api 4783 // 4784 // The intended usage is indicated by these steps: 4785 // 1) The user gets the current affinity mask 4786 // 2) Then sets the affinity by calling this function 4787 // 3) Error check the return value 4788 // 4) Use non-OpenMP parallelization 4789 // 5) Reset the affinity to what was stored in step 1) 4790 #ifdef __cplusplus 4791 extern "C" 4792 #endif 4793 int 4794 kmp_set_thread_affinity_mask_initial() 4795 // the function returns 0 on success, 4796 // -1 if we cannot bind thread 4797 // >0 (errno) if an error happened during binding 4798 { 4799 int gtid = __kmp_get_gtid(); 4800 if (gtid < 0) { 4801 // Do not touch non-omp threads 4802 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 4803 "non-omp thread, returning\n")); 4804 return -1; 4805 } 4806 if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) { 4807 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 4808 "affinity not initialized, returning\n")); 4809 return -1; 4810 } 4811 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 4812 "set full mask for thread %d\n", 4813 gtid)); 4814 KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL); 4815 return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE); 4816 } 4817 #endif 4818 4819 #endif // KMP_AFFINITY_SUPPORTED 4820