1 /* 2 * kmp_affinity.cpp -- affinity management 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_i18n.h" 16 #include "kmp_io.h" 17 #include "kmp_str.h" 18 #include "kmp_wrapper_getpid.h" 19 #if KMP_USE_HIER_SCHED 20 #include "kmp_dispatch_hier.h" 21 #endif 22 #if KMP_USE_HWLOC 23 // Copied from hwloc 24 #define HWLOC_GROUP_KIND_INTEL_MODULE 102 25 #define HWLOC_GROUP_KIND_INTEL_TILE 103 26 #define HWLOC_GROUP_KIND_INTEL_DIE 104 27 #define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220 28 #endif 29 30 // The machine topology 31 kmp_topology_t *__kmp_topology = nullptr; 32 // KMP_HW_SUBSET environment variable 33 kmp_hw_subset_t *__kmp_hw_subset = nullptr; 34 35 // Store the real or imagined machine hierarchy here 36 static hierarchy_info machine_hierarchy; 37 38 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); } 39 40 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 41 kmp_uint32 depth; 42 // The test below is true if affinity is available, but set to "none". Need to 43 // init on first use of hierarchical barrier. 44 if (TCR_1(machine_hierarchy.uninitialized)) 45 machine_hierarchy.init(nproc); 46 47 // Adjust the hierarchy in case num threads exceeds original 48 if (nproc > machine_hierarchy.base_num_threads) 49 machine_hierarchy.resize(nproc); 50 51 depth = machine_hierarchy.depth; 52 KMP_DEBUG_ASSERT(depth > 0); 53 54 thr_bar->depth = depth; 55 __kmp_type_convert(machine_hierarchy.numPerLevel[0] - 1, 56 &(thr_bar->base_leaf_kids)); 57 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 58 } 59 60 static int nCoresPerPkg, nPackages; 61 static int __kmp_nThreadsPerCore; 62 #ifndef KMP_DFLT_NTH_CORES 63 static int __kmp_ncores; 64 #endif 65 66 const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) { 67 switch (type) { 68 case KMP_HW_SOCKET: 69 return ((plural) ? KMP_I18N_STR(Sockets) : KMP_I18N_STR(Socket)); 70 case KMP_HW_DIE: 71 return ((plural) ? KMP_I18N_STR(Dice) : KMP_I18N_STR(Die)); 72 case KMP_HW_MODULE: 73 return ((plural) ? KMP_I18N_STR(Modules) : KMP_I18N_STR(Module)); 74 case KMP_HW_TILE: 75 return ((plural) ? KMP_I18N_STR(Tiles) : KMP_I18N_STR(Tile)); 76 case KMP_HW_NUMA: 77 return ((plural) ? KMP_I18N_STR(NumaDomains) : KMP_I18N_STR(NumaDomain)); 78 case KMP_HW_L3: 79 return ((plural) ? KMP_I18N_STR(L3Caches) : KMP_I18N_STR(L3Cache)); 80 case KMP_HW_L2: 81 return ((plural) ? KMP_I18N_STR(L2Caches) : KMP_I18N_STR(L2Cache)); 82 case KMP_HW_L1: 83 return ((plural) ? KMP_I18N_STR(L1Caches) : KMP_I18N_STR(L1Cache)); 84 case KMP_HW_LLC: 85 return ((plural) ? KMP_I18N_STR(LLCaches) : KMP_I18N_STR(LLCache)); 86 case KMP_HW_CORE: 87 return ((plural) ? KMP_I18N_STR(Cores) : KMP_I18N_STR(Core)); 88 case KMP_HW_THREAD: 89 return ((plural) ? KMP_I18N_STR(Threads) : KMP_I18N_STR(Thread)); 90 case KMP_HW_PROC_GROUP: 91 return ((plural) ? KMP_I18N_STR(ProcGroups) : KMP_I18N_STR(ProcGroup)); 92 } 93 return KMP_I18N_STR(Unknown); 94 } 95 96 const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) { 97 switch (type) { 98 case KMP_HW_SOCKET: 99 return ((plural) ? "sockets" : "socket"); 100 case KMP_HW_DIE: 101 return ((plural) ? "dice" : "die"); 102 case KMP_HW_MODULE: 103 return ((plural) ? "modules" : "module"); 104 case KMP_HW_TILE: 105 return ((plural) ? "tiles" : "tile"); 106 case KMP_HW_NUMA: 107 return ((plural) ? "numa_domains" : "numa_domain"); 108 case KMP_HW_L3: 109 return ((plural) ? "l3_caches" : "l3_cache"); 110 case KMP_HW_L2: 111 return ((plural) ? "l2_caches" : "l2_cache"); 112 case KMP_HW_L1: 113 return ((plural) ? "l1_caches" : "l1_cache"); 114 case KMP_HW_LLC: 115 return ((plural) ? "ll_caches" : "ll_cache"); 116 case KMP_HW_CORE: 117 return ((plural) ? "cores" : "core"); 118 case KMP_HW_THREAD: 119 return ((plural) ? "threads" : "thread"); 120 case KMP_HW_PROC_GROUP: 121 return ((plural) ? "proc_groups" : "proc_group"); 122 } 123 return ((plural) ? "unknowns" : "unknown"); 124 } 125 126 const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) { 127 switch (type) { 128 case KMP_HW_CORE_TYPE_UNKNOWN: 129 return "unknown"; 130 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 131 case KMP_HW_CORE_TYPE_ATOM: 132 return "Intel Atom(R) processor"; 133 case KMP_HW_CORE_TYPE_CORE: 134 return "Intel(R) Core(TM) processor"; 135 #endif 136 } 137 return "unknown"; 138 } 139 140 //////////////////////////////////////////////////////////////////////////////// 141 // kmp_hw_thread_t methods 142 int kmp_hw_thread_t::compare_ids(const void *a, const void *b) { 143 const kmp_hw_thread_t *ahwthread = (const kmp_hw_thread_t *)a; 144 const kmp_hw_thread_t *bhwthread = (const kmp_hw_thread_t *)b; 145 int depth = __kmp_topology->get_depth(); 146 for (int level = 0; level < depth; ++level) { 147 if (ahwthread->ids[level] < bhwthread->ids[level]) 148 return -1; 149 else if (ahwthread->ids[level] > bhwthread->ids[level]) 150 return 1; 151 } 152 if (ahwthread->os_id < bhwthread->os_id) 153 return -1; 154 else if (ahwthread->os_id > bhwthread->os_id) 155 return 1; 156 return 0; 157 } 158 159 #if KMP_AFFINITY_SUPPORTED 160 int kmp_hw_thread_t::compare_compact(const void *a, const void *b) { 161 int i; 162 const kmp_hw_thread_t *aa = (const kmp_hw_thread_t *)a; 163 const kmp_hw_thread_t *bb = (const kmp_hw_thread_t *)b; 164 int depth = __kmp_topology->get_depth(); 165 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0); 166 KMP_DEBUG_ASSERT(__kmp_affinity_compact <= depth); 167 for (i = 0; i < __kmp_affinity_compact; i++) { 168 int j = depth - i - 1; 169 if (aa->sub_ids[j] < bb->sub_ids[j]) 170 return -1; 171 if (aa->sub_ids[j] > bb->sub_ids[j]) 172 return 1; 173 } 174 for (; i < depth; i++) { 175 int j = i - __kmp_affinity_compact; 176 if (aa->sub_ids[j] < bb->sub_ids[j]) 177 return -1; 178 if (aa->sub_ids[j] > bb->sub_ids[j]) 179 return 1; 180 } 181 return 0; 182 } 183 #endif 184 185 void kmp_hw_thread_t::print() const { 186 int depth = __kmp_topology->get_depth(); 187 printf("%4d ", os_id); 188 for (int i = 0; i < depth; ++i) { 189 printf("%4d ", ids[i]); 190 } 191 if (core_type != KMP_HW_CORE_TYPE_UNKNOWN) { 192 printf(" (%s)", __kmp_hw_get_core_type_string(core_type)); 193 } 194 printf("\n"); 195 } 196 197 //////////////////////////////////////////////////////////////////////////////// 198 // kmp_topology_t methods 199 200 // Remove layers that don't add information to the topology. 201 // This is done by having the layer take on the id = UNKNOWN_ID (-1) 202 void kmp_topology_t::_remove_radix1_layers() { 203 int preference[KMP_HW_LAST]; 204 int top_index1, top_index2; 205 // Set up preference associative array 206 preference[KMP_HW_PROC_GROUP] = 110; 207 preference[KMP_HW_SOCKET] = 100; 208 preference[KMP_HW_CORE] = 95; 209 preference[KMP_HW_THREAD] = 90; 210 preference[KMP_HW_NUMA] = 85; 211 preference[KMP_HW_DIE] = 80; 212 preference[KMP_HW_TILE] = 75; 213 preference[KMP_HW_MODULE] = 73; 214 preference[KMP_HW_L3] = 70; 215 preference[KMP_HW_L2] = 65; 216 preference[KMP_HW_L1] = 60; 217 preference[KMP_HW_LLC] = 5; 218 top_index1 = 0; 219 top_index2 = 1; 220 while (top_index1 < depth - 1 && top_index2 < depth) { 221 kmp_hw_t type1 = types[top_index1]; 222 kmp_hw_t type2 = types[top_index2]; 223 KMP_ASSERT_VALID_HW_TYPE(type1); 224 KMP_ASSERT_VALID_HW_TYPE(type2); 225 // Do not allow the three main topology levels (sockets, cores, threads) to 226 // be compacted down 227 if ((type1 == KMP_HW_THREAD || type1 == KMP_HW_CORE || 228 type1 == KMP_HW_SOCKET) && 229 (type2 == KMP_HW_THREAD || type2 == KMP_HW_CORE || 230 type2 == KMP_HW_SOCKET)) { 231 top_index1 = top_index2++; 232 continue; 233 } 234 bool radix1 = true; 235 bool all_same = true; 236 int id1 = hw_threads[0].ids[top_index1]; 237 int id2 = hw_threads[0].ids[top_index2]; 238 int pref1 = preference[type1]; 239 int pref2 = preference[type2]; 240 for (int hwidx = 1; hwidx < num_hw_threads; ++hwidx) { 241 if (hw_threads[hwidx].ids[top_index1] == id1 && 242 hw_threads[hwidx].ids[top_index2] != id2) { 243 radix1 = false; 244 break; 245 } 246 if (hw_threads[hwidx].ids[top_index2] != id2) 247 all_same = false; 248 id1 = hw_threads[hwidx].ids[top_index1]; 249 id2 = hw_threads[hwidx].ids[top_index2]; 250 } 251 if (radix1) { 252 // Select the layer to remove based on preference 253 kmp_hw_t remove_type, keep_type; 254 int remove_layer, remove_layer_ids; 255 if (pref1 > pref2) { 256 remove_type = type2; 257 remove_layer = remove_layer_ids = top_index2; 258 keep_type = type1; 259 } else { 260 remove_type = type1; 261 remove_layer = remove_layer_ids = top_index1; 262 keep_type = type2; 263 } 264 // If all the indexes for the second (deeper) layer are the same. 265 // e.g., all are zero, then make sure to keep the first layer's ids 266 if (all_same) 267 remove_layer_ids = top_index2; 268 // Remove radix one type by setting the equivalence, removing the id from 269 // the hw threads and removing the layer from types and depth 270 set_equivalent_type(remove_type, keep_type); 271 for (int idx = 0; idx < num_hw_threads; ++idx) { 272 kmp_hw_thread_t &hw_thread = hw_threads[idx]; 273 for (int d = remove_layer_ids; d < depth - 1; ++d) 274 hw_thread.ids[d] = hw_thread.ids[d + 1]; 275 } 276 for (int idx = remove_layer; idx < depth - 1; ++idx) 277 types[idx] = types[idx + 1]; 278 depth--; 279 } else { 280 top_index1 = top_index2++; 281 } 282 } 283 KMP_ASSERT(depth > 0); 284 } 285 286 void kmp_topology_t::_set_last_level_cache() { 287 if (get_equivalent_type(KMP_HW_L3) != KMP_HW_UNKNOWN) 288 set_equivalent_type(KMP_HW_LLC, KMP_HW_L3); 289 else if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN) 290 set_equivalent_type(KMP_HW_LLC, KMP_HW_L2); 291 #if KMP_MIC_SUPPORTED 292 else if (__kmp_mic_type == mic3) { 293 if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN) 294 set_equivalent_type(KMP_HW_LLC, KMP_HW_L2); 295 else if (get_equivalent_type(KMP_HW_TILE) != KMP_HW_UNKNOWN) 296 set_equivalent_type(KMP_HW_LLC, KMP_HW_TILE); 297 // L2/Tile wasn't detected so just say L1 298 else 299 set_equivalent_type(KMP_HW_LLC, KMP_HW_L1); 300 } 301 #endif 302 else if (get_equivalent_type(KMP_HW_L1) != KMP_HW_UNKNOWN) 303 set_equivalent_type(KMP_HW_LLC, KMP_HW_L1); 304 // Fallback is to set last level cache to socket or core 305 if (get_equivalent_type(KMP_HW_LLC) == KMP_HW_UNKNOWN) { 306 if (get_equivalent_type(KMP_HW_SOCKET) != KMP_HW_UNKNOWN) 307 set_equivalent_type(KMP_HW_LLC, KMP_HW_SOCKET); 308 else if (get_equivalent_type(KMP_HW_CORE) != KMP_HW_UNKNOWN) 309 set_equivalent_type(KMP_HW_LLC, KMP_HW_CORE); 310 } 311 KMP_ASSERT(get_equivalent_type(KMP_HW_LLC) != KMP_HW_UNKNOWN); 312 } 313 314 // Gather the count of each topology layer and the ratio 315 void kmp_topology_t::_gather_enumeration_information() { 316 int previous_id[KMP_HW_LAST]; 317 int max[KMP_HW_LAST]; 318 int previous_core_id = kmp_hw_thread_t::UNKNOWN_ID; 319 320 for (int i = 0; i < depth; ++i) { 321 previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID; 322 max[i] = 0; 323 count[i] = 0; 324 ratio[i] = 0; 325 } 326 if (__kmp_is_hybrid_cpu()) { 327 for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i) { 328 core_types_count[i] = 0; 329 core_types[i] = KMP_HW_CORE_TYPE_UNKNOWN; 330 } 331 } 332 for (int i = 0; i < num_hw_threads; ++i) { 333 kmp_hw_thread_t &hw_thread = hw_threads[i]; 334 for (int layer = 0; layer < depth; ++layer) { 335 int id = hw_thread.ids[layer]; 336 if (id != previous_id[layer]) { 337 // Add an additional increment to each count 338 for (int l = layer; l < depth; ++l) 339 count[l]++; 340 // Keep track of topology layer ratio statistics 341 max[layer]++; 342 for (int l = layer + 1; l < depth; ++l) { 343 if (max[l] > ratio[l]) 344 ratio[l] = max[l]; 345 max[l] = 1; 346 } 347 break; 348 } 349 } 350 for (int layer = 0; layer < depth; ++layer) { 351 previous_id[layer] = hw_thread.ids[layer]; 352 } 353 // Figure out the number of each core type for hybrid CPUs 354 if (__kmp_is_hybrid_cpu()) { 355 int core_level = get_level(KMP_HW_CORE); 356 if (core_level != -1) { 357 if (hw_thread.ids[core_level] != previous_core_id) 358 _increment_core_type(hw_thread.core_type); 359 previous_core_id = hw_thread.ids[core_level]; 360 } 361 } 362 } 363 for (int layer = 0; layer < depth; ++layer) { 364 if (max[layer] > ratio[layer]) 365 ratio[layer] = max[layer]; 366 } 367 } 368 369 // Find out if the topology is uniform 370 void kmp_topology_t::_discover_uniformity() { 371 int num = 1; 372 for (int level = 0; level < depth; ++level) 373 num *= ratio[level]; 374 flags.uniform = (num == count[depth - 1]); 375 } 376 377 // Set all the sub_ids for each hardware thread 378 void kmp_topology_t::_set_sub_ids() { 379 int previous_id[KMP_HW_LAST]; 380 int sub_id[KMP_HW_LAST]; 381 382 for (int i = 0; i < depth; ++i) { 383 previous_id[i] = -1; 384 sub_id[i] = -1; 385 } 386 for (int i = 0; i < num_hw_threads; ++i) { 387 kmp_hw_thread_t &hw_thread = hw_threads[i]; 388 // Setup the sub_id 389 for (int j = 0; j < depth; ++j) { 390 if (hw_thread.ids[j] != previous_id[j]) { 391 sub_id[j]++; 392 for (int k = j + 1; k < depth; ++k) { 393 sub_id[k] = 0; 394 } 395 break; 396 } 397 } 398 // Set previous_id 399 for (int j = 0; j < depth; ++j) { 400 previous_id[j] = hw_thread.ids[j]; 401 } 402 // Set the sub_ids field 403 for (int j = 0; j < depth; ++j) { 404 hw_thread.sub_ids[j] = sub_id[j]; 405 } 406 } 407 } 408 409 void kmp_topology_t::_set_globals() { 410 // Set nCoresPerPkg, nPackages, __kmp_nThreadsPerCore, __kmp_ncores 411 int core_level, thread_level, package_level; 412 package_level = get_level(KMP_HW_SOCKET); 413 #if KMP_GROUP_AFFINITY 414 if (package_level == -1) 415 package_level = get_level(KMP_HW_PROC_GROUP); 416 #endif 417 core_level = get_level(KMP_HW_CORE); 418 thread_level = get_level(KMP_HW_THREAD); 419 420 KMP_ASSERT(core_level != -1); 421 KMP_ASSERT(thread_level != -1); 422 423 __kmp_nThreadsPerCore = calculate_ratio(thread_level, core_level); 424 if (package_level != -1) { 425 nCoresPerPkg = calculate_ratio(core_level, package_level); 426 nPackages = get_count(package_level); 427 } else { 428 // assume one socket 429 nCoresPerPkg = get_count(core_level); 430 nPackages = 1; 431 } 432 #ifndef KMP_DFLT_NTH_CORES 433 __kmp_ncores = get_count(core_level); 434 #endif 435 } 436 437 kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth, 438 const kmp_hw_t *types) { 439 kmp_topology_t *retval; 440 // Allocate all data in one large allocation 441 size_t size = sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc + 442 sizeof(int) * ndepth * 3; 443 char *bytes = (char *)__kmp_allocate(size); 444 retval = (kmp_topology_t *)bytes; 445 if (nproc > 0) { 446 retval->hw_threads = (kmp_hw_thread_t *)(bytes + sizeof(kmp_topology_t)); 447 } else { 448 retval->hw_threads = nullptr; 449 } 450 retval->num_hw_threads = nproc; 451 retval->depth = ndepth; 452 int *arr = 453 (int *)(bytes + sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc); 454 retval->types = (kmp_hw_t *)arr; 455 retval->ratio = arr + ndepth; 456 retval->count = arr + 2 * ndepth; 457 KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; } 458 for (int i = 0; i < ndepth; ++i) { 459 retval->types[i] = types[i]; 460 retval->equivalent[types[i]] = types[i]; 461 } 462 return retval; 463 } 464 465 void kmp_topology_t::deallocate(kmp_topology_t *topology) { 466 if (topology) 467 __kmp_free(topology); 468 } 469 470 bool kmp_topology_t::check_ids() const { 471 // Assume ids have been sorted 472 if (num_hw_threads == 0) 473 return true; 474 for (int i = 1; i < num_hw_threads; ++i) { 475 kmp_hw_thread_t ¤t_thread = hw_threads[i]; 476 kmp_hw_thread_t &previous_thread = hw_threads[i - 1]; 477 bool unique = false; 478 for (int j = 0; j < depth; ++j) { 479 if (previous_thread.ids[j] != current_thread.ids[j]) { 480 unique = true; 481 break; 482 } 483 } 484 if (unique) 485 continue; 486 return false; 487 } 488 return true; 489 } 490 491 void kmp_topology_t::dump() const { 492 printf("***********************\n"); 493 printf("*** __kmp_topology: ***\n"); 494 printf("***********************\n"); 495 printf("* depth: %d\n", depth); 496 497 printf("* types: "); 498 for (int i = 0; i < depth; ++i) 499 printf("%15s ", __kmp_hw_get_keyword(types[i])); 500 printf("\n"); 501 502 printf("* ratio: "); 503 for (int i = 0; i < depth; ++i) { 504 printf("%15d ", ratio[i]); 505 } 506 printf("\n"); 507 508 printf("* count: "); 509 for (int i = 0; i < depth; ++i) { 510 printf("%15d ", count[i]); 511 } 512 printf("\n"); 513 514 printf("* core_types:\n"); 515 for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i) { 516 if (core_types[i] != KMP_HW_CORE_TYPE_UNKNOWN) { 517 printf(" %d %s core%c\n", core_types_count[i], 518 __kmp_hw_get_core_type_string(core_types[i]), 519 ((core_types_count[i] > 1) ? 's' : ' ')); 520 } else { 521 if (i == 0) 522 printf("No hybrid information available\n"); 523 break; 524 } 525 } 526 527 printf("* equivalent map:\n"); 528 KMP_FOREACH_HW_TYPE(i) { 529 const char *key = __kmp_hw_get_keyword(i); 530 const char *value = __kmp_hw_get_keyword(equivalent[i]); 531 printf("%-15s -> %-15s\n", key, value); 532 } 533 534 printf("* uniform: %s\n", (is_uniform() ? "Yes" : "No")); 535 536 printf("* num_hw_threads: %d\n", num_hw_threads); 537 printf("* hw_threads:\n"); 538 for (int i = 0; i < num_hw_threads; ++i) { 539 hw_threads[i].print(); 540 } 541 printf("***********************\n"); 542 } 543 544 void kmp_topology_t::print(const char *env_var) const { 545 kmp_str_buf_t buf; 546 int print_types_depth; 547 __kmp_str_buf_init(&buf); 548 kmp_hw_t print_types[KMP_HW_LAST + 2]; 549 550 // Num Available Threads 551 KMP_INFORM(AvailableOSProc, env_var, num_hw_threads); 552 553 // Uniform or not 554 if (is_uniform()) { 555 KMP_INFORM(Uniform, env_var); 556 } else { 557 KMP_INFORM(NonUniform, env_var); 558 } 559 560 // Equivalent types 561 KMP_FOREACH_HW_TYPE(type) { 562 kmp_hw_t eq_type = equivalent[type]; 563 if (eq_type != KMP_HW_UNKNOWN && eq_type != type) { 564 KMP_INFORM(AffEqualTopologyTypes, env_var, 565 __kmp_hw_get_catalog_string(type), 566 __kmp_hw_get_catalog_string(eq_type)); 567 } 568 } 569 570 // Quick topology 571 KMP_ASSERT(depth > 0 && depth <= (int)KMP_HW_LAST); 572 // Create a print types array that always guarantees printing 573 // the core and thread level 574 print_types_depth = 0; 575 for (int level = 0; level < depth; ++level) 576 print_types[print_types_depth++] = types[level]; 577 if (equivalent[KMP_HW_CORE] != KMP_HW_CORE) { 578 // Force in the core level for quick topology 579 if (print_types[print_types_depth - 1] == KMP_HW_THREAD) { 580 // Force core before thread e.g., 1 socket X 2 threads/socket 581 // becomes 1 socket X 1 core/socket X 2 threads/socket 582 print_types[print_types_depth - 1] = KMP_HW_CORE; 583 print_types[print_types_depth++] = KMP_HW_THREAD; 584 } else { 585 print_types[print_types_depth++] = KMP_HW_CORE; 586 } 587 } 588 // Always put threads at very end of quick topology 589 if (equivalent[KMP_HW_THREAD] != KMP_HW_THREAD) 590 print_types[print_types_depth++] = KMP_HW_THREAD; 591 592 __kmp_str_buf_clear(&buf); 593 kmp_hw_t numerator_type; 594 kmp_hw_t denominator_type = KMP_HW_UNKNOWN; 595 int core_level = get_level(KMP_HW_CORE); 596 int ncores = get_count(core_level); 597 598 for (int plevel = 0, level = 0; plevel < print_types_depth; ++plevel) { 599 int c; 600 bool plural; 601 numerator_type = print_types[plevel]; 602 KMP_ASSERT_VALID_HW_TYPE(numerator_type); 603 if (equivalent[numerator_type] != numerator_type) 604 c = 1; 605 else 606 c = get_ratio(level++); 607 plural = (c > 1); 608 if (plevel == 0) { 609 __kmp_str_buf_print(&buf, "%d %s", c, 610 __kmp_hw_get_catalog_string(numerator_type, plural)); 611 } else { 612 __kmp_str_buf_print(&buf, " x %d %s/%s", c, 613 __kmp_hw_get_catalog_string(numerator_type, plural), 614 __kmp_hw_get_catalog_string(denominator_type)); 615 } 616 denominator_type = numerator_type; 617 } 618 KMP_INFORM(TopologyGeneric, env_var, buf.str, ncores); 619 620 if (__kmp_is_hybrid_cpu()) { 621 for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i) { 622 if (core_types[i] == KMP_HW_CORE_TYPE_UNKNOWN) 623 break; 624 KMP_INFORM(TopologyHybrid, env_var, core_types_count[i], 625 __kmp_hw_get_core_type_string(core_types[i])); 626 } 627 } 628 629 if (num_hw_threads <= 0) { 630 __kmp_str_buf_free(&buf); 631 return; 632 } 633 634 // Full OS proc to hardware thread map 635 KMP_INFORM(OSProcToPhysicalThreadMap, env_var); 636 for (int i = 0; i < num_hw_threads; i++) { 637 __kmp_str_buf_clear(&buf); 638 for (int level = 0; level < depth; ++level) { 639 kmp_hw_t type = types[level]; 640 __kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type)); 641 __kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]); 642 } 643 if (__kmp_is_hybrid_cpu()) 644 __kmp_str_buf_print( 645 &buf, "(%s)", __kmp_hw_get_core_type_string(hw_threads[i].core_type)); 646 KMP_INFORM(OSProcMapToPack, env_var, hw_threads[i].os_id, buf.str); 647 } 648 649 __kmp_str_buf_free(&buf); 650 } 651 652 void kmp_topology_t::canonicalize() { 653 _remove_radix1_layers(); 654 _gather_enumeration_information(); 655 _discover_uniformity(); 656 _set_sub_ids(); 657 _set_globals(); 658 _set_last_level_cache(); 659 660 #if KMP_MIC_SUPPORTED 661 // Manually Add L2 = Tile equivalence 662 if (__kmp_mic_type == mic3) { 663 if (get_level(KMP_HW_L2) != -1) 664 set_equivalent_type(KMP_HW_TILE, KMP_HW_L2); 665 else if (get_level(KMP_HW_TILE) != -1) 666 set_equivalent_type(KMP_HW_L2, KMP_HW_TILE); 667 } 668 #endif 669 670 // Perform post canonicalization checking 671 KMP_ASSERT(depth > 0); 672 for (int level = 0; level < depth; ++level) { 673 // All counts, ratios, and types must be valid 674 KMP_ASSERT(count[level] > 0 && ratio[level] > 0); 675 KMP_ASSERT_VALID_HW_TYPE(types[level]); 676 // Detected types must point to themselves 677 KMP_ASSERT(equivalent[types[level]] == types[level]); 678 } 679 680 #if KMP_AFFINITY_SUPPORTED 681 // Set the number of affinity granularity levels 682 if (__kmp_affinity_gran_levels < 0) { 683 kmp_hw_t gran_type = get_equivalent_type(__kmp_affinity_gran); 684 // Check if user's granularity request is valid 685 if (gran_type == KMP_HW_UNKNOWN) { 686 // First try core, then thread, then package 687 kmp_hw_t gran_types[3] = {KMP_HW_CORE, KMP_HW_THREAD, KMP_HW_SOCKET}; 688 for (auto g : gran_types) { 689 if (__kmp_topology->get_equivalent_type(g) != KMP_HW_UNKNOWN) { 690 gran_type = g; 691 break; 692 } 693 } 694 KMP_ASSERT(gran_type != KMP_HW_UNKNOWN); 695 // Warn user what granularity setting will be used instead 696 KMP_WARNING(AffGranularityBad, "KMP_AFFINITY", 697 __kmp_hw_get_catalog_string(__kmp_affinity_gran), 698 __kmp_hw_get_catalog_string(gran_type)); 699 __kmp_affinity_gran = gran_type; 700 } 701 __kmp_affinity_gran_levels = 0; 702 for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i) 703 __kmp_affinity_gran_levels++; 704 } 705 #endif // KMP_AFFINITY_SUPPORTED 706 } 707 708 // Canonicalize an explicit packages X cores/pkg X threads/core topology 709 void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg, 710 int nthreads_per_core, int ncores) { 711 int ndepth = 3; 712 depth = ndepth; 713 KMP_FOREACH_HW_TYPE(i) { equivalent[i] = KMP_HW_UNKNOWN; } 714 for (int level = 0; level < depth; ++level) { 715 count[level] = 0; 716 ratio[level] = 0; 717 } 718 count[0] = npackages; 719 count[1] = ncores; 720 count[2] = __kmp_xproc; 721 ratio[0] = npackages; 722 ratio[1] = ncores_per_pkg; 723 ratio[2] = nthreads_per_core; 724 equivalent[KMP_HW_SOCKET] = KMP_HW_SOCKET; 725 equivalent[KMP_HW_CORE] = KMP_HW_CORE; 726 equivalent[KMP_HW_THREAD] = KMP_HW_THREAD; 727 types[0] = KMP_HW_SOCKET; 728 types[1] = KMP_HW_CORE; 729 types[2] = KMP_HW_THREAD; 730 //__kmp_avail_proc = __kmp_xproc; 731 _discover_uniformity(); 732 } 733 734 // Apply the KMP_HW_SUBSET envirable to the topology 735 // Returns true if KMP_HW_SUBSET filtered any processors 736 // otherwise, returns false 737 bool kmp_topology_t::filter_hw_subset() { 738 // If KMP_HW_SUBSET wasn't requested, then do nothing. 739 if (!__kmp_hw_subset) 740 return false; 741 742 // Check to see if KMP_HW_SUBSET is a valid subset of the detected topology 743 int hw_subset_depth = __kmp_hw_subset->get_depth(); 744 kmp_hw_t specified[KMP_HW_LAST]; 745 KMP_ASSERT(hw_subset_depth > 0); 746 KMP_FOREACH_HW_TYPE(i) { specified[i] = KMP_HW_UNKNOWN; } 747 for (int i = 0; i < hw_subset_depth; ++i) { 748 int max_count; 749 int num = __kmp_hw_subset->at(i).num; 750 int offset = __kmp_hw_subset->at(i).offset; 751 kmp_hw_t type = __kmp_hw_subset->at(i).type; 752 kmp_hw_t equivalent_type = equivalent[type]; 753 int level = get_level(type); 754 755 // Check to see if current layer is in detected machine topology 756 if (equivalent_type != KMP_HW_UNKNOWN) { 757 __kmp_hw_subset->at(i).type = equivalent_type; 758 } else { 759 KMP_WARNING(AffHWSubsetNotExistGeneric, 760 __kmp_hw_get_catalog_string(type)); 761 return false; 762 } 763 764 // Check to see if current layer has already been specified 765 // either directly or through an equivalent type 766 if (specified[equivalent_type] != KMP_HW_UNKNOWN) { 767 KMP_WARNING(AffHWSubsetEqvLayers, __kmp_hw_get_catalog_string(type), 768 __kmp_hw_get_catalog_string(specified[equivalent_type])); 769 return false; 770 } 771 specified[equivalent_type] = type; 772 773 // Check to see if layers are in order 774 if (i + 1 < hw_subset_depth) { 775 kmp_hw_t next_type = get_equivalent_type(__kmp_hw_subset->at(i + 1).type); 776 if (next_type == KMP_HW_UNKNOWN) { 777 KMP_WARNING( 778 AffHWSubsetNotExistGeneric, 779 __kmp_hw_get_catalog_string(__kmp_hw_subset->at(i + 1).type)); 780 return false; 781 } 782 int next_topology_level = get_level(next_type); 783 if (level > next_topology_level) { 784 KMP_WARNING(AffHWSubsetOutOfOrder, __kmp_hw_get_catalog_string(type), 785 __kmp_hw_get_catalog_string(next_type)); 786 return false; 787 } 788 } 789 790 // Check to see if each layer's num & offset parameters are valid 791 max_count = get_ratio(level); 792 if (max_count < 0 || num + offset > max_count) { 793 bool plural = (num > 1); 794 KMP_WARNING(AffHWSubsetManyGeneric, 795 __kmp_hw_get_catalog_string(type, plural)); 796 return false; 797 } 798 } 799 800 // Apply the filtered hardware subset 801 int new_index = 0; 802 for (int i = 0; i < num_hw_threads; ++i) { 803 kmp_hw_thread_t &hw_thread = hw_threads[i]; 804 // Check to see if this hardware thread should be filtered 805 bool should_be_filtered = false; 806 for (int level = 0, hw_subset_index = 0; 807 level < depth && hw_subset_index < hw_subset_depth; ++level) { 808 kmp_hw_t topology_type = types[level]; 809 auto hw_subset_item = __kmp_hw_subset->at(hw_subset_index); 810 kmp_hw_t hw_subset_type = hw_subset_item.type; 811 if (topology_type != hw_subset_type) 812 continue; 813 int num = hw_subset_item.num; 814 int offset = hw_subset_item.offset; 815 hw_subset_index++; 816 if (hw_thread.sub_ids[level] < offset || 817 hw_thread.sub_ids[level] >= offset + num) { 818 should_be_filtered = true; 819 break; 820 } 821 } 822 if (!should_be_filtered) { 823 if (i != new_index) 824 hw_threads[new_index] = hw_thread; 825 new_index++; 826 } else { 827 #if KMP_AFFINITY_SUPPORTED 828 KMP_CPU_CLR(hw_thread.os_id, __kmp_affin_fullMask); 829 #endif 830 __kmp_avail_proc--; 831 } 832 } 833 KMP_DEBUG_ASSERT(new_index <= num_hw_threads); 834 num_hw_threads = new_index; 835 836 // Post hardware subset canonicalization 837 _gather_enumeration_information(); 838 _discover_uniformity(); 839 _set_globals(); 840 _set_last_level_cache(); 841 return true; 842 } 843 844 bool kmp_topology_t::is_close(int hwt1, int hwt2, int hw_level) const { 845 if (hw_level >= depth) 846 return true; 847 bool retval = true; 848 const kmp_hw_thread_t &t1 = hw_threads[hwt1]; 849 const kmp_hw_thread_t &t2 = hw_threads[hwt2]; 850 for (int i = 0; i < (depth - hw_level); ++i) { 851 if (t1.ids[i] != t2.ids[i]) 852 return false; 853 } 854 return retval; 855 } 856 857 //////////////////////////////////////////////////////////////////////////////// 858 859 #if KMP_AFFINITY_SUPPORTED 860 class kmp_affinity_raii_t { 861 kmp_affin_mask_t *mask; 862 bool restored; 863 864 public: 865 kmp_affinity_raii_t() : restored(false) { 866 KMP_CPU_ALLOC(mask); 867 KMP_ASSERT(mask != NULL); 868 __kmp_get_system_affinity(mask, TRUE); 869 } 870 void restore() { 871 __kmp_set_system_affinity(mask, TRUE); 872 KMP_CPU_FREE(mask); 873 restored = true; 874 } 875 ~kmp_affinity_raii_t() { 876 if (!restored) { 877 __kmp_set_system_affinity(mask, TRUE); 878 KMP_CPU_FREE(mask); 879 } 880 } 881 }; 882 883 bool KMPAffinity::picked_api = false; 884 885 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); } 886 void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); } 887 void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); } 888 void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); } 889 void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); } 890 void KMPAffinity::operator delete(void *p) { __kmp_free(p); } 891 892 void KMPAffinity::pick_api() { 893 KMPAffinity *affinity_dispatch; 894 if (picked_api) 895 return; 896 #if KMP_USE_HWLOC 897 // Only use Hwloc if affinity isn't explicitly disabled and 898 // user requests Hwloc topology method 899 if (__kmp_affinity_top_method == affinity_top_method_hwloc && 900 __kmp_affinity_type != affinity_disabled) { 901 affinity_dispatch = new KMPHwlocAffinity(); 902 } else 903 #endif 904 { 905 affinity_dispatch = new KMPNativeAffinity(); 906 } 907 __kmp_affinity_dispatch = affinity_dispatch; 908 picked_api = true; 909 } 910 911 void KMPAffinity::destroy_api() { 912 if (__kmp_affinity_dispatch != NULL) { 913 delete __kmp_affinity_dispatch; 914 __kmp_affinity_dispatch = NULL; 915 picked_api = false; 916 } 917 } 918 919 #define KMP_ADVANCE_SCAN(scan) \ 920 while (*scan != '\0') { \ 921 scan++; \ 922 } 923 924 // Print the affinity mask to the character array in a pretty format. 925 // The format is a comma separated list of non-negative integers or integer 926 // ranges: e.g., 1,2,3-5,7,9-15 927 // The format can also be the string "{<empty>}" if no bits are set in mask 928 char *__kmp_affinity_print_mask(char *buf, int buf_len, 929 kmp_affin_mask_t *mask) { 930 int start = 0, finish = 0, previous = 0; 931 bool first_range; 932 KMP_ASSERT(buf); 933 KMP_ASSERT(buf_len >= 40); 934 KMP_ASSERT(mask); 935 char *scan = buf; 936 char *end = buf + buf_len - 1; 937 938 // Check for empty set. 939 if (mask->begin() == mask->end()) { 940 KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}"); 941 KMP_ADVANCE_SCAN(scan); 942 KMP_ASSERT(scan <= end); 943 return buf; 944 } 945 946 first_range = true; 947 start = mask->begin(); 948 while (1) { 949 // Find next range 950 // [start, previous] is inclusive range of contiguous bits in mask 951 for (finish = mask->next(start), previous = start; 952 finish == previous + 1 && finish != mask->end(); 953 finish = mask->next(finish)) { 954 previous = finish; 955 } 956 957 // The first range does not need a comma printed before it, but the rest 958 // of the ranges do need a comma beforehand 959 if (!first_range) { 960 KMP_SNPRINTF(scan, end - scan + 1, "%s", ","); 961 KMP_ADVANCE_SCAN(scan); 962 } else { 963 first_range = false; 964 } 965 // Range with three or more contiguous bits in the affinity mask 966 if (previous - start > 1) { 967 KMP_SNPRINTF(scan, end - scan + 1, "%u-%u", start, previous); 968 } else { 969 // Range with one or two contiguous bits in the affinity mask 970 KMP_SNPRINTF(scan, end - scan + 1, "%u", start); 971 KMP_ADVANCE_SCAN(scan); 972 if (previous - start > 0) { 973 KMP_SNPRINTF(scan, end - scan + 1, ",%u", previous); 974 } 975 } 976 KMP_ADVANCE_SCAN(scan); 977 // Start over with new start point 978 start = finish; 979 if (start == mask->end()) 980 break; 981 // Check for overflow 982 if (end - scan < 2) 983 break; 984 } 985 986 // Check for overflow 987 KMP_ASSERT(scan <= end); 988 return buf; 989 } 990 #undef KMP_ADVANCE_SCAN 991 992 // Print the affinity mask to the string buffer object in a pretty format 993 // The format is a comma separated list of non-negative integers or integer 994 // ranges: e.g., 1,2,3-5,7,9-15 995 // The format can also be the string "{<empty>}" if no bits are set in mask 996 kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf, 997 kmp_affin_mask_t *mask) { 998 int start = 0, finish = 0, previous = 0; 999 bool first_range; 1000 KMP_ASSERT(buf); 1001 KMP_ASSERT(mask); 1002 1003 __kmp_str_buf_clear(buf); 1004 1005 // Check for empty set. 1006 if (mask->begin() == mask->end()) { 1007 __kmp_str_buf_print(buf, "%s", "{<empty>}"); 1008 return buf; 1009 } 1010 1011 first_range = true; 1012 start = mask->begin(); 1013 while (1) { 1014 // Find next range 1015 // [start, previous] is inclusive range of contiguous bits in mask 1016 for (finish = mask->next(start), previous = start; 1017 finish == previous + 1 && finish != mask->end(); 1018 finish = mask->next(finish)) { 1019 previous = finish; 1020 } 1021 1022 // The first range does not need a comma printed before it, but the rest 1023 // of the ranges do need a comma beforehand 1024 if (!first_range) { 1025 __kmp_str_buf_print(buf, "%s", ","); 1026 } else { 1027 first_range = false; 1028 } 1029 // Range with three or more contiguous bits in the affinity mask 1030 if (previous - start > 1) { 1031 __kmp_str_buf_print(buf, "%u-%u", start, previous); 1032 } else { 1033 // Range with one or two contiguous bits in the affinity mask 1034 __kmp_str_buf_print(buf, "%u", start); 1035 if (previous - start > 0) { 1036 __kmp_str_buf_print(buf, ",%u", previous); 1037 } 1038 } 1039 // Start over with new start point 1040 start = finish; 1041 if (start == mask->end()) 1042 break; 1043 } 1044 return buf; 1045 } 1046 1047 void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) { 1048 KMP_CPU_ZERO(mask); 1049 1050 #if KMP_GROUP_AFFINITY 1051 1052 if (__kmp_num_proc_groups > 1) { 1053 int group; 1054 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 1055 for (group = 0; group < __kmp_num_proc_groups; group++) { 1056 int i; 1057 int num = __kmp_GetActiveProcessorCount(group); 1058 for (i = 0; i < num; i++) { 1059 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 1060 } 1061 } 1062 } else 1063 1064 #endif /* KMP_GROUP_AFFINITY */ 1065 1066 { 1067 int proc; 1068 for (proc = 0; proc < __kmp_xproc; proc++) { 1069 KMP_CPU_SET(proc, mask); 1070 } 1071 } 1072 } 1073 1074 // All of the __kmp_affinity_create_*_map() routines should allocate the 1075 // internal topology object and set the layer ids for it. Each routine 1076 // returns a boolean on whether it was successful at doing so. 1077 kmp_affin_mask_t *__kmp_affin_fullMask = NULL; 1078 1079 #if KMP_USE_HWLOC 1080 static inline bool __kmp_hwloc_is_cache_type(hwloc_obj_t obj) { 1081 #if HWLOC_API_VERSION >= 0x00020000 1082 return hwloc_obj_type_is_cache(obj->type); 1083 #else 1084 return obj->type == HWLOC_OBJ_CACHE; 1085 #endif 1086 } 1087 1088 // Returns KMP_HW_* type derived from HWLOC_* type 1089 static inline kmp_hw_t __kmp_hwloc_type_2_topology_type(hwloc_obj_t obj) { 1090 1091 if (__kmp_hwloc_is_cache_type(obj)) { 1092 if (obj->attr->cache.type == HWLOC_OBJ_CACHE_INSTRUCTION) 1093 return KMP_HW_UNKNOWN; 1094 switch (obj->attr->cache.depth) { 1095 case 1: 1096 return KMP_HW_L1; 1097 case 2: 1098 #if KMP_MIC_SUPPORTED 1099 if (__kmp_mic_type == mic3) { 1100 return KMP_HW_TILE; 1101 } 1102 #endif 1103 return KMP_HW_L2; 1104 case 3: 1105 return KMP_HW_L3; 1106 } 1107 return KMP_HW_UNKNOWN; 1108 } 1109 1110 switch (obj->type) { 1111 case HWLOC_OBJ_PACKAGE: 1112 return KMP_HW_SOCKET; 1113 case HWLOC_OBJ_NUMANODE: 1114 return KMP_HW_NUMA; 1115 case HWLOC_OBJ_CORE: 1116 return KMP_HW_CORE; 1117 case HWLOC_OBJ_PU: 1118 return KMP_HW_THREAD; 1119 case HWLOC_OBJ_GROUP: 1120 if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_DIE) 1121 return KMP_HW_DIE; 1122 else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_TILE) 1123 return KMP_HW_TILE; 1124 else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_MODULE) 1125 return KMP_HW_MODULE; 1126 else if (obj->attr->group.kind == HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP) 1127 return KMP_HW_PROC_GROUP; 1128 return KMP_HW_UNKNOWN; 1129 #if HWLOC_API_VERSION >= 0x00020100 1130 case HWLOC_OBJ_DIE: 1131 return KMP_HW_DIE; 1132 #endif 1133 } 1134 return KMP_HW_UNKNOWN; 1135 } 1136 1137 // Returns the number of objects of type 'type' below 'obj' within the topology 1138 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is 1139 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET 1140 // object. 1141 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj, 1142 hwloc_obj_type_t type) { 1143 int retval = 0; 1144 hwloc_obj_t first; 1145 for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type, 1146 obj->logical_index, type, 0); 1147 first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, 1148 obj->type, first) == obj; 1149 first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type, 1150 first)) { 1151 ++retval; 1152 } 1153 return retval; 1154 } 1155 1156 // This gets the sub_id for a lower object under a higher object in the 1157 // topology tree 1158 static int __kmp_hwloc_get_sub_id(hwloc_topology_t t, hwloc_obj_t higher, 1159 hwloc_obj_t lower) { 1160 hwloc_obj_t obj; 1161 hwloc_obj_type_t ltype = lower->type; 1162 int lindex = lower->logical_index - 1; 1163 int sub_id = 0; 1164 // Get the previous lower object 1165 obj = hwloc_get_obj_by_type(t, ltype, lindex); 1166 while (obj && lindex >= 0 && 1167 hwloc_bitmap_isincluded(obj->cpuset, higher->cpuset)) { 1168 if (obj->userdata) { 1169 sub_id = (int)(RCAST(kmp_intptr_t, obj->userdata)); 1170 break; 1171 } 1172 sub_id++; 1173 lindex--; 1174 obj = hwloc_get_obj_by_type(t, ltype, lindex); 1175 } 1176 // store sub_id + 1 so that 0 is differed from NULL 1177 lower->userdata = RCAST(void *, sub_id + 1); 1178 return sub_id; 1179 } 1180 1181 static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) { 1182 kmp_hw_t type; 1183 int hw_thread_index, sub_id; 1184 int depth; 1185 hwloc_obj_t pu, obj, root, prev; 1186 kmp_hw_t types[KMP_HW_LAST]; 1187 hwloc_obj_type_t hwloc_types[KMP_HW_LAST]; 1188 1189 hwloc_topology_t tp = __kmp_hwloc_topology; 1190 *msg_id = kmp_i18n_null; 1191 if (__kmp_affinity_verbose) { 1192 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 1193 } 1194 1195 if (!KMP_AFFINITY_CAPABLE()) { 1196 // Hack to try and infer the machine topology using only the data 1197 // available from hwloc on the current thread, and __kmp_xproc. 1198 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1199 // hwloc only guarantees existance of PU object, so check PACKAGE and CORE 1200 hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0); 1201 if (o != NULL) 1202 nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_CORE); 1203 else 1204 nCoresPerPkg = 1; // no PACKAGE found 1205 o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0); 1206 if (o != NULL) 1207 __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_PU); 1208 else 1209 __kmp_nThreadsPerCore = 1; // no CORE found 1210 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1211 if (nCoresPerPkg == 0) 1212 nCoresPerPkg = 1; // to prevent possible division by 0 1213 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1214 return true; 1215 } 1216 1217 root = hwloc_get_root_obj(tp); 1218 1219 // Figure out the depth and types in the topology 1220 depth = 0; 1221 pu = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin()); 1222 KMP_ASSERT(pu); 1223 obj = pu; 1224 types[depth] = KMP_HW_THREAD; 1225 hwloc_types[depth] = obj->type; 1226 depth++; 1227 while (obj != root && obj != NULL) { 1228 obj = obj->parent; 1229 #if HWLOC_API_VERSION >= 0x00020000 1230 if (obj->memory_arity) { 1231 hwloc_obj_t memory; 1232 for (memory = obj->memory_first_child; memory; 1233 memory = hwloc_get_next_child(tp, obj, memory)) { 1234 if (memory->type == HWLOC_OBJ_NUMANODE) 1235 break; 1236 } 1237 if (memory && memory->type == HWLOC_OBJ_NUMANODE) { 1238 types[depth] = KMP_HW_NUMA; 1239 hwloc_types[depth] = memory->type; 1240 depth++; 1241 } 1242 } 1243 #endif 1244 type = __kmp_hwloc_type_2_topology_type(obj); 1245 if (type != KMP_HW_UNKNOWN) { 1246 types[depth] = type; 1247 hwloc_types[depth] = obj->type; 1248 depth++; 1249 } 1250 } 1251 KMP_ASSERT(depth > 0); 1252 1253 // Get the order for the types correct 1254 for (int i = 0, j = depth - 1; i < j; ++i, --j) { 1255 hwloc_obj_type_t hwloc_temp = hwloc_types[i]; 1256 kmp_hw_t temp = types[i]; 1257 types[i] = types[j]; 1258 types[j] = temp; 1259 hwloc_types[i] = hwloc_types[j]; 1260 hwloc_types[j] = hwloc_temp; 1261 } 1262 1263 // Allocate the data structure to be returned. 1264 __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types); 1265 1266 hw_thread_index = 0; 1267 pu = NULL; 1268 while (pu = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, pu)) { 1269 int index = depth - 1; 1270 bool included = KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask); 1271 kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index); 1272 if (included) { 1273 hw_thread.clear(); 1274 hw_thread.ids[index] = pu->logical_index; 1275 hw_thread.os_id = pu->os_index; 1276 index--; 1277 } 1278 obj = pu; 1279 prev = obj; 1280 while (obj != root && obj != NULL) { 1281 obj = obj->parent; 1282 #if HWLOC_API_VERSION >= 0x00020000 1283 // NUMA Nodes are handled differently since they are not within the 1284 // parent/child structure anymore. They are separate children 1285 // of obj (memory_first_child points to first memory child) 1286 if (obj->memory_arity) { 1287 hwloc_obj_t memory; 1288 for (memory = obj->memory_first_child; memory; 1289 memory = hwloc_get_next_child(tp, obj, memory)) { 1290 if (memory->type == HWLOC_OBJ_NUMANODE) 1291 break; 1292 } 1293 if (memory && memory->type == HWLOC_OBJ_NUMANODE) { 1294 sub_id = __kmp_hwloc_get_sub_id(tp, memory, prev); 1295 if (included) { 1296 hw_thread.ids[index] = memory->logical_index; 1297 hw_thread.ids[index + 1] = sub_id; 1298 index--; 1299 } 1300 prev = memory; 1301 } 1302 prev = obj; 1303 } 1304 #endif 1305 type = __kmp_hwloc_type_2_topology_type(obj); 1306 if (type != KMP_HW_UNKNOWN) { 1307 sub_id = __kmp_hwloc_get_sub_id(tp, obj, prev); 1308 if (included) { 1309 hw_thread.ids[index] = obj->logical_index; 1310 hw_thread.ids[index + 1] = sub_id; 1311 index--; 1312 } 1313 prev = obj; 1314 } 1315 } 1316 if (included) 1317 hw_thread_index++; 1318 } 1319 __kmp_topology->sort_ids(); 1320 return true; 1321 } 1322 #endif // KMP_USE_HWLOC 1323 1324 // If we don't know how to retrieve the machine's processor topology, or 1325 // encounter an error in doing so, this routine is called to form a "flat" 1326 // mapping of os thread id's <-> processor id's. 1327 static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t *const msg_id) { 1328 *msg_id = kmp_i18n_null; 1329 int depth = 3; 1330 kmp_hw_t types[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD}; 1331 1332 if (__kmp_affinity_verbose) { 1333 KMP_INFORM(UsingFlatOS, "KMP_AFFINITY"); 1334 } 1335 1336 // Even if __kmp_affinity_type == affinity_none, this routine might still 1337 // called to set __kmp_ncores, as well as 1338 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1339 if (!KMP_AFFINITY_CAPABLE()) { 1340 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1341 __kmp_ncores = nPackages = __kmp_xproc; 1342 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1343 return true; 1344 } 1345 1346 // When affinity is off, this routine will still be called to set 1347 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1348 // Make sure all these vars are set correctly, and return now if affinity is 1349 // not enabled. 1350 __kmp_ncores = nPackages = __kmp_avail_proc; 1351 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1352 1353 // Construct the data structure to be returned. 1354 __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types); 1355 int avail_ct = 0; 1356 int i; 1357 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1358 // Skip this proc if it is not included in the machine model. 1359 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1360 continue; 1361 } 1362 kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct); 1363 hw_thread.clear(); 1364 hw_thread.os_id = i; 1365 hw_thread.ids[0] = i; 1366 hw_thread.ids[1] = 0; 1367 hw_thread.ids[2] = 0; 1368 avail_ct++; 1369 } 1370 if (__kmp_affinity_verbose) { 1371 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 1372 } 1373 return true; 1374 } 1375 1376 #if KMP_GROUP_AFFINITY 1377 // If multiple Windows* OS processor groups exist, we can create a 2-level 1378 // topology map with the groups at level 0 and the individual procs at level 1. 1379 // This facilitates letting the threads float among all procs in a group, 1380 // if granularity=group (the default when there are multiple groups). 1381 static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t *const msg_id) { 1382 *msg_id = kmp_i18n_null; 1383 int depth = 3; 1384 kmp_hw_t types[] = {KMP_HW_PROC_GROUP, KMP_HW_CORE, KMP_HW_THREAD}; 1385 const static size_t BITS_PER_GROUP = CHAR_BIT * sizeof(DWORD_PTR); 1386 1387 if (__kmp_affinity_verbose) { 1388 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 1389 } 1390 1391 // If we aren't affinity capable, then use flat topology 1392 if (!KMP_AFFINITY_CAPABLE()) { 1393 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1394 nPackages = __kmp_num_proc_groups; 1395 __kmp_nThreadsPerCore = 1; 1396 __kmp_ncores = __kmp_xproc; 1397 nCoresPerPkg = nPackages / __kmp_ncores; 1398 return true; 1399 } 1400 1401 // Construct the data structure to be returned. 1402 __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types); 1403 int avail_ct = 0; 1404 int i; 1405 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1406 // Skip this proc if it is not included in the machine model. 1407 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1408 continue; 1409 } 1410 kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct++); 1411 hw_thread.clear(); 1412 hw_thread.os_id = i; 1413 hw_thread.ids[0] = i / BITS_PER_GROUP; 1414 hw_thread.ids[1] = hw_thread.ids[2] = i % BITS_PER_GROUP; 1415 } 1416 return true; 1417 } 1418 #endif /* KMP_GROUP_AFFINITY */ 1419 1420 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1421 1422 template <kmp_uint32 LSB, kmp_uint32 MSB> 1423 static inline unsigned __kmp_extract_bits(kmp_uint32 v) { 1424 const kmp_uint32 SHIFT_LEFT = sizeof(kmp_uint32) * 8 - 1 - MSB; 1425 const kmp_uint32 SHIFT_RIGHT = LSB; 1426 kmp_uint32 retval = v; 1427 retval <<= SHIFT_LEFT; 1428 retval >>= (SHIFT_LEFT + SHIFT_RIGHT); 1429 return retval; 1430 } 1431 1432 static int __kmp_cpuid_mask_width(int count) { 1433 int r = 0; 1434 1435 while ((1 << r) < count) 1436 ++r; 1437 return r; 1438 } 1439 1440 class apicThreadInfo { 1441 public: 1442 unsigned osId; // param to __kmp_affinity_bind_thread 1443 unsigned apicId; // from cpuid after binding 1444 unsigned maxCoresPerPkg; // "" 1445 unsigned maxThreadsPerPkg; // "" 1446 unsigned pkgId; // inferred from above values 1447 unsigned coreId; // "" 1448 unsigned threadId; // "" 1449 }; 1450 1451 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, 1452 const void *b) { 1453 const apicThreadInfo *aa = (const apicThreadInfo *)a; 1454 const apicThreadInfo *bb = (const apicThreadInfo *)b; 1455 if (aa->pkgId < bb->pkgId) 1456 return -1; 1457 if (aa->pkgId > bb->pkgId) 1458 return 1; 1459 if (aa->coreId < bb->coreId) 1460 return -1; 1461 if (aa->coreId > bb->coreId) 1462 return 1; 1463 if (aa->threadId < bb->threadId) 1464 return -1; 1465 if (aa->threadId > bb->threadId) 1466 return 1; 1467 return 0; 1468 } 1469 1470 class kmp_cache_info_t { 1471 public: 1472 struct info_t { 1473 unsigned level, mask; 1474 }; 1475 kmp_cache_info_t() : depth(0) { get_leaf4_levels(); } 1476 size_t get_depth() const { return depth; } 1477 info_t &operator[](size_t index) { return table[index]; } 1478 const info_t &operator[](size_t index) const { return table[index]; } 1479 1480 static kmp_hw_t get_topology_type(unsigned level) { 1481 KMP_DEBUG_ASSERT(level >= 1 && level <= MAX_CACHE_LEVEL); 1482 switch (level) { 1483 case 1: 1484 return KMP_HW_L1; 1485 case 2: 1486 return KMP_HW_L2; 1487 case 3: 1488 return KMP_HW_L3; 1489 } 1490 return KMP_HW_UNKNOWN; 1491 } 1492 1493 private: 1494 static const int MAX_CACHE_LEVEL = 3; 1495 1496 size_t depth; 1497 info_t table[MAX_CACHE_LEVEL]; 1498 1499 void get_leaf4_levels() { 1500 unsigned level = 0; 1501 while (depth < MAX_CACHE_LEVEL) { 1502 unsigned cache_type, max_threads_sharing; 1503 unsigned cache_level, cache_mask_width; 1504 kmp_cpuid buf2; 1505 __kmp_x86_cpuid(4, level, &buf2); 1506 cache_type = __kmp_extract_bits<0, 4>(buf2.eax); 1507 if (!cache_type) 1508 break; 1509 // Skip instruction caches 1510 if (cache_type == 2) { 1511 level++; 1512 continue; 1513 } 1514 max_threads_sharing = __kmp_extract_bits<14, 25>(buf2.eax) + 1; 1515 cache_mask_width = __kmp_cpuid_mask_width(max_threads_sharing); 1516 cache_level = __kmp_extract_bits<5, 7>(buf2.eax); 1517 table[depth].level = cache_level; 1518 table[depth].mask = ((-1) << cache_mask_width); 1519 depth++; 1520 level++; 1521 } 1522 } 1523 }; 1524 1525 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 1526 // an algorithm which cycles through the available os threads, setting 1527 // the current thread's affinity mask to that thread, and then retrieves 1528 // the Apic Id for each thread context using the cpuid instruction. 1529 static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) { 1530 kmp_cpuid buf; 1531 *msg_id = kmp_i18n_null; 1532 1533 if (__kmp_affinity_verbose) { 1534 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 1535 } 1536 1537 // Check if cpuid leaf 4 is supported. 1538 __kmp_x86_cpuid(0, 0, &buf); 1539 if (buf.eax < 4) { 1540 *msg_id = kmp_i18n_str_NoLeaf4Support; 1541 return false; 1542 } 1543 1544 // The algorithm used starts by setting the affinity to each available thread 1545 // and retrieving info from the cpuid instruction, so if we are not capable of 1546 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we 1547 // need to do something else - use the defaults that we calculated from 1548 // issuing cpuid without binding to each proc. 1549 if (!KMP_AFFINITY_CAPABLE()) { 1550 // Hack to try and infer the machine topology using only the data 1551 // available from cpuid on the current thread, and __kmp_xproc. 1552 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1553 1554 // Get an upper bound on the number of threads per package using cpuid(1). 1555 // On some OS/chps combinations where HT is supported by the chip but is 1556 // disabled, this value will be 2 on a single core chip. Usually, it will be 1557 // 2 if HT is enabled and 1 if HT is disabled. 1558 __kmp_x86_cpuid(1, 0, &buf); 1559 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1560 if (maxThreadsPerPkg == 0) { 1561 maxThreadsPerPkg = 1; 1562 } 1563 1564 // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded 1565 // value. 1566 // 1567 // The author of cpu_count.cpp treated this only an upper bound on the 1568 // number of cores, but I haven't seen any cases where it was greater than 1569 // the actual number of cores, so we will treat it as exact in this block of 1570 // code. 1571 // 1572 // First, we need to check if cpuid(4) is supported on this chip. To see if 1573 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or 1574 // greater. 1575 __kmp_x86_cpuid(0, 0, &buf); 1576 if (buf.eax >= 4) { 1577 __kmp_x86_cpuid(4, 0, &buf); 1578 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1579 } else { 1580 nCoresPerPkg = 1; 1581 } 1582 1583 // There is no way to reliably tell if HT is enabled without issuing the 1584 // cpuid instruction from every thread, can correlating the cpuid info, so 1585 // if the machine is not affinity capable, we assume that HT is off. We have 1586 // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine 1587 // does not support HT. 1588 // 1589 // - Older OSes are usually found on machines with older chips, which do not 1590 // support HT. 1591 // - The performance penalty for mistakenly identifying a machine as HT when 1592 // it isn't (which results in blocktime being incorrectly set to 0) is 1593 // greater than the penalty when for mistakenly identifying a machine as 1594 // being 1 thread/core when it is really HT enabled (which results in 1595 // blocktime being incorrectly set to a positive value). 1596 __kmp_ncores = __kmp_xproc; 1597 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1598 __kmp_nThreadsPerCore = 1; 1599 return true; 1600 } 1601 1602 // From here on, we can assume that it is safe to call 1603 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 1604 // __kmp_affinity_type = affinity_none. 1605 1606 // Save the affinity mask for the current thread. 1607 kmp_affinity_raii_t previous_affinity; 1608 1609 // Run through each of the available contexts, binding the current thread 1610 // to it, and obtaining the pertinent information using the cpuid instr. 1611 // 1612 // The relevant information is: 1613 // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 1614 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 1615 // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value 1616 // of this field determines the width of the core# + thread# fields in the 1617 // Apic Id. It is also an upper bound on the number of threads per 1618 // package, but it has been verified that situations happen were it is not 1619 // exact. In particular, on certain OS/chip combinations where Intel(R) 1620 // Hyper-Threading Technology is supported by the chip but has been 1621 // disabled, the value of this field will be 2 (for a single core chip). 1622 // On other OS/chip combinations supporting Intel(R) Hyper-Threading 1623 // Technology, the value of this field will be 1 when Intel(R) 1624 // Hyper-Threading Technology is disabled and 2 when it is enabled. 1625 // - Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The value 1626 // of this field (+1) determines the width of the core# field in the Apic 1627 // Id. The comments in "cpucount.cpp" say that this value is an upper 1628 // bound, but the IA-32 architecture manual says that it is exactly the 1629 // number of cores per package, and I haven't seen any case where it 1630 // wasn't. 1631 // 1632 // From this information, deduce the package Id, core Id, and thread Id, 1633 // and set the corresponding fields in the apicThreadInfo struct. 1634 unsigned i; 1635 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 1636 __kmp_avail_proc * sizeof(apicThreadInfo)); 1637 unsigned nApics = 0; 1638 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1639 // Skip this proc if it is not included in the machine model. 1640 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1641 continue; 1642 } 1643 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 1644 1645 __kmp_affinity_dispatch->bind_thread(i); 1646 threadInfo[nApics].osId = i; 1647 1648 // The apic id and max threads per pkg come from cpuid(1). 1649 __kmp_x86_cpuid(1, 0, &buf); 1650 if (((buf.edx >> 9) & 1) == 0) { 1651 __kmp_free(threadInfo); 1652 *msg_id = kmp_i18n_str_ApicNotPresent; 1653 return false; 1654 } 1655 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 1656 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1657 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 1658 threadInfo[nApics].maxThreadsPerPkg = 1; 1659 } 1660 1661 // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded 1662 // value. 1663 // 1664 // First, we need to check if cpuid(4) is supported on this chip. To see if 1665 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n 1666 // or greater. 1667 __kmp_x86_cpuid(0, 0, &buf); 1668 if (buf.eax >= 4) { 1669 __kmp_x86_cpuid(4, 0, &buf); 1670 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1671 } else { 1672 threadInfo[nApics].maxCoresPerPkg = 1; 1673 } 1674 1675 // Infer the pkgId / coreId / threadId using only the info obtained locally. 1676 int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg); 1677 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 1678 1679 int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg); 1680 int widthT = widthCT - widthC; 1681 if (widthT < 0) { 1682 // I've never seen this one happen, but I suppose it could, if the cpuid 1683 // instruction on a chip was really screwed up. Make sure to restore the 1684 // affinity mask before the tail call. 1685 __kmp_free(threadInfo); 1686 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1687 return false; 1688 } 1689 1690 int maskC = (1 << widthC) - 1; 1691 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC; 1692 1693 int maskT = (1 << widthT) - 1; 1694 threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT; 1695 1696 nApics++; 1697 } 1698 1699 // We've collected all the info we need. 1700 // Restore the old affinity mask for this thread. 1701 previous_affinity.restore(); 1702 1703 // Sort the threadInfo table by physical Id. 1704 qsort(threadInfo, nApics, sizeof(*threadInfo), 1705 __kmp_affinity_cmp_apicThreadInfo_phys_id); 1706 1707 // The table is now sorted by pkgId / coreId / threadId, but we really don't 1708 // know the radix of any of the fields. pkgId's may be sparsely assigned among 1709 // the chips on a system. Although coreId's are usually assigned 1710 // [0 .. coresPerPkg-1] and threadId's are usually assigned 1711 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1712 // 1713 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 1714 // total # packages) are at this point - we want to determine that now. We 1715 // only have an upper bound on the first two figures. 1716 // 1717 // We also perform a consistency check at this point: the values returned by 1718 // the cpuid instruction for any thread bound to a given package had better 1719 // return the same info for maxThreadsPerPkg and maxCoresPerPkg. 1720 nPackages = 1; 1721 nCoresPerPkg = 1; 1722 __kmp_nThreadsPerCore = 1; 1723 unsigned nCores = 1; 1724 1725 unsigned pkgCt = 1; // to determine radii 1726 unsigned lastPkgId = threadInfo[0].pkgId; 1727 unsigned coreCt = 1; 1728 unsigned lastCoreId = threadInfo[0].coreId; 1729 unsigned threadCt = 1; 1730 unsigned lastThreadId = threadInfo[0].threadId; 1731 1732 // intra-pkg consist checks 1733 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 1734 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 1735 1736 for (i = 1; i < nApics; i++) { 1737 if (threadInfo[i].pkgId != lastPkgId) { 1738 nCores++; 1739 pkgCt++; 1740 lastPkgId = threadInfo[i].pkgId; 1741 if ((int)coreCt > nCoresPerPkg) 1742 nCoresPerPkg = coreCt; 1743 coreCt = 1; 1744 lastCoreId = threadInfo[i].coreId; 1745 if ((int)threadCt > __kmp_nThreadsPerCore) 1746 __kmp_nThreadsPerCore = threadCt; 1747 threadCt = 1; 1748 lastThreadId = threadInfo[i].threadId; 1749 1750 // This is a different package, so go on to the next iteration without 1751 // doing any consistency checks. Reset the consistency check vars, though. 1752 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 1753 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 1754 continue; 1755 } 1756 1757 if (threadInfo[i].coreId != lastCoreId) { 1758 nCores++; 1759 coreCt++; 1760 lastCoreId = threadInfo[i].coreId; 1761 if ((int)threadCt > __kmp_nThreadsPerCore) 1762 __kmp_nThreadsPerCore = threadCt; 1763 threadCt = 1; 1764 lastThreadId = threadInfo[i].threadId; 1765 } else if (threadInfo[i].threadId != lastThreadId) { 1766 threadCt++; 1767 lastThreadId = threadInfo[i].threadId; 1768 } else { 1769 __kmp_free(threadInfo); 1770 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1771 return false; 1772 } 1773 1774 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 1775 // fields agree between all the threads bounds to a given package. 1776 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) || 1777 (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 1778 __kmp_free(threadInfo); 1779 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1780 return false; 1781 } 1782 } 1783 // When affinity is off, this routine will still be called to set 1784 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1785 // Make sure all these vars are set correctly 1786 nPackages = pkgCt; 1787 if ((int)coreCt > nCoresPerPkg) 1788 nCoresPerPkg = coreCt; 1789 if ((int)threadCt > __kmp_nThreadsPerCore) 1790 __kmp_nThreadsPerCore = threadCt; 1791 __kmp_ncores = nCores; 1792 KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc); 1793 1794 // Now that we've determined the number of packages, the number of cores per 1795 // package, and the number of threads per core, we can construct the data 1796 // structure that is to be returned. 1797 int idx = 0; 1798 int pkgLevel = 0; 1799 int coreLevel = 1; 1800 int threadLevel = 2; 1801 //(__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 1802 int depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 1803 kmp_hw_t types[3]; 1804 if (pkgLevel >= 0) 1805 types[idx++] = KMP_HW_SOCKET; 1806 if (coreLevel >= 0) 1807 types[idx++] = KMP_HW_CORE; 1808 if (threadLevel >= 0) 1809 types[idx++] = KMP_HW_THREAD; 1810 1811 KMP_ASSERT(depth > 0); 1812 __kmp_topology = kmp_topology_t::allocate(nApics, depth, types); 1813 1814 for (i = 0; i < nApics; ++i) { 1815 idx = 0; 1816 unsigned os = threadInfo[i].osId; 1817 kmp_hw_thread_t &hw_thread = __kmp_topology->at(i); 1818 hw_thread.clear(); 1819 1820 if (pkgLevel >= 0) { 1821 hw_thread.ids[idx++] = threadInfo[i].pkgId; 1822 } 1823 if (coreLevel >= 0) { 1824 hw_thread.ids[idx++] = threadInfo[i].coreId; 1825 } 1826 if (threadLevel >= 0) { 1827 hw_thread.ids[idx++] = threadInfo[i].threadId; 1828 } 1829 hw_thread.os_id = os; 1830 } 1831 1832 __kmp_free(threadInfo); 1833 __kmp_topology->sort_ids(); 1834 if (!__kmp_topology->check_ids()) { 1835 kmp_topology_t::deallocate(__kmp_topology); 1836 __kmp_topology = nullptr; 1837 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1838 return false; 1839 } 1840 return true; 1841 } 1842 1843 // Hybrid cpu detection using CPUID.1A 1844 // Thread should be pinned to processor already 1845 static void __kmp_get_hybrid_info(kmp_hw_core_type_t *type, 1846 unsigned *native_model_id) { 1847 kmp_cpuid buf; 1848 __kmp_x86_cpuid(0x1a, 0, &buf); 1849 *type = (kmp_hw_core_type_t)__kmp_extract_bits<24, 31>(buf.eax); 1850 *native_model_id = __kmp_extract_bits<0, 23>(buf.eax); 1851 } 1852 1853 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 1854 // architectures support a newer interface for specifying the x2APIC Ids, 1855 // based on CPUID.B or CPUID.1F 1856 /* 1857 * CPUID.B or 1F, Input ECX (sub leaf # aka level number) 1858 Bits Bits Bits Bits 1859 31-16 15-8 7-4 4-0 1860 ---+-----------+--------------+-------------+-----------------+ 1861 EAX| reserved | reserved | reserved | Bits to Shift | 1862 ---+-----------|--------------+-------------+-----------------| 1863 EBX| reserved | Num logical processors at level (16 bits) | 1864 ---+-----------|--------------+-------------------------------| 1865 ECX| reserved | Level Type | Level Number (8 bits) | 1866 ---+-----------+--------------+-------------------------------| 1867 EDX| X2APIC ID (32 bits) | 1868 ---+----------------------------------------------------------+ 1869 */ 1870 1871 enum { 1872 INTEL_LEVEL_TYPE_INVALID = 0, // Package level 1873 INTEL_LEVEL_TYPE_SMT = 1, 1874 INTEL_LEVEL_TYPE_CORE = 2, 1875 INTEL_LEVEL_TYPE_TILE = 3, 1876 INTEL_LEVEL_TYPE_MODULE = 4, 1877 INTEL_LEVEL_TYPE_DIE = 5, 1878 INTEL_LEVEL_TYPE_LAST = 6, 1879 }; 1880 1881 struct cpuid_level_info_t { 1882 unsigned level_type, mask, mask_width, nitems, cache_mask; 1883 }; 1884 1885 static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) { 1886 switch (intel_type) { 1887 case INTEL_LEVEL_TYPE_INVALID: 1888 return KMP_HW_SOCKET; 1889 case INTEL_LEVEL_TYPE_SMT: 1890 return KMP_HW_THREAD; 1891 case INTEL_LEVEL_TYPE_CORE: 1892 return KMP_HW_CORE; 1893 case INTEL_LEVEL_TYPE_TILE: 1894 return KMP_HW_TILE; 1895 case INTEL_LEVEL_TYPE_MODULE: 1896 return KMP_HW_MODULE; 1897 case INTEL_LEVEL_TYPE_DIE: 1898 return KMP_HW_DIE; 1899 } 1900 return KMP_HW_UNKNOWN; 1901 } 1902 1903 // This function takes the topology leaf, a levels array to store the levels 1904 // detected and a bitmap of the known levels. 1905 // Returns the number of levels in the topology 1906 static unsigned 1907 __kmp_x2apicid_get_levels(int leaf, 1908 cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST], 1909 kmp_uint64 known_levels) { 1910 unsigned level, levels_index; 1911 unsigned level_type, mask_width, nitems; 1912 kmp_cpuid buf; 1913 1914 // New algorithm has known topology layers act as highest unknown topology 1915 // layers when unknown topology layers exist. 1916 // e.g., Suppose layers were SMT <X> CORE <Y> <Z> PACKAGE, where <X> <Y> <Z> 1917 // are unknown topology layers, Then SMT will take the characteristics of 1918 // (SMT x <X>) and CORE will take the characteristics of (CORE x <Y> x <Z>). 1919 // This eliminates unknown portions of the topology while still keeping the 1920 // correct structure. 1921 level = levels_index = 0; 1922 do { 1923 __kmp_x86_cpuid(leaf, level, &buf); 1924 level_type = __kmp_extract_bits<8, 15>(buf.ecx); 1925 mask_width = __kmp_extract_bits<0, 4>(buf.eax); 1926 nitems = __kmp_extract_bits<0, 15>(buf.ebx); 1927 if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0) 1928 return 0; 1929 1930 if (known_levels & (1ull << level_type)) { 1931 // Add a new level to the topology 1932 KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST); 1933 levels[levels_index].level_type = level_type; 1934 levels[levels_index].mask_width = mask_width; 1935 levels[levels_index].nitems = nitems; 1936 levels_index++; 1937 } else { 1938 // If it is an unknown level, then logically move the previous layer up 1939 if (levels_index > 0) { 1940 levels[levels_index - 1].mask_width = mask_width; 1941 levels[levels_index - 1].nitems = nitems; 1942 } 1943 } 1944 level++; 1945 } while (level_type != INTEL_LEVEL_TYPE_INVALID); 1946 1947 // Set the masks to & with apicid 1948 for (unsigned i = 0; i < levels_index; ++i) { 1949 if (levels[i].level_type != INTEL_LEVEL_TYPE_INVALID) { 1950 levels[i].mask = ~((-1) << levels[i].mask_width); 1951 levels[i].cache_mask = (-1) << levels[i].mask_width; 1952 for (unsigned j = 0; j < i; ++j) 1953 levels[i].mask ^= levels[j].mask; 1954 } else { 1955 KMP_DEBUG_ASSERT(levels_index > 0); 1956 levels[i].mask = (-1) << levels[i - 1].mask_width; 1957 levels[i].cache_mask = 0; 1958 } 1959 } 1960 return levels_index; 1961 } 1962 1963 static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) { 1964 1965 cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST]; 1966 kmp_hw_t types[INTEL_LEVEL_TYPE_LAST]; 1967 unsigned levels_index; 1968 kmp_cpuid buf; 1969 kmp_uint64 known_levels; 1970 int topology_leaf, highest_leaf, apic_id; 1971 int num_leaves; 1972 static int leaves[] = {0, 0}; 1973 1974 kmp_i18n_id_t leaf_message_id; 1975 1976 KMP_BUILD_ASSERT(sizeof(known_levels) * CHAR_BIT > KMP_HW_LAST); 1977 1978 *msg_id = kmp_i18n_null; 1979 if (__kmp_affinity_verbose) { 1980 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 1981 } 1982 1983 // Figure out the known topology levels 1984 known_levels = 0ull; 1985 for (int i = 0; i < INTEL_LEVEL_TYPE_LAST; ++i) { 1986 if (__kmp_intel_type_2_topology_type(i) != KMP_HW_UNKNOWN) { 1987 known_levels |= (1ull << i); 1988 } 1989 } 1990 1991 // Get the highest cpuid leaf supported 1992 __kmp_x86_cpuid(0, 0, &buf); 1993 highest_leaf = buf.eax; 1994 1995 // If a specific topology method was requested, only allow that specific leaf 1996 // otherwise, try both leaves 31 and 11 in that order 1997 num_leaves = 0; 1998 if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 1999 num_leaves = 1; 2000 leaves[0] = 11; 2001 leaf_message_id = kmp_i18n_str_NoLeaf11Support; 2002 } else if (__kmp_affinity_top_method == affinity_top_method_x2apicid_1f) { 2003 num_leaves = 1; 2004 leaves[0] = 31; 2005 leaf_message_id = kmp_i18n_str_NoLeaf31Support; 2006 } else { 2007 num_leaves = 2; 2008 leaves[0] = 31; 2009 leaves[1] = 11; 2010 leaf_message_id = kmp_i18n_str_NoLeaf11Support; 2011 } 2012 2013 // Check to see if cpuid leaf 31 or 11 is supported. 2014 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 2015 topology_leaf = -1; 2016 for (int i = 0; i < num_leaves; ++i) { 2017 int leaf = leaves[i]; 2018 if (highest_leaf < leaf) 2019 continue; 2020 __kmp_x86_cpuid(leaf, 0, &buf); 2021 if (buf.ebx == 0) 2022 continue; 2023 topology_leaf = leaf; 2024 levels_index = __kmp_x2apicid_get_levels(leaf, levels, known_levels); 2025 if (levels_index == 0) 2026 continue; 2027 break; 2028 } 2029 if (topology_leaf == -1 || levels_index == 0) { 2030 *msg_id = leaf_message_id; 2031 return false; 2032 } 2033 KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST); 2034 2035 // The algorithm used starts by setting the affinity to each available thread 2036 // and retrieving info from the cpuid instruction, so if we are not capable of 2037 // calling __kmp_get_system_affinity() and __kmp_get_system_affinity(), then 2038 // we need to do something else - use the defaults that we calculated from 2039 // issuing cpuid without binding to each proc. 2040 if (!KMP_AFFINITY_CAPABLE()) { 2041 // Hack to try and infer the machine topology using only the data 2042 // available from cpuid on the current thread, and __kmp_xproc. 2043 KMP_ASSERT(__kmp_affinity_type == affinity_none); 2044 for (unsigned i = 0; i < levels_index; ++i) { 2045 if (levels[i].level_type == INTEL_LEVEL_TYPE_SMT) { 2046 __kmp_nThreadsPerCore = levels[i].nitems; 2047 } else if (levels[i].level_type == INTEL_LEVEL_TYPE_CORE) { 2048 nCoresPerPkg = levels[i].nitems; 2049 } 2050 } 2051 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 2052 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 2053 return true; 2054 } 2055 2056 // Allocate the data structure to be returned. 2057 int depth = levels_index; 2058 for (int i = depth - 1, j = 0; i >= 0; --i, ++j) 2059 types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type); 2060 __kmp_topology = 2061 kmp_topology_t::allocate(__kmp_avail_proc, levels_index, types); 2062 2063 // Insert equivalent cache types if they exist 2064 kmp_cache_info_t cache_info; 2065 for (size_t i = 0; i < cache_info.get_depth(); ++i) { 2066 const kmp_cache_info_t::info_t &info = cache_info[i]; 2067 unsigned cache_mask = info.mask; 2068 unsigned cache_level = info.level; 2069 for (unsigned j = 0; j < levels_index; ++j) { 2070 unsigned hw_cache_mask = levels[j].cache_mask; 2071 kmp_hw_t cache_type = kmp_cache_info_t::get_topology_type(cache_level); 2072 if (hw_cache_mask == cache_mask && j < levels_index - 1) { 2073 kmp_hw_t type = 2074 __kmp_intel_type_2_topology_type(levels[j + 1].level_type); 2075 __kmp_topology->set_equivalent_type(cache_type, type); 2076 } 2077 } 2078 } 2079 2080 // From here on, we can assume that it is safe to call 2081 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 2082 // __kmp_affinity_type = affinity_none. 2083 2084 // Save the affinity mask for the current thread. 2085 kmp_affinity_raii_t previous_affinity; 2086 2087 // Run through each of the available contexts, binding the current thread 2088 // to it, and obtaining the pertinent information using the cpuid instr. 2089 unsigned int proc; 2090 int hw_thread_index = 0; 2091 KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) { 2092 cpuid_level_info_t my_levels[INTEL_LEVEL_TYPE_LAST]; 2093 unsigned my_levels_index; 2094 2095 // Skip this proc if it is not included in the machine model. 2096 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 2097 continue; 2098 } 2099 KMP_DEBUG_ASSERT(hw_thread_index < __kmp_avail_proc); 2100 2101 __kmp_affinity_dispatch->bind_thread(proc); 2102 2103 // New algorithm 2104 __kmp_x86_cpuid(topology_leaf, 0, &buf); 2105 apic_id = buf.edx; 2106 kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index); 2107 my_levels_index = 2108 __kmp_x2apicid_get_levels(topology_leaf, my_levels, known_levels); 2109 if (my_levels_index == 0 || my_levels_index != levels_index) { 2110 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 2111 return false; 2112 } 2113 hw_thread.clear(); 2114 hw_thread.os_id = proc; 2115 // Put in topology information 2116 for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) { 2117 hw_thread.ids[idx] = apic_id & my_levels[j].mask; 2118 if (j > 0) { 2119 hw_thread.ids[idx] >>= my_levels[j - 1].mask_width; 2120 } 2121 } 2122 // Hybrid information 2123 if (__kmp_is_hybrid_cpu() && highest_leaf >= 0x1a) { 2124 kmp_hw_core_type_t type; 2125 unsigned native_model_id; 2126 __kmp_get_hybrid_info(&type, &native_model_id); 2127 hw_thread.core_type = type; 2128 } 2129 hw_thread_index++; 2130 } 2131 KMP_ASSERT(hw_thread_index > 0); 2132 __kmp_topology->sort_ids(); 2133 if (!__kmp_topology->check_ids()) { 2134 kmp_topology_t::deallocate(__kmp_topology); 2135 __kmp_topology = nullptr; 2136 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 2137 return false; 2138 } 2139 return true; 2140 } 2141 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 2142 2143 #define osIdIndex 0 2144 #define threadIdIndex 1 2145 #define coreIdIndex 2 2146 #define pkgIdIndex 3 2147 #define nodeIdIndex 4 2148 2149 typedef unsigned *ProcCpuInfo; 2150 static unsigned maxIndex = pkgIdIndex; 2151 2152 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, 2153 const void *b) { 2154 unsigned i; 2155 const unsigned *aa = *(unsigned *const *)a; 2156 const unsigned *bb = *(unsigned *const *)b; 2157 for (i = maxIndex;; i--) { 2158 if (aa[i] < bb[i]) 2159 return -1; 2160 if (aa[i] > bb[i]) 2161 return 1; 2162 if (i == osIdIndex) 2163 break; 2164 } 2165 return 0; 2166 } 2167 2168 #if KMP_USE_HIER_SCHED 2169 // Set the array sizes for the hierarchy layers 2170 static void __kmp_dispatch_set_hierarchy_values() { 2171 // Set the maximum number of L1's to number of cores 2172 // Set the maximum number of L2's to to either number of cores / 2 for 2173 // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing 2174 // Or the number of cores for Intel(R) Xeon(R) processors 2175 // Set the maximum number of NUMA nodes and L3's to number of packages 2176 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] = 2177 nPackages * nCoresPerPkg * __kmp_nThreadsPerCore; 2178 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores; 2179 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \ 2180 KMP_MIC_SUPPORTED 2181 if (__kmp_mic_type >= mic3) 2182 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2; 2183 else 2184 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) 2185 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores; 2186 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages; 2187 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages; 2188 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1; 2189 // Set the number of threads per unit 2190 // Number of hardware threads per L1/L2/L3/NUMA/LOOP 2191 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1; 2192 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] = 2193 __kmp_nThreadsPerCore; 2194 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \ 2195 KMP_MIC_SUPPORTED 2196 if (__kmp_mic_type >= mic3) 2197 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] = 2198 2 * __kmp_nThreadsPerCore; 2199 else 2200 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) 2201 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] = 2202 __kmp_nThreadsPerCore; 2203 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] = 2204 nCoresPerPkg * __kmp_nThreadsPerCore; 2205 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] = 2206 nCoresPerPkg * __kmp_nThreadsPerCore; 2207 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] = 2208 nPackages * nCoresPerPkg * __kmp_nThreadsPerCore; 2209 } 2210 2211 // Return the index into the hierarchy for this tid and layer type (L1, L2, etc) 2212 // i.e., this thread's L1 or this thread's L2, etc. 2213 int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) { 2214 int index = type + 1; 2215 int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1]; 2216 KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST); 2217 if (type == kmp_hier_layer_e::LAYER_THREAD) 2218 return tid; 2219 else if (type == kmp_hier_layer_e::LAYER_LOOP) 2220 return 0; 2221 KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0); 2222 if (tid >= num_hw_threads) 2223 tid = tid % num_hw_threads; 2224 return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index]; 2225 } 2226 2227 // Return the number of t1's per t2 2228 int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) { 2229 int i1 = t1 + 1; 2230 int i2 = t2 + 1; 2231 KMP_DEBUG_ASSERT(i1 <= i2); 2232 KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST); 2233 KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST); 2234 KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0); 2235 // (nthreads/t2) / (nthreads/t1) = t1 / t2 2236 return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1]; 2237 } 2238 #endif // KMP_USE_HIER_SCHED 2239 2240 static inline const char *__kmp_cpuinfo_get_filename() { 2241 const char *filename; 2242 if (__kmp_cpuinfo_file != nullptr) 2243 filename = __kmp_cpuinfo_file; 2244 else 2245 filename = "/proc/cpuinfo"; 2246 return filename; 2247 } 2248 2249 static inline const char *__kmp_cpuinfo_get_envvar() { 2250 const char *envvar = nullptr; 2251 if (__kmp_cpuinfo_file != nullptr) 2252 envvar = "KMP_CPUINFO_FILE"; 2253 return envvar; 2254 } 2255 2256 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 2257 // affinity map. 2258 static bool __kmp_affinity_create_cpuinfo_map(int *line, 2259 kmp_i18n_id_t *const msg_id) { 2260 const char *filename = __kmp_cpuinfo_get_filename(); 2261 const char *envvar = __kmp_cpuinfo_get_envvar(); 2262 *msg_id = kmp_i18n_null; 2263 2264 if (__kmp_affinity_verbose) { 2265 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 2266 } 2267 2268 kmp_safe_raii_file_t f(filename, "r", envvar); 2269 2270 // Scan of the file, and count the number of "processor" (osId) fields, 2271 // and find the highest value of <n> for a node_<n> field. 2272 char buf[256]; 2273 unsigned num_records = 0; 2274 while (!feof(f)) { 2275 buf[sizeof(buf) - 1] = 1; 2276 if (!fgets(buf, sizeof(buf), f)) { 2277 // Read errors presumably because of EOF 2278 break; 2279 } 2280 2281 char s1[] = "processor"; 2282 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 2283 num_records++; 2284 continue; 2285 } 2286 2287 // FIXME - this will match "node_<n> <garbage>" 2288 unsigned level; 2289 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 2290 // validate the input fisrt: 2291 if (level > (unsigned)__kmp_xproc) { // level is too big 2292 level = __kmp_xproc; 2293 } 2294 if (nodeIdIndex + level >= maxIndex) { 2295 maxIndex = nodeIdIndex + level; 2296 } 2297 continue; 2298 } 2299 } 2300 2301 // Check for empty file / no valid processor records, or too many. The number 2302 // of records can't exceed the number of valid bits in the affinity mask. 2303 if (num_records == 0) { 2304 *msg_id = kmp_i18n_str_NoProcRecords; 2305 return false; 2306 } 2307 if (num_records > (unsigned)__kmp_xproc) { 2308 *msg_id = kmp_i18n_str_TooManyProcRecords; 2309 return false; 2310 } 2311 2312 // Set the file pointer back to the beginning, so that we can scan the file 2313 // again, this time performing a full parse of the data. Allocate a vector of 2314 // ProcCpuInfo object, where we will place the data. Adding an extra element 2315 // at the end allows us to remove a lot of extra checks for termination 2316 // conditions. 2317 if (fseek(f, 0, SEEK_SET) != 0) { 2318 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 2319 return false; 2320 } 2321 2322 // Allocate the array of records to store the proc info in. The dummy 2323 // element at the end makes the logic in filling them out easier to code. 2324 unsigned **threadInfo = 2325 (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *)); 2326 unsigned i; 2327 for (i = 0; i <= num_records; i++) { 2328 threadInfo[i] = 2329 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2330 } 2331 2332 #define CLEANUP_THREAD_INFO \ 2333 for (i = 0; i <= num_records; i++) { \ 2334 __kmp_free(threadInfo[i]); \ 2335 } \ 2336 __kmp_free(threadInfo); 2337 2338 // A value of UINT_MAX means that we didn't find the field 2339 unsigned __index; 2340 2341 #define INIT_PROC_INFO(p) \ 2342 for (__index = 0; __index <= maxIndex; __index++) { \ 2343 (p)[__index] = UINT_MAX; \ 2344 } 2345 2346 for (i = 0; i <= num_records; i++) { 2347 INIT_PROC_INFO(threadInfo[i]); 2348 } 2349 2350 unsigned num_avail = 0; 2351 *line = 0; 2352 while (!feof(f)) { 2353 // Create an inner scoping level, so that all the goto targets at the end of 2354 // the loop appear in an outer scoping level. This avoids warnings about 2355 // jumping past an initialization to a target in the same block. 2356 { 2357 buf[sizeof(buf) - 1] = 1; 2358 bool long_line = false; 2359 if (!fgets(buf, sizeof(buf), f)) { 2360 // Read errors presumably because of EOF 2361 // If there is valid data in threadInfo[num_avail], then fake 2362 // a blank line in ensure that the last address gets parsed. 2363 bool valid = false; 2364 for (i = 0; i <= maxIndex; i++) { 2365 if (threadInfo[num_avail][i] != UINT_MAX) { 2366 valid = true; 2367 } 2368 } 2369 if (!valid) { 2370 break; 2371 } 2372 buf[0] = 0; 2373 } else if (!buf[sizeof(buf) - 1]) { 2374 // The line is longer than the buffer. Set a flag and don't 2375 // emit an error if we were going to ignore the line, anyway. 2376 long_line = true; 2377 2378 #define CHECK_LINE \ 2379 if (long_line) { \ 2380 CLEANUP_THREAD_INFO; \ 2381 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 2382 return false; \ 2383 } 2384 } 2385 (*line)++; 2386 2387 char s1[] = "processor"; 2388 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 2389 CHECK_LINE; 2390 char *p = strchr(buf + sizeof(s1) - 1, ':'); 2391 unsigned val; 2392 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2393 goto no_val; 2394 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) 2395 #if KMP_ARCH_AARCH64 2396 // Handle the old AArch64 /proc/cpuinfo layout differently, 2397 // it contains all of the 'processor' entries listed in a 2398 // single 'Processor' section, therefore the normal looking 2399 // for duplicates in that section will always fail. 2400 num_avail++; 2401 #else 2402 goto dup_field; 2403 #endif 2404 threadInfo[num_avail][osIdIndex] = val; 2405 #if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64) 2406 char path[256]; 2407 KMP_SNPRINTF( 2408 path, sizeof(path), 2409 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 2410 threadInfo[num_avail][osIdIndex]); 2411 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 2412 2413 KMP_SNPRINTF(path, sizeof(path), 2414 "/sys/devices/system/cpu/cpu%u/topology/core_id", 2415 threadInfo[num_avail][osIdIndex]); 2416 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 2417 continue; 2418 #else 2419 } 2420 char s2[] = "physical id"; 2421 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 2422 CHECK_LINE; 2423 char *p = strchr(buf + sizeof(s2) - 1, ':'); 2424 unsigned val; 2425 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2426 goto no_val; 2427 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) 2428 goto dup_field; 2429 threadInfo[num_avail][pkgIdIndex] = val; 2430 continue; 2431 } 2432 char s3[] = "core id"; 2433 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 2434 CHECK_LINE; 2435 char *p = strchr(buf + sizeof(s3) - 1, ':'); 2436 unsigned val; 2437 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2438 goto no_val; 2439 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) 2440 goto dup_field; 2441 threadInfo[num_avail][coreIdIndex] = val; 2442 continue; 2443 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 2444 } 2445 char s4[] = "thread id"; 2446 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 2447 CHECK_LINE; 2448 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2449 unsigned val; 2450 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2451 goto no_val; 2452 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) 2453 goto dup_field; 2454 threadInfo[num_avail][threadIdIndex] = val; 2455 continue; 2456 } 2457 unsigned level; 2458 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 2459 CHECK_LINE; 2460 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2461 unsigned val; 2462 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2463 goto no_val; 2464 // validate the input before using level: 2465 if (level > (unsigned)__kmp_xproc) { // level is too big 2466 level = __kmp_xproc; 2467 } 2468 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) 2469 goto dup_field; 2470 threadInfo[num_avail][nodeIdIndex + level] = val; 2471 continue; 2472 } 2473 2474 // We didn't recognize the leading token on the line. There are lots of 2475 // leading tokens that we don't recognize - if the line isn't empty, go on 2476 // to the next line. 2477 if ((*buf != 0) && (*buf != '\n')) { 2478 // If the line is longer than the buffer, read characters 2479 // until we find a newline. 2480 if (long_line) { 2481 int ch; 2482 while (((ch = fgetc(f)) != EOF) && (ch != '\n')) 2483 ; 2484 } 2485 continue; 2486 } 2487 2488 // A newline has signalled the end of the processor record. 2489 // Check that there aren't too many procs specified. 2490 if ((int)num_avail == __kmp_xproc) { 2491 CLEANUP_THREAD_INFO; 2492 *msg_id = kmp_i18n_str_TooManyEntries; 2493 return false; 2494 } 2495 2496 // Check for missing fields. The osId field must be there, and we 2497 // currently require that the physical id field is specified, also. 2498 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 2499 CLEANUP_THREAD_INFO; 2500 *msg_id = kmp_i18n_str_MissingProcField; 2501 return false; 2502 } 2503 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 2504 CLEANUP_THREAD_INFO; 2505 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 2506 return false; 2507 } 2508 2509 // Skip this proc if it is not included in the machine model. 2510 if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], 2511 __kmp_affin_fullMask)) { 2512 INIT_PROC_INFO(threadInfo[num_avail]); 2513 continue; 2514 } 2515 2516 // We have a successful parse of this proc's info. 2517 // Increment the counter, and prepare for the next proc. 2518 num_avail++; 2519 KMP_ASSERT(num_avail <= num_records); 2520 INIT_PROC_INFO(threadInfo[num_avail]); 2521 } 2522 continue; 2523 2524 no_val: 2525 CLEANUP_THREAD_INFO; 2526 *msg_id = kmp_i18n_str_MissingValCpuinfo; 2527 return false; 2528 2529 dup_field: 2530 CLEANUP_THREAD_INFO; 2531 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 2532 return false; 2533 } 2534 *line = 0; 2535 2536 #if KMP_MIC && REDUCE_TEAM_SIZE 2537 unsigned teamSize = 0; 2538 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2539 2540 // check for num_records == __kmp_xproc ??? 2541 2542 // If it is configured to omit the package level when there is only a single 2543 // package, the logic at the end of this routine won't work if there is only a 2544 // single thread 2545 KMP_ASSERT(num_avail > 0); 2546 KMP_ASSERT(num_avail <= num_records); 2547 2548 // Sort the threadInfo table by physical Id. 2549 qsort(threadInfo, num_avail, sizeof(*threadInfo), 2550 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 2551 2552 // The table is now sorted by pkgId / coreId / threadId, but we really don't 2553 // know the radix of any of the fields. pkgId's may be sparsely assigned among 2554 // the chips on a system. Although coreId's are usually assigned 2555 // [0 .. coresPerPkg-1] and threadId's are usually assigned 2556 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2557 // 2558 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 2559 // total # packages) are at this point - we want to determine that now. We 2560 // only have an upper bound on the first two figures. 2561 unsigned *counts = 2562 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2563 unsigned *maxCt = 2564 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2565 unsigned *totals = 2566 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2567 unsigned *lastId = 2568 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2569 2570 bool assign_thread_ids = false; 2571 unsigned threadIdCt; 2572 unsigned index; 2573 2574 restart_radix_check: 2575 threadIdCt = 0; 2576 2577 // Initialize the counter arrays with data from threadInfo[0]. 2578 if (assign_thread_ids) { 2579 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 2580 threadInfo[0][threadIdIndex] = threadIdCt++; 2581 } else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 2582 threadIdCt = threadInfo[0][threadIdIndex] + 1; 2583 } 2584 } 2585 for (index = 0; index <= maxIndex; index++) { 2586 counts[index] = 1; 2587 maxCt[index] = 1; 2588 totals[index] = 1; 2589 lastId[index] = threadInfo[0][index]; 2590 ; 2591 } 2592 2593 // Run through the rest of the OS procs. 2594 for (i = 1; i < num_avail; i++) { 2595 // Find the most significant index whose id differs from the id for the 2596 // previous OS proc. 2597 for (index = maxIndex; index >= threadIdIndex; index--) { 2598 if (assign_thread_ids && (index == threadIdIndex)) { 2599 // Auto-assign the thread id field if it wasn't specified. 2600 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2601 threadInfo[i][threadIdIndex] = threadIdCt++; 2602 } 2603 // Apparently the thread id field was specified for some entries and not 2604 // others. Start the thread id counter off at the next higher thread id. 2605 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2606 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2607 } 2608 } 2609 if (threadInfo[i][index] != lastId[index]) { 2610 // Run through all indices which are less significant, and reset the 2611 // counts to 1. At all levels up to and including index, we need to 2612 // increment the totals and record the last id. 2613 unsigned index2; 2614 for (index2 = threadIdIndex; index2 < index; index2++) { 2615 totals[index2]++; 2616 if (counts[index2] > maxCt[index2]) { 2617 maxCt[index2] = counts[index2]; 2618 } 2619 counts[index2] = 1; 2620 lastId[index2] = threadInfo[i][index2]; 2621 } 2622 counts[index]++; 2623 totals[index]++; 2624 lastId[index] = threadInfo[i][index]; 2625 2626 if (assign_thread_ids && (index > threadIdIndex)) { 2627 2628 #if KMP_MIC && REDUCE_TEAM_SIZE 2629 // The default team size is the total #threads in the machine 2630 // minus 1 thread for every core that has 3 or more threads. 2631 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2632 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2633 2634 // Restart the thread counter, as we are on a new core. 2635 threadIdCt = 0; 2636 2637 // Auto-assign the thread id field if it wasn't specified. 2638 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2639 threadInfo[i][threadIdIndex] = threadIdCt++; 2640 } 2641 2642 // Apparently the thread id field was specified for some entries and 2643 // not others. Start the thread id counter off at the next higher 2644 // thread id. 2645 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2646 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2647 } 2648 } 2649 break; 2650 } 2651 } 2652 if (index < threadIdIndex) { 2653 // If thread ids were specified, it is an error if they are not unique. 2654 // Also, check that we waven't already restarted the loop (to be safe - 2655 // shouldn't need to). 2656 if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) { 2657 __kmp_free(lastId); 2658 __kmp_free(totals); 2659 __kmp_free(maxCt); 2660 __kmp_free(counts); 2661 CLEANUP_THREAD_INFO; 2662 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2663 return false; 2664 } 2665 2666 // If the thread ids were not specified and we see entries entries that 2667 // are duplicates, start the loop over and assign the thread ids manually. 2668 assign_thread_ids = true; 2669 goto restart_radix_check; 2670 } 2671 } 2672 2673 #if KMP_MIC && REDUCE_TEAM_SIZE 2674 // The default team size is the total #threads in the machine 2675 // minus 1 thread for every core that has 3 or more threads. 2676 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2677 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2678 2679 for (index = threadIdIndex; index <= maxIndex; index++) { 2680 if (counts[index] > maxCt[index]) { 2681 maxCt[index] = counts[index]; 2682 } 2683 } 2684 2685 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2686 nCoresPerPkg = maxCt[coreIdIndex]; 2687 nPackages = totals[pkgIdIndex]; 2688 2689 // When affinity is off, this routine will still be called to set 2690 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 2691 // Make sure all these vars are set correctly, and return now if affinity is 2692 // not enabled. 2693 __kmp_ncores = totals[coreIdIndex]; 2694 if (!KMP_AFFINITY_CAPABLE()) { 2695 KMP_ASSERT(__kmp_affinity_type == affinity_none); 2696 return true; 2697 } 2698 2699 #if KMP_MIC && REDUCE_TEAM_SIZE 2700 // Set the default team size. 2701 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2702 __kmp_dflt_team_nth = teamSize; 2703 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting " 2704 "__kmp_dflt_team_nth = %d\n", 2705 __kmp_dflt_team_nth)); 2706 } 2707 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2708 2709 KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc); 2710 2711 // Count the number of levels which have more nodes at that level than at the 2712 // parent's level (with there being an implicit root node of the top level). 2713 // This is equivalent to saying that there is at least one node at this level 2714 // which has a sibling. These levels are in the map, and the package level is 2715 // always in the map. 2716 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2717 for (index = threadIdIndex; index < maxIndex; index++) { 2718 KMP_ASSERT(totals[index] >= totals[index + 1]); 2719 inMap[index] = (totals[index] > totals[index + 1]); 2720 } 2721 inMap[maxIndex] = (totals[maxIndex] > 1); 2722 inMap[pkgIdIndex] = true; 2723 inMap[coreIdIndex] = true; 2724 inMap[threadIdIndex] = true; 2725 2726 int depth = 0; 2727 int idx = 0; 2728 kmp_hw_t types[KMP_HW_LAST]; 2729 int pkgLevel = -1; 2730 int coreLevel = -1; 2731 int threadLevel = -1; 2732 for (index = threadIdIndex; index <= maxIndex; index++) { 2733 if (inMap[index]) { 2734 depth++; 2735 } 2736 } 2737 if (inMap[pkgIdIndex]) { 2738 pkgLevel = idx; 2739 types[idx++] = KMP_HW_SOCKET; 2740 } 2741 if (inMap[coreIdIndex]) { 2742 coreLevel = idx; 2743 types[idx++] = KMP_HW_CORE; 2744 } 2745 if (inMap[threadIdIndex]) { 2746 threadLevel = idx; 2747 types[idx++] = KMP_HW_THREAD; 2748 } 2749 KMP_ASSERT(depth > 0); 2750 2751 // Construct the data structure that is to be returned. 2752 __kmp_topology = kmp_topology_t::allocate(num_avail, depth, types); 2753 2754 for (i = 0; i < num_avail; ++i) { 2755 unsigned os = threadInfo[i][osIdIndex]; 2756 int src_index; 2757 int dst_index = 0; 2758 kmp_hw_thread_t &hw_thread = __kmp_topology->at(i); 2759 hw_thread.clear(); 2760 hw_thread.os_id = os; 2761 2762 idx = 0; 2763 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2764 if (!inMap[src_index]) { 2765 continue; 2766 } 2767 if (src_index == pkgIdIndex) { 2768 hw_thread.ids[pkgLevel] = threadInfo[i][src_index]; 2769 } else if (src_index == coreIdIndex) { 2770 hw_thread.ids[coreLevel] = threadInfo[i][src_index]; 2771 } else if (src_index == threadIdIndex) { 2772 hw_thread.ids[threadLevel] = threadInfo[i][src_index]; 2773 } 2774 dst_index++; 2775 } 2776 } 2777 2778 __kmp_free(inMap); 2779 __kmp_free(lastId); 2780 __kmp_free(totals); 2781 __kmp_free(maxCt); 2782 __kmp_free(counts); 2783 CLEANUP_THREAD_INFO; 2784 __kmp_topology->sort_ids(); 2785 if (!__kmp_topology->check_ids()) { 2786 kmp_topology_t::deallocate(__kmp_topology); 2787 __kmp_topology = nullptr; 2788 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2789 return false; 2790 } 2791 return true; 2792 } 2793 2794 // Create and return a table of affinity masks, indexed by OS thread ID. 2795 // This routine handles OR'ing together all the affinity masks of threads 2796 // that are sufficiently close, if granularity > fine. 2797 static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex, 2798 unsigned *numUnique) { 2799 // First form a table of affinity masks in order of OS thread id. 2800 int maxOsId; 2801 int i; 2802 int numAddrs = __kmp_topology->get_num_hw_threads(); 2803 int depth = __kmp_topology->get_depth(); 2804 KMP_ASSERT(numAddrs); 2805 KMP_ASSERT(depth); 2806 2807 maxOsId = 0; 2808 for (i = numAddrs - 1;; --i) { 2809 int osId = __kmp_topology->at(i).os_id; 2810 if (osId > maxOsId) { 2811 maxOsId = osId; 2812 } 2813 if (i == 0) 2814 break; 2815 } 2816 kmp_affin_mask_t *osId2Mask; 2817 KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1)); 2818 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2819 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2820 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2821 } 2822 if (__kmp_affinity_gran_levels >= (int)depth) { 2823 if (__kmp_affinity_verbose || 2824 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 2825 KMP_WARNING(AffThreadsMayMigrate); 2826 } 2827 } 2828 2829 // Run through the table, forming the masks for all threads on each core. 2830 // Threads on the same core will have identical kmp_hw_thread_t objects, not 2831 // considering the last level, which must be the thread id. All threads on a 2832 // core will appear consecutively. 2833 int unique = 0; 2834 int j = 0; // index of 1st thread on core 2835 int leader = 0; 2836 kmp_affin_mask_t *sum; 2837 KMP_CPU_ALLOC_ON_STACK(sum); 2838 KMP_CPU_ZERO(sum); 2839 KMP_CPU_SET(__kmp_topology->at(0).os_id, sum); 2840 for (i = 1; i < numAddrs; i++) { 2841 // If this thread is sufficiently close to the leader (within the 2842 // granularity setting), then set the bit for this os thread in the 2843 // affinity mask for this group, and go on to the next thread. 2844 if (__kmp_topology->is_close(leader, i, __kmp_affinity_gran_levels)) { 2845 KMP_CPU_SET(__kmp_topology->at(i).os_id, sum); 2846 continue; 2847 } 2848 2849 // For every thread in this group, copy the mask to the thread's entry in 2850 // the osId2Mask table. Mark the first address as a leader. 2851 for (; j < i; j++) { 2852 int osId = __kmp_topology->at(j).os_id; 2853 KMP_DEBUG_ASSERT(osId <= maxOsId); 2854 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2855 KMP_CPU_COPY(mask, sum); 2856 __kmp_topology->at(j).leader = (j == leader); 2857 } 2858 unique++; 2859 2860 // Start a new mask. 2861 leader = i; 2862 KMP_CPU_ZERO(sum); 2863 KMP_CPU_SET(__kmp_topology->at(i).os_id, sum); 2864 } 2865 2866 // For every thread in last group, copy the mask to the thread's 2867 // entry in the osId2Mask table. 2868 for (; j < i; j++) { 2869 int osId = __kmp_topology->at(j).os_id; 2870 KMP_DEBUG_ASSERT(osId <= maxOsId); 2871 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2872 KMP_CPU_COPY(mask, sum); 2873 __kmp_topology->at(j).leader = (j == leader); 2874 } 2875 unique++; 2876 KMP_CPU_FREE_FROM_STACK(sum); 2877 2878 *maxIndex = maxOsId; 2879 *numUnique = unique; 2880 return osId2Mask; 2881 } 2882 2883 // Stuff for the affinity proclist parsers. It's easier to declare these vars 2884 // as file-static than to try and pass them through the calling sequence of 2885 // the recursive-descent OMP_PLACES parser. 2886 static kmp_affin_mask_t *newMasks; 2887 static int numNewMasks; 2888 static int nextNewMask; 2889 2890 #define ADD_MASK(_mask) \ 2891 { \ 2892 if (nextNewMask >= numNewMasks) { \ 2893 int i; \ 2894 numNewMasks *= 2; \ 2895 kmp_affin_mask_t *temp; \ 2896 KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \ 2897 for (i = 0; i < numNewMasks / 2; i++) { \ 2898 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); \ 2899 kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i); \ 2900 KMP_CPU_COPY(dest, src); \ 2901 } \ 2902 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2); \ 2903 newMasks = temp; \ 2904 } \ 2905 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 2906 nextNewMask++; \ 2907 } 2908 2909 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId) \ 2910 { \ 2911 if (((_osId) > _maxOsId) || \ 2912 (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 2913 if (__kmp_affinity_verbose || \ 2914 (__kmp_affinity_warnings && \ 2915 (__kmp_affinity_type != affinity_none))) { \ 2916 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 2917 } \ 2918 } else { \ 2919 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 2920 } \ 2921 } 2922 2923 // Re-parse the proclist (for the explicit affinity type), and form the list 2924 // of affinity newMasks indexed by gtid. 2925 static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 2926 unsigned int *out_numMasks, 2927 const char *proclist, 2928 kmp_affin_mask_t *osId2Mask, 2929 int maxOsId) { 2930 int i; 2931 const char *scan = proclist; 2932 const char *next = proclist; 2933 2934 // We use malloc() for the temporary mask vector, so that we can use 2935 // realloc() to extend it. 2936 numNewMasks = 2; 2937 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 2938 nextNewMask = 0; 2939 kmp_affin_mask_t *sumMask; 2940 KMP_CPU_ALLOC(sumMask); 2941 int setSize = 0; 2942 2943 for (;;) { 2944 int start, end, stride; 2945 2946 SKIP_WS(scan); 2947 next = scan; 2948 if (*next == '\0') { 2949 break; 2950 } 2951 2952 if (*next == '{') { 2953 int num; 2954 setSize = 0; 2955 next++; // skip '{' 2956 SKIP_WS(next); 2957 scan = next; 2958 2959 // Read the first integer in the set. 2960 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist"); 2961 SKIP_DIGITS(next); 2962 num = __kmp_str_to_int(scan, *next); 2963 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2964 2965 // Copy the mask for that osId to the sum (union) mask. 2966 if ((num > maxOsId) || 2967 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2968 if (__kmp_affinity_verbose || 2969 (__kmp_affinity_warnings && 2970 (__kmp_affinity_type != affinity_none))) { 2971 KMP_WARNING(AffIgnoreInvalidProcID, num); 2972 } 2973 KMP_CPU_ZERO(sumMask); 2974 } else { 2975 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2976 setSize = 1; 2977 } 2978 2979 for (;;) { 2980 // Check for end of set. 2981 SKIP_WS(next); 2982 if (*next == '}') { 2983 next++; // skip '}' 2984 break; 2985 } 2986 2987 // Skip optional comma. 2988 if (*next == ',') { 2989 next++; 2990 } 2991 SKIP_WS(next); 2992 2993 // Read the next integer in the set. 2994 scan = next; 2995 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2996 2997 SKIP_DIGITS(next); 2998 num = __kmp_str_to_int(scan, *next); 2999 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 3000 3001 // Add the mask for that osId to the sum mask. 3002 if ((num > maxOsId) || 3003 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3004 if (__kmp_affinity_verbose || 3005 (__kmp_affinity_warnings && 3006 (__kmp_affinity_type != affinity_none))) { 3007 KMP_WARNING(AffIgnoreInvalidProcID, num); 3008 } 3009 } else { 3010 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 3011 setSize++; 3012 } 3013 } 3014 if (setSize > 0) { 3015 ADD_MASK(sumMask); 3016 } 3017 3018 SKIP_WS(next); 3019 if (*next == ',') { 3020 next++; 3021 } 3022 scan = next; 3023 continue; 3024 } 3025 3026 // Read the first integer. 3027 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 3028 SKIP_DIGITS(next); 3029 start = __kmp_str_to_int(scan, *next); 3030 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 3031 SKIP_WS(next); 3032 3033 // If this isn't a range, then add a mask to the list and go on. 3034 if (*next != '-') { 3035 ADD_MASK_OSID(start, osId2Mask, maxOsId); 3036 3037 // Skip optional comma. 3038 if (*next == ',') { 3039 next++; 3040 } 3041 scan = next; 3042 continue; 3043 } 3044 3045 // This is a range. Skip over the '-' and read in the 2nd int. 3046 next++; // skip '-' 3047 SKIP_WS(next); 3048 scan = next; 3049 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 3050 SKIP_DIGITS(next); 3051 end = __kmp_str_to_int(scan, *next); 3052 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 3053 3054 // Check for a stride parameter 3055 stride = 1; 3056 SKIP_WS(next); 3057 if (*next == ':') { 3058 // A stride is specified. Skip over the ':" and read the 3rd int. 3059 int sign = +1; 3060 next++; // skip ':' 3061 SKIP_WS(next); 3062 scan = next; 3063 if (*next == '-') { 3064 sign = -1; 3065 next++; 3066 SKIP_WS(next); 3067 scan = next; 3068 } 3069 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 3070 SKIP_DIGITS(next); 3071 stride = __kmp_str_to_int(scan, *next); 3072 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 3073 stride *= sign; 3074 } 3075 3076 // Do some range checks. 3077 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 3078 if (stride > 0) { 3079 KMP_ASSERT2(start <= end, "bad explicit proc list"); 3080 } else { 3081 KMP_ASSERT2(start >= end, "bad explicit proc list"); 3082 } 3083 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 3084 3085 // Add the mask for each OS proc # to the list. 3086 if (stride > 0) { 3087 do { 3088 ADD_MASK_OSID(start, osId2Mask, maxOsId); 3089 start += stride; 3090 } while (start <= end); 3091 } else { 3092 do { 3093 ADD_MASK_OSID(start, osId2Mask, maxOsId); 3094 start += stride; 3095 } while (start >= end); 3096 } 3097 3098 // Skip optional comma. 3099 SKIP_WS(next); 3100 if (*next == ',') { 3101 next++; 3102 } 3103 scan = next; 3104 } 3105 3106 *out_numMasks = nextNewMask; 3107 if (nextNewMask == 0) { 3108 *out_masks = NULL; 3109 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3110 return; 3111 } 3112 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3113 for (i = 0; i < nextNewMask; i++) { 3114 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 3115 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 3116 KMP_CPU_COPY(dest, src); 3117 } 3118 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3119 KMP_CPU_FREE(sumMask); 3120 } 3121 3122 /*----------------------------------------------------------------------------- 3123 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 3124 places. Again, Here is the grammar: 3125 3126 place_list := place 3127 place_list := place , place_list 3128 place := num 3129 place := place : num 3130 place := place : num : signed 3131 place := { subplacelist } 3132 place := ! place // (lowest priority) 3133 subplace_list := subplace 3134 subplace_list := subplace , subplace_list 3135 subplace := num 3136 subplace := num : num 3137 subplace := num : num : signed 3138 signed := num 3139 signed := + signed 3140 signed := - signed 3141 -----------------------------------------------------------------------------*/ 3142 static void __kmp_process_subplace_list(const char **scan, 3143 kmp_affin_mask_t *osId2Mask, 3144 int maxOsId, kmp_affin_mask_t *tempMask, 3145 int *setSize) { 3146 const char *next; 3147 3148 for (;;) { 3149 int start, count, stride, i; 3150 3151 // Read in the starting proc id 3152 SKIP_WS(*scan); 3153 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3154 next = *scan; 3155 SKIP_DIGITS(next); 3156 start = __kmp_str_to_int(*scan, *next); 3157 KMP_ASSERT(start >= 0); 3158 *scan = next; 3159 3160 // valid follow sets are ',' ':' and '}' 3161 SKIP_WS(*scan); 3162 if (**scan == '}' || **scan == ',') { 3163 if ((start > maxOsId) || 3164 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3165 if (__kmp_affinity_verbose || 3166 (__kmp_affinity_warnings && 3167 (__kmp_affinity_type != affinity_none))) { 3168 KMP_WARNING(AffIgnoreInvalidProcID, start); 3169 } 3170 } else { 3171 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3172 (*setSize)++; 3173 } 3174 if (**scan == '}') { 3175 break; 3176 } 3177 (*scan)++; // skip ',' 3178 continue; 3179 } 3180 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3181 (*scan)++; // skip ':' 3182 3183 // Read count parameter 3184 SKIP_WS(*scan); 3185 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3186 next = *scan; 3187 SKIP_DIGITS(next); 3188 count = __kmp_str_to_int(*scan, *next); 3189 KMP_ASSERT(count >= 0); 3190 *scan = next; 3191 3192 // valid follow sets are ',' ':' and '}' 3193 SKIP_WS(*scan); 3194 if (**scan == '}' || **scan == ',') { 3195 for (i = 0; i < count; i++) { 3196 if ((start > maxOsId) || 3197 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3198 if (__kmp_affinity_verbose || 3199 (__kmp_affinity_warnings && 3200 (__kmp_affinity_type != affinity_none))) { 3201 KMP_WARNING(AffIgnoreInvalidProcID, start); 3202 } 3203 break; // don't proliferate warnings for large count 3204 } else { 3205 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3206 start++; 3207 (*setSize)++; 3208 } 3209 } 3210 if (**scan == '}') { 3211 break; 3212 } 3213 (*scan)++; // skip ',' 3214 continue; 3215 } 3216 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3217 (*scan)++; // skip ':' 3218 3219 // Read stride parameter 3220 int sign = +1; 3221 for (;;) { 3222 SKIP_WS(*scan); 3223 if (**scan == '+') { 3224 (*scan)++; // skip '+' 3225 continue; 3226 } 3227 if (**scan == '-') { 3228 sign *= -1; 3229 (*scan)++; // skip '-' 3230 continue; 3231 } 3232 break; 3233 } 3234 SKIP_WS(*scan); 3235 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3236 next = *scan; 3237 SKIP_DIGITS(next); 3238 stride = __kmp_str_to_int(*scan, *next); 3239 KMP_ASSERT(stride >= 0); 3240 *scan = next; 3241 stride *= sign; 3242 3243 // valid follow sets are ',' and '}' 3244 SKIP_WS(*scan); 3245 if (**scan == '}' || **scan == ',') { 3246 for (i = 0; i < count; i++) { 3247 if ((start > maxOsId) || 3248 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3249 if (__kmp_affinity_verbose || 3250 (__kmp_affinity_warnings && 3251 (__kmp_affinity_type != affinity_none))) { 3252 KMP_WARNING(AffIgnoreInvalidProcID, start); 3253 } 3254 break; // don't proliferate warnings for large count 3255 } else { 3256 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3257 start += stride; 3258 (*setSize)++; 3259 } 3260 } 3261 if (**scan == '}') { 3262 break; 3263 } 3264 (*scan)++; // skip ',' 3265 continue; 3266 } 3267 3268 KMP_ASSERT2(0, "bad explicit places list"); 3269 } 3270 } 3271 3272 static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 3273 int maxOsId, kmp_affin_mask_t *tempMask, 3274 int *setSize) { 3275 const char *next; 3276 3277 // valid follow sets are '{' '!' and num 3278 SKIP_WS(*scan); 3279 if (**scan == '{') { 3280 (*scan)++; // skip '{' 3281 __kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize); 3282 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 3283 (*scan)++; // skip '}' 3284 } else if (**scan == '!') { 3285 (*scan)++; // skip '!' 3286 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 3287 KMP_CPU_COMPLEMENT(maxOsId, tempMask); 3288 } else if ((**scan >= '0') && (**scan <= '9')) { 3289 next = *scan; 3290 SKIP_DIGITS(next); 3291 int num = __kmp_str_to_int(*scan, *next); 3292 KMP_ASSERT(num >= 0); 3293 if ((num > maxOsId) || 3294 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3295 if (__kmp_affinity_verbose || 3296 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 3297 KMP_WARNING(AffIgnoreInvalidProcID, num); 3298 } 3299 } else { 3300 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 3301 (*setSize)++; 3302 } 3303 *scan = next; // skip num 3304 } else { 3305 KMP_ASSERT2(0, "bad explicit places list"); 3306 } 3307 } 3308 3309 // static void 3310 void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 3311 unsigned int *out_numMasks, 3312 const char *placelist, 3313 kmp_affin_mask_t *osId2Mask, 3314 int maxOsId) { 3315 int i, j, count, stride, sign; 3316 const char *scan = placelist; 3317 const char *next = placelist; 3318 3319 numNewMasks = 2; 3320 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 3321 nextNewMask = 0; 3322 3323 // tempMask is modified based on the previous or initial 3324 // place to form the current place 3325 // previousMask contains the previous place 3326 kmp_affin_mask_t *tempMask; 3327 kmp_affin_mask_t *previousMask; 3328 KMP_CPU_ALLOC(tempMask); 3329 KMP_CPU_ZERO(tempMask); 3330 KMP_CPU_ALLOC(previousMask); 3331 KMP_CPU_ZERO(previousMask); 3332 int setSize = 0; 3333 3334 for (;;) { 3335 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 3336 3337 // valid follow sets are ',' ':' and EOL 3338 SKIP_WS(scan); 3339 if (*scan == '\0' || *scan == ',') { 3340 if (setSize > 0) { 3341 ADD_MASK(tempMask); 3342 } 3343 KMP_CPU_ZERO(tempMask); 3344 setSize = 0; 3345 if (*scan == '\0') { 3346 break; 3347 } 3348 scan++; // skip ',' 3349 continue; 3350 } 3351 3352 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3353 scan++; // skip ':' 3354 3355 // Read count parameter 3356 SKIP_WS(scan); 3357 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 3358 next = scan; 3359 SKIP_DIGITS(next); 3360 count = __kmp_str_to_int(scan, *next); 3361 KMP_ASSERT(count >= 0); 3362 scan = next; 3363 3364 // valid follow sets are ',' ':' and EOL 3365 SKIP_WS(scan); 3366 if (*scan == '\0' || *scan == ',') { 3367 stride = +1; 3368 } else { 3369 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3370 scan++; // skip ':' 3371 3372 // Read stride parameter 3373 sign = +1; 3374 for (;;) { 3375 SKIP_WS(scan); 3376 if (*scan == '+') { 3377 scan++; // skip '+' 3378 continue; 3379 } 3380 if (*scan == '-') { 3381 sign *= -1; 3382 scan++; // skip '-' 3383 continue; 3384 } 3385 break; 3386 } 3387 SKIP_WS(scan); 3388 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 3389 next = scan; 3390 SKIP_DIGITS(next); 3391 stride = __kmp_str_to_int(scan, *next); 3392 KMP_DEBUG_ASSERT(stride >= 0); 3393 scan = next; 3394 stride *= sign; 3395 } 3396 3397 // Add places determined by initial_place : count : stride 3398 for (i = 0; i < count; i++) { 3399 if (setSize == 0) { 3400 break; 3401 } 3402 // Add the current place, then build the next place (tempMask) from that 3403 KMP_CPU_COPY(previousMask, tempMask); 3404 ADD_MASK(previousMask); 3405 KMP_CPU_ZERO(tempMask); 3406 setSize = 0; 3407 KMP_CPU_SET_ITERATE(j, previousMask) { 3408 if (!KMP_CPU_ISSET(j, previousMask)) { 3409 continue; 3410 } 3411 if ((j + stride > maxOsId) || (j + stride < 0) || 3412 (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) || 3413 (!KMP_CPU_ISSET(j + stride, 3414 KMP_CPU_INDEX(osId2Mask, j + stride)))) { 3415 if ((__kmp_affinity_verbose || 3416 (__kmp_affinity_warnings && 3417 (__kmp_affinity_type != affinity_none))) && 3418 i < count - 1) { 3419 KMP_WARNING(AffIgnoreInvalidProcID, j + stride); 3420 } 3421 continue; 3422 } 3423 KMP_CPU_SET(j + stride, tempMask); 3424 setSize++; 3425 } 3426 } 3427 KMP_CPU_ZERO(tempMask); 3428 setSize = 0; 3429 3430 // valid follow sets are ',' and EOL 3431 SKIP_WS(scan); 3432 if (*scan == '\0') { 3433 break; 3434 } 3435 if (*scan == ',') { 3436 scan++; // skip ',' 3437 continue; 3438 } 3439 3440 KMP_ASSERT2(0, "bad explicit places list"); 3441 } 3442 3443 *out_numMasks = nextNewMask; 3444 if (nextNewMask == 0) { 3445 *out_masks = NULL; 3446 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3447 return; 3448 } 3449 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3450 KMP_CPU_FREE(tempMask); 3451 KMP_CPU_FREE(previousMask); 3452 for (i = 0; i < nextNewMask; i++) { 3453 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 3454 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 3455 KMP_CPU_COPY(dest, src); 3456 } 3457 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3458 } 3459 3460 #undef ADD_MASK 3461 #undef ADD_MASK_OSID 3462 3463 // This function figures out the deepest level at which there is at least one 3464 // cluster/core with more than one processing unit bound to it. 3465 static int __kmp_affinity_find_core_level(int nprocs, int bottom_level) { 3466 int core_level = 0; 3467 3468 for (int i = 0; i < nprocs; i++) { 3469 const kmp_hw_thread_t &hw_thread = __kmp_topology->at(i); 3470 for (int j = bottom_level; j > 0; j--) { 3471 if (hw_thread.ids[j] > 0) { 3472 if (core_level < (j - 1)) { 3473 core_level = j - 1; 3474 } 3475 } 3476 } 3477 } 3478 return core_level; 3479 } 3480 3481 // This function counts number of clusters/cores at given level. 3482 static int __kmp_affinity_compute_ncores(int nprocs, int bottom_level, 3483 int core_level) { 3484 return __kmp_topology->get_count(core_level); 3485 } 3486 // This function finds to which cluster/core given processing unit is bound. 3487 static int __kmp_affinity_find_core(int proc, int bottom_level, 3488 int core_level) { 3489 int core = 0; 3490 KMP_DEBUG_ASSERT(proc >= 0 && proc < __kmp_topology->get_num_hw_threads()); 3491 for (int i = 0; i <= proc; ++i) { 3492 if (i + 1 <= proc) { 3493 for (int j = 0; j <= core_level; ++j) { 3494 if (__kmp_topology->at(i + 1).sub_ids[j] != 3495 __kmp_topology->at(i).sub_ids[j]) { 3496 core++; 3497 break; 3498 } 3499 } 3500 } 3501 } 3502 return core; 3503 } 3504 3505 // This function finds maximal number of processing units bound to a 3506 // cluster/core at given level. 3507 static int __kmp_affinity_max_proc_per_core(int nprocs, int bottom_level, 3508 int core_level) { 3509 if (core_level >= bottom_level) 3510 return 1; 3511 int thread_level = __kmp_topology->get_level(KMP_HW_THREAD); 3512 return __kmp_topology->calculate_ratio(thread_level, core_level); 3513 } 3514 3515 static int *procarr = NULL; 3516 static int __kmp_aff_depth = 0; 3517 3518 // Create a one element mask array (set of places) which only contains the 3519 // initial process's affinity mask 3520 static void __kmp_create_affinity_none_places() { 3521 KMP_ASSERT(__kmp_affin_fullMask != NULL); 3522 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3523 __kmp_affinity_num_masks = 1; 3524 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 3525 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, 0); 3526 KMP_CPU_COPY(dest, __kmp_affin_fullMask); 3527 } 3528 3529 static void __kmp_aux_affinity_initialize(void) { 3530 if (__kmp_affinity_masks != NULL) { 3531 KMP_ASSERT(__kmp_affin_fullMask != NULL); 3532 return; 3533 } 3534 3535 // Create the "full" mask - this defines all of the processors that we 3536 // consider to be in the machine model. If respect is set, then it is the 3537 // initialization thread's affinity mask. Otherwise, it is all processors that 3538 // we know about on the machine. 3539 if (__kmp_affin_fullMask == NULL) { 3540 KMP_CPU_ALLOC(__kmp_affin_fullMask); 3541 } 3542 if (KMP_AFFINITY_CAPABLE()) { 3543 __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE); 3544 if (__kmp_affinity_respect_mask) { 3545 // Count the number of available processors. 3546 unsigned i; 3547 __kmp_avail_proc = 0; 3548 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 3549 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 3550 continue; 3551 } 3552 __kmp_avail_proc++; 3553 } 3554 if (__kmp_avail_proc > __kmp_xproc) { 3555 if (__kmp_affinity_verbose || 3556 (__kmp_affinity_warnings && 3557 (__kmp_affinity_type != affinity_none))) { 3558 KMP_WARNING(ErrorInitializeAffinity); 3559 } 3560 __kmp_affinity_type = affinity_none; 3561 KMP_AFFINITY_DISABLE(); 3562 return; 3563 } 3564 3565 if (__kmp_affinity_verbose) { 3566 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 3567 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 3568 __kmp_affin_fullMask); 3569 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 3570 } 3571 } else { 3572 if (__kmp_affinity_verbose) { 3573 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 3574 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 3575 __kmp_affin_fullMask); 3576 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 3577 } 3578 __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask); 3579 __kmp_avail_proc = __kmp_xproc; 3580 #if KMP_OS_WINDOWS 3581 // Set the process affinity mask since threads' affinity 3582 // masks must be subset of process mask in Windows* OS 3583 __kmp_affin_fullMask->set_process_affinity(true); 3584 #endif 3585 } 3586 } 3587 3588 kmp_i18n_id_t msg_id = kmp_i18n_null; 3589 3590 // For backward compatibility, setting KMP_CPUINFO_FILE => 3591 // KMP_TOPOLOGY_METHOD=cpuinfo 3592 if ((__kmp_cpuinfo_file != NULL) && 3593 (__kmp_affinity_top_method == affinity_top_method_all)) { 3594 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 3595 } 3596 3597 bool success = false; 3598 if (__kmp_affinity_top_method == affinity_top_method_all) { 3599 // In the default code path, errors are not fatal - we just try using 3600 // another method. We only emit a warning message if affinity is on, or the 3601 // verbose flag is set, an the nowarnings flag was not set. 3602 #if KMP_USE_HWLOC 3603 if (!success && 3604 __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 3605 if (!__kmp_hwloc_error) { 3606 success = __kmp_affinity_create_hwloc_map(&msg_id); 3607 if (!success && __kmp_affinity_verbose) { 3608 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 3609 } 3610 } else if (__kmp_affinity_verbose) { 3611 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 3612 } 3613 } 3614 #endif 3615 3616 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 3617 if (!success) { 3618 success = __kmp_affinity_create_x2apicid_map(&msg_id); 3619 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3620 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3621 } 3622 } 3623 if (!success) { 3624 success = __kmp_affinity_create_apicid_map(&msg_id); 3625 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3626 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3627 } 3628 } 3629 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3630 3631 #if KMP_OS_LINUX 3632 if (!success) { 3633 int line = 0; 3634 success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id); 3635 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3636 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3637 } 3638 } 3639 #endif /* KMP_OS_LINUX */ 3640 3641 #if KMP_GROUP_AFFINITY 3642 if (!success && (__kmp_num_proc_groups > 1)) { 3643 success = __kmp_affinity_create_proc_group_map(&msg_id); 3644 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3645 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3646 } 3647 } 3648 #endif /* KMP_GROUP_AFFINITY */ 3649 3650 if (!success) { 3651 success = __kmp_affinity_create_flat_map(&msg_id); 3652 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3653 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3654 } 3655 KMP_ASSERT(success); 3656 } 3657 } 3658 3659 // If the user has specified that a paricular topology discovery method is to be 3660 // used, then we abort if that method fails. The exception is group affinity, 3661 // which might have been implicitly set. 3662 #if KMP_USE_HWLOC 3663 else if (__kmp_affinity_top_method == affinity_top_method_hwloc) { 3664 KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC); 3665 success = __kmp_affinity_create_hwloc_map(&msg_id); 3666 if (!success) { 3667 KMP_ASSERT(msg_id != kmp_i18n_null); 3668 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3669 } 3670 } 3671 #endif // KMP_USE_HWLOC 3672 3673 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 3674 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid || 3675 __kmp_affinity_top_method == affinity_top_method_x2apicid_1f) { 3676 success = __kmp_affinity_create_x2apicid_map(&msg_id); 3677 if (!success) { 3678 KMP_ASSERT(msg_id != kmp_i18n_null); 3679 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3680 } 3681 } else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 3682 success = __kmp_affinity_create_apicid_map(&msg_id); 3683 if (!success) { 3684 KMP_ASSERT(msg_id != kmp_i18n_null); 3685 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3686 } 3687 } 3688 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3689 3690 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 3691 int line = 0; 3692 success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id); 3693 if (!success) { 3694 KMP_ASSERT(msg_id != kmp_i18n_null); 3695 const char *filename = __kmp_cpuinfo_get_filename(); 3696 if (line > 0) { 3697 KMP_FATAL(FileLineMsgExiting, filename, line, 3698 __kmp_i18n_catgets(msg_id)); 3699 } else { 3700 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 3701 } 3702 } 3703 } 3704 3705 #if KMP_GROUP_AFFINITY 3706 else if (__kmp_affinity_top_method == affinity_top_method_group) { 3707 success = __kmp_affinity_create_proc_group_map(&msg_id); 3708 KMP_ASSERT(success); 3709 if (!success) { 3710 KMP_ASSERT(msg_id != kmp_i18n_null); 3711 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3712 } 3713 } 3714 #endif /* KMP_GROUP_AFFINITY */ 3715 3716 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 3717 success = __kmp_affinity_create_flat_map(&msg_id); 3718 // should not fail 3719 KMP_ASSERT(success); 3720 } 3721 3722 // Early exit if topology could not be created 3723 if (!__kmp_topology) { 3724 if (KMP_AFFINITY_CAPABLE() && 3725 (__kmp_affinity_verbose || 3726 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) { 3727 KMP_WARNING(ErrorInitializeAffinity); 3728 } 3729 if (nPackages > 0 && nCoresPerPkg > 0 && __kmp_nThreadsPerCore > 0 && 3730 __kmp_ncores > 0) { 3731 __kmp_topology = kmp_topology_t::allocate(0, 0, NULL); 3732 __kmp_topology->canonicalize(nPackages, nCoresPerPkg, 3733 __kmp_nThreadsPerCore, __kmp_ncores); 3734 if (__kmp_affinity_verbose) { 3735 __kmp_topology->print("KMP_AFFINITY"); 3736 } 3737 } 3738 __kmp_affinity_type = affinity_none; 3739 __kmp_create_affinity_none_places(); 3740 #if KMP_USE_HIER_SCHED 3741 __kmp_dispatch_set_hierarchy_values(); 3742 #endif 3743 KMP_AFFINITY_DISABLE(); 3744 return; 3745 } 3746 3747 // Canonicalize, print (if requested), apply KMP_HW_SUBSET, and 3748 // initialize other data structures which depend on the topology 3749 __kmp_topology->canonicalize(); 3750 if (__kmp_affinity_verbose) 3751 __kmp_topology->print("KMP_AFFINITY"); 3752 bool filtered = __kmp_topology->filter_hw_subset(); 3753 if (filtered && __kmp_affinity_verbose) 3754 __kmp_topology->print("KMP_HW_SUBSET"); 3755 machine_hierarchy.init(__kmp_topology->get_num_hw_threads()); 3756 KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads()); 3757 // If KMP_AFFINITY=none, then only create the single "none" place 3758 // which is the process's initial affinity mask or the number of 3759 // hardware threads depending on respect,norespect 3760 if (__kmp_affinity_type == affinity_none) { 3761 __kmp_create_affinity_none_places(); 3762 #if KMP_USE_HIER_SCHED 3763 __kmp_dispatch_set_hierarchy_values(); 3764 #endif 3765 return; 3766 } 3767 int depth = __kmp_topology->get_depth(); 3768 3769 // Create the table of masks, indexed by thread Id. 3770 unsigned maxIndex; 3771 unsigned numUnique; 3772 kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique); 3773 if (__kmp_affinity_gran_levels == 0) { 3774 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 3775 } 3776 3777 switch (__kmp_affinity_type) { 3778 3779 case affinity_explicit: 3780 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 3781 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) { 3782 __kmp_affinity_process_proclist( 3783 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 3784 __kmp_affinity_proclist, osId2Mask, maxIndex); 3785 } else { 3786 __kmp_affinity_process_placelist( 3787 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 3788 __kmp_affinity_proclist, osId2Mask, maxIndex); 3789 } 3790 if (__kmp_affinity_num_masks == 0) { 3791 if (__kmp_affinity_verbose || 3792 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 3793 KMP_WARNING(AffNoValidProcID); 3794 } 3795 __kmp_affinity_type = affinity_none; 3796 __kmp_create_affinity_none_places(); 3797 return; 3798 } 3799 break; 3800 3801 // The other affinity types rely on sorting the hardware threads according to 3802 // some permutation of the machine topology tree. Set __kmp_affinity_compact 3803 // and __kmp_affinity_offset appropriately, then jump to a common code 3804 // fragment to do the sort and create the array of affinity masks. 3805 case affinity_logical: 3806 __kmp_affinity_compact = 0; 3807 if (__kmp_affinity_offset) { 3808 __kmp_affinity_offset = 3809 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 3810 } 3811 goto sortTopology; 3812 3813 case affinity_physical: 3814 if (__kmp_nThreadsPerCore > 1) { 3815 __kmp_affinity_compact = 1; 3816 if (__kmp_affinity_compact >= depth) { 3817 __kmp_affinity_compact = 0; 3818 } 3819 } else { 3820 __kmp_affinity_compact = 0; 3821 } 3822 if (__kmp_affinity_offset) { 3823 __kmp_affinity_offset = 3824 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 3825 } 3826 goto sortTopology; 3827 3828 case affinity_scatter: 3829 if (__kmp_affinity_compact >= depth) { 3830 __kmp_affinity_compact = 0; 3831 } else { 3832 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 3833 } 3834 goto sortTopology; 3835 3836 case affinity_compact: 3837 if (__kmp_affinity_compact >= depth) { 3838 __kmp_affinity_compact = depth - 1; 3839 } 3840 goto sortTopology; 3841 3842 case affinity_balanced: 3843 if (depth <= 1) { 3844 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 3845 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 3846 } 3847 __kmp_affinity_type = affinity_none; 3848 __kmp_create_affinity_none_places(); 3849 return; 3850 } else if (!__kmp_topology->is_uniform()) { 3851 // Save the depth for further usage 3852 __kmp_aff_depth = depth; 3853 3854 int core_level = 3855 __kmp_affinity_find_core_level(__kmp_avail_proc, depth - 1); 3856 int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, depth - 1, 3857 core_level); 3858 int maxprocpercore = __kmp_affinity_max_proc_per_core( 3859 __kmp_avail_proc, depth - 1, core_level); 3860 3861 int nproc = ncores * maxprocpercore; 3862 if ((nproc < 2) || (nproc < __kmp_avail_proc)) { 3863 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 3864 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 3865 } 3866 __kmp_affinity_type = affinity_none; 3867 return; 3868 } 3869 3870 procarr = (int *)__kmp_allocate(sizeof(int) * nproc); 3871 for (int i = 0; i < nproc; i++) { 3872 procarr[i] = -1; 3873 } 3874 3875 int lastcore = -1; 3876 int inlastcore = 0; 3877 for (int i = 0; i < __kmp_avail_proc; i++) { 3878 int proc = __kmp_topology->at(i).os_id; 3879 int core = __kmp_affinity_find_core(i, depth - 1, core_level); 3880 3881 if (core == lastcore) { 3882 inlastcore++; 3883 } else { 3884 inlastcore = 0; 3885 } 3886 lastcore = core; 3887 3888 procarr[core * maxprocpercore + inlastcore] = proc; 3889 } 3890 } 3891 if (__kmp_affinity_compact >= depth) { 3892 __kmp_affinity_compact = depth - 1; 3893 } 3894 3895 sortTopology: 3896 // Allocate the gtid->affinity mask table. 3897 if (__kmp_affinity_dups) { 3898 __kmp_affinity_num_masks = __kmp_avail_proc; 3899 } else { 3900 __kmp_affinity_num_masks = numUnique; 3901 } 3902 3903 if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) && 3904 (__kmp_affinity_num_places > 0) && 3905 ((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) { 3906 __kmp_affinity_num_masks = __kmp_affinity_num_places; 3907 } 3908 3909 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 3910 3911 // Sort the topology table according to the current setting of 3912 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 3913 __kmp_topology->sort_compact(); 3914 { 3915 int i; 3916 unsigned j; 3917 int num_hw_threads = __kmp_topology->get_num_hw_threads(); 3918 for (i = 0, j = 0; i < num_hw_threads; i++) { 3919 if ((!__kmp_affinity_dups) && (!__kmp_topology->at(i).leader)) { 3920 continue; 3921 } 3922 int osId = __kmp_topology->at(i).os_id; 3923 3924 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 3925 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j); 3926 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 3927 KMP_CPU_COPY(dest, src); 3928 if (++j >= __kmp_affinity_num_masks) { 3929 break; 3930 } 3931 } 3932 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 3933 } 3934 // Sort the topology back using ids 3935 __kmp_topology->sort_ids(); 3936 break; 3937 3938 default: 3939 KMP_ASSERT2(0, "Unexpected affinity setting"); 3940 } 3941 3942 KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1); 3943 } 3944 3945 void __kmp_affinity_initialize(void) { 3946 // Much of the code above was written assuming that if a machine was not 3947 // affinity capable, then __kmp_affinity_type == affinity_none. We now 3948 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 3949 // There are too many checks for __kmp_affinity_type == affinity_none 3950 // in this code. Instead of trying to change them all, check if 3951 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 3952 // affinity_none, call the real initialization routine, then restore 3953 // __kmp_affinity_type to affinity_disabled. 3954 int disabled = (__kmp_affinity_type == affinity_disabled); 3955 if (!KMP_AFFINITY_CAPABLE()) { 3956 KMP_ASSERT(disabled); 3957 } 3958 if (disabled) { 3959 __kmp_affinity_type = affinity_none; 3960 } 3961 __kmp_aux_affinity_initialize(); 3962 if (disabled) { 3963 __kmp_affinity_type = affinity_disabled; 3964 } 3965 } 3966 3967 void __kmp_affinity_uninitialize(void) { 3968 if (__kmp_affinity_masks != NULL) { 3969 KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 3970 __kmp_affinity_masks = NULL; 3971 } 3972 if (__kmp_affin_fullMask != NULL) { 3973 KMP_CPU_FREE(__kmp_affin_fullMask); 3974 __kmp_affin_fullMask = NULL; 3975 } 3976 __kmp_affinity_num_masks = 0; 3977 __kmp_affinity_type = affinity_default; 3978 __kmp_affinity_num_places = 0; 3979 if (__kmp_affinity_proclist != NULL) { 3980 __kmp_free(__kmp_affinity_proclist); 3981 __kmp_affinity_proclist = NULL; 3982 } 3983 if (procarr != NULL) { 3984 __kmp_free(procarr); 3985 procarr = NULL; 3986 } 3987 #if KMP_USE_HWLOC 3988 if (__kmp_hwloc_topology != NULL) { 3989 hwloc_topology_destroy(__kmp_hwloc_topology); 3990 __kmp_hwloc_topology = NULL; 3991 } 3992 #endif 3993 if (__kmp_hw_subset) { 3994 kmp_hw_subset_t::deallocate(__kmp_hw_subset); 3995 __kmp_hw_subset = nullptr; 3996 } 3997 if (__kmp_topology) { 3998 kmp_topology_t::deallocate(__kmp_topology); 3999 __kmp_topology = nullptr; 4000 } 4001 KMPAffinity::destroy_api(); 4002 } 4003 4004 void __kmp_affinity_set_init_mask(int gtid, int isa_root) { 4005 if (!KMP_AFFINITY_CAPABLE()) { 4006 return; 4007 } 4008 4009 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4010 if (th->th.th_affin_mask == NULL) { 4011 KMP_CPU_ALLOC(th->th.th_affin_mask); 4012 } else { 4013 KMP_CPU_ZERO(th->th.th_affin_mask); 4014 } 4015 4016 // Copy the thread mask to the kmp_info_t structure. If 4017 // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that 4018 // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set, 4019 // then the full mask is the same as the mask of the initialization thread. 4020 kmp_affin_mask_t *mask; 4021 int i; 4022 4023 if (KMP_AFFINITY_NON_PROC_BIND) { 4024 if ((__kmp_affinity_type == affinity_none) || 4025 (__kmp_affinity_type == affinity_balanced) || 4026 KMP_HIDDEN_HELPER_THREAD(gtid)) { 4027 #if KMP_GROUP_AFFINITY 4028 if (__kmp_num_proc_groups > 1) { 4029 return; 4030 } 4031 #endif 4032 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4033 i = 0; 4034 mask = __kmp_affin_fullMask; 4035 } else { 4036 int mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid); 4037 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 4038 i = (mask_idx + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4039 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4040 } 4041 } else { 4042 if ((!isa_root) || KMP_HIDDEN_HELPER_THREAD(gtid) || 4043 (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 4044 #if KMP_GROUP_AFFINITY 4045 if (__kmp_num_proc_groups > 1) { 4046 return; 4047 } 4048 #endif 4049 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4050 i = KMP_PLACE_ALL; 4051 mask = __kmp_affin_fullMask; 4052 } else { 4053 // int i = some hash function or just a counter that doesn't 4054 // always start at 0. Use adjusted gtid for now. 4055 int mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid); 4056 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 4057 i = (mask_idx + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4058 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4059 } 4060 } 4061 4062 th->th.th_current_place = i; 4063 if (isa_root || KMP_HIDDEN_HELPER_THREAD(gtid)) { 4064 th->th.th_new_place = i; 4065 th->th.th_first_place = 0; 4066 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4067 } else if (KMP_AFFINITY_NON_PROC_BIND) { 4068 // When using a Non-OMP_PROC_BIND affinity method, 4069 // set all threads' place-partition-var to the entire place list 4070 th->th.th_first_place = 0; 4071 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4072 } 4073 4074 if (i == KMP_PLACE_ALL) { 4075 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 4076 gtid)); 4077 } else { 4078 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 4079 gtid, i)); 4080 } 4081 4082 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4083 4084 if (__kmp_affinity_verbose && !KMP_HIDDEN_HELPER_THREAD(gtid) 4085 /* to avoid duplicate printing (will be correctly printed on barrier) */ 4086 && (__kmp_affinity_type == affinity_none || 4087 (i != KMP_PLACE_ALL && __kmp_affinity_type != affinity_balanced))) { 4088 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4089 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4090 th->th.th_affin_mask); 4091 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4092 __kmp_gettid(), gtid, buf); 4093 } 4094 4095 #if KMP_DEBUG 4096 // Hidden helper thread affinity only printed for debug builds 4097 if (__kmp_affinity_verbose && KMP_HIDDEN_HELPER_THREAD(gtid)) { 4098 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4099 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4100 th->th.th_affin_mask); 4101 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY (hidden helper thread)", 4102 (kmp_int32)getpid(), __kmp_gettid(), gtid, buf); 4103 } 4104 #endif 4105 4106 #if KMP_OS_WINDOWS 4107 // On Windows* OS, the process affinity mask might have changed. If the user 4108 // didn't request affinity and this call fails, just continue silently. 4109 // See CQ171393. 4110 if (__kmp_affinity_type == affinity_none) { 4111 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 4112 } else 4113 #endif 4114 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4115 } 4116 4117 void __kmp_affinity_set_place(int gtid) { 4118 if (!KMP_AFFINITY_CAPABLE()) { 4119 return; 4120 } 4121 4122 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4123 4124 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current " 4125 "place = %d)\n", 4126 gtid, th->th.th_new_place, th->th.th_current_place)); 4127 4128 // Check that the new place is within this thread's partition. 4129 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4130 KMP_ASSERT(th->th.th_new_place >= 0); 4131 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 4132 if (th->th.th_first_place <= th->th.th_last_place) { 4133 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) && 4134 (th->th.th_new_place <= th->th.th_last_place)); 4135 } else { 4136 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) || 4137 (th->th.th_new_place >= th->th.th_last_place)); 4138 } 4139 4140 // Copy the thread mask to the kmp_info_t structure, 4141 // and set this thread's affinity. 4142 kmp_affin_mask_t *mask = 4143 KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place); 4144 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4145 th->th.th_current_place = th->th.th_new_place; 4146 4147 if (__kmp_affinity_verbose) { 4148 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4149 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4150 th->th.th_affin_mask); 4151 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), 4152 __kmp_gettid(), gtid, buf); 4153 } 4154 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4155 } 4156 4157 int __kmp_aux_set_affinity(void **mask) { 4158 int gtid; 4159 kmp_info_t *th; 4160 int retval; 4161 4162 if (!KMP_AFFINITY_CAPABLE()) { 4163 return -1; 4164 } 4165 4166 gtid = __kmp_entry_gtid(); 4167 KA_TRACE( 4168 1000, (""); { 4169 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4170 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4171 (kmp_affin_mask_t *)(*mask)); 4172 __kmp_debug_printf( 4173 "kmp_set_affinity: setting affinity mask for thread %d = %s\n", 4174 gtid, buf); 4175 }); 4176 4177 if (__kmp_env_consistency_check) { 4178 if ((mask == NULL) || (*mask == NULL)) { 4179 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4180 } else { 4181 unsigned proc; 4182 int num_procs = 0; 4183 4184 KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) { 4185 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4186 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4187 } 4188 if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 4189 continue; 4190 } 4191 num_procs++; 4192 } 4193 if (num_procs == 0) { 4194 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4195 } 4196 4197 #if KMP_GROUP_AFFINITY 4198 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 4199 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4200 } 4201 #endif /* KMP_GROUP_AFFINITY */ 4202 } 4203 } 4204 4205 th = __kmp_threads[gtid]; 4206 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4207 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4208 if (retval == 0) { 4209 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 4210 } 4211 4212 th->th.th_current_place = KMP_PLACE_UNDEFINED; 4213 th->th.th_new_place = KMP_PLACE_UNDEFINED; 4214 th->th.th_first_place = 0; 4215 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4216 4217 // Turn off 4.0 affinity for the current tread at this parallel level. 4218 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 4219 4220 return retval; 4221 } 4222 4223 int __kmp_aux_get_affinity(void **mask) { 4224 int gtid; 4225 int retval; 4226 #if KMP_OS_WINDOWS || KMP_DEBUG 4227 kmp_info_t *th; 4228 #endif 4229 if (!KMP_AFFINITY_CAPABLE()) { 4230 return -1; 4231 } 4232 4233 gtid = __kmp_entry_gtid(); 4234 #if KMP_OS_WINDOWS || KMP_DEBUG 4235 th = __kmp_threads[gtid]; 4236 #else 4237 (void)gtid; // unused variable 4238 #endif 4239 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4240 4241 KA_TRACE( 4242 1000, (""); { 4243 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4244 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4245 th->th.th_affin_mask); 4246 __kmp_printf( 4247 "kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, 4248 buf); 4249 }); 4250 4251 if (__kmp_env_consistency_check) { 4252 if ((mask == NULL) || (*mask == NULL)) { 4253 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 4254 } 4255 } 4256 4257 #if !KMP_OS_WINDOWS 4258 4259 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4260 KA_TRACE( 4261 1000, (""); { 4262 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4263 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4264 (kmp_affin_mask_t *)(*mask)); 4265 __kmp_printf( 4266 "kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, 4267 buf); 4268 }); 4269 return retval; 4270 4271 #else 4272 (void)retval; 4273 4274 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 4275 return 0; 4276 4277 #endif /* KMP_OS_WINDOWS */ 4278 } 4279 4280 int __kmp_aux_get_affinity_max_proc() { 4281 if (!KMP_AFFINITY_CAPABLE()) { 4282 return 0; 4283 } 4284 #if KMP_GROUP_AFFINITY 4285 if (__kmp_num_proc_groups > 1) { 4286 return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT); 4287 } 4288 #endif 4289 return __kmp_xproc; 4290 } 4291 4292 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) { 4293 if (!KMP_AFFINITY_CAPABLE()) { 4294 return -1; 4295 } 4296 4297 KA_TRACE( 4298 1000, (""); { 4299 int gtid = __kmp_entry_gtid(); 4300 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4301 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4302 (kmp_affin_mask_t *)(*mask)); 4303 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in " 4304 "affinity mask for thread %d = %s\n", 4305 proc, gtid, buf); 4306 }); 4307 4308 if (__kmp_env_consistency_check) { 4309 if ((mask == NULL) || (*mask == NULL)) { 4310 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4311 } 4312 } 4313 4314 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4315 return -1; 4316 } 4317 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4318 return -2; 4319 } 4320 4321 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 4322 return 0; 4323 } 4324 4325 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) { 4326 if (!KMP_AFFINITY_CAPABLE()) { 4327 return -1; 4328 } 4329 4330 KA_TRACE( 4331 1000, (""); { 4332 int gtid = __kmp_entry_gtid(); 4333 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4334 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4335 (kmp_affin_mask_t *)(*mask)); 4336 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in " 4337 "affinity mask for thread %d = %s\n", 4338 proc, gtid, buf); 4339 }); 4340 4341 if (__kmp_env_consistency_check) { 4342 if ((mask == NULL) || (*mask == NULL)) { 4343 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 4344 } 4345 } 4346 4347 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4348 return -1; 4349 } 4350 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4351 return -2; 4352 } 4353 4354 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 4355 return 0; 4356 } 4357 4358 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) { 4359 if (!KMP_AFFINITY_CAPABLE()) { 4360 return -1; 4361 } 4362 4363 KA_TRACE( 4364 1000, (""); { 4365 int gtid = __kmp_entry_gtid(); 4366 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4367 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4368 (kmp_affin_mask_t *)(*mask)); 4369 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in " 4370 "affinity mask for thread %d = %s\n", 4371 proc, gtid, buf); 4372 }); 4373 4374 if (__kmp_env_consistency_check) { 4375 if ((mask == NULL) || (*mask == NULL)) { 4376 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc"); 4377 } 4378 } 4379 4380 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4381 return -1; 4382 } 4383 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4384 return 0; 4385 } 4386 4387 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 4388 } 4389 4390 // Dynamic affinity settings - Affinity balanced 4391 void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) { 4392 KMP_DEBUG_ASSERT(th); 4393 bool fine_gran = true; 4394 int tid = th->th.th_info.ds.ds_tid; 4395 4396 // Do not perform balanced affinity for the hidden helper threads 4397 if (KMP_HIDDEN_HELPER_THREAD(__kmp_gtid_from_thread(th))) 4398 return; 4399 4400 switch (__kmp_affinity_gran) { 4401 case KMP_HW_THREAD: 4402 break; 4403 case KMP_HW_CORE: 4404 if (__kmp_nThreadsPerCore > 1) { 4405 fine_gran = false; 4406 } 4407 break; 4408 case KMP_HW_SOCKET: 4409 if (nCoresPerPkg > 1) { 4410 fine_gran = false; 4411 } 4412 break; 4413 default: 4414 fine_gran = false; 4415 } 4416 4417 if (__kmp_topology->is_uniform()) { 4418 int coreID; 4419 int threadID; 4420 // Number of hyper threads per core in HT machine 4421 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 4422 // Number of cores 4423 int ncores = __kmp_ncores; 4424 if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) { 4425 __kmp_nth_per_core = __kmp_avail_proc / nPackages; 4426 ncores = nPackages; 4427 } 4428 // How many threads will be bound to each core 4429 int chunk = nthreads / ncores; 4430 // How many cores will have an additional thread bound to it - "big cores" 4431 int big_cores = nthreads % ncores; 4432 // Number of threads on the big cores 4433 int big_nth = (chunk + 1) * big_cores; 4434 if (tid < big_nth) { 4435 coreID = tid / (chunk + 1); 4436 threadID = (tid % (chunk + 1)) % __kmp_nth_per_core; 4437 } else { // tid >= big_nth 4438 coreID = (tid - big_cores) / chunk; 4439 threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core; 4440 } 4441 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 4442 "Illegal set affinity operation when not capable"); 4443 4444 kmp_affin_mask_t *mask = th->th.th_affin_mask; 4445 KMP_CPU_ZERO(mask); 4446 4447 if (fine_gran) { 4448 int osID = 4449 __kmp_topology->at(coreID * __kmp_nth_per_core + threadID).os_id; 4450 KMP_CPU_SET(osID, mask); 4451 } else { 4452 for (int i = 0; i < __kmp_nth_per_core; i++) { 4453 int osID; 4454 osID = __kmp_topology->at(coreID * __kmp_nth_per_core + i).os_id; 4455 KMP_CPU_SET(osID, mask); 4456 } 4457 } 4458 if (__kmp_affinity_verbose) { 4459 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4460 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4461 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4462 __kmp_gettid(), tid, buf); 4463 } 4464 __kmp_set_system_affinity(mask, TRUE); 4465 } else { // Non-uniform topology 4466 4467 kmp_affin_mask_t *mask = th->th.th_affin_mask; 4468 KMP_CPU_ZERO(mask); 4469 4470 int core_level = 4471 __kmp_affinity_find_core_level(__kmp_avail_proc, __kmp_aff_depth - 1); 4472 int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, 4473 __kmp_aff_depth - 1, core_level); 4474 int nth_per_core = __kmp_affinity_max_proc_per_core( 4475 __kmp_avail_proc, __kmp_aff_depth - 1, core_level); 4476 4477 // For performance gain consider the special case nthreads == 4478 // __kmp_avail_proc 4479 if (nthreads == __kmp_avail_proc) { 4480 if (fine_gran) { 4481 int osID = __kmp_topology->at(tid).os_id; 4482 KMP_CPU_SET(osID, mask); 4483 } else { 4484 int core = 4485 __kmp_affinity_find_core(tid, __kmp_aff_depth - 1, core_level); 4486 for (int i = 0; i < __kmp_avail_proc; i++) { 4487 int osID = __kmp_topology->at(i).os_id; 4488 if (__kmp_affinity_find_core(i, __kmp_aff_depth - 1, core_level) == 4489 core) { 4490 KMP_CPU_SET(osID, mask); 4491 } 4492 } 4493 } 4494 } else if (nthreads <= ncores) { 4495 4496 int core = 0; 4497 for (int i = 0; i < ncores; i++) { 4498 // Check if this core from procarr[] is in the mask 4499 int in_mask = 0; 4500 for (int j = 0; j < nth_per_core; j++) { 4501 if (procarr[i * nth_per_core + j] != -1) { 4502 in_mask = 1; 4503 break; 4504 } 4505 } 4506 if (in_mask) { 4507 if (tid == core) { 4508 for (int j = 0; j < nth_per_core; j++) { 4509 int osID = procarr[i * nth_per_core + j]; 4510 if (osID != -1) { 4511 KMP_CPU_SET(osID, mask); 4512 // For fine granularity it is enough to set the first available 4513 // osID for this core 4514 if (fine_gran) { 4515 break; 4516 } 4517 } 4518 } 4519 break; 4520 } else { 4521 core++; 4522 } 4523 } 4524 } 4525 } else { // nthreads > ncores 4526 // Array to save the number of processors at each core 4527 int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores); 4528 // Array to save the number of cores with "x" available processors; 4529 int *ncores_with_x_procs = 4530 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 4531 // Array to save the number of cores with # procs from x to nth_per_core 4532 int *ncores_with_x_to_max_procs = 4533 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 4534 4535 for (int i = 0; i <= nth_per_core; i++) { 4536 ncores_with_x_procs[i] = 0; 4537 ncores_with_x_to_max_procs[i] = 0; 4538 } 4539 4540 for (int i = 0; i < ncores; i++) { 4541 int cnt = 0; 4542 for (int j = 0; j < nth_per_core; j++) { 4543 if (procarr[i * nth_per_core + j] != -1) { 4544 cnt++; 4545 } 4546 } 4547 nproc_at_core[i] = cnt; 4548 ncores_with_x_procs[cnt]++; 4549 } 4550 4551 for (int i = 0; i <= nth_per_core; i++) { 4552 for (int j = i; j <= nth_per_core; j++) { 4553 ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j]; 4554 } 4555 } 4556 4557 // Max number of processors 4558 int nproc = nth_per_core * ncores; 4559 // An array to keep number of threads per each context 4560 int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc); 4561 for (int i = 0; i < nproc; i++) { 4562 newarr[i] = 0; 4563 } 4564 4565 int nth = nthreads; 4566 int flag = 0; 4567 while (nth > 0) { 4568 for (int j = 1; j <= nth_per_core; j++) { 4569 int cnt = ncores_with_x_to_max_procs[j]; 4570 for (int i = 0; i < ncores; i++) { 4571 // Skip the core with 0 processors 4572 if (nproc_at_core[i] == 0) { 4573 continue; 4574 } 4575 for (int k = 0; k < nth_per_core; k++) { 4576 if (procarr[i * nth_per_core + k] != -1) { 4577 if (newarr[i * nth_per_core + k] == 0) { 4578 newarr[i * nth_per_core + k] = 1; 4579 cnt--; 4580 nth--; 4581 break; 4582 } else { 4583 if (flag != 0) { 4584 newarr[i * nth_per_core + k]++; 4585 cnt--; 4586 nth--; 4587 break; 4588 } 4589 } 4590 } 4591 } 4592 if (cnt == 0 || nth == 0) { 4593 break; 4594 } 4595 } 4596 if (nth == 0) { 4597 break; 4598 } 4599 } 4600 flag = 1; 4601 } 4602 int sum = 0; 4603 for (int i = 0; i < nproc; i++) { 4604 sum += newarr[i]; 4605 if (sum > tid) { 4606 if (fine_gran) { 4607 int osID = procarr[i]; 4608 KMP_CPU_SET(osID, mask); 4609 } else { 4610 int coreID = i / nth_per_core; 4611 for (int ii = 0; ii < nth_per_core; ii++) { 4612 int osID = procarr[coreID * nth_per_core + ii]; 4613 if (osID != -1) { 4614 KMP_CPU_SET(osID, mask); 4615 } 4616 } 4617 } 4618 break; 4619 } 4620 } 4621 __kmp_free(newarr); 4622 } 4623 4624 if (__kmp_affinity_verbose) { 4625 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4626 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4627 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4628 __kmp_gettid(), tid, buf); 4629 } 4630 __kmp_set_system_affinity(mask, TRUE); 4631 } 4632 } 4633 4634 #if KMP_OS_LINUX || KMP_OS_FREEBSD 4635 // We don't need this entry for Windows because 4636 // there is GetProcessAffinityMask() api 4637 // 4638 // The intended usage is indicated by these steps: 4639 // 1) The user gets the current affinity mask 4640 // 2) Then sets the affinity by calling this function 4641 // 3) Error check the return value 4642 // 4) Use non-OpenMP parallelization 4643 // 5) Reset the affinity to what was stored in step 1) 4644 #ifdef __cplusplus 4645 extern "C" 4646 #endif 4647 int 4648 kmp_set_thread_affinity_mask_initial() 4649 // the function returns 0 on success, 4650 // -1 if we cannot bind thread 4651 // >0 (errno) if an error happened during binding 4652 { 4653 int gtid = __kmp_get_gtid(); 4654 if (gtid < 0) { 4655 // Do not touch non-omp threads 4656 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 4657 "non-omp thread, returning\n")); 4658 return -1; 4659 } 4660 if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) { 4661 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 4662 "affinity not initialized, returning\n")); 4663 return -1; 4664 } 4665 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 4666 "set full mask for thread %d\n", 4667 gtid)); 4668 KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL); 4669 return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE); 4670 } 4671 #endif 4672 4673 #endif // KMP_AFFINITY_SUPPORTED 4674