1 /* 2 * kmp_affinity.cpp -- affinity management 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_i18n.h" 16 #include "kmp_io.h" 17 #include "kmp_str.h" 18 #include "kmp_wrapper_getpid.h" 19 #if KMP_USE_HIER_SCHED 20 #include "kmp_dispatch_hier.h" 21 #endif 22 #if KMP_USE_HWLOC 23 // Copied from hwloc 24 #define HWLOC_GROUP_KIND_INTEL_MODULE 102 25 #define HWLOC_GROUP_KIND_INTEL_TILE 103 26 #define HWLOC_GROUP_KIND_INTEL_DIE 104 27 #define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220 28 #endif 29 30 // The machine topology 31 kmp_topology_t *__kmp_topology = nullptr; 32 // KMP_HW_SUBSET environment variable 33 kmp_hw_subset_t *__kmp_hw_subset = nullptr; 34 35 // Store the real or imagined machine hierarchy here 36 static hierarchy_info machine_hierarchy; 37 38 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); } 39 40 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 41 kmp_uint32 depth; 42 // The test below is true if affinity is available, but set to "none". Need to 43 // init on first use of hierarchical barrier. 44 if (TCR_1(machine_hierarchy.uninitialized)) 45 machine_hierarchy.init(nproc); 46 47 // Adjust the hierarchy in case num threads exceeds original 48 if (nproc > machine_hierarchy.base_num_threads) 49 machine_hierarchy.resize(nproc); 50 51 depth = machine_hierarchy.depth; 52 KMP_DEBUG_ASSERT(depth > 0); 53 54 thr_bar->depth = depth; 55 __kmp_type_convert(machine_hierarchy.numPerLevel[0] - 1, 56 &(thr_bar->base_leaf_kids)); 57 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 58 } 59 60 static int nCoresPerPkg, nPackages; 61 static int __kmp_nThreadsPerCore; 62 #ifndef KMP_DFLT_NTH_CORES 63 static int __kmp_ncores; 64 #endif 65 66 const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) { 67 switch (type) { 68 case KMP_HW_SOCKET: 69 return ((plural) ? KMP_I18N_STR(Sockets) : KMP_I18N_STR(Socket)); 70 case KMP_HW_DIE: 71 return ((plural) ? KMP_I18N_STR(Dice) : KMP_I18N_STR(Die)); 72 case KMP_HW_MODULE: 73 return ((plural) ? KMP_I18N_STR(Modules) : KMP_I18N_STR(Module)); 74 case KMP_HW_TILE: 75 return ((plural) ? KMP_I18N_STR(Tiles) : KMP_I18N_STR(Tile)); 76 case KMP_HW_NUMA: 77 return ((plural) ? KMP_I18N_STR(NumaDomains) : KMP_I18N_STR(NumaDomain)); 78 case KMP_HW_L3: 79 return ((plural) ? KMP_I18N_STR(L3Caches) : KMP_I18N_STR(L3Cache)); 80 case KMP_HW_L2: 81 return ((plural) ? KMP_I18N_STR(L2Caches) : KMP_I18N_STR(L2Cache)); 82 case KMP_HW_L1: 83 return ((plural) ? KMP_I18N_STR(L1Caches) : KMP_I18N_STR(L1Cache)); 84 case KMP_HW_LLC: 85 return ((plural) ? KMP_I18N_STR(LLCaches) : KMP_I18N_STR(LLCache)); 86 case KMP_HW_CORE: 87 return ((plural) ? KMP_I18N_STR(Cores) : KMP_I18N_STR(Core)); 88 case KMP_HW_THREAD: 89 return ((plural) ? KMP_I18N_STR(Threads) : KMP_I18N_STR(Thread)); 90 case KMP_HW_PROC_GROUP: 91 return ((plural) ? KMP_I18N_STR(ProcGroups) : KMP_I18N_STR(ProcGroup)); 92 } 93 return KMP_I18N_STR(Unknown); 94 } 95 96 const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) { 97 switch (type) { 98 case KMP_HW_SOCKET: 99 return ((plural) ? "sockets" : "socket"); 100 case KMP_HW_DIE: 101 return ((plural) ? "dice" : "die"); 102 case KMP_HW_MODULE: 103 return ((plural) ? "modules" : "module"); 104 case KMP_HW_TILE: 105 return ((plural) ? "tiles" : "tile"); 106 case KMP_HW_NUMA: 107 return ((plural) ? "numa_domains" : "numa_domain"); 108 case KMP_HW_L3: 109 return ((plural) ? "l3_caches" : "l3_cache"); 110 case KMP_HW_L2: 111 return ((plural) ? "l2_caches" : "l2_cache"); 112 case KMP_HW_L1: 113 return ((plural) ? "l1_caches" : "l1_cache"); 114 case KMP_HW_LLC: 115 return ((plural) ? "ll_caches" : "ll_cache"); 116 case KMP_HW_CORE: 117 return ((plural) ? "cores" : "core"); 118 case KMP_HW_THREAD: 119 return ((plural) ? "threads" : "thread"); 120 case KMP_HW_PROC_GROUP: 121 return ((plural) ? "proc_groups" : "proc_group"); 122 } 123 return ((plural) ? "unknowns" : "unknown"); 124 } 125 126 const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) { 127 switch (type) { 128 case KMP_HW_CORE_TYPE_UNKNOWN: 129 return "unknown"; 130 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 131 case KMP_HW_CORE_TYPE_ATOM: 132 return "Intel Atom(R) processor"; 133 case KMP_HW_CORE_TYPE_CORE: 134 return "Intel(R) Core(TM) processor"; 135 #endif 136 } 137 return "unknown"; 138 } 139 140 //////////////////////////////////////////////////////////////////////////////// 141 // kmp_hw_thread_t methods 142 int kmp_hw_thread_t::compare_ids(const void *a, const void *b) { 143 const kmp_hw_thread_t *ahwthread = (const kmp_hw_thread_t *)a; 144 const kmp_hw_thread_t *bhwthread = (const kmp_hw_thread_t *)b; 145 int depth = __kmp_topology->get_depth(); 146 for (int level = 0; level < depth; ++level) { 147 if (ahwthread->ids[level] < bhwthread->ids[level]) 148 return -1; 149 else if (ahwthread->ids[level] > bhwthread->ids[level]) 150 return 1; 151 } 152 if (ahwthread->os_id < bhwthread->os_id) 153 return -1; 154 else if (ahwthread->os_id > bhwthread->os_id) 155 return 1; 156 return 0; 157 } 158 159 #if KMP_AFFINITY_SUPPORTED 160 int kmp_hw_thread_t::compare_compact(const void *a, const void *b) { 161 int i; 162 const kmp_hw_thread_t *aa = (const kmp_hw_thread_t *)a; 163 const kmp_hw_thread_t *bb = (const kmp_hw_thread_t *)b; 164 int depth = __kmp_topology->get_depth(); 165 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0); 166 KMP_DEBUG_ASSERT(__kmp_affinity_compact <= depth); 167 for (i = 0; i < __kmp_affinity_compact; i++) { 168 int j = depth - i - 1; 169 if (aa->sub_ids[j] < bb->sub_ids[j]) 170 return -1; 171 if (aa->sub_ids[j] > bb->sub_ids[j]) 172 return 1; 173 } 174 for (; i < depth; i++) { 175 int j = i - __kmp_affinity_compact; 176 if (aa->sub_ids[j] < bb->sub_ids[j]) 177 return -1; 178 if (aa->sub_ids[j] > bb->sub_ids[j]) 179 return 1; 180 } 181 return 0; 182 } 183 #endif 184 185 void kmp_hw_thread_t::print() const { 186 int depth = __kmp_topology->get_depth(); 187 printf("%4d ", os_id); 188 for (int i = 0; i < depth; ++i) { 189 printf("%4d ", ids[i]); 190 } 191 if (core_type != KMP_HW_CORE_TYPE_UNKNOWN) { 192 printf(" (%s)", __kmp_hw_get_core_type_string(core_type)); 193 } 194 printf("\n"); 195 } 196 197 //////////////////////////////////////////////////////////////////////////////// 198 // kmp_topology_t methods 199 200 // Remove layers that don't add information to the topology. 201 // This is done by having the layer take on the id = UNKNOWN_ID (-1) 202 void kmp_topology_t::_remove_radix1_layers() { 203 int preference[KMP_HW_LAST]; 204 int top_index1, top_index2; 205 // Set up preference associative array 206 preference[KMP_HW_PROC_GROUP] = 110; 207 preference[KMP_HW_SOCKET] = 100; 208 preference[KMP_HW_CORE] = 95; 209 preference[KMP_HW_THREAD] = 90; 210 preference[KMP_HW_NUMA] = 85; 211 preference[KMP_HW_DIE] = 80; 212 preference[KMP_HW_TILE] = 75; 213 preference[KMP_HW_MODULE] = 73; 214 preference[KMP_HW_L3] = 70; 215 preference[KMP_HW_L2] = 65; 216 preference[KMP_HW_L1] = 60; 217 preference[KMP_HW_LLC] = 5; 218 top_index1 = 0; 219 top_index2 = 1; 220 while (top_index1 < depth - 1 && top_index2 < depth) { 221 kmp_hw_t type1 = types[top_index1]; 222 kmp_hw_t type2 = types[top_index2]; 223 KMP_ASSERT_VALID_HW_TYPE(type1); 224 KMP_ASSERT_VALID_HW_TYPE(type2); 225 // Do not allow the three main topology levels (sockets, cores, threads) to 226 // be compacted down 227 if ((type1 == KMP_HW_THREAD || type1 == KMP_HW_CORE || 228 type1 == KMP_HW_SOCKET) && 229 (type2 == KMP_HW_THREAD || type2 == KMP_HW_CORE || 230 type2 == KMP_HW_SOCKET)) { 231 top_index1 = top_index2++; 232 continue; 233 } 234 bool radix1 = true; 235 bool all_same = true; 236 int id1 = hw_threads[0].ids[top_index1]; 237 int id2 = hw_threads[0].ids[top_index2]; 238 int pref1 = preference[type1]; 239 int pref2 = preference[type2]; 240 for (int hwidx = 1; hwidx < num_hw_threads; ++hwidx) { 241 if (hw_threads[hwidx].ids[top_index1] == id1 && 242 hw_threads[hwidx].ids[top_index2] != id2) { 243 radix1 = false; 244 break; 245 } 246 if (hw_threads[hwidx].ids[top_index2] != id2) 247 all_same = false; 248 id1 = hw_threads[hwidx].ids[top_index1]; 249 id2 = hw_threads[hwidx].ids[top_index2]; 250 } 251 if (radix1) { 252 // Select the layer to remove based on preference 253 kmp_hw_t remove_type, keep_type; 254 int remove_layer, remove_layer_ids; 255 if (pref1 > pref2) { 256 remove_type = type2; 257 remove_layer = remove_layer_ids = top_index2; 258 keep_type = type1; 259 } else { 260 remove_type = type1; 261 remove_layer = remove_layer_ids = top_index1; 262 keep_type = type2; 263 } 264 // If all the indexes for the second (deeper) layer are the same. 265 // e.g., all are zero, then make sure to keep the first layer's ids 266 if (all_same) 267 remove_layer_ids = top_index2; 268 // Remove radix one type by setting the equivalence, removing the id from 269 // the hw threads and removing the layer from types and depth 270 set_equivalent_type(remove_type, keep_type); 271 for (int idx = 0; idx < num_hw_threads; ++idx) { 272 kmp_hw_thread_t &hw_thread = hw_threads[idx]; 273 for (int d = remove_layer_ids; d < depth - 1; ++d) 274 hw_thread.ids[d] = hw_thread.ids[d + 1]; 275 } 276 for (int idx = remove_layer; idx < depth - 1; ++idx) 277 types[idx] = types[idx + 1]; 278 depth--; 279 } else { 280 top_index1 = top_index2++; 281 } 282 } 283 KMP_ASSERT(depth > 0); 284 } 285 286 void kmp_topology_t::_set_last_level_cache() { 287 if (get_equivalent_type(KMP_HW_L3) != KMP_HW_UNKNOWN) 288 set_equivalent_type(KMP_HW_LLC, KMP_HW_L3); 289 else if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN) 290 set_equivalent_type(KMP_HW_LLC, KMP_HW_L2); 291 #if KMP_MIC_SUPPORTED 292 else if (__kmp_mic_type == mic3) { 293 if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN) 294 set_equivalent_type(KMP_HW_LLC, KMP_HW_L2); 295 else if (get_equivalent_type(KMP_HW_TILE) != KMP_HW_UNKNOWN) 296 set_equivalent_type(KMP_HW_LLC, KMP_HW_TILE); 297 // L2/Tile wasn't detected so just say L1 298 else 299 set_equivalent_type(KMP_HW_LLC, KMP_HW_L1); 300 } 301 #endif 302 else if (get_equivalent_type(KMP_HW_L1) != KMP_HW_UNKNOWN) 303 set_equivalent_type(KMP_HW_LLC, KMP_HW_L1); 304 // Fallback is to set last level cache to socket or core 305 if (get_equivalent_type(KMP_HW_LLC) == KMP_HW_UNKNOWN) { 306 if (get_equivalent_type(KMP_HW_SOCKET) != KMP_HW_UNKNOWN) 307 set_equivalent_type(KMP_HW_LLC, KMP_HW_SOCKET); 308 else if (get_equivalent_type(KMP_HW_CORE) != KMP_HW_UNKNOWN) 309 set_equivalent_type(KMP_HW_LLC, KMP_HW_CORE); 310 } 311 KMP_ASSERT(get_equivalent_type(KMP_HW_LLC) != KMP_HW_UNKNOWN); 312 } 313 314 // Gather the count of each topology layer and the ratio 315 void kmp_topology_t::_gather_enumeration_information() { 316 int previous_id[KMP_HW_LAST]; 317 int max[KMP_HW_LAST]; 318 int previous_core_id = kmp_hw_thread_t::UNKNOWN_ID; 319 320 for (int i = 0; i < depth; ++i) { 321 previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID; 322 max[i] = 0; 323 count[i] = 0; 324 ratio[i] = 0; 325 } 326 if (__kmp_is_hybrid_cpu()) { 327 for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i) { 328 core_types_count[i] = 0; 329 core_types[i] = KMP_HW_CORE_TYPE_UNKNOWN; 330 } 331 } 332 for (int i = 0; i < num_hw_threads; ++i) { 333 kmp_hw_thread_t &hw_thread = hw_threads[i]; 334 for (int layer = 0; layer < depth; ++layer) { 335 int id = hw_thread.ids[layer]; 336 if (id != previous_id[layer]) { 337 // Add an additional increment to each count 338 for (int l = layer; l < depth; ++l) 339 count[l]++; 340 // Keep track of topology layer ratio statistics 341 max[layer]++; 342 for (int l = layer + 1; l < depth; ++l) { 343 if (max[l] > ratio[l]) 344 ratio[l] = max[l]; 345 max[l] = 1; 346 } 347 break; 348 } 349 } 350 for (int layer = 0; layer < depth; ++layer) { 351 previous_id[layer] = hw_thread.ids[layer]; 352 } 353 // Figure out the number of each core type for hybrid CPUs 354 if (__kmp_is_hybrid_cpu()) { 355 int core_level = get_level(KMP_HW_CORE); 356 if (core_level != -1) { 357 if (hw_thread.ids[core_level] != previous_core_id) 358 _increment_core_type(hw_thread.core_type); 359 previous_core_id = hw_thread.ids[core_level]; 360 } 361 } 362 } 363 for (int layer = 0; layer < depth; ++layer) { 364 if (max[layer] > ratio[layer]) 365 ratio[layer] = max[layer]; 366 } 367 } 368 369 // Find out if the topology is uniform 370 void kmp_topology_t::_discover_uniformity() { 371 int num = 1; 372 for (int level = 0; level < depth; ++level) 373 num *= ratio[level]; 374 flags.uniform = (num == count[depth - 1]); 375 } 376 377 // Set all the sub_ids for each hardware thread 378 void kmp_topology_t::_set_sub_ids() { 379 int previous_id[KMP_HW_LAST]; 380 int sub_id[KMP_HW_LAST]; 381 382 for (int i = 0; i < depth; ++i) { 383 previous_id[i] = -1; 384 sub_id[i] = -1; 385 } 386 for (int i = 0; i < num_hw_threads; ++i) { 387 kmp_hw_thread_t &hw_thread = hw_threads[i]; 388 // Setup the sub_id 389 for (int j = 0; j < depth; ++j) { 390 if (hw_thread.ids[j] != previous_id[j]) { 391 sub_id[j]++; 392 for (int k = j + 1; k < depth; ++k) { 393 sub_id[k] = 0; 394 } 395 break; 396 } 397 } 398 // Set previous_id 399 for (int j = 0; j < depth; ++j) { 400 previous_id[j] = hw_thread.ids[j]; 401 } 402 // Set the sub_ids field 403 for (int j = 0; j < depth; ++j) { 404 hw_thread.sub_ids[j] = sub_id[j]; 405 } 406 } 407 } 408 409 void kmp_topology_t::_set_globals() { 410 // Set nCoresPerPkg, nPackages, __kmp_nThreadsPerCore, __kmp_ncores 411 int core_level, thread_level, package_level; 412 package_level = get_level(KMP_HW_SOCKET); 413 #if KMP_GROUP_AFFINITY 414 if (package_level == -1) 415 package_level = get_level(KMP_HW_PROC_GROUP); 416 #endif 417 core_level = get_level(KMP_HW_CORE); 418 thread_level = get_level(KMP_HW_THREAD); 419 420 KMP_ASSERT(core_level != -1); 421 KMP_ASSERT(thread_level != -1); 422 423 __kmp_nThreadsPerCore = calculate_ratio(thread_level, core_level); 424 if (package_level != -1) { 425 nCoresPerPkg = calculate_ratio(core_level, package_level); 426 nPackages = get_count(package_level); 427 } else { 428 // assume one socket 429 nCoresPerPkg = get_count(core_level); 430 nPackages = 1; 431 } 432 #ifndef KMP_DFLT_NTH_CORES 433 __kmp_ncores = get_count(core_level); 434 #endif 435 } 436 437 kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth, 438 const kmp_hw_t *types) { 439 kmp_topology_t *retval; 440 // Allocate all data in one large allocation 441 size_t size = sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc + 442 sizeof(int) * ndepth * 3; 443 char *bytes = (char *)__kmp_allocate(size); 444 retval = (kmp_topology_t *)bytes; 445 if (nproc > 0) { 446 retval->hw_threads = (kmp_hw_thread_t *)(bytes + sizeof(kmp_topology_t)); 447 } else { 448 retval->hw_threads = nullptr; 449 } 450 retval->num_hw_threads = nproc; 451 retval->depth = ndepth; 452 int *arr = 453 (int *)(bytes + sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc); 454 retval->types = (kmp_hw_t *)arr; 455 retval->ratio = arr + ndepth; 456 retval->count = arr + 2 * ndepth; 457 KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; } 458 for (int i = 0; i < ndepth; ++i) { 459 retval->types[i] = types[i]; 460 retval->equivalent[types[i]] = types[i]; 461 } 462 return retval; 463 } 464 465 void kmp_topology_t::deallocate(kmp_topology_t *topology) { 466 if (topology) 467 __kmp_free(topology); 468 } 469 470 bool kmp_topology_t::check_ids() const { 471 // Assume ids have been sorted 472 if (num_hw_threads == 0) 473 return true; 474 for (int i = 1; i < num_hw_threads; ++i) { 475 kmp_hw_thread_t ¤t_thread = hw_threads[i]; 476 kmp_hw_thread_t &previous_thread = hw_threads[i - 1]; 477 bool unique = false; 478 for (int j = 0; j < depth; ++j) { 479 if (previous_thread.ids[j] != current_thread.ids[j]) { 480 unique = true; 481 break; 482 } 483 } 484 if (unique) 485 continue; 486 return false; 487 } 488 return true; 489 } 490 491 void kmp_topology_t::dump() const { 492 printf("***********************\n"); 493 printf("*** __kmp_topology: ***\n"); 494 printf("***********************\n"); 495 printf("* depth: %d\n", depth); 496 497 printf("* types: "); 498 for (int i = 0; i < depth; ++i) 499 printf("%15s ", __kmp_hw_get_keyword(types[i])); 500 printf("\n"); 501 502 printf("* ratio: "); 503 for (int i = 0; i < depth; ++i) { 504 printf("%15d ", ratio[i]); 505 } 506 printf("\n"); 507 508 printf("* count: "); 509 for (int i = 0; i < depth; ++i) { 510 printf("%15d ", count[i]); 511 } 512 printf("\n"); 513 514 printf("* core_types:\n"); 515 for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i) { 516 if (core_types[i] != KMP_HW_CORE_TYPE_UNKNOWN) { 517 printf(" %d %s core%c\n", core_types_count[i], 518 __kmp_hw_get_core_type_string(core_types[i]), 519 ((core_types_count[i] > 1) ? 's' : ' ')); 520 } else { 521 if (i == 0) 522 printf("No hybrid information available\n"); 523 break; 524 } 525 } 526 527 printf("* equivalent map:\n"); 528 KMP_FOREACH_HW_TYPE(i) { 529 const char *key = __kmp_hw_get_keyword(i); 530 const char *value = __kmp_hw_get_keyword(equivalent[i]); 531 printf("%-15s -> %-15s\n", key, value); 532 } 533 534 printf("* uniform: %s\n", (is_uniform() ? "Yes" : "No")); 535 536 printf("* num_hw_threads: %d\n", num_hw_threads); 537 printf("* hw_threads:\n"); 538 for (int i = 0; i < num_hw_threads; ++i) { 539 hw_threads[i].print(); 540 } 541 printf("***********************\n"); 542 } 543 544 void kmp_topology_t::print(const char *env_var) const { 545 kmp_str_buf_t buf; 546 int print_types_depth; 547 __kmp_str_buf_init(&buf); 548 kmp_hw_t print_types[KMP_HW_LAST + 2]; 549 550 // Num Available Threads 551 KMP_INFORM(AvailableOSProc, env_var, num_hw_threads); 552 553 // Uniform or not 554 if (is_uniform()) { 555 KMP_INFORM(Uniform, env_var); 556 } else { 557 KMP_INFORM(NonUniform, env_var); 558 } 559 560 // Equivalent types 561 KMP_FOREACH_HW_TYPE(type) { 562 kmp_hw_t eq_type = equivalent[type]; 563 if (eq_type != KMP_HW_UNKNOWN && eq_type != type) { 564 KMP_INFORM(AffEqualTopologyTypes, env_var, 565 __kmp_hw_get_catalog_string(type), 566 __kmp_hw_get_catalog_string(eq_type)); 567 } 568 } 569 570 // Quick topology 571 KMP_ASSERT(depth > 0 && depth <= (int)KMP_HW_LAST); 572 // Create a print types array that always guarantees printing 573 // the core and thread level 574 print_types_depth = 0; 575 for (int level = 0; level < depth; ++level) 576 print_types[print_types_depth++] = types[level]; 577 if (equivalent[KMP_HW_CORE] != KMP_HW_CORE) { 578 // Force in the core level for quick topology 579 if (print_types[print_types_depth - 1] == KMP_HW_THREAD) { 580 // Force core before thread e.g., 1 socket X 2 threads/socket 581 // becomes 1 socket X 1 core/socket X 2 threads/socket 582 print_types[print_types_depth - 1] = KMP_HW_CORE; 583 print_types[print_types_depth++] = KMP_HW_THREAD; 584 } else { 585 print_types[print_types_depth++] = KMP_HW_CORE; 586 } 587 } 588 // Always put threads at very end of quick topology 589 if (equivalent[KMP_HW_THREAD] != KMP_HW_THREAD) 590 print_types[print_types_depth++] = KMP_HW_THREAD; 591 592 __kmp_str_buf_clear(&buf); 593 kmp_hw_t numerator_type; 594 kmp_hw_t denominator_type = KMP_HW_UNKNOWN; 595 int core_level = get_level(KMP_HW_CORE); 596 int ncores = get_count(core_level); 597 598 for (int plevel = 0, level = 0; plevel < print_types_depth; ++plevel) { 599 int c; 600 bool plural; 601 numerator_type = print_types[plevel]; 602 KMP_ASSERT_VALID_HW_TYPE(numerator_type); 603 if (equivalent[numerator_type] != numerator_type) 604 c = 1; 605 else 606 c = get_ratio(level++); 607 plural = (c > 1); 608 if (plevel == 0) { 609 __kmp_str_buf_print(&buf, "%d %s", c, 610 __kmp_hw_get_catalog_string(numerator_type, plural)); 611 } else { 612 __kmp_str_buf_print(&buf, " x %d %s/%s", c, 613 __kmp_hw_get_catalog_string(numerator_type, plural), 614 __kmp_hw_get_catalog_string(denominator_type)); 615 } 616 denominator_type = numerator_type; 617 } 618 KMP_INFORM(TopologyGeneric, env_var, buf.str, ncores); 619 620 if (__kmp_is_hybrid_cpu()) { 621 for (int i = 0; i < KMP_HW_MAX_NUM_CORE_TYPES; ++i) { 622 if (core_types[i] == KMP_HW_CORE_TYPE_UNKNOWN) 623 break; 624 KMP_INFORM(TopologyHybrid, env_var, core_types_count[i], 625 __kmp_hw_get_core_type_string(core_types[i])); 626 } 627 } 628 629 if (num_hw_threads <= 0) { 630 __kmp_str_buf_free(&buf); 631 return; 632 } 633 634 // Full OS proc to hardware thread map 635 KMP_INFORM(OSProcToPhysicalThreadMap, env_var); 636 for (int i = 0; i < num_hw_threads; i++) { 637 __kmp_str_buf_clear(&buf); 638 for (int level = 0; level < depth; ++level) { 639 kmp_hw_t type = types[level]; 640 __kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type)); 641 __kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]); 642 } 643 if (__kmp_is_hybrid_cpu()) 644 __kmp_str_buf_print( 645 &buf, "(%s)", __kmp_hw_get_core_type_string(hw_threads[i].core_type)); 646 KMP_INFORM(OSProcMapToPack, env_var, hw_threads[i].os_id, buf.str); 647 } 648 649 __kmp_str_buf_free(&buf); 650 } 651 652 void kmp_topology_t::canonicalize() { 653 _remove_radix1_layers(); 654 _gather_enumeration_information(); 655 _discover_uniformity(); 656 _set_sub_ids(); 657 _set_globals(); 658 _set_last_level_cache(); 659 660 #if KMP_MIC_SUPPORTED 661 // Manually Add L2 = Tile equivalence 662 if (__kmp_mic_type == mic3) { 663 if (get_level(KMP_HW_L2) != -1) 664 set_equivalent_type(KMP_HW_TILE, KMP_HW_L2); 665 else if (get_level(KMP_HW_TILE) != -1) 666 set_equivalent_type(KMP_HW_L2, KMP_HW_TILE); 667 } 668 #endif 669 670 // Perform post canonicalization checking 671 KMP_ASSERT(depth > 0); 672 for (int level = 0; level < depth; ++level) { 673 // All counts, ratios, and types must be valid 674 KMP_ASSERT(count[level] > 0 && ratio[level] > 0); 675 KMP_ASSERT_VALID_HW_TYPE(types[level]); 676 // Detected types must point to themselves 677 KMP_ASSERT(equivalent[types[level]] == types[level]); 678 } 679 680 #if KMP_AFFINITY_SUPPORTED 681 // Set the number of affinity granularity levels 682 if (__kmp_affinity_gran_levels < 0) { 683 kmp_hw_t gran_type = get_equivalent_type(__kmp_affinity_gran); 684 // Check if user's granularity request is valid 685 if (gran_type == KMP_HW_UNKNOWN) { 686 // First try core, then thread, then package 687 kmp_hw_t gran_types[3] = {KMP_HW_CORE, KMP_HW_THREAD, KMP_HW_SOCKET}; 688 for (auto g : gran_types) { 689 if (__kmp_topology->get_equivalent_type(g) != KMP_HW_UNKNOWN) { 690 gran_type = g; 691 break; 692 } 693 } 694 KMP_ASSERT(gran_type != KMP_HW_UNKNOWN); 695 // Warn user what granularity setting will be used instead 696 KMP_WARNING(AffGranularityBad, "KMP_AFFINITY", 697 __kmp_hw_get_catalog_string(__kmp_affinity_gran), 698 __kmp_hw_get_catalog_string(gran_type)); 699 __kmp_affinity_gran = gran_type; 700 } 701 __kmp_affinity_gran_levels = 0; 702 for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i) 703 __kmp_affinity_gran_levels++; 704 } 705 #endif // KMP_AFFINITY_SUPPORTED 706 } 707 708 // Canonicalize an explicit packages X cores/pkg X threads/core topology 709 void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg, 710 int nthreads_per_core, int ncores) { 711 int ndepth = 3; 712 depth = ndepth; 713 KMP_FOREACH_HW_TYPE(i) { equivalent[i] = KMP_HW_UNKNOWN; } 714 for (int level = 0; level < depth; ++level) { 715 count[level] = 0; 716 ratio[level] = 0; 717 } 718 count[0] = npackages; 719 count[1] = ncores; 720 count[2] = __kmp_xproc; 721 ratio[0] = npackages; 722 ratio[1] = ncores_per_pkg; 723 ratio[2] = nthreads_per_core; 724 equivalent[KMP_HW_SOCKET] = KMP_HW_SOCKET; 725 equivalent[KMP_HW_CORE] = KMP_HW_CORE; 726 equivalent[KMP_HW_THREAD] = KMP_HW_THREAD; 727 types[0] = KMP_HW_SOCKET; 728 types[1] = KMP_HW_CORE; 729 types[2] = KMP_HW_THREAD; 730 //__kmp_avail_proc = __kmp_xproc; 731 _discover_uniformity(); 732 } 733 734 // Apply the KMP_HW_SUBSET envirable to the topology 735 // Returns true if KMP_HW_SUBSET filtered any processors 736 // otherwise, returns false 737 bool kmp_topology_t::filter_hw_subset() { 738 // If KMP_HW_SUBSET wasn't requested, then do nothing. 739 if (!__kmp_hw_subset) 740 return false; 741 742 // First, sort the KMP_HW_SUBSET items by the machine topology 743 __kmp_hw_subset->sort(); 744 745 // Check to see if KMP_HW_SUBSET is a valid subset of the detected topology 746 int hw_subset_depth = __kmp_hw_subset->get_depth(); 747 kmp_hw_t specified[KMP_HW_LAST]; 748 KMP_ASSERT(hw_subset_depth > 0); 749 KMP_FOREACH_HW_TYPE(i) { specified[i] = KMP_HW_UNKNOWN; } 750 for (int i = 0; i < hw_subset_depth; ++i) { 751 int max_count; 752 int num = __kmp_hw_subset->at(i).num; 753 int offset = __kmp_hw_subset->at(i).offset; 754 kmp_hw_t type = __kmp_hw_subset->at(i).type; 755 kmp_hw_t equivalent_type = equivalent[type]; 756 int level = get_level(type); 757 758 // Check to see if current layer is in detected machine topology 759 if (equivalent_type != KMP_HW_UNKNOWN) { 760 __kmp_hw_subset->at(i).type = equivalent_type; 761 } else { 762 KMP_WARNING(AffHWSubsetNotExistGeneric, 763 __kmp_hw_get_catalog_string(type)); 764 return false; 765 } 766 767 // Check to see if current layer has already been specified 768 // either directly or through an equivalent type 769 if (specified[equivalent_type] != KMP_HW_UNKNOWN) { 770 KMP_WARNING(AffHWSubsetEqvLayers, __kmp_hw_get_catalog_string(type), 771 __kmp_hw_get_catalog_string(specified[equivalent_type])); 772 return false; 773 } 774 specified[equivalent_type] = type; 775 776 // Check to see if each layer's num & offset parameters are valid 777 max_count = get_ratio(level); 778 if (max_count < 0 || num + offset > max_count) { 779 bool plural = (num > 1); 780 KMP_WARNING(AffHWSubsetManyGeneric, 781 __kmp_hw_get_catalog_string(type, plural)); 782 return false; 783 } 784 } 785 786 // Apply the filtered hardware subset 787 int new_index = 0; 788 for (int i = 0; i < num_hw_threads; ++i) { 789 kmp_hw_thread_t &hw_thread = hw_threads[i]; 790 // Check to see if this hardware thread should be filtered 791 bool should_be_filtered = false; 792 for (int level = 0, hw_subset_index = 0; 793 level < depth && hw_subset_index < hw_subset_depth; ++level) { 794 kmp_hw_t topology_type = types[level]; 795 auto hw_subset_item = __kmp_hw_subset->at(hw_subset_index); 796 kmp_hw_t hw_subset_type = hw_subset_item.type; 797 if (topology_type != hw_subset_type) 798 continue; 799 int num = hw_subset_item.num; 800 int offset = hw_subset_item.offset; 801 hw_subset_index++; 802 if (hw_thread.sub_ids[level] < offset || 803 hw_thread.sub_ids[level] >= offset + num) { 804 should_be_filtered = true; 805 break; 806 } 807 } 808 if (!should_be_filtered) { 809 if (i != new_index) 810 hw_threads[new_index] = hw_thread; 811 new_index++; 812 } else { 813 #if KMP_AFFINITY_SUPPORTED 814 KMP_CPU_CLR(hw_thread.os_id, __kmp_affin_fullMask); 815 #endif 816 __kmp_avail_proc--; 817 } 818 } 819 KMP_DEBUG_ASSERT(new_index <= num_hw_threads); 820 num_hw_threads = new_index; 821 822 // Post hardware subset canonicalization 823 _gather_enumeration_information(); 824 _discover_uniformity(); 825 _set_globals(); 826 _set_last_level_cache(); 827 return true; 828 } 829 830 bool kmp_topology_t::is_close(int hwt1, int hwt2, int hw_level) const { 831 if (hw_level >= depth) 832 return true; 833 bool retval = true; 834 const kmp_hw_thread_t &t1 = hw_threads[hwt1]; 835 const kmp_hw_thread_t &t2 = hw_threads[hwt2]; 836 for (int i = 0; i < (depth - hw_level); ++i) { 837 if (t1.ids[i] != t2.ids[i]) 838 return false; 839 } 840 return retval; 841 } 842 843 //////////////////////////////////////////////////////////////////////////////// 844 845 #if KMP_AFFINITY_SUPPORTED 846 class kmp_affinity_raii_t { 847 kmp_affin_mask_t *mask; 848 bool restored; 849 850 public: 851 kmp_affinity_raii_t() : restored(false) { 852 KMP_CPU_ALLOC(mask); 853 KMP_ASSERT(mask != NULL); 854 __kmp_get_system_affinity(mask, TRUE); 855 } 856 void restore() { 857 __kmp_set_system_affinity(mask, TRUE); 858 KMP_CPU_FREE(mask); 859 restored = true; 860 } 861 ~kmp_affinity_raii_t() { 862 if (!restored) { 863 __kmp_set_system_affinity(mask, TRUE); 864 KMP_CPU_FREE(mask); 865 } 866 } 867 }; 868 869 bool KMPAffinity::picked_api = false; 870 871 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); } 872 void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); } 873 void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); } 874 void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); } 875 void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); } 876 void KMPAffinity::operator delete(void *p) { __kmp_free(p); } 877 878 void KMPAffinity::pick_api() { 879 KMPAffinity *affinity_dispatch; 880 if (picked_api) 881 return; 882 #if KMP_USE_HWLOC 883 // Only use Hwloc if affinity isn't explicitly disabled and 884 // user requests Hwloc topology method 885 if (__kmp_affinity_top_method == affinity_top_method_hwloc && 886 __kmp_affinity_type != affinity_disabled) { 887 affinity_dispatch = new KMPHwlocAffinity(); 888 } else 889 #endif 890 { 891 affinity_dispatch = new KMPNativeAffinity(); 892 } 893 __kmp_affinity_dispatch = affinity_dispatch; 894 picked_api = true; 895 } 896 897 void KMPAffinity::destroy_api() { 898 if (__kmp_affinity_dispatch != NULL) { 899 delete __kmp_affinity_dispatch; 900 __kmp_affinity_dispatch = NULL; 901 picked_api = false; 902 } 903 } 904 905 #define KMP_ADVANCE_SCAN(scan) \ 906 while (*scan != '\0') { \ 907 scan++; \ 908 } 909 910 // Print the affinity mask to the character array in a pretty format. 911 // The format is a comma separated list of non-negative integers or integer 912 // ranges: e.g., 1,2,3-5,7,9-15 913 // The format can also be the string "{<empty>}" if no bits are set in mask 914 char *__kmp_affinity_print_mask(char *buf, int buf_len, 915 kmp_affin_mask_t *mask) { 916 int start = 0, finish = 0, previous = 0; 917 bool first_range; 918 KMP_ASSERT(buf); 919 KMP_ASSERT(buf_len >= 40); 920 KMP_ASSERT(mask); 921 char *scan = buf; 922 char *end = buf + buf_len - 1; 923 924 // Check for empty set. 925 if (mask->begin() == mask->end()) { 926 KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}"); 927 KMP_ADVANCE_SCAN(scan); 928 KMP_ASSERT(scan <= end); 929 return buf; 930 } 931 932 first_range = true; 933 start = mask->begin(); 934 while (1) { 935 // Find next range 936 // [start, previous] is inclusive range of contiguous bits in mask 937 for (finish = mask->next(start), previous = start; 938 finish == previous + 1 && finish != mask->end(); 939 finish = mask->next(finish)) { 940 previous = finish; 941 } 942 943 // The first range does not need a comma printed before it, but the rest 944 // of the ranges do need a comma beforehand 945 if (!first_range) { 946 KMP_SNPRINTF(scan, end - scan + 1, "%s", ","); 947 KMP_ADVANCE_SCAN(scan); 948 } else { 949 first_range = false; 950 } 951 // Range with three or more contiguous bits in the affinity mask 952 if (previous - start > 1) { 953 KMP_SNPRINTF(scan, end - scan + 1, "%u-%u", start, previous); 954 } else { 955 // Range with one or two contiguous bits in the affinity mask 956 KMP_SNPRINTF(scan, end - scan + 1, "%u", start); 957 KMP_ADVANCE_SCAN(scan); 958 if (previous - start > 0) { 959 KMP_SNPRINTF(scan, end - scan + 1, ",%u", previous); 960 } 961 } 962 KMP_ADVANCE_SCAN(scan); 963 // Start over with new start point 964 start = finish; 965 if (start == mask->end()) 966 break; 967 // Check for overflow 968 if (end - scan < 2) 969 break; 970 } 971 972 // Check for overflow 973 KMP_ASSERT(scan <= end); 974 return buf; 975 } 976 #undef KMP_ADVANCE_SCAN 977 978 // Print the affinity mask to the string buffer object in a pretty format 979 // The format is a comma separated list of non-negative integers or integer 980 // ranges: e.g., 1,2,3-5,7,9-15 981 // The format can also be the string "{<empty>}" if no bits are set in mask 982 kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf, 983 kmp_affin_mask_t *mask) { 984 int start = 0, finish = 0, previous = 0; 985 bool first_range; 986 KMP_ASSERT(buf); 987 KMP_ASSERT(mask); 988 989 __kmp_str_buf_clear(buf); 990 991 // Check for empty set. 992 if (mask->begin() == mask->end()) { 993 __kmp_str_buf_print(buf, "%s", "{<empty>}"); 994 return buf; 995 } 996 997 first_range = true; 998 start = mask->begin(); 999 while (1) { 1000 // Find next range 1001 // [start, previous] is inclusive range of contiguous bits in mask 1002 for (finish = mask->next(start), previous = start; 1003 finish == previous + 1 && finish != mask->end(); 1004 finish = mask->next(finish)) { 1005 previous = finish; 1006 } 1007 1008 // The first range does not need a comma printed before it, but the rest 1009 // of the ranges do need a comma beforehand 1010 if (!first_range) { 1011 __kmp_str_buf_print(buf, "%s", ","); 1012 } else { 1013 first_range = false; 1014 } 1015 // Range with three or more contiguous bits in the affinity mask 1016 if (previous - start > 1) { 1017 __kmp_str_buf_print(buf, "%u-%u", start, previous); 1018 } else { 1019 // Range with one or two contiguous bits in the affinity mask 1020 __kmp_str_buf_print(buf, "%u", start); 1021 if (previous - start > 0) { 1022 __kmp_str_buf_print(buf, ",%u", previous); 1023 } 1024 } 1025 // Start over with new start point 1026 start = finish; 1027 if (start == mask->end()) 1028 break; 1029 } 1030 return buf; 1031 } 1032 1033 void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) { 1034 KMP_CPU_ZERO(mask); 1035 1036 #if KMP_GROUP_AFFINITY 1037 1038 if (__kmp_num_proc_groups > 1) { 1039 int group; 1040 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 1041 for (group = 0; group < __kmp_num_proc_groups; group++) { 1042 int i; 1043 int num = __kmp_GetActiveProcessorCount(group); 1044 for (i = 0; i < num; i++) { 1045 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 1046 } 1047 } 1048 } else 1049 1050 #endif /* KMP_GROUP_AFFINITY */ 1051 1052 { 1053 int proc; 1054 for (proc = 0; proc < __kmp_xproc; proc++) { 1055 KMP_CPU_SET(proc, mask); 1056 } 1057 } 1058 } 1059 1060 // All of the __kmp_affinity_create_*_map() routines should allocate the 1061 // internal topology object and set the layer ids for it. Each routine 1062 // returns a boolean on whether it was successful at doing so. 1063 kmp_affin_mask_t *__kmp_affin_fullMask = NULL; 1064 1065 #if KMP_USE_HWLOC 1066 static inline bool __kmp_hwloc_is_cache_type(hwloc_obj_t obj) { 1067 #if HWLOC_API_VERSION >= 0x00020000 1068 return hwloc_obj_type_is_cache(obj->type); 1069 #else 1070 return obj->type == HWLOC_OBJ_CACHE; 1071 #endif 1072 } 1073 1074 // Returns KMP_HW_* type derived from HWLOC_* type 1075 static inline kmp_hw_t __kmp_hwloc_type_2_topology_type(hwloc_obj_t obj) { 1076 1077 if (__kmp_hwloc_is_cache_type(obj)) { 1078 if (obj->attr->cache.type == HWLOC_OBJ_CACHE_INSTRUCTION) 1079 return KMP_HW_UNKNOWN; 1080 switch (obj->attr->cache.depth) { 1081 case 1: 1082 return KMP_HW_L1; 1083 case 2: 1084 #if KMP_MIC_SUPPORTED 1085 if (__kmp_mic_type == mic3) { 1086 return KMP_HW_TILE; 1087 } 1088 #endif 1089 return KMP_HW_L2; 1090 case 3: 1091 return KMP_HW_L3; 1092 } 1093 return KMP_HW_UNKNOWN; 1094 } 1095 1096 switch (obj->type) { 1097 case HWLOC_OBJ_PACKAGE: 1098 return KMP_HW_SOCKET; 1099 case HWLOC_OBJ_NUMANODE: 1100 return KMP_HW_NUMA; 1101 case HWLOC_OBJ_CORE: 1102 return KMP_HW_CORE; 1103 case HWLOC_OBJ_PU: 1104 return KMP_HW_THREAD; 1105 case HWLOC_OBJ_GROUP: 1106 if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_DIE) 1107 return KMP_HW_DIE; 1108 else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_TILE) 1109 return KMP_HW_TILE; 1110 else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_MODULE) 1111 return KMP_HW_MODULE; 1112 else if (obj->attr->group.kind == HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP) 1113 return KMP_HW_PROC_GROUP; 1114 return KMP_HW_UNKNOWN; 1115 #if HWLOC_API_VERSION >= 0x00020100 1116 case HWLOC_OBJ_DIE: 1117 return KMP_HW_DIE; 1118 #endif 1119 } 1120 return KMP_HW_UNKNOWN; 1121 } 1122 1123 // Returns the number of objects of type 'type' below 'obj' within the topology 1124 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is 1125 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET 1126 // object. 1127 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj, 1128 hwloc_obj_type_t type) { 1129 int retval = 0; 1130 hwloc_obj_t first; 1131 for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type, 1132 obj->logical_index, type, 0); 1133 first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, 1134 obj->type, first) == obj; 1135 first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type, 1136 first)) { 1137 ++retval; 1138 } 1139 return retval; 1140 } 1141 1142 // This gets the sub_id for a lower object under a higher object in the 1143 // topology tree 1144 static int __kmp_hwloc_get_sub_id(hwloc_topology_t t, hwloc_obj_t higher, 1145 hwloc_obj_t lower) { 1146 hwloc_obj_t obj; 1147 hwloc_obj_type_t ltype = lower->type; 1148 int lindex = lower->logical_index - 1; 1149 int sub_id = 0; 1150 // Get the previous lower object 1151 obj = hwloc_get_obj_by_type(t, ltype, lindex); 1152 while (obj && lindex >= 0 && 1153 hwloc_bitmap_isincluded(obj->cpuset, higher->cpuset)) { 1154 if (obj->userdata) { 1155 sub_id = (int)(RCAST(kmp_intptr_t, obj->userdata)); 1156 break; 1157 } 1158 sub_id++; 1159 lindex--; 1160 obj = hwloc_get_obj_by_type(t, ltype, lindex); 1161 } 1162 // store sub_id + 1 so that 0 is differed from NULL 1163 lower->userdata = RCAST(void *, sub_id + 1); 1164 return sub_id; 1165 } 1166 1167 static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) { 1168 kmp_hw_t type; 1169 int hw_thread_index, sub_id; 1170 int depth; 1171 hwloc_obj_t pu, obj, root, prev; 1172 kmp_hw_t types[KMP_HW_LAST]; 1173 hwloc_obj_type_t hwloc_types[KMP_HW_LAST]; 1174 1175 hwloc_topology_t tp = __kmp_hwloc_topology; 1176 *msg_id = kmp_i18n_null; 1177 if (__kmp_affinity_verbose) { 1178 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 1179 } 1180 1181 if (!KMP_AFFINITY_CAPABLE()) { 1182 // Hack to try and infer the machine topology using only the data 1183 // available from hwloc on the current thread, and __kmp_xproc. 1184 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1185 // hwloc only guarantees existance of PU object, so check PACKAGE and CORE 1186 hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0); 1187 if (o != NULL) 1188 nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_CORE); 1189 else 1190 nCoresPerPkg = 1; // no PACKAGE found 1191 o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0); 1192 if (o != NULL) 1193 __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_PU); 1194 else 1195 __kmp_nThreadsPerCore = 1; // no CORE found 1196 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1197 if (nCoresPerPkg == 0) 1198 nCoresPerPkg = 1; // to prevent possible division by 0 1199 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1200 return true; 1201 } 1202 1203 root = hwloc_get_root_obj(tp); 1204 1205 // Figure out the depth and types in the topology 1206 depth = 0; 1207 pu = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin()); 1208 KMP_ASSERT(pu); 1209 obj = pu; 1210 types[depth] = KMP_HW_THREAD; 1211 hwloc_types[depth] = obj->type; 1212 depth++; 1213 while (obj != root && obj != NULL) { 1214 obj = obj->parent; 1215 #if HWLOC_API_VERSION >= 0x00020000 1216 if (obj->memory_arity) { 1217 hwloc_obj_t memory; 1218 for (memory = obj->memory_first_child; memory; 1219 memory = hwloc_get_next_child(tp, obj, memory)) { 1220 if (memory->type == HWLOC_OBJ_NUMANODE) 1221 break; 1222 } 1223 if (memory && memory->type == HWLOC_OBJ_NUMANODE) { 1224 types[depth] = KMP_HW_NUMA; 1225 hwloc_types[depth] = memory->type; 1226 depth++; 1227 } 1228 } 1229 #endif 1230 type = __kmp_hwloc_type_2_topology_type(obj); 1231 if (type != KMP_HW_UNKNOWN) { 1232 types[depth] = type; 1233 hwloc_types[depth] = obj->type; 1234 depth++; 1235 } 1236 } 1237 KMP_ASSERT(depth > 0); 1238 1239 // Get the order for the types correct 1240 for (int i = 0, j = depth - 1; i < j; ++i, --j) { 1241 hwloc_obj_type_t hwloc_temp = hwloc_types[i]; 1242 kmp_hw_t temp = types[i]; 1243 types[i] = types[j]; 1244 types[j] = temp; 1245 hwloc_types[i] = hwloc_types[j]; 1246 hwloc_types[j] = hwloc_temp; 1247 } 1248 1249 // Allocate the data structure to be returned. 1250 __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types); 1251 1252 hw_thread_index = 0; 1253 pu = NULL; 1254 while (pu = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, pu)) { 1255 int index = depth - 1; 1256 bool included = KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask); 1257 kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index); 1258 if (included) { 1259 hw_thread.clear(); 1260 hw_thread.ids[index] = pu->logical_index; 1261 hw_thread.os_id = pu->os_index; 1262 index--; 1263 } 1264 obj = pu; 1265 prev = obj; 1266 while (obj != root && obj != NULL) { 1267 obj = obj->parent; 1268 #if HWLOC_API_VERSION >= 0x00020000 1269 // NUMA Nodes are handled differently since they are not within the 1270 // parent/child structure anymore. They are separate children 1271 // of obj (memory_first_child points to first memory child) 1272 if (obj->memory_arity) { 1273 hwloc_obj_t memory; 1274 for (memory = obj->memory_first_child; memory; 1275 memory = hwloc_get_next_child(tp, obj, memory)) { 1276 if (memory->type == HWLOC_OBJ_NUMANODE) 1277 break; 1278 } 1279 if (memory && memory->type == HWLOC_OBJ_NUMANODE) { 1280 sub_id = __kmp_hwloc_get_sub_id(tp, memory, prev); 1281 if (included) { 1282 hw_thread.ids[index] = memory->logical_index; 1283 hw_thread.ids[index + 1] = sub_id; 1284 index--; 1285 } 1286 prev = memory; 1287 } 1288 prev = obj; 1289 } 1290 #endif 1291 type = __kmp_hwloc_type_2_topology_type(obj); 1292 if (type != KMP_HW_UNKNOWN) { 1293 sub_id = __kmp_hwloc_get_sub_id(tp, obj, prev); 1294 if (included) { 1295 hw_thread.ids[index] = obj->logical_index; 1296 hw_thread.ids[index + 1] = sub_id; 1297 index--; 1298 } 1299 prev = obj; 1300 } 1301 } 1302 if (included) 1303 hw_thread_index++; 1304 } 1305 __kmp_topology->sort_ids(); 1306 return true; 1307 } 1308 #endif // KMP_USE_HWLOC 1309 1310 // If we don't know how to retrieve the machine's processor topology, or 1311 // encounter an error in doing so, this routine is called to form a "flat" 1312 // mapping of os thread id's <-> processor id's. 1313 static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t *const msg_id) { 1314 *msg_id = kmp_i18n_null; 1315 int depth = 3; 1316 kmp_hw_t types[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD}; 1317 1318 if (__kmp_affinity_verbose) { 1319 KMP_INFORM(UsingFlatOS, "KMP_AFFINITY"); 1320 } 1321 1322 // Even if __kmp_affinity_type == affinity_none, this routine might still 1323 // called to set __kmp_ncores, as well as 1324 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1325 if (!KMP_AFFINITY_CAPABLE()) { 1326 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1327 __kmp_ncores = nPackages = __kmp_xproc; 1328 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1329 return true; 1330 } 1331 1332 // When affinity is off, this routine will still be called to set 1333 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1334 // Make sure all these vars are set correctly, and return now if affinity is 1335 // not enabled. 1336 __kmp_ncores = nPackages = __kmp_avail_proc; 1337 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1338 1339 // Construct the data structure to be returned. 1340 __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types); 1341 int avail_ct = 0; 1342 int i; 1343 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1344 // Skip this proc if it is not included in the machine model. 1345 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1346 continue; 1347 } 1348 kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct); 1349 hw_thread.clear(); 1350 hw_thread.os_id = i; 1351 hw_thread.ids[0] = i; 1352 hw_thread.ids[1] = 0; 1353 hw_thread.ids[2] = 0; 1354 avail_ct++; 1355 } 1356 if (__kmp_affinity_verbose) { 1357 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 1358 } 1359 return true; 1360 } 1361 1362 #if KMP_GROUP_AFFINITY 1363 // If multiple Windows* OS processor groups exist, we can create a 2-level 1364 // topology map with the groups at level 0 and the individual procs at level 1. 1365 // This facilitates letting the threads float among all procs in a group, 1366 // if granularity=group (the default when there are multiple groups). 1367 static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t *const msg_id) { 1368 *msg_id = kmp_i18n_null; 1369 int depth = 3; 1370 kmp_hw_t types[] = {KMP_HW_PROC_GROUP, KMP_HW_CORE, KMP_HW_THREAD}; 1371 const static size_t BITS_PER_GROUP = CHAR_BIT * sizeof(DWORD_PTR); 1372 1373 if (__kmp_affinity_verbose) { 1374 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 1375 } 1376 1377 // If we aren't affinity capable, then use flat topology 1378 if (!KMP_AFFINITY_CAPABLE()) { 1379 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1380 nPackages = __kmp_num_proc_groups; 1381 __kmp_nThreadsPerCore = 1; 1382 __kmp_ncores = __kmp_xproc; 1383 nCoresPerPkg = nPackages / __kmp_ncores; 1384 return true; 1385 } 1386 1387 // Construct the data structure to be returned. 1388 __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types); 1389 int avail_ct = 0; 1390 int i; 1391 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1392 // Skip this proc if it is not included in the machine model. 1393 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1394 continue; 1395 } 1396 kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct++); 1397 hw_thread.clear(); 1398 hw_thread.os_id = i; 1399 hw_thread.ids[0] = i / BITS_PER_GROUP; 1400 hw_thread.ids[1] = hw_thread.ids[2] = i % BITS_PER_GROUP; 1401 } 1402 return true; 1403 } 1404 #endif /* KMP_GROUP_AFFINITY */ 1405 1406 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1407 1408 template <kmp_uint32 LSB, kmp_uint32 MSB> 1409 static inline unsigned __kmp_extract_bits(kmp_uint32 v) { 1410 const kmp_uint32 SHIFT_LEFT = sizeof(kmp_uint32) * 8 - 1 - MSB; 1411 const kmp_uint32 SHIFT_RIGHT = LSB; 1412 kmp_uint32 retval = v; 1413 retval <<= SHIFT_LEFT; 1414 retval >>= (SHIFT_LEFT + SHIFT_RIGHT); 1415 return retval; 1416 } 1417 1418 static int __kmp_cpuid_mask_width(int count) { 1419 int r = 0; 1420 1421 while ((1 << r) < count) 1422 ++r; 1423 return r; 1424 } 1425 1426 class apicThreadInfo { 1427 public: 1428 unsigned osId; // param to __kmp_affinity_bind_thread 1429 unsigned apicId; // from cpuid after binding 1430 unsigned maxCoresPerPkg; // "" 1431 unsigned maxThreadsPerPkg; // "" 1432 unsigned pkgId; // inferred from above values 1433 unsigned coreId; // "" 1434 unsigned threadId; // "" 1435 }; 1436 1437 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, 1438 const void *b) { 1439 const apicThreadInfo *aa = (const apicThreadInfo *)a; 1440 const apicThreadInfo *bb = (const apicThreadInfo *)b; 1441 if (aa->pkgId < bb->pkgId) 1442 return -1; 1443 if (aa->pkgId > bb->pkgId) 1444 return 1; 1445 if (aa->coreId < bb->coreId) 1446 return -1; 1447 if (aa->coreId > bb->coreId) 1448 return 1; 1449 if (aa->threadId < bb->threadId) 1450 return -1; 1451 if (aa->threadId > bb->threadId) 1452 return 1; 1453 return 0; 1454 } 1455 1456 class kmp_cache_info_t { 1457 public: 1458 struct info_t { 1459 unsigned level, mask; 1460 }; 1461 kmp_cache_info_t() : depth(0) { get_leaf4_levels(); } 1462 size_t get_depth() const { return depth; } 1463 info_t &operator[](size_t index) { return table[index]; } 1464 const info_t &operator[](size_t index) const { return table[index]; } 1465 1466 static kmp_hw_t get_topology_type(unsigned level) { 1467 KMP_DEBUG_ASSERT(level >= 1 && level <= MAX_CACHE_LEVEL); 1468 switch (level) { 1469 case 1: 1470 return KMP_HW_L1; 1471 case 2: 1472 return KMP_HW_L2; 1473 case 3: 1474 return KMP_HW_L3; 1475 } 1476 return KMP_HW_UNKNOWN; 1477 } 1478 1479 private: 1480 static const int MAX_CACHE_LEVEL = 3; 1481 1482 size_t depth; 1483 info_t table[MAX_CACHE_LEVEL]; 1484 1485 void get_leaf4_levels() { 1486 unsigned level = 0; 1487 while (depth < MAX_CACHE_LEVEL) { 1488 unsigned cache_type, max_threads_sharing; 1489 unsigned cache_level, cache_mask_width; 1490 kmp_cpuid buf2; 1491 __kmp_x86_cpuid(4, level, &buf2); 1492 cache_type = __kmp_extract_bits<0, 4>(buf2.eax); 1493 if (!cache_type) 1494 break; 1495 // Skip instruction caches 1496 if (cache_type == 2) { 1497 level++; 1498 continue; 1499 } 1500 max_threads_sharing = __kmp_extract_bits<14, 25>(buf2.eax) + 1; 1501 cache_mask_width = __kmp_cpuid_mask_width(max_threads_sharing); 1502 cache_level = __kmp_extract_bits<5, 7>(buf2.eax); 1503 table[depth].level = cache_level; 1504 table[depth].mask = ((-1) << cache_mask_width); 1505 depth++; 1506 level++; 1507 } 1508 } 1509 }; 1510 1511 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 1512 // an algorithm which cycles through the available os threads, setting 1513 // the current thread's affinity mask to that thread, and then retrieves 1514 // the Apic Id for each thread context using the cpuid instruction. 1515 static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) { 1516 kmp_cpuid buf; 1517 *msg_id = kmp_i18n_null; 1518 1519 if (__kmp_affinity_verbose) { 1520 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 1521 } 1522 1523 // Check if cpuid leaf 4 is supported. 1524 __kmp_x86_cpuid(0, 0, &buf); 1525 if (buf.eax < 4) { 1526 *msg_id = kmp_i18n_str_NoLeaf4Support; 1527 return false; 1528 } 1529 1530 // The algorithm used starts by setting the affinity to each available thread 1531 // and retrieving info from the cpuid instruction, so if we are not capable of 1532 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we 1533 // need to do something else - use the defaults that we calculated from 1534 // issuing cpuid without binding to each proc. 1535 if (!KMP_AFFINITY_CAPABLE()) { 1536 // Hack to try and infer the machine topology using only the data 1537 // available from cpuid on the current thread, and __kmp_xproc. 1538 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1539 1540 // Get an upper bound on the number of threads per package using cpuid(1). 1541 // On some OS/chps combinations where HT is supported by the chip but is 1542 // disabled, this value will be 2 on a single core chip. Usually, it will be 1543 // 2 if HT is enabled and 1 if HT is disabled. 1544 __kmp_x86_cpuid(1, 0, &buf); 1545 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1546 if (maxThreadsPerPkg == 0) { 1547 maxThreadsPerPkg = 1; 1548 } 1549 1550 // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded 1551 // value. 1552 // 1553 // The author of cpu_count.cpp treated this only an upper bound on the 1554 // number of cores, but I haven't seen any cases where it was greater than 1555 // the actual number of cores, so we will treat it as exact in this block of 1556 // code. 1557 // 1558 // First, we need to check if cpuid(4) is supported on this chip. To see if 1559 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or 1560 // greater. 1561 __kmp_x86_cpuid(0, 0, &buf); 1562 if (buf.eax >= 4) { 1563 __kmp_x86_cpuid(4, 0, &buf); 1564 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1565 } else { 1566 nCoresPerPkg = 1; 1567 } 1568 1569 // There is no way to reliably tell if HT is enabled without issuing the 1570 // cpuid instruction from every thread, can correlating the cpuid info, so 1571 // if the machine is not affinity capable, we assume that HT is off. We have 1572 // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine 1573 // does not support HT. 1574 // 1575 // - Older OSes are usually found on machines with older chips, which do not 1576 // support HT. 1577 // - The performance penalty for mistakenly identifying a machine as HT when 1578 // it isn't (which results in blocktime being incorrectly set to 0) is 1579 // greater than the penalty when for mistakenly identifying a machine as 1580 // being 1 thread/core when it is really HT enabled (which results in 1581 // blocktime being incorrectly set to a positive value). 1582 __kmp_ncores = __kmp_xproc; 1583 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1584 __kmp_nThreadsPerCore = 1; 1585 return true; 1586 } 1587 1588 // From here on, we can assume that it is safe to call 1589 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 1590 // __kmp_affinity_type = affinity_none. 1591 1592 // Save the affinity mask for the current thread. 1593 kmp_affinity_raii_t previous_affinity; 1594 1595 // Run through each of the available contexts, binding the current thread 1596 // to it, and obtaining the pertinent information using the cpuid instr. 1597 // 1598 // The relevant information is: 1599 // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 1600 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 1601 // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value 1602 // of this field determines the width of the core# + thread# fields in the 1603 // Apic Id. It is also an upper bound on the number of threads per 1604 // package, but it has been verified that situations happen were it is not 1605 // exact. In particular, on certain OS/chip combinations where Intel(R) 1606 // Hyper-Threading Technology is supported by the chip but has been 1607 // disabled, the value of this field will be 2 (for a single core chip). 1608 // On other OS/chip combinations supporting Intel(R) Hyper-Threading 1609 // Technology, the value of this field will be 1 when Intel(R) 1610 // Hyper-Threading Technology is disabled and 2 when it is enabled. 1611 // - Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The value 1612 // of this field (+1) determines the width of the core# field in the Apic 1613 // Id. The comments in "cpucount.cpp" say that this value is an upper 1614 // bound, but the IA-32 architecture manual says that it is exactly the 1615 // number of cores per package, and I haven't seen any case where it 1616 // wasn't. 1617 // 1618 // From this information, deduce the package Id, core Id, and thread Id, 1619 // and set the corresponding fields in the apicThreadInfo struct. 1620 unsigned i; 1621 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 1622 __kmp_avail_proc * sizeof(apicThreadInfo)); 1623 unsigned nApics = 0; 1624 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1625 // Skip this proc if it is not included in the machine model. 1626 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1627 continue; 1628 } 1629 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 1630 1631 __kmp_affinity_dispatch->bind_thread(i); 1632 threadInfo[nApics].osId = i; 1633 1634 // The apic id and max threads per pkg come from cpuid(1). 1635 __kmp_x86_cpuid(1, 0, &buf); 1636 if (((buf.edx >> 9) & 1) == 0) { 1637 __kmp_free(threadInfo); 1638 *msg_id = kmp_i18n_str_ApicNotPresent; 1639 return false; 1640 } 1641 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 1642 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1643 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 1644 threadInfo[nApics].maxThreadsPerPkg = 1; 1645 } 1646 1647 // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded 1648 // value. 1649 // 1650 // First, we need to check if cpuid(4) is supported on this chip. To see if 1651 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n 1652 // or greater. 1653 __kmp_x86_cpuid(0, 0, &buf); 1654 if (buf.eax >= 4) { 1655 __kmp_x86_cpuid(4, 0, &buf); 1656 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1657 } else { 1658 threadInfo[nApics].maxCoresPerPkg = 1; 1659 } 1660 1661 // Infer the pkgId / coreId / threadId using only the info obtained locally. 1662 int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg); 1663 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 1664 1665 int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg); 1666 int widthT = widthCT - widthC; 1667 if (widthT < 0) { 1668 // I've never seen this one happen, but I suppose it could, if the cpuid 1669 // instruction on a chip was really screwed up. Make sure to restore the 1670 // affinity mask before the tail call. 1671 __kmp_free(threadInfo); 1672 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1673 return false; 1674 } 1675 1676 int maskC = (1 << widthC) - 1; 1677 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC; 1678 1679 int maskT = (1 << widthT) - 1; 1680 threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT; 1681 1682 nApics++; 1683 } 1684 1685 // We've collected all the info we need. 1686 // Restore the old affinity mask for this thread. 1687 previous_affinity.restore(); 1688 1689 // Sort the threadInfo table by physical Id. 1690 qsort(threadInfo, nApics, sizeof(*threadInfo), 1691 __kmp_affinity_cmp_apicThreadInfo_phys_id); 1692 1693 // The table is now sorted by pkgId / coreId / threadId, but we really don't 1694 // know the radix of any of the fields. pkgId's may be sparsely assigned among 1695 // the chips on a system. Although coreId's are usually assigned 1696 // [0 .. coresPerPkg-1] and threadId's are usually assigned 1697 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1698 // 1699 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 1700 // total # packages) are at this point - we want to determine that now. We 1701 // only have an upper bound on the first two figures. 1702 // 1703 // We also perform a consistency check at this point: the values returned by 1704 // the cpuid instruction for any thread bound to a given package had better 1705 // return the same info for maxThreadsPerPkg and maxCoresPerPkg. 1706 nPackages = 1; 1707 nCoresPerPkg = 1; 1708 __kmp_nThreadsPerCore = 1; 1709 unsigned nCores = 1; 1710 1711 unsigned pkgCt = 1; // to determine radii 1712 unsigned lastPkgId = threadInfo[0].pkgId; 1713 unsigned coreCt = 1; 1714 unsigned lastCoreId = threadInfo[0].coreId; 1715 unsigned threadCt = 1; 1716 unsigned lastThreadId = threadInfo[0].threadId; 1717 1718 // intra-pkg consist checks 1719 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 1720 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 1721 1722 for (i = 1; i < nApics; i++) { 1723 if (threadInfo[i].pkgId != lastPkgId) { 1724 nCores++; 1725 pkgCt++; 1726 lastPkgId = threadInfo[i].pkgId; 1727 if ((int)coreCt > nCoresPerPkg) 1728 nCoresPerPkg = coreCt; 1729 coreCt = 1; 1730 lastCoreId = threadInfo[i].coreId; 1731 if ((int)threadCt > __kmp_nThreadsPerCore) 1732 __kmp_nThreadsPerCore = threadCt; 1733 threadCt = 1; 1734 lastThreadId = threadInfo[i].threadId; 1735 1736 // This is a different package, so go on to the next iteration without 1737 // doing any consistency checks. Reset the consistency check vars, though. 1738 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 1739 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 1740 continue; 1741 } 1742 1743 if (threadInfo[i].coreId != lastCoreId) { 1744 nCores++; 1745 coreCt++; 1746 lastCoreId = threadInfo[i].coreId; 1747 if ((int)threadCt > __kmp_nThreadsPerCore) 1748 __kmp_nThreadsPerCore = threadCt; 1749 threadCt = 1; 1750 lastThreadId = threadInfo[i].threadId; 1751 } else if (threadInfo[i].threadId != lastThreadId) { 1752 threadCt++; 1753 lastThreadId = threadInfo[i].threadId; 1754 } else { 1755 __kmp_free(threadInfo); 1756 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1757 return false; 1758 } 1759 1760 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 1761 // fields agree between all the threads bounds to a given package. 1762 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) || 1763 (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 1764 __kmp_free(threadInfo); 1765 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1766 return false; 1767 } 1768 } 1769 // When affinity is off, this routine will still be called to set 1770 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1771 // Make sure all these vars are set correctly 1772 nPackages = pkgCt; 1773 if ((int)coreCt > nCoresPerPkg) 1774 nCoresPerPkg = coreCt; 1775 if ((int)threadCt > __kmp_nThreadsPerCore) 1776 __kmp_nThreadsPerCore = threadCt; 1777 __kmp_ncores = nCores; 1778 KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc); 1779 1780 // Now that we've determined the number of packages, the number of cores per 1781 // package, and the number of threads per core, we can construct the data 1782 // structure that is to be returned. 1783 int idx = 0; 1784 int pkgLevel = 0; 1785 int coreLevel = 1; 1786 int threadLevel = 2; 1787 //(__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 1788 int depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 1789 kmp_hw_t types[3]; 1790 if (pkgLevel >= 0) 1791 types[idx++] = KMP_HW_SOCKET; 1792 if (coreLevel >= 0) 1793 types[idx++] = KMP_HW_CORE; 1794 if (threadLevel >= 0) 1795 types[idx++] = KMP_HW_THREAD; 1796 1797 KMP_ASSERT(depth > 0); 1798 __kmp_topology = kmp_topology_t::allocate(nApics, depth, types); 1799 1800 for (i = 0; i < nApics; ++i) { 1801 idx = 0; 1802 unsigned os = threadInfo[i].osId; 1803 kmp_hw_thread_t &hw_thread = __kmp_topology->at(i); 1804 hw_thread.clear(); 1805 1806 if (pkgLevel >= 0) { 1807 hw_thread.ids[idx++] = threadInfo[i].pkgId; 1808 } 1809 if (coreLevel >= 0) { 1810 hw_thread.ids[idx++] = threadInfo[i].coreId; 1811 } 1812 if (threadLevel >= 0) { 1813 hw_thread.ids[idx++] = threadInfo[i].threadId; 1814 } 1815 hw_thread.os_id = os; 1816 } 1817 1818 __kmp_free(threadInfo); 1819 __kmp_topology->sort_ids(); 1820 if (!__kmp_topology->check_ids()) { 1821 kmp_topology_t::deallocate(__kmp_topology); 1822 __kmp_topology = nullptr; 1823 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1824 return false; 1825 } 1826 return true; 1827 } 1828 1829 // Hybrid cpu detection using CPUID.1A 1830 // Thread should be pinned to processor already 1831 static void __kmp_get_hybrid_info(kmp_hw_core_type_t *type, 1832 unsigned *native_model_id) { 1833 kmp_cpuid buf; 1834 __kmp_x86_cpuid(0x1a, 0, &buf); 1835 *type = (kmp_hw_core_type_t)__kmp_extract_bits<24, 31>(buf.eax); 1836 *native_model_id = __kmp_extract_bits<0, 23>(buf.eax); 1837 } 1838 1839 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 1840 // architectures support a newer interface for specifying the x2APIC Ids, 1841 // based on CPUID.B or CPUID.1F 1842 /* 1843 * CPUID.B or 1F, Input ECX (sub leaf # aka level number) 1844 Bits Bits Bits Bits 1845 31-16 15-8 7-4 4-0 1846 ---+-----------+--------------+-------------+-----------------+ 1847 EAX| reserved | reserved | reserved | Bits to Shift | 1848 ---+-----------|--------------+-------------+-----------------| 1849 EBX| reserved | Num logical processors at level (16 bits) | 1850 ---+-----------|--------------+-------------------------------| 1851 ECX| reserved | Level Type | Level Number (8 bits) | 1852 ---+-----------+--------------+-------------------------------| 1853 EDX| X2APIC ID (32 bits) | 1854 ---+----------------------------------------------------------+ 1855 */ 1856 1857 enum { 1858 INTEL_LEVEL_TYPE_INVALID = 0, // Package level 1859 INTEL_LEVEL_TYPE_SMT = 1, 1860 INTEL_LEVEL_TYPE_CORE = 2, 1861 INTEL_LEVEL_TYPE_TILE = 3, 1862 INTEL_LEVEL_TYPE_MODULE = 4, 1863 INTEL_LEVEL_TYPE_DIE = 5, 1864 INTEL_LEVEL_TYPE_LAST = 6, 1865 }; 1866 1867 struct cpuid_level_info_t { 1868 unsigned level_type, mask, mask_width, nitems, cache_mask; 1869 }; 1870 1871 static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) { 1872 switch (intel_type) { 1873 case INTEL_LEVEL_TYPE_INVALID: 1874 return KMP_HW_SOCKET; 1875 case INTEL_LEVEL_TYPE_SMT: 1876 return KMP_HW_THREAD; 1877 case INTEL_LEVEL_TYPE_CORE: 1878 return KMP_HW_CORE; 1879 case INTEL_LEVEL_TYPE_TILE: 1880 return KMP_HW_TILE; 1881 case INTEL_LEVEL_TYPE_MODULE: 1882 return KMP_HW_MODULE; 1883 case INTEL_LEVEL_TYPE_DIE: 1884 return KMP_HW_DIE; 1885 } 1886 return KMP_HW_UNKNOWN; 1887 } 1888 1889 // This function takes the topology leaf, a levels array to store the levels 1890 // detected and a bitmap of the known levels. 1891 // Returns the number of levels in the topology 1892 static unsigned 1893 __kmp_x2apicid_get_levels(int leaf, 1894 cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST], 1895 kmp_uint64 known_levels) { 1896 unsigned level, levels_index; 1897 unsigned level_type, mask_width, nitems; 1898 kmp_cpuid buf; 1899 1900 // New algorithm has known topology layers act as highest unknown topology 1901 // layers when unknown topology layers exist. 1902 // e.g., Suppose layers were SMT <X> CORE <Y> <Z> PACKAGE, where <X> <Y> <Z> 1903 // are unknown topology layers, Then SMT will take the characteristics of 1904 // (SMT x <X>) and CORE will take the characteristics of (CORE x <Y> x <Z>). 1905 // This eliminates unknown portions of the topology while still keeping the 1906 // correct structure. 1907 level = levels_index = 0; 1908 do { 1909 __kmp_x86_cpuid(leaf, level, &buf); 1910 level_type = __kmp_extract_bits<8, 15>(buf.ecx); 1911 mask_width = __kmp_extract_bits<0, 4>(buf.eax); 1912 nitems = __kmp_extract_bits<0, 15>(buf.ebx); 1913 if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0) 1914 return 0; 1915 1916 if (known_levels & (1ull << level_type)) { 1917 // Add a new level to the topology 1918 KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST); 1919 levels[levels_index].level_type = level_type; 1920 levels[levels_index].mask_width = mask_width; 1921 levels[levels_index].nitems = nitems; 1922 levels_index++; 1923 } else { 1924 // If it is an unknown level, then logically move the previous layer up 1925 if (levels_index > 0) { 1926 levels[levels_index - 1].mask_width = mask_width; 1927 levels[levels_index - 1].nitems = nitems; 1928 } 1929 } 1930 level++; 1931 } while (level_type != INTEL_LEVEL_TYPE_INVALID); 1932 1933 // Set the masks to & with apicid 1934 for (unsigned i = 0; i < levels_index; ++i) { 1935 if (levels[i].level_type != INTEL_LEVEL_TYPE_INVALID) { 1936 levels[i].mask = ~((-1) << levels[i].mask_width); 1937 levels[i].cache_mask = (-1) << levels[i].mask_width; 1938 for (unsigned j = 0; j < i; ++j) 1939 levels[i].mask ^= levels[j].mask; 1940 } else { 1941 KMP_DEBUG_ASSERT(levels_index > 0); 1942 levels[i].mask = (-1) << levels[i - 1].mask_width; 1943 levels[i].cache_mask = 0; 1944 } 1945 } 1946 return levels_index; 1947 } 1948 1949 static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) { 1950 1951 cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST]; 1952 kmp_hw_t types[INTEL_LEVEL_TYPE_LAST]; 1953 unsigned levels_index; 1954 kmp_cpuid buf; 1955 kmp_uint64 known_levels; 1956 int topology_leaf, highest_leaf, apic_id; 1957 int num_leaves; 1958 static int leaves[] = {0, 0}; 1959 1960 kmp_i18n_id_t leaf_message_id; 1961 1962 KMP_BUILD_ASSERT(sizeof(known_levels) * CHAR_BIT > KMP_HW_LAST); 1963 1964 *msg_id = kmp_i18n_null; 1965 if (__kmp_affinity_verbose) { 1966 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 1967 } 1968 1969 // Figure out the known topology levels 1970 known_levels = 0ull; 1971 for (int i = 0; i < INTEL_LEVEL_TYPE_LAST; ++i) { 1972 if (__kmp_intel_type_2_topology_type(i) != KMP_HW_UNKNOWN) { 1973 known_levels |= (1ull << i); 1974 } 1975 } 1976 1977 // Get the highest cpuid leaf supported 1978 __kmp_x86_cpuid(0, 0, &buf); 1979 highest_leaf = buf.eax; 1980 1981 // If a specific topology method was requested, only allow that specific leaf 1982 // otherwise, try both leaves 31 and 11 in that order 1983 num_leaves = 0; 1984 if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 1985 num_leaves = 1; 1986 leaves[0] = 11; 1987 leaf_message_id = kmp_i18n_str_NoLeaf11Support; 1988 } else if (__kmp_affinity_top_method == affinity_top_method_x2apicid_1f) { 1989 num_leaves = 1; 1990 leaves[0] = 31; 1991 leaf_message_id = kmp_i18n_str_NoLeaf31Support; 1992 } else { 1993 num_leaves = 2; 1994 leaves[0] = 31; 1995 leaves[1] = 11; 1996 leaf_message_id = kmp_i18n_str_NoLeaf11Support; 1997 } 1998 1999 // Check to see if cpuid leaf 31 or 11 is supported. 2000 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 2001 topology_leaf = -1; 2002 for (int i = 0; i < num_leaves; ++i) { 2003 int leaf = leaves[i]; 2004 if (highest_leaf < leaf) 2005 continue; 2006 __kmp_x86_cpuid(leaf, 0, &buf); 2007 if (buf.ebx == 0) 2008 continue; 2009 topology_leaf = leaf; 2010 levels_index = __kmp_x2apicid_get_levels(leaf, levels, known_levels); 2011 if (levels_index == 0) 2012 continue; 2013 break; 2014 } 2015 if (topology_leaf == -1 || levels_index == 0) { 2016 *msg_id = leaf_message_id; 2017 return false; 2018 } 2019 KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST); 2020 2021 // The algorithm used starts by setting the affinity to each available thread 2022 // and retrieving info from the cpuid instruction, so if we are not capable of 2023 // calling __kmp_get_system_affinity() and __kmp_get_system_affinity(), then 2024 // we need to do something else - use the defaults that we calculated from 2025 // issuing cpuid without binding to each proc. 2026 if (!KMP_AFFINITY_CAPABLE()) { 2027 // Hack to try and infer the machine topology using only the data 2028 // available from cpuid on the current thread, and __kmp_xproc. 2029 KMP_ASSERT(__kmp_affinity_type == affinity_none); 2030 for (unsigned i = 0; i < levels_index; ++i) { 2031 if (levels[i].level_type == INTEL_LEVEL_TYPE_SMT) { 2032 __kmp_nThreadsPerCore = levels[i].nitems; 2033 } else if (levels[i].level_type == INTEL_LEVEL_TYPE_CORE) { 2034 nCoresPerPkg = levels[i].nitems; 2035 } 2036 } 2037 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 2038 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 2039 return true; 2040 } 2041 2042 // Allocate the data structure to be returned. 2043 int depth = levels_index; 2044 for (int i = depth - 1, j = 0; i >= 0; --i, ++j) 2045 types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type); 2046 __kmp_topology = 2047 kmp_topology_t::allocate(__kmp_avail_proc, levels_index, types); 2048 2049 // Insert equivalent cache types if they exist 2050 kmp_cache_info_t cache_info; 2051 for (size_t i = 0; i < cache_info.get_depth(); ++i) { 2052 const kmp_cache_info_t::info_t &info = cache_info[i]; 2053 unsigned cache_mask = info.mask; 2054 unsigned cache_level = info.level; 2055 for (unsigned j = 0; j < levels_index; ++j) { 2056 unsigned hw_cache_mask = levels[j].cache_mask; 2057 kmp_hw_t cache_type = kmp_cache_info_t::get_topology_type(cache_level); 2058 if (hw_cache_mask == cache_mask && j < levels_index - 1) { 2059 kmp_hw_t type = 2060 __kmp_intel_type_2_topology_type(levels[j + 1].level_type); 2061 __kmp_topology->set_equivalent_type(cache_type, type); 2062 } 2063 } 2064 } 2065 2066 // From here on, we can assume that it is safe to call 2067 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 2068 // __kmp_affinity_type = affinity_none. 2069 2070 // Save the affinity mask for the current thread. 2071 kmp_affinity_raii_t previous_affinity; 2072 2073 // Run through each of the available contexts, binding the current thread 2074 // to it, and obtaining the pertinent information using the cpuid instr. 2075 unsigned int proc; 2076 int hw_thread_index = 0; 2077 KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) { 2078 cpuid_level_info_t my_levels[INTEL_LEVEL_TYPE_LAST]; 2079 unsigned my_levels_index; 2080 2081 // Skip this proc if it is not included in the machine model. 2082 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 2083 continue; 2084 } 2085 KMP_DEBUG_ASSERT(hw_thread_index < __kmp_avail_proc); 2086 2087 __kmp_affinity_dispatch->bind_thread(proc); 2088 2089 // New algorithm 2090 __kmp_x86_cpuid(topology_leaf, 0, &buf); 2091 apic_id = buf.edx; 2092 kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index); 2093 my_levels_index = 2094 __kmp_x2apicid_get_levels(topology_leaf, my_levels, known_levels); 2095 if (my_levels_index == 0 || my_levels_index != levels_index) { 2096 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 2097 return false; 2098 } 2099 hw_thread.clear(); 2100 hw_thread.os_id = proc; 2101 // Put in topology information 2102 for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) { 2103 hw_thread.ids[idx] = apic_id & my_levels[j].mask; 2104 if (j > 0) { 2105 hw_thread.ids[idx] >>= my_levels[j - 1].mask_width; 2106 } 2107 } 2108 // Hybrid information 2109 if (__kmp_is_hybrid_cpu() && highest_leaf >= 0x1a) { 2110 kmp_hw_core_type_t type; 2111 unsigned native_model_id; 2112 __kmp_get_hybrid_info(&type, &native_model_id); 2113 hw_thread.core_type = type; 2114 } 2115 hw_thread_index++; 2116 } 2117 KMP_ASSERT(hw_thread_index > 0); 2118 __kmp_topology->sort_ids(); 2119 if (!__kmp_topology->check_ids()) { 2120 kmp_topology_t::deallocate(__kmp_topology); 2121 __kmp_topology = nullptr; 2122 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 2123 return false; 2124 } 2125 return true; 2126 } 2127 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 2128 2129 #define osIdIndex 0 2130 #define threadIdIndex 1 2131 #define coreIdIndex 2 2132 #define pkgIdIndex 3 2133 #define nodeIdIndex 4 2134 2135 typedef unsigned *ProcCpuInfo; 2136 static unsigned maxIndex = pkgIdIndex; 2137 2138 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, 2139 const void *b) { 2140 unsigned i; 2141 const unsigned *aa = *(unsigned *const *)a; 2142 const unsigned *bb = *(unsigned *const *)b; 2143 for (i = maxIndex;; i--) { 2144 if (aa[i] < bb[i]) 2145 return -1; 2146 if (aa[i] > bb[i]) 2147 return 1; 2148 if (i == osIdIndex) 2149 break; 2150 } 2151 return 0; 2152 } 2153 2154 #if KMP_USE_HIER_SCHED 2155 // Set the array sizes for the hierarchy layers 2156 static void __kmp_dispatch_set_hierarchy_values() { 2157 // Set the maximum number of L1's to number of cores 2158 // Set the maximum number of L2's to to either number of cores / 2 for 2159 // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing 2160 // Or the number of cores for Intel(R) Xeon(R) processors 2161 // Set the maximum number of NUMA nodes and L3's to number of packages 2162 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] = 2163 nPackages * nCoresPerPkg * __kmp_nThreadsPerCore; 2164 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores; 2165 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \ 2166 KMP_MIC_SUPPORTED 2167 if (__kmp_mic_type >= mic3) 2168 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2; 2169 else 2170 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) 2171 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores; 2172 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages; 2173 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages; 2174 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1; 2175 // Set the number of threads per unit 2176 // Number of hardware threads per L1/L2/L3/NUMA/LOOP 2177 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1; 2178 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] = 2179 __kmp_nThreadsPerCore; 2180 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \ 2181 KMP_MIC_SUPPORTED 2182 if (__kmp_mic_type >= mic3) 2183 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] = 2184 2 * __kmp_nThreadsPerCore; 2185 else 2186 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) 2187 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] = 2188 __kmp_nThreadsPerCore; 2189 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] = 2190 nCoresPerPkg * __kmp_nThreadsPerCore; 2191 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] = 2192 nCoresPerPkg * __kmp_nThreadsPerCore; 2193 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] = 2194 nPackages * nCoresPerPkg * __kmp_nThreadsPerCore; 2195 } 2196 2197 // Return the index into the hierarchy for this tid and layer type (L1, L2, etc) 2198 // i.e., this thread's L1 or this thread's L2, etc. 2199 int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) { 2200 int index = type + 1; 2201 int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1]; 2202 KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST); 2203 if (type == kmp_hier_layer_e::LAYER_THREAD) 2204 return tid; 2205 else if (type == kmp_hier_layer_e::LAYER_LOOP) 2206 return 0; 2207 KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0); 2208 if (tid >= num_hw_threads) 2209 tid = tid % num_hw_threads; 2210 return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index]; 2211 } 2212 2213 // Return the number of t1's per t2 2214 int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) { 2215 int i1 = t1 + 1; 2216 int i2 = t2 + 1; 2217 KMP_DEBUG_ASSERT(i1 <= i2); 2218 KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST); 2219 KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST); 2220 KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0); 2221 // (nthreads/t2) / (nthreads/t1) = t1 / t2 2222 return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1]; 2223 } 2224 #endif // KMP_USE_HIER_SCHED 2225 2226 static inline const char *__kmp_cpuinfo_get_filename() { 2227 const char *filename; 2228 if (__kmp_cpuinfo_file != nullptr) 2229 filename = __kmp_cpuinfo_file; 2230 else 2231 filename = "/proc/cpuinfo"; 2232 return filename; 2233 } 2234 2235 static inline const char *__kmp_cpuinfo_get_envvar() { 2236 const char *envvar = nullptr; 2237 if (__kmp_cpuinfo_file != nullptr) 2238 envvar = "KMP_CPUINFO_FILE"; 2239 return envvar; 2240 } 2241 2242 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 2243 // affinity map. 2244 static bool __kmp_affinity_create_cpuinfo_map(int *line, 2245 kmp_i18n_id_t *const msg_id) { 2246 const char *filename = __kmp_cpuinfo_get_filename(); 2247 const char *envvar = __kmp_cpuinfo_get_envvar(); 2248 *msg_id = kmp_i18n_null; 2249 2250 if (__kmp_affinity_verbose) { 2251 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 2252 } 2253 2254 kmp_safe_raii_file_t f(filename, "r", envvar); 2255 2256 // Scan of the file, and count the number of "processor" (osId) fields, 2257 // and find the highest value of <n> for a node_<n> field. 2258 char buf[256]; 2259 unsigned num_records = 0; 2260 while (!feof(f)) { 2261 buf[sizeof(buf) - 1] = 1; 2262 if (!fgets(buf, sizeof(buf), f)) { 2263 // Read errors presumably because of EOF 2264 break; 2265 } 2266 2267 char s1[] = "processor"; 2268 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 2269 num_records++; 2270 continue; 2271 } 2272 2273 // FIXME - this will match "node_<n> <garbage>" 2274 unsigned level; 2275 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 2276 // validate the input fisrt: 2277 if (level > (unsigned)__kmp_xproc) { // level is too big 2278 level = __kmp_xproc; 2279 } 2280 if (nodeIdIndex + level >= maxIndex) { 2281 maxIndex = nodeIdIndex + level; 2282 } 2283 continue; 2284 } 2285 } 2286 2287 // Check for empty file / no valid processor records, or too many. The number 2288 // of records can't exceed the number of valid bits in the affinity mask. 2289 if (num_records == 0) { 2290 *msg_id = kmp_i18n_str_NoProcRecords; 2291 return false; 2292 } 2293 if (num_records > (unsigned)__kmp_xproc) { 2294 *msg_id = kmp_i18n_str_TooManyProcRecords; 2295 return false; 2296 } 2297 2298 // Set the file pointer back to the beginning, so that we can scan the file 2299 // again, this time performing a full parse of the data. Allocate a vector of 2300 // ProcCpuInfo object, where we will place the data. Adding an extra element 2301 // at the end allows us to remove a lot of extra checks for termination 2302 // conditions. 2303 if (fseek(f, 0, SEEK_SET) != 0) { 2304 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 2305 return false; 2306 } 2307 2308 // Allocate the array of records to store the proc info in. The dummy 2309 // element at the end makes the logic in filling them out easier to code. 2310 unsigned **threadInfo = 2311 (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *)); 2312 unsigned i; 2313 for (i = 0; i <= num_records; i++) { 2314 threadInfo[i] = 2315 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2316 } 2317 2318 #define CLEANUP_THREAD_INFO \ 2319 for (i = 0; i <= num_records; i++) { \ 2320 __kmp_free(threadInfo[i]); \ 2321 } \ 2322 __kmp_free(threadInfo); 2323 2324 // A value of UINT_MAX means that we didn't find the field 2325 unsigned __index; 2326 2327 #define INIT_PROC_INFO(p) \ 2328 for (__index = 0; __index <= maxIndex; __index++) { \ 2329 (p)[__index] = UINT_MAX; \ 2330 } 2331 2332 for (i = 0; i <= num_records; i++) { 2333 INIT_PROC_INFO(threadInfo[i]); 2334 } 2335 2336 unsigned num_avail = 0; 2337 *line = 0; 2338 while (!feof(f)) { 2339 // Create an inner scoping level, so that all the goto targets at the end of 2340 // the loop appear in an outer scoping level. This avoids warnings about 2341 // jumping past an initialization to a target in the same block. 2342 { 2343 buf[sizeof(buf) - 1] = 1; 2344 bool long_line = false; 2345 if (!fgets(buf, sizeof(buf), f)) { 2346 // Read errors presumably because of EOF 2347 // If there is valid data in threadInfo[num_avail], then fake 2348 // a blank line in ensure that the last address gets parsed. 2349 bool valid = false; 2350 for (i = 0; i <= maxIndex; i++) { 2351 if (threadInfo[num_avail][i] != UINT_MAX) { 2352 valid = true; 2353 } 2354 } 2355 if (!valid) { 2356 break; 2357 } 2358 buf[0] = 0; 2359 } else if (!buf[sizeof(buf) - 1]) { 2360 // The line is longer than the buffer. Set a flag and don't 2361 // emit an error if we were going to ignore the line, anyway. 2362 long_line = true; 2363 2364 #define CHECK_LINE \ 2365 if (long_line) { \ 2366 CLEANUP_THREAD_INFO; \ 2367 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 2368 return false; \ 2369 } 2370 } 2371 (*line)++; 2372 2373 char s1[] = "processor"; 2374 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 2375 CHECK_LINE; 2376 char *p = strchr(buf + sizeof(s1) - 1, ':'); 2377 unsigned val; 2378 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2379 goto no_val; 2380 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) 2381 #if KMP_ARCH_AARCH64 2382 // Handle the old AArch64 /proc/cpuinfo layout differently, 2383 // it contains all of the 'processor' entries listed in a 2384 // single 'Processor' section, therefore the normal looking 2385 // for duplicates in that section will always fail. 2386 num_avail++; 2387 #else 2388 goto dup_field; 2389 #endif 2390 threadInfo[num_avail][osIdIndex] = val; 2391 #if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64) 2392 char path[256]; 2393 KMP_SNPRINTF( 2394 path, sizeof(path), 2395 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 2396 threadInfo[num_avail][osIdIndex]); 2397 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 2398 2399 KMP_SNPRINTF(path, sizeof(path), 2400 "/sys/devices/system/cpu/cpu%u/topology/core_id", 2401 threadInfo[num_avail][osIdIndex]); 2402 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 2403 continue; 2404 #else 2405 } 2406 char s2[] = "physical id"; 2407 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 2408 CHECK_LINE; 2409 char *p = strchr(buf + sizeof(s2) - 1, ':'); 2410 unsigned val; 2411 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2412 goto no_val; 2413 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) 2414 goto dup_field; 2415 threadInfo[num_avail][pkgIdIndex] = val; 2416 continue; 2417 } 2418 char s3[] = "core id"; 2419 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 2420 CHECK_LINE; 2421 char *p = strchr(buf + sizeof(s3) - 1, ':'); 2422 unsigned val; 2423 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2424 goto no_val; 2425 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) 2426 goto dup_field; 2427 threadInfo[num_avail][coreIdIndex] = val; 2428 continue; 2429 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 2430 } 2431 char s4[] = "thread id"; 2432 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 2433 CHECK_LINE; 2434 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2435 unsigned val; 2436 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2437 goto no_val; 2438 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) 2439 goto dup_field; 2440 threadInfo[num_avail][threadIdIndex] = val; 2441 continue; 2442 } 2443 unsigned level; 2444 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 2445 CHECK_LINE; 2446 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2447 unsigned val; 2448 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2449 goto no_val; 2450 // validate the input before using level: 2451 if (level > (unsigned)__kmp_xproc) { // level is too big 2452 level = __kmp_xproc; 2453 } 2454 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) 2455 goto dup_field; 2456 threadInfo[num_avail][nodeIdIndex + level] = val; 2457 continue; 2458 } 2459 2460 // We didn't recognize the leading token on the line. There are lots of 2461 // leading tokens that we don't recognize - if the line isn't empty, go on 2462 // to the next line. 2463 if ((*buf != 0) && (*buf != '\n')) { 2464 // If the line is longer than the buffer, read characters 2465 // until we find a newline. 2466 if (long_line) { 2467 int ch; 2468 while (((ch = fgetc(f)) != EOF) && (ch != '\n')) 2469 ; 2470 } 2471 continue; 2472 } 2473 2474 // A newline has signalled the end of the processor record. 2475 // Check that there aren't too many procs specified. 2476 if ((int)num_avail == __kmp_xproc) { 2477 CLEANUP_THREAD_INFO; 2478 *msg_id = kmp_i18n_str_TooManyEntries; 2479 return false; 2480 } 2481 2482 // Check for missing fields. The osId field must be there, and we 2483 // currently require that the physical id field is specified, also. 2484 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 2485 CLEANUP_THREAD_INFO; 2486 *msg_id = kmp_i18n_str_MissingProcField; 2487 return false; 2488 } 2489 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 2490 CLEANUP_THREAD_INFO; 2491 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 2492 return false; 2493 } 2494 2495 // Skip this proc if it is not included in the machine model. 2496 if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], 2497 __kmp_affin_fullMask)) { 2498 INIT_PROC_INFO(threadInfo[num_avail]); 2499 continue; 2500 } 2501 2502 // We have a successful parse of this proc's info. 2503 // Increment the counter, and prepare for the next proc. 2504 num_avail++; 2505 KMP_ASSERT(num_avail <= num_records); 2506 INIT_PROC_INFO(threadInfo[num_avail]); 2507 } 2508 continue; 2509 2510 no_val: 2511 CLEANUP_THREAD_INFO; 2512 *msg_id = kmp_i18n_str_MissingValCpuinfo; 2513 return false; 2514 2515 dup_field: 2516 CLEANUP_THREAD_INFO; 2517 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 2518 return false; 2519 } 2520 *line = 0; 2521 2522 #if KMP_MIC && REDUCE_TEAM_SIZE 2523 unsigned teamSize = 0; 2524 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2525 2526 // check for num_records == __kmp_xproc ??? 2527 2528 // If it is configured to omit the package level when there is only a single 2529 // package, the logic at the end of this routine won't work if there is only a 2530 // single thread 2531 KMP_ASSERT(num_avail > 0); 2532 KMP_ASSERT(num_avail <= num_records); 2533 2534 // Sort the threadInfo table by physical Id. 2535 qsort(threadInfo, num_avail, sizeof(*threadInfo), 2536 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 2537 2538 // The table is now sorted by pkgId / coreId / threadId, but we really don't 2539 // know the radix of any of the fields. pkgId's may be sparsely assigned among 2540 // the chips on a system. Although coreId's are usually assigned 2541 // [0 .. coresPerPkg-1] and threadId's are usually assigned 2542 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2543 // 2544 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 2545 // total # packages) are at this point - we want to determine that now. We 2546 // only have an upper bound on the first two figures. 2547 unsigned *counts = 2548 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2549 unsigned *maxCt = 2550 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2551 unsigned *totals = 2552 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2553 unsigned *lastId = 2554 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2555 2556 bool assign_thread_ids = false; 2557 unsigned threadIdCt; 2558 unsigned index; 2559 2560 restart_radix_check: 2561 threadIdCt = 0; 2562 2563 // Initialize the counter arrays with data from threadInfo[0]. 2564 if (assign_thread_ids) { 2565 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 2566 threadInfo[0][threadIdIndex] = threadIdCt++; 2567 } else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 2568 threadIdCt = threadInfo[0][threadIdIndex] + 1; 2569 } 2570 } 2571 for (index = 0; index <= maxIndex; index++) { 2572 counts[index] = 1; 2573 maxCt[index] = 1; 2574 totals[index] = 1; 2575 lastId[index] = threadInfo[0][index]; 2576 ; 2577 } 2578 2579 // Run through the rest of the OS procs. 2580 for (i = 1; i < num_avail; i++) { 2581 // Find the most significant index whose id differs from the id for the 2582 // previous OS proc. 2583 for (index = maxIndex; index >= threadIdIndex; index--) { 2584 if (assign_thread_ids && (index == threadIdIndex)) { 2585 // Auto-assign the thread id field if it wasn't specified. 2586 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2587 threadInfo[i][threadIdIndex] = threadIdCt++; 2588 } 2589 // Apparently the thread id field was specified for some entries and not 2590 // others. Start the thread id counter off at the next higher thread id. 2591 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2592 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2593 } 2594 } 2595 if (threadInfo[i][index] != lastId[index]) { 2596 // Run through all indices which are less significant, and reset the 2597 // counts to 1. At all levels up to and including index, we need to 2598 // increment the totals and record the last id. 2599 unsigned index2; 2600 for (index2 = threadIdIndex; index2 < index; index2++) { 2601 totals[index2]++; 2602 if (counts[index2] > maxCt[index2]) { 2603 maxCt[index2] = counts[index2]; 2604 } 2605 counts[index2] = 1; 2606 lastId[index2] = threadInfo[i][index2]; 2607 } 2608 counts[index]++; 2609 totals[index]++; 2610 lastId[index] = threadInfo[i][index]; 2611 2612 if (assign_thread_ids && (index > threadIdIndex)) { 2613 2614 #if KMP_MIC && REDUCE_TEAM_SIZE 2615 // The default team size is the total #threads in the machine 2616 // minus 1 thread for every core that has 3 or more threads. 2617 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2618 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2619 2620 // Restart the thread counter, as we are on a new core. 2621 threadIdCt = 0; 2622 2623 // Auto-assign the thread id field if it wasn't specified. 2624 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2625 threadInfo[i][threadIdIndex] = threadIdCt++; 2626 } 2627 2628 // Apparently the thread id field was specified for some entries and 2629 // not others. Start the thread id counter off at the next higher 2630 // thread id. 2631 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2632 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2633 } 2634 } 2635 break; 2636 } 2637 } 2638 if (index < threadIdIndex) { 2639 // If thread ids were specified, it is an error if they are not unique. 2640 // Also, check that we waven't already restarted the loop (to be safe - 2641 // shouldn't need to). 2642 if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) { 2643 __kmp_free(lastId); 2644 __kmp_free(totals); 2645 __kmp_free(maxCt); 2646 __kmp_free(counts); 2647 CLEANUP_THREAD_INFO; 2648 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2649 return false; 2650 } 2651 2652 // If the thread ids were not specified and we see entries entries that 2653 // are duplicates, start the loop over and assign the thread ids manually. 2654 assign_thread_ids = true; 2655 goto restart_radix_check; 2656 } 2657 } 2658 2659 #if KMP_MIC && REDUCE_TEAM_SIZE 2660 // The default team size is the total #threads in the machine 2661 // minus 1 thread for every core that has 3 or more threads. 2662 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2663 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2664 2665 for (index = threadIdIndex; index <= maxIndex; index++) { 2666 if (counts[index] > maxCt[index]) { 2667 maxCt[index] = counts[index]; 2668 } 2669 } 2670 2671 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2672 nCoresPerPkg = maxCt[coreIdIndex]; 2673 nPackages = totals[pkgIdIndex]; 2674 2675 // When affinity is off, this routine will still be called to set 2676 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 2677 // Make sure all these vars are set correctly, and return now if affinity is 2678 // not enabled. 2679 __kmp_ncores = totals[coreIdIndex]; 2680 if (!KMP_AFFINITY_CAPABLE()) { 2681 KMP_ASSERT(__kmp_affinity_type == affinity_none); 2682 return true; 2683 } 2684 2685 #if KMP_MIC && REDUCE_TEAM_SIZE 2686 // Set the default team size. 2687 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2688 __kmp_dflt_team_nth = teamSize; 2689 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting " 2690 "__kmp_dflt_team_nth = %d\n", 2691 __kmp_dflt_team_nth)); 2692 } 2693 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2694 2695 KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc); 2696 2697 // Count the number of levels which have more nodes at that level than at the 2698 // parent's level (with there being an implicit root node of the top level). 2699 // This is equivalent to saying that there is at least one node at this level 2700 // which has a sibling. These levels are in the map, and the package level is 2701 // always in the map. 2702 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2703 for (index = threadIdIndex; index < maxIndex; index++) { 2704 KMP_ASSERT(totals[index] >= totals[index + 1]); 2705 inMap[index] = (totals[index] > totals[index + 1]); 2706 } 2707 inMap[maxIndex] = (totals[maxIndex] > 1); 2708 inMap[pkgIdIndex] = true; 2709 inMap[coreIdIndex] = true; 2710 inMap[threadIdIndex] = true; 2711 2712 int depth = 0; 2713 int idx = 0; 2714 kmp_hw_t types[KMP_HW_LAST]; 2715 int pkgLevel = -1; 2716 int coreLevel = -1; 2717 int threadLevel = -1; 2718 for (index = threadIdIndex; index <= maxIndex; index++) { 2719 if (inMap[index]) { 2720 depth++; 2721 } 2722 } 2723 if (inMap[pkgIdIndex]) { 2724 pkgLevel = idx; 2725 types[idx++] = KMP_HW_SOCKET; 2726 } 2727 if (inMap[coreIdIndex]) { 2728 coreLevel = idx; 2729 types[idx++] = KMP_HW_CORE; 2730 } 2731 if (inMap[threadIdIndex]) { 2732 threadLevel = idx; 2733 types[idx++] = KMP_HW_THREAD; 2734 } 2735 KMP_ASSERT(depth > 0); 2736 2737 // Construct the data structure that is to be returned. 2738 __kmp_topology = kmp_topology_t::allocate(num_avail, depth, types); 2739 2740 for (i = 0; i < num_avail; ++i) { 2741 unsigned os = threadInfo[i][osIdIndex]; 2742 int src_index; 2743 int dst_index = 0; 2744 kmp_hw_thread_t &hw_thread = __kmp_topology->at(i); 2745 hw_thread.clear(); 2746 hw_thread.os_id = os; 2747 2748 idx = 0; 2749 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2750 if (!inMap[src_index]) { 2751 continue; 2752 } 2753 if (src_index == pkgIdIndex) { 2754 hw_thread.ids[pkgLevel] = threadInfo[i][src_index]; 2755 } else if (src_index == coreIdIndex) { 2756 hw_thread.ids[coreLevel] = threadInfo[i][src_index]; 2757 } else if (src_index == threadIdIndex) { 2758 hw_thread.ids[threadLevel] = threadInfo[i][src_index]; 2759 } 2760 dst_index++; 2761 } 2762 } 2763 2764 __kmp_free(inMap); 2765 __kmp_free(lastId); 2766 __kmp_free(totals); 2767 __kmp_free(maxCt); 2768 __kmp_free(counts); 2769 CLEANUP_THREAD_INFO; 2770 __kmp_topology->sort_ids(); 2771 if (!__kmp_topology->check_ids()) { 2772 kmp_topology_t::deallocate(__kmp_topology); 2773 __kmp_topology = nullptr; 2774 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2775 return false; 2776 } 2777 return true; 2778 } 2779 2780 // Create and return a table of affinity masks, indexed by OS thread ID. 2781 // This routine handles OR'ing together all the affinity masks of threads 2782 // that are sufficiently close, if granularity > fine. 2783 static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex, 2784 unsigned *numUnique) { 2785 // First form a table of affinity masks in order of OS thread id. 2786 int maxOsId; 2787 int i; 2788 int numAddrs = __kmp_topology->get_num_hw_threads(); 2789 int depth = __kmp_topology->get_depth(); 2790 KMP_ASSERT(numAddrs); 2791 KMP_ASSERT(depth); 2792 2793 maxOsId = 0; 2794 for (i = numAddrs - 1;; --i) { 2795 int osId = __kmp_topology->at(i).os_id; 2796 if (osId > maxOsId) { 2797 maxOsId = osId; 2798 } 2799 if (i == 0) 2800 break; 2801 } 2802 kmp_affin_mask_t *osId2Mask; 2803 KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1)); 2804 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2805 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2806 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2807 } 2808 if (__kmp_affinity_gran_levels >= (int)depth) { 2809 if (__kmp_affinity_verbose || 2810 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 2811 KMP_WARNING(AffThreadsMayMigrate); 2812 } 2813 } 2814 2815 // Run through the table, forming the masks for all threads on each core. 2816 // Threads on the same core will have identical kmp_hw_thread_t objects, not 2817 // considering the last level, which must be the thread id. All threads on a 2818 // core will appear consecutively. 2819 int unique = 0; 2820 int j = 0; // index of 1st thread on core 2821 int leader = 0; 2822 kmp_affin_mask_t *sum; 2823 KMP_CPU_ALLOC_ON_STACK(sum); 2824 KMP_CPU_ZERO(sum); 2825 KMP_CPU_SET(__kmp_topology->at(0).os_id, sum); 2826 for (i = 1; i < numAddrs; i++) { 2827 // If this thread is sufficiently close to the leader (within the 2828 // granularity setting), then set the bit for this os thread in the 2829 // affinity mask for this group, and go on to the next thread. 2830 if (__kmp_topology->is_close(leader, i, __kmp_affinity_gran_levels)) { 2831 KMP_CPU_SET(__kmp_topology->at(i).os_id, sum); 2832 continue; 2833 } 2834 2835 // For every thread in this group, copy the mask to the thread's entry in 2836 // the osId2Mask table. Mark the first address as a leader. 2837 for (; j < i; j++) { 2838 int osId = __kmp_topology->at(j).os_id; 2839 KMP_DEBUG_ASSERT(osId <= maxOsId); 2840 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2841 KMP_CPU_COPY(mask, sum); 2842 __kmp_topology->at(j).leader = (j == leader); 2843 } 2844 unique++; 2845 2846 // Start a new mask. 2847 leader = i; 2848 KMP_CPU_ZERO(sum); 2849 KMP_CPU_SET(__kmp_topology->at(i).os_id, sum); 2850 } 2851 2852 // For every thread in last group, copy the mask to the thread's 2853 // entry in the osId2Mask table. 2854 for (; j < i; j++) { 2855 int osId = __kmp_topology->at(j).os_id; 2856 KMP_DEBUG_ASSERT(osId <= maxOsId); 2857 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2858 KMP_CPU_COPY(mask, sum); 2859 __kmp_topology->at(j).leader = (j == leader); 2860 } 2861 unique++; 2862 KMP_CPU_FREE_FROM_STACK(sum); 2863 2864 *maxIndex = maxOsId; 2865 *numUnique = unique; 2866 return osId2Mask; 2867 } 2868 2869 // Stuff for the affinity proclist parsers. It's easier to declare these vars 2870 // as file-static than to try and pass them through the calling sequence of 2871 // the recursive-descent OMP_PLACES parser. 2872 static kmp_affin_mask_t *newMasks; 2873 static int numNewMasks; 2874 static int nextNewMask; 2875 2876 #define ADD_MASK(_mask) \ 2877 { \ 2878 if (nextNewMask >= numNewMasks) { \ 2879 int i; \ 2880 numNewMasks *= 2; \ 2881 kmp_affin_mask_t *temp; \ 2882 KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \ 2883 for (i = 0; i < numNewMasks / 2; i++) { \ 2884 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); \ 2885 kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i); \ 2886 KMP_CPU_COPY(dest, src); \ 2887 } \ 2888 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2); \ 2889 newMasks = temp; \ 2890 } \ 2891 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 2892 nextNewMask++; \ 2893 } 2894 2895 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId) \ 2896 { \ 2897 if (((_osId) > _maxOsId) || \ 2898 (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 2899 if (__kmp_affinity_verbose || \ 2900 (__kmp_affinity_warnings && \ 2901 (__kmp_affinity_type != affinity_none))) { \ 2902 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 2903 } \ 2904 } else { \ 2905 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 2906 } \ 2907 } 2908 2909 // Re-parse the proclist (for the explicit affinity type), and form the list 2910 // of affinity newMasks indexed by gtid. 2911 static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 2912 unsigned int *out_numMasks, 2913 const char *proclist, 2914 kmp_affin_mask_t *osId2Mask, 2915 int maxOsId) { 2916 int i; 2917 const char *scan = proclist; 2918 const char *next = proclist; 2919 2920 // We use malloc() for the temporary mask vector, so that we can use 2921 // realloc() to extend it. 2922 numNewMasks = 2; 2923 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 2924 nextNewMask = 0; 2925 kmp_affin_mask_t *sumMask; 2926 KMP_CPU_ALLOC(sumMask); 2927 int setSize = 0; 2928 2929 for (;;) { 2930 int start, end, stride; 2931 2932 SKIP_WS(scan); 2933 next = scan; 2934 if (*next == '\0') { 2935 break; 2936 } 2937 2938 if (*next == '{') { 2939 int num; 2940 setSize = 0; 2941 next++; // skip '{' 2942 SKIP_WS(next); 2943 scan = next; 2944 2945 // Read the first integer in the set. 2946 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist"); 2947 SKIP_DIGITS(next); 2948 num = __kmp_str_to_int(scan, *next); 2949 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2950 2951 // Copy the mask for that osId to the sum (union) mask. 2952 if ((num > maxOsId) || 2953 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2954 if (__kmp_affinity_verbose || 2955 (__kmp_affinity_warnings && 2956 (__kmp_affinity_type != affinity_none))) { 2957 KMP_WARNING(AffIgnoreInvalidProcID, num); 2958 } 2959 KMP_CPU_ZERO(sumMask); 2960 } else { 2961 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2962 setSize = 1; 2963 } 2964 2965 for (;;) { 2966 // Check for end of set. 2967 SKIP_WS(next); 2968 if (*next == '}') { 2969 next++; // skip '}' 2970 break; 2971 } 2972 2973 // Skip optional comma. 2974 if (*next == ',') { 2975 next++; 2976 } 2977 SKIP_WS(next); 2978 2979 // Read the next integer in the set. 2980 scan = next; 2981 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2982 2983 SKIP_DIGITS(next); 2984 num = __kmp_str_to_int(scan, *next); 2985 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2986 2987 // Add the mask for that osId to the sum mask. 2988 if ((num > maxOsId) || 2989 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2990 if (__kmp_affinity_verbose || 2991 (__kmp_affinity_warnings && 2992 (__kmp_affinity_type != affinity_none))) { 2993 KMP_WARNING(AffIgnoreInvalidProcID, num); 2994 } 2995 } else { 2996 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2997 setSize++; 2998 } 2999 } 3000 if (setSize > 0) { 3001 ADD_MASK(sumMask); 3002 } 3003 3004 SKIP_WS(next); 3005 if (*next == ',') { 3006 next++; 3007 } 3008 scan = next; 3009 continue; 3010 } 3011 3012 // Read the first integer. 3013 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 3014 SKIP_DIGITS(next); 3015 start = __kmp_str_to_int(scan, *next); 3016 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 3017 SKIP_WS(next); 3018 3019 // If this isn't a range, then add a mask to the list and go on. 3020 if (*next != '-') { 3021 ADD_MASK_OSID(start, osId2Mask, maxOsId); 3022 3023 // Skip optional comma. 3024 if (*next == ',') { 3025 next++; 3026 } 3027 scan = next; 3028 continue; 3029 } 3030 3031 // This is a range. Skip over the '-' and read in the 2nd int. 3032 next++; // skip '-' 3033 SKIP_WS(next); 3034 scan = next; 3035 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 3036 SKIP_DIGITS(next); 3037 end = __kmp_str_to_int(scan, *next); 3038 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 3039 3040 // Check for a stride parameter 3041 stride = 1; 3042 SKIP_WS(next); 3043 if (*next == ':') { 3044 // A stride is specified. Skip over the ':" and read the 3rd int. 3045 int sign = +1; 3046 next++; // skip ':' 3047 SKIP_WS(next); 3048 scan = next; 3049 if (*next == '-') { 3050 sign = -1; 3051 next++; 3052 SKIP_WS(next); 3053 scan = next; 3054 } 3055 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 3056 SKIP_DIGITS(next); 3057 stride = __kmp_str_to_int(scan, *next); 3058 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 3059 stride *= sign; 3060 } 3061 3062 // Do some range checks. 3063 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 3064 if (stride > 0) { 3065 KMP_ASSERT2(start <= end, "bad explicit proc list"); 3066 } else { 3067 KMP_ASSERT2(start >= end, "bad explicit proc list"); 3068 } 3069 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 3070 3071 // Add the mask for each OS proc # to the list. 3072 if (stride > 0) { 3073 do { 3074 ADD_MASK_OSID(start, osId2Mask, maxOsId); 3075 start += stride; 3076 } while (start <= end); 3077 } else { 3078 do { 3079 ADD_MASK_OSID(start, osId2Mask, maxOsId); 3080 start += stride; 3081 } while (start >= end); 3082 } 3083 3084 // Skip optional comma. 3085 SKIP_WS(next); 3086 if (*next == ',') { 3087 next++; 3088 } 3089 scan = next; 3090 } 3091 3092 *out_numMasks = nextNewMask; 3093 if (nextNewMask == 0) { 3094 *out_masks = NULL; 3095 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3096 return; 3097 } 3098 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3099 for (i = 0; i < nextNewMask; i++) { 3100 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 3101 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 3102 KMP_CPU_COPY(dest, src); 3103 } 3104 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3105 KMP_CPU_FREE(sumMask); 3106 } 3107 3108 /*----------------------------------------------------------------------------- 3109 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 3110 places. Again, Here is the grammar: 3111 3112 place_list := place 3113 place_list := place , place_list 3114 place := num 3115 place := place : num 3116 place := place : num : signed 3117 place := { subplacelist } 3118 place := ! place // (lowest priority) 3119 subplace_list := subplace 3120 subplace_list := subplace , subplace_list 3121 subplace := num 3122 subplace := num : num 3123 subplace := num : num : signed 3124 signed := num 3125 signed := + signed 3126 signed := - signed 3127 -----------------------------------------------------------------------------*/ 3128 static void __kmp_process_subplace_list(const char **scan, 3129 kmp_affin_mask_t *osId2Mask, 3130 int maxOsId, kmp_affin_mask_t *tempMask, 3131 int *setSize) { 3132 const char *next; 3133 3134 for (;;) { 3135 int start, count, stride, i; 3136 3137 // Read in the starting proc id 3138 SKIP_WS(*scan); 3139 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3140 next = *scan; 3141 SKIP_DIGITS(next); 3142 start = __kmp_str_to_int(*scan, *next); 3143 KMP_ASSERT(start >= 0); 3144 *scan = next; 3145 3146 // valid follow sets are ',' ':' and '}' 3147 SKIP_WS(*scan); 3148 if (**scan == '}' || **scan == ',') { 3149 if ((start > maxOsId) || 3150 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3151 if (__kmp_affinity_verbose || 3152 (__kmp_affinity_warnings && 3153 (__kmp_affinity_type != affinity_none))) { 3154 KMP_WARNING(AffIgnoreInvalidProcID, start); 3155 } 3156 } else { 3157 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3158 (*setSize)++; 3159 } 3160 if (**scan == '}') { 3161 break; 3162 } 3163 (*scan)++; // skip ',' 3164 continue; 3165 } 3166 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3167 (*scan)++; // skip ':' 3168 3169 // Read count parameter 3170 SKIP_WS(*scan); 3171 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3172 next = *scan; 3173 SKIP_DIGITS(next); 3174 count = __kmp_str_to_int(*scan, *next); 3175 KMP_ASSERT(count >= 0); 3176 *scan = next; 3177 3178 // valid follow sets are ',' ':' and '}' 3179 SKIP_WS(*scan); 3180 if (**scan == '}' || **scan == ',') { 3181 for (i = 0; i < count; i++) { 3182 if ((start > maxOsId) || 3183 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3184 if (__kmp_affinity_verbose || 3185 (__kmp_affinity_warnings && 3186 (__kmp_affinity_type != affinity_none))) { 3187 KMP_WARNING(AffIgnoreInvalidProcID, start); 3188 } 3189 break; // don't proliferate warnings for large count 3190 } else { 3191 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3192 start++; 3193 (*setSize)++; 3194 } 3195 } 3196 if (**scan == '}') { 3197 break; 3198 } 3199 (*scan)++; // skip ',' 3200 continue; 3201 } 3202 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3203 (*scan)++; // skip ':' 3204 3205 // Read stride parameter 3206 int sign = +1; 3207 for (;;) { 3208 SKIP_WS(*scan); 3209 if (**scan == '+') { 3210 (*scan)++; // skip '+' 3211 continue; 3212 } 3213 if (**scan == '-') { 3214 sign *= -1; 3215 (*scan)++; // skip '-' 3216 continue; 3217 } 3218 break; 3219 } 3220 SKIP_WS(*scan); 3221 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3222 next = *scan; 3223 SKIP_DIGITS(next); 3224 stride = __kmp_str_to_int(*scan, *next); 3225 KMP_ASSERT(stride >= 0); 3226 *scan = next; 3227 stride *= sign; 3228 3229 // valid follow sets are ',' and '}' 3230 SKIP_WS(*scan); 3231 if (**scan == '}' || **scan == ',') { 3232 for (i = 0; i < count; i++) { 3233 if ((start > maxOsId) || 3234 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3235 if (__kmp_affinity_verbose || 3236 (__kmp_affinity_warnings && 3237 (__kmp_affinity_type != affinity_none))) { 3238 KMP_WARNING(AffIgnoreInvalidProcID, start); 3239 } 3240 break; // don't proliferate warnings for large count 3241 } else { 3242 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3243 start += stride; 3244 (*setSize)++; 3245 } 3246 } 3247 if (**scan == '}') { 3248 break; 3249 } 3250 (*scan)++; // skip ',' 3251 continue; 3252 } 3253 3254 KMP_ASSERT2(0, "bad explicit places list"); 3255 } 3256 } 3257 3258 static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 3259 int maxOsId, kmp_affin_mask_t *tempMask, 3260 int *setSize) { 3261 const char *next; 3262 3263 // valid follow sets are '{' '!' and num 3264 SKIP_WS(*scan); 3265 if (**scan == '{') { 3266 (*scan)++; // skip '{' 3267 __kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize); 3268 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 3269 (*scan)++; // skip '}' 3270 } else if (**scan == '!') { 3271 (*scan)++; // skip '!' 3272 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 3273 KMP_CPU_COMPLEMENT(maxOsId, tempMask); 3274 } else if ((**scan >= '0') && (**scan <= '9')) { 3275 next = *scan; 3276 SKIP_DIGITS(next); 3277 int num = __kmp_str_to_int(*scan, *next); 3278 KMP_ASSERT(num >= 0); 3279 if ((num > maxOsId) || 3280 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3281 if (__kmp_affinity_verbose || 3282 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 3283 KMP_WARNING(AffIgnoreInvalidProcID, num); 3284 } 3285 } else { 3286 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 3287 (*setSize)++; 3288 } 3289 *scan = next; // skip num 3290 } else { 3291 KMP_ASSERT2(0, "bad explicit places list"); 3292 } 3293 } 3294 3295 // static void 3296 void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 3297 unsigned int *out_numMasks, 3298 const char *placelist, 3299 kmp_affin_mask_t *osId2Mask, 3300 int maxOsId) { 3301 int i, j, count, stride, sign; 3302 const char *scan = placelist; 3303 const char *next = placelist; 3304 3305 numNewMasks = 2; 3306 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 3307 nextNewMask = 0; 3308 3309 // tempMask is modified based on the previous or initial 3310 // place to form the current place 3311 // previousMask contains the previous place 3312 kmp_affin_mask_t *tempMask; 3313 kmp_affin_mask_t *previousMask; 3314 KMP_CPU_ALLOC(tempMask); 3315 KMP_CPU_ZERO(tempMask); 3316 KMP_CPU_ALLOC(previousMask); 3317 KMP_CPU_ZERO(previousMask); 3318 int setSize = 0; 3319 3320 for (;;) { 3321 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 3322 3323 // valid follow sets are ',' ':' and EOL 3324 SKIP_WS(scan); 3325 if (*scan == '\0' || *scan == ',') { 3326 if (setSize > 0) { 3327 ADD_MASK(tempMask); 3328 } 3329 KMP_CPU_ZERO(tempMask); 3330 setSize = 0; 3331 if (*scan == '\0') { 3332 break; 3333 } 3334 scan++; // skip ',' 3335 continue; 3336 } 3337 3338 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3339 scan++; // skip ':' 3340 3341 // Read count parameter 3342 SKIP_WS(scan); 3343 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 3344 next = scan; 3345 SKIP_DIGITS(next); 3346 count = __kmp_str_to_int(scan, *next); 3347 KMP_ASSERT(count >= 0); 3348 scan = next; 3349 3350 // valid follow sets are ',' ':' and EOL 3351 SKIP_WS(scan); 3352 if (*scan == '\0' || *scan == ',') { 3353 stride = +1; 3354 } else { 3355 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3356 scan++; // skip ':' 3357 3358 // Read stride parameter 3359 sign = +1; 3360 for (;;) { 3361 SKIP_WS(scan); 3362 if (*scan == '+') { 3363 scan++; // skip '+' 3364 continue; 3365 } 3366 if (*scan == '-') { 3367 sign *= -1; 3368 scan++; // skip '-' 3369 continue; 3370 } 3371 break; 3372 } 3373 SKIP_WS(scan); 3374 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 3375 next = scan; 3376 SKIP_DIGITS(next); 3377 stride = __kmp_str_to_int(scan, *next); 3378 KMP_DEBUG_ASSERT(stride >= 0); 3379 scan = next; 3380 stride *= sign; 3381 } 3382 3383 // Add places determined by initial_place : count : stride 3384 for (i = 0; i < count; i++) { 3385 if (setSize == 0) { 3386 break; 3387 } 3388 // Add the current place, then build the next place (tempMask) from that 3389 KMP_CPU_COPY(previousMask, tempMask); 3390 ADD_MASK(previousMask); 3391 KMP_CPU_ZERO(tempMask); 3392 setSize = 0; 3393 KMP_CPU_SET_ITERATE(j, previousMask) { 3394 if (!KMP_CPU_ISSET(j, previousMask)) { 3395 continue; 3396 } 3397 if ((j + stride > maxOsId) || (j + stride < 0) || 3398 (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) || 3399 (!KMP_CPU_ISSET(j + stride, 3400 KMP_CPU_INDEX(osId2Mask, j + stride)))) { 3401 if ((__kmp_affinity_verbose || 3402 (__kmp_affinity_warnings && 3403 (__kmp_affinity_type != affinity_none))) && 3404 i < count - 1) { 3405 KMP_WARNING(AffIgnoreInvalidProcID, j + stride); 3406 } 3407 continue; 3408 } 3409 KMP_CPU_SET(j + stride, tempMask); 3410 setSize++; 3411 } 3412 } 3413 KMP_CPU_ZERO(tempMask); 3414 setSize = 0; 3415 3416 // valid follow sets are ',' and EOL 3417 SKIP_WS(scan); 3418 if (*scan == '\0') { 3419 break; 3420 } 3421 if (*scan == ',') { 3422 scan++; // skip ',' 3423 continue; 3424 } 3425 3426 KMP_ASSERT2(0, "bad explicit places list"); 3427 } 3428 3429 *out_numMasks = nextNewMask; 3430 if (nextNewMask == 0) { 3431 *out_masks = NULL; 3432 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3433 return; 3434 } 3435 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3436 KMP_CPU_FREE(tempMask); 3437 KMP_CPU_FREE(previousMask); 3438 for (i = 0; i < nextNewMask; i++) { 3439 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 3440 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 3441 KMP_CPU_COPY(dest, src); 3442 } 3443 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3444 } 3445 3446 #undef ADD_MASK 3447 #undef ADD_MASK_OSID 3448 3449 // This function figures out the deepest level at which there is at least one 3450 // cluster/core with more than one processing unit bound to it. 3451 static int __kmp_affinity_find_core_level(int nprocs, int bottom_level) { 3452 int core_level = 0; 3453 3454 for (int i = 0; i < nprocs; i++) { 3455 const kmp_hw_thread_t &hw_thread = __kmp_topology->at(i); 3456 for (int j = bottom_level; j > 0; j--) { 3457 if (hw_thread.ids[j] > 0) { 3458 if (core_level < (j - 1)) { 3459 core_level = j - 1; 3460 } 3461 } 3462 } 3463 } 3464 return core_level; 3465 } 3466 3467 // This function counts number of clusters/cores at given level. 3468 static int __kmp_affinity_compute_ncores(int nprocs, int bottom_level, 3469 int core_level) { 3470 return __kmp_topology->get_count(core_level); 3471 } 3472 // This function finds to which cluster/core given processing unit is bound. 3473 static int __kmp_affinity_find_core(int proc, int bottom_level, 3474 int core_level) { 3475 int core = 0; 3476 KMP_DEBUG_ASSERT(proc >= 0 && proc < __kmp_topology->get_num_hw_threads()); 3477 for (int i = 0; i <= proc; ++i) { 3478 if (i + 1 <= proc) { 3479 for (int j = 0; j <= core_level; ++j) { 3480 if (__kmp_topology->at(i + 1).sub_ids[j] != 3481 __kmp_topology->at(i).sub_ids[j]) { 3482 core++; 3483 break; 3484 } 3485 } 3486 } 3487 } 3488 return core; 3489 } 3490 3491 // This function finds maximal number of processing units bound to a 3492 // cluster/core at given level. 3493 static int __kmp_affinity_max_proc_per_core(int nprocs, int bottom_level, 3494 int core_level) { 3495 if (core_level >= bottom_level) 3496 return 1; 3497 int thread_level = __kmp_topology->get_level(KMP_HW_THREAD); 3498 return __kmp_topology->calculate_ratio(thread_level, core_level); 3499 } 3500 3501 static int *procarr = NULL; 3502 static int __kmp_aff_depth = 0; 3503 3504 // Create a one element mask array (set of places) which only contains the 3505 // initial process's affinity mask 3506 static void __kmp_create_affinity_none_places() { 3507 KMP_ASSERT(__kmp_affin_fullMask != NULL); 3508 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3509 __kmp_affinity_num_masks = 1; 3510 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 3511 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, 0); 3512 KMP_CPU_COPY(dest, __kmp_affin_fullMask); 3513 } 3514 3515 static void __kmp_aux_affinity_initialize(void) { 3516 if (__kmp_affinity_masks != NULL) { 3517 KMP_ASSERT(__kmp_affin_fullMask != NULL); 3518 return; 3519 } 3520 3521 // Create the "full" mask - this defines all of the processors that we 3522 // consider to be in the machine model. If respect is set, then it is the 3523 // initialization thread's affinity mask. Otherwise, it is all processors that 3524 // we know about on the machine. 3525 if (__kmp_affin_fullMask == NULL) { 3526 KMP_CPU_ALLOC(__kmp_affin_fullMask); 3527 } 3528 if (KMP_AFFINITY_CAPABLE()) { 3529 __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE); 3530 if (__kmp_affinity_respect_mask) { 3531 // Count the number of available processors. 3532 unsigned i; 3533 __kmp_avail_proc = 0; 3534 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 3535 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 3536 continue; 3537 } 3538 __kmp_avail_proc++; 3539 } 3540 if (__kmp_avail_proc > __kmp_xproc) { 3541 if (__kmp_affinity_verbose || 3542 (__kmp_affinity_warnings && 3543 (__kmp_affinity_type != affinity_none))) { 3544 KMP_WARNING(ErrorInitializeAffinity); 3545 } 3546 __kmp_affinity_type = affinity_none; 3547 KMP_AFFINITY_DISABLE(); 3548 return; 3549 } 3550 3551 if (__kmp_affinity_verbose) { 3552 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 3553 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 3554 __kmp_affin_fullMask); 3555 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 3556 } 3557 } else { 3558 if (__kmp_affinity_verbose) { 3559 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 3560 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 3561 __kmp_affin_fullMask); 3562 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 3563 } 3564 __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask); 3565 __kmp_avail_proc = __kmp_xproc; 3566 #if KMP_OS_WINDOWS 3567 // Set the process affinity mask since threads' affinity 3568 // masks must be subset of process mask in Windows* OS 3569 __kmp_affin_fullMask->set_process_affinity(true); 3570 #endif 3571 } 3572 } 3573 3574 kmp_i18n_id_t msg_id = kmp_i18n_null; 3575 3576 // For backward compatibility, setting KMP_CPUINFO_FILE => 3577 // KMP_TOPOLOGY_METHOD=cpuinfo 3578 if ((__kmp_cpuinfo_file != NULL) && 3579 (__kmp_affinity_top_method == affinity_top_method_all)) { 3580 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 3581 } 3582 3583 bool success = false; 3584 if (__kmp_affinity_top_method == affinity_top_method_all) { 3585 // In the default code path, errors are not fatal - we just try using 3586 // another method. We only emit a warning message if affinity is on, or the 3587 // verbose flag is set, an the nowarnings flag was not set. 3588 #if KMP_USE_HWLOC 3589 if (!success && 3590 __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 3591 if (!__kmp_hwloc_error) { 3592 success = __kmp_affinity_create_hwloc_map(&msg_id); 3593 if (!success && __kmp_affinity_verbose) { 3594 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 3595 } 3596 } else if (__kmp_affinity_verbose) { 3597 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 3598 } 3599 } 3600 #endif 3601 3602 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 3603 if (!success) { 3604 success = __kmp_affinity_create_x2apicid_map(&msg_id); 3605 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3606 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3607 } 3608 } 3609 if (!success) { 3610 success = __kmp_affinity_create_apicid_map(&msg_id); 3611 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3612 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3613 } 3614 } 3615 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3616 3617 #if KMP_OS_LINUX 3618 if (!success) { 3619 int line = 0; 3620 success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id); 3621 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3622 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3623 } 3624 } 3625 #endif /* KMP_OS_LINUX */ 3626 3627 #if KMP_GROUP_AFFINITY 3628 if (!success && (__kmp_num_proc_groups > 1)) { 3629 success = __kmp_affinity_create_proc_group_map(&msg_id); 3630 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3631 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3632 } 3633 } 3634 #endif /* KMP_GROUP_AFFINITY */ 3635 3636 if (!success) { 3637 success = __kmp_affinity_create_flat_map(&msg_id); 3638 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3639 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3640 } 3641 KMP_ASSERT(success); 3642 } 3643 } 3644 3645 // If the user has specified that a paricular topology discovery method is to be 3646 // used, then we abort if that method fails. The exception is group affinity, 3647 // which might have been implicitly set. 3648 #if KMP_USE_HWLOC 3649 else if (__kmp_affinity_top_method == affinity_top_method_hwloc) { 3650 KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC); 3651 success = __kmp_affinity_create_hwloc_map(&msg_id); 3652 if (!success) { 3653 KMP_ASSERT(msg_id != kmp_i18n_null); 3654 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3655 } 3656 } 3657 #endif // KMP_USE_HWLOC 3658 3659 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 3660 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid || 3661 __kmp_affinity_top_method == affinity_top_method_x2apicid_1f) { 3662 success = __kmp_affinity_create_x2apicid_map(&msg_id); 3663 if (!success) { 3664 KMP_ASSERT(msg_id != kmp_i18n_null); 3665 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3666 } 3667 } else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 3668 success = __kmp_affinity_create_apicid_map(&msg_id); 3669 if (!success) { 3670 KMP_ASSERT(msg_id != kmp_i18n_null); 3671 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3672 } 3673 } 3674 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3675 3676 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 3677 int line = 0; 3678 success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id); 3679 if (!success) { 3680 KMP_ASSERT(msg_id != kmp_i18n_null); 3681 const char *filename = __kmp_cpuinfo_get_filename(); 3682 if (line > 0) { 3683 KMP_FATAL(FileLineMsgExiting, filename, line, 3684 __kmp_i18n_catgets(msg_id)); 3685 } else { 3686 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 3687 } 3688 } 3689 } 3690 3691 #if KMP_GROUP_AFFINITY 3692 else if (__kmp_affinity_top_method == affinity_top_method_group) { 3693 success = __kmp_affinity_create_proc_group_map(&msg_id); 3694 KMP_ASSERT(success); 3695 if (!success) { 3696 KMP_ASSERT(msg_id != kmp_i18n_null); 3697 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3698 } 3699 } 3700 #endif /* KMP_GROUP_AFFINITY */ 3701 3702 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 3703 success = __kmp_affinity_create_flat_map(&msg_id); 3704 // should not fail 3705 KMP_ASSERT(success); 3706 } 3707 3708 // Early exit if topology could not be created 3709 if (!__kmp_topology) { 3710 if (KMP_AFFINITY_CAPABLE() && 3711 (__kmp_affinity_verbose || 3712 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) { 3713 KMP_WARNING(ErrorInitializeAffinity); 3714 } 3715 if (nPackages > 0 && nCoresPerPkg > 0 && __kmp_nThreadsPerCore > 0 && 3716 __kmp_ncores > 0) { 3717 __kmp_topology = kmp_topology_t::allocate(0, 0, NULL); 3718 __kmp_topology->canonicalize(nPackages, nCoresPerPkg, 3719 __kmp_nThreadsPerCore, __kmp_ncores); 3720 if (__kmp_affinity_verbose) { 3721 __kmp_topology->print("KMP_AFFINITY"); 3722 } 3723 } 3724 __kmp_affinity_type = affinity_none; 3725 __kmp_create_affinity_none_places(); 3726 #if KMP_USE_HIER_SCHED 3727 __kmp_dispatch_set_hierarchy_values(); 3728 #endif 3729 KMP_AFFINITY_DISABLE(); 3730 return; 3731 } 3732 3733 // Canonicalize, print (if requested), apply KMP_HW_SUBSET, and 3734 // initialize other data structures which depend on the topology 3735 __kmp_topology->canonicalize(); 3736 if (__kmp_affinity_verbose) 3737 __kmp_topology->print("KMP_AFFINITY"); 3738 bool filtered = __kmp_topology->filter_hw_subset(); 3739 if (filtered && __kmp_affinity_verbose) 3740 __kmp_topology->print("KMP_HW_SUBSET"); 3741 machine_hierarchy.init(__kmp_topology->get_num_hw_threads()); 3742 KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads()); 3743 // If KMP_AFFINITY=none, then only create the single "none" place 3744 // which is the process's initial affinity mask or the number of 3745 // hardware threads depending on respect,norespect 3746 if (__kmp_affinity_type == affinity_none) { 3747 __kmp_create_affinity_none_places(); 3748 #if KMP_USE_HIER_SCHED 3749 __kmp_dispatch_set_hierarchy_values(); 3750 #endif 3751 return; 3752 } 3753 int depth = __kmp_topology->get_depth(); 3754 3755 // Create the table of masks, indexed by thread Id. 3756 unsigned maxIndex; 3757 unsigned numUnique; 3758 kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique); 3759 if (__kmp_affinity_gran_levels == 0) { 3760 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 3761 } 3762 3763 switch (__kmp_affinity_type) { 3764 3765 case affinity_explicit: 3766 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 3767 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) { 3768 __kmp_affinity_process_proclist( 3769 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 3770 __kmp_affinity_proclist, osId2Mask, maxIndex); 3771 } else { 3772 __kmp_affinity_process_placelist( 3773 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 3774 __kmp_affinity_proclist, osId2Mask, maxIndex); 3775 } 3776 if (__kmp_affinity_num_masks == 0) { 3777 if (__kmp_affinity_verbose || 3778 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 3779 KMP_WARNING(AffNoValidProcID); 3780 } 3781 __kmp_affinity_type = affinity_none; 3782 __kmp_create_affinity_none_places(); 3783 return; 3784 } 3785 break; 3786 3787 // The other affinity types rely on sorting the hardware threads according to 3788 // some permutation of the machine topology tree. Set __kmp_affinity_compact 3789 // and __kmp_affinity_offset appropriately, then jump to a common code 3790 // fragment to do the sort and create the array of affinity masks. 3791 case affinity_logical: 3792 __kmp_affinity_compact = 0; 3793 if (__kmp_affinity_offset) { 3794 __kmp_affinity_offset = 3795 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 3796 } 3797 goto sortTopology; 3798 3799 case affinity_physical: 3800 if (__kmp_nThreadsPerCore > 1) { 3801 __kmp_affinity_compact = 1; 3802 if (__kmp_affinity_compact >= depth) { 3803 __kmp_affinity_compact = 0; 3804 } 3805 } else { 3806 __kmp_affinity_compact = 0; 3807 } 3808 if (__kmp_affinity_offset) { 3809 __kmp_affinity_offset = 3810 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 3811 } 3812 goto sortTopology; 3813 3814 case affinity_scatter: 3815 if (__kmp_affinity_compact >= depth) { 3816 __kmp_affinity_compact = 0; 3817 } else { 3818 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 3819 } 3820 goto sortTopology; 3821 3822 case affinity_compact: 3823 if (__kmp_affinity_compact >= depth) { 3824 __kmp_affinity_compact = depth - 1; 3825 } 3826 goto sortTopology; 3827 3828 case affinity_balanced: 3829 if (depth <= 1) { 3830 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 3831 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 3832 } 3833 __kmp_affinity_type = affinity_none; 3834 __kmp_create_affinity_none_places(); 3835 return; 3836 } else if (!__kmp_topology->is_uniform()) { 3837 // Save the depth for further usage 3838 __kmp_aff_depth = depth; 3839 3840 int core_level = 3841 __kmp_affinity_find_core_level(__kmp_avail_proc, depth - 1); 3842 int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, depth - 1, 3843 core_level); 3844 int maxprocpercore = __kmp_affinity_max_proc_per_core( 3845 __kmp_avail_proc, depth - 1, core_level); 3846 3847 int nproc = ncores * maxprocpercore; 3848 if ((nproc < 2) || (nproc < __kmp_avail_proc)) { 3849 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 3850 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 3851 } 3852 __kmp_affinity_type = affinity_none; 3853 return; 3854 } 3855 3856 procarr = (int *)__kmp_allocate(sizeof(int) * nproc); 3857 for (int i = 0; i < nproc; i++) { 3858 procarr[i] = -1; 3859 } 3860 3861 int lastcore = -1; 3862 int inlastcore = 0; 3863 for (int i = 0; i < __kmp_avail_proc; i++) { 3864 int proc = __kmp_topology->at(i).os_id; 3865 int core = __kmp_affinity_find_core(i, depth - 1, core_level); 3866 3867 if (core == lastcore) { 3868 inlastcore++; 3869 } else { 3870 inlastcore = 0; 3871 } 3872 lastcore = core; 3873 3874 procarr[core * maxprocpercore + inlastcore] = proc; 3875 } 3876 } 3877 if (__kmp_affinity_compact >= depth) { 3878 __kmp_affinity_compact = depth - 1; 3879 } 3880 3881 sortTopology: 3882 // Allocate the gtid->affinity mask table. 3883 if (__kmp_affinity_dups) { 3884 __kmp_affinity_num_masks = __kmp_avail_proc; 3885 } else { 3886 __kmp_affinity_num_masks = numUnique; 3887 } 3888 3889 if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) && 3890 (__kmp_affinity_num_places > 0) && 3891 ((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) { 3892 __kmp_affinity_num_masks = __kmp_affinity_num_places; 3893 } 3894 3895 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 3896 3897 // Sort the topology table according to the current setting of 3898 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 3899 __kmp_topology->sort_compact(); 3900 { 3901 int i; 3902 unsigned j; 3903 int num_hw_threads = __kmp_topology->get_num_hw_threads(); 3904 for (i = 0, j = 0; i < num_hw_threads; i++) { 3905 if ((!__kmp_affinity_dups) && (!__kmp_topology->at(i).leader)) { 3906 continue; 3907 } 3908 int osId = __kmp_topology->at(i).os_id; 3909 3910 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 3911 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j); 3912 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 3913 KMP_CPU_COPY(dest, src); 3914 if (++j >= __kmp_affinity_num_masks) { 3915 break; 3916 } 3917 } 3918 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 3919 } 3920 // Sort the topology back using ids 3921 __kmp_topology->sort_ids(); 3922 break; 3923 3924 default: 3925 KMP_ASSERT2(0, "Unexpected affinity setting"); 3926 } 3927 3928 KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1); 3929 } 3930 3931 void __kmp_affinity_initialize(void) { 3932 // Much of the code above was written assuming that if a machine was not 3933 // affinity capable, then __kmp_affinity_type == affinity_none. We now 3934 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 3935 // There are too many checks for __kmp_affinity_type == affinity_none 3936 // in this code. Instead of trying to change them all, check if 3937 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 3938 // affinity_none, call the real initialization routine, then restore 3939 // __kmp_affinity_type to affinity_disabled. 3940 int disabled = (__kmp_affinity_type == affinity_disabled); 3941 if (!KMP_AFFINITY_CAPABLE()) { 3942 KMP_ASSERT(disabled); 3943 } 3944 if (disabled) { 3945 __kmp_affinity_type = affinity_none; 3946 } 3947 __kmp_aux_affinity_initialize(); 3948 if (disabled) { 3949 __kmp_affinity_type = affinity_disabled; 3950 } 3951 } 3952 3953 void __kmp_affinity_uninitialize(void) { 3954 if (__kmp_affinity_masks != NULL) { 3955 KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 3956 __kmp_affinity_masks = NULL; 3957 } 3958 if (__kmp_affin_fullMask != NULL) { 3959 KMP_CPU_FREE(__kmp_affin_fullMask); 3960 __kmp_affin_fullMask = NULL; 3961 } 3962 __kmp_affinity_num_masks = 0; 3963 __kmp_affinity_type = affinity_default; 3964 __kmp_affinity_num_places = 0; 3965 if (__kmp_affinity_proclist != NULL) { 3966 __kmp_free(__kmp_affinity_proclist); 3967 __kmp_affinity_proclist = NULL; 3968 } 3969 if (procarr != NULL) { 3970 __kmp_free(procarr); 3971 procarr = NULL; 3972 } 3973 #if KMP_USE_HWLOC 3974 if (__kmp_hwloc_topology != NULL) { 3975 hwloc_topology_destroy(__kmp_hwloc_topology); 3976 __kmp_hwloc_topology = NULL; 3977 } 3978 #endif 3979 if (__kmp_hw_subset) { 3980 kmp_hw_subset_t::deallocate(__kmp_hw_subset); 3981 __kmp_hw_subset = nullptr; 3982 } 3983 if (__kmp_topology) { 3984 kmp_topology_t::deallocate(__kmp_topology); 3985 __kmp_topology = nullptr; 3986 } 3987 KMPAffinity::destroy_api(); 3988 } 3989 3990 void __kmp_affinity_set_init_mask(int gtid, int isa_root) { 3991 if (!KMP_AFFINITY_CAPABLE()) { 3992 return; 3993 } 3994 3995 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 3996 if (th->th.th_affin_mask == NULL) { 3997 KMP_CPU_ALLOC(th->th.th_affin_mask); 3998 } else { 3999 KMP_CPU_ZERO(th->th.th_affin_mask); 4000 } 4001 4002 // Copy the thread mask to the kmp_info_t structure. If 4003 // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that 4004 // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set, 4005 // then the full mask is the same as the mask of the initialization thread. 4006 kmp_affin_mask_t *mask; 4007 int i; 4008 4009 if (KMP_AFFINITY_NON_PROC_BIND) { 4010 if ((__kmp_affinity_type == affinity_none) || 4011 (__kmp_affinity_type == affinity_balanced) || 4012 KMP_HIDDEN_HELPER_THREAD(gtid)) { 4013 #if KMP_GROUP_AFFINITY 4014 if (__kmp_num_proc_groups > 1) { 4015 return; 4016 } 4017 #endif 4018 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4019 i = 0; 4020 mask = __kmp_affin_fullMask; 4021 } else { 4022 int mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid); 4023 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 4024 i = (mask_idx + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4025 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4026 } 4027 } else { 4028 if ((!isa_root) || KMP_HIDDEN_HELPER_THREAD(gtid) || 4029 (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 4030 #if KMP_GROUP_AFFINITY 4031 if (__kmp_num_proc_groups > 1) { 4032 return; 4033 } 4034 #endif 4035 KMP_ASSERT(__kmp_affin_fullMask != NULL); 4036 i = KMP_PLACE_ALL; 4037 mask = __kmp_affin_fullMask; 4038 } else { 4039 // int i = some hash function or just a counter that doesn't 4040 // always start at 0. Use adjusted gtid for now. 4041 int mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid); 4042 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 4043 i = (mask_idx + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4044 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4045 } 4046 } 4047 4048 th->th.th_current_place = i; 4049 if (isa_root || KMP_HIDDEN_HELPER_THREAD(gtid)) { 4050 th->th.th_new_place = i; 4051 th->th.th_first_place = 0; 4052 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4053 } else if (KMP_AFFINITY_NON_PROC_BIND) { 4054 // When using a Non-OMP_PROC_BIND affinity method, 4055 // set all threads' place-partition-var to the entire place list 4056 th->th.th_first_place = 0; 4057 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4058 } 4059 4060 if (i == KMP_PLACE_ALL) { 4061 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 4062 gtid)); 4063 } else { 4064 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 4065 gtid, i)); 4066 } 4067 4068 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4069 4070 if (__kmp_affinity_verbose && !KMP_HIDDEN_HELPER_THREAD(gtid) 4071 /* to avoid duplicate printing (will be correctly printed on barrier) */ 4072 && (__kmp_affinity_type == affinity_none || 4073 (i != KMP_PLACE_ALL && __kmp_affinity_type != affinity_balanced))) { 4074 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4075 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4076 th->th.th_affin_mask); 4077 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4078 __kmp_gettid(), gtid, buf); 4079 } 4080 4081 #if KMP_DEBUG 4082 // Hidden helper thread affinity only printed for debug builds 4083 if (__kmp_affinity_verbose && KMP_HIDDEN_HELPER_THREAD(gtid)) { 4084 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4085 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4086 th->th.th_affin_mask); 4087 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY (hidden helper thread)", 4088 (kmp_int32)getpid(), __kmp_gettid(), gtid, buf); 4089 } 4090 #endif 4091 4092 #if KMP_OS_WINDOWS 4093 // On Windows* OS, the process affinity mask might have changed. If the user 4094 // didn't request affinity and this call fails, just continue silently. 4095 // See CQ171393. 4096 if (__kmp_affinity_type == affinity_none) { 4097 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 4098 } else 4099 #endif 4100 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4101 } 4102 4103 void __kmp_affinity_set_place(int gtid) { 4104 if (!KMP_AFFINITY_CAPABLE()) { 4105 return; 4106 } 4107 4108 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4109 4110 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current " 4111 "place = %d)\n", 4112 gtid, th->th.th_new_place, th->th.th_current_place)); 4113 4114 // Check that the new place is within this thread's partition. 4115 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4116 KMP_ASSERT(th->th.th_new_place >= 0); 4117 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 4118 if (th->th.th_first_place <= th->th.th_last_place) { 4119 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) && 4120 (th->th.th_new_place <= th->th.th_last_place)); 4121 } else { 4122 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) || 4123 (th->th.th_new_place >= th->th.th_last_place)); 4124 } 4125 4126 // Copy the thread mask to the kmp_info_t structure, 4127 // and set this thread's affinity. 4128 kmp_affin_mask_t *mask = 4129 KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place); 4130 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4131 th->th.th_current_place = th->th.th_new_place; 4132 4133 if (__kmp_affinity_verbose) { 4134 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4135 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4136 th->th.th_affin_mask); 4137 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), 4138 __kmp_gettid(), gtid, buf); 4139 } 4140 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4141 } 4142 4143 int __kmp_aux_set_affinity(void **mask) { 4144 int gtid; 4145 kmp_info_t *th; 4146 int retval; 4147 4148 if (!KMP_AFFINITY_CAPABLE()) { 4149 return -1; 4150 } 4151 4152 gtid = __kmp_entry_gtid(); 4153 KA_TRACE( 4154 1000, (""); { 4155 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4156 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4157 (kmp_affin_mask_t *)(*mask)); 4158 __kmp_debug_printf( 4159 "kmp_set_affinity: setting affinity mask for thread %d = %s\n", 4160 gtid, buf); 4161 }); 4162 4163 if (__kmp_env_consistency_check) { 4164 if ((mask == NULL) || (*mask == NULL)) { 4165 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4166 } else { 4167 unsigned proc; 4168 int num_procs = 0; 4169 4170 KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) { 4171 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4172 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4173 } 4174 if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 4175 continue; 4176 } 4177 num_procs++; 4178 } 4179 if (num_procs == 0) { 4180 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4181 } 4182 4183 #if KMP_GROUP_AFFINITY 4184 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 4185 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4186 } 4187 #endif /* KMP_GROUP_AFFINITY */ 4188 } 4189 } 4190 4191 th = __kmp_threads[gtid]; 4192 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4193 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4194 if (retval == 0) { 4195 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 4196 } 4197 4198 th->th.th_current_place = KMP_PLACE_UNDEFINED; 4199 th->th.th_new_place = KMP_PLACE_UNDEFINED; 4200 th->th.th_first_place = 0; 4201 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4202 4203 // Turn off 4.0 affinity for the current tread at this parallel level. 4204 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 4205 4206 return retval; 4207 } 4208 4209 int __kmp_aux_get_affinity(void **mask) { 4210 int gtid; 4211 int retval; 4212 #if KMP_OS_WINDOWS || KMP_DEBUG 4213 kmp_info_t *th; 4214 #endif 4215 if (!KMP_AFFINITY_CAPABLE()) { 4216 return -1; 4217 } 4218 4219 gtid = __kmp_entry_gtid(); 4220 #if KMP_OS_WINDOWS || KMP_DEBUG 4221 th = __kmp_threads[gtid]; 4222 #else 4223 (void)gtid; // unused variable 4224 #endif 4225 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4226 4227 KA_TRACE( 4228 1000, (""); { 4229 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4230 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4231 th->th.th_affin_mask); 4232 __kmp_printf( 4233 "kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, 4234 buf); 4235 }); 4236 4237 if (__kmp_env_consistency_check) { 4238 if ((mask == NULL) || (*mask == NULL)) { 4239 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 4240 } 4241 } 4242 4243 #if !KMP_OS_WINDOWS 4244 4245 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4246 KA_TRACE( 4247 1000, (""); { 4248 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4249 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4250 (kmp_affin_mask_t *)(*mask)); 4251 __kmp_printf( 4252 "kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, 4253 buf); 4254 }); 4255 return retval; 4256 4257 #else 4258 (void)retval; 4259 4260 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 4261 return 0; 4262 4263 #endif /* KMP_OS_WINDOWS */ 4264 } 4265 4266 int __kmp_aux_get_affinity_max_proc() { 4267 if (!KMP_AFFINITY_CAPABLE()) { 4268 return 0; 4269 } 4270 #if KMP_GROUP_AFFINITY 4271 if (__kmp_num_proc_groups > 1) { 4272 return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT); 4273 } 4274 #endif 4275 return __kmp_xproc; 4276 } 4277 4278 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) { 4279 if (!KMP_AFFINITY_CAPABLE()) { 4280 return -1; 4281 } 4282 4283 KA_TRACE( 4284 1000, (""); { 4285 int gtid = __kmp_entry_gtid(); 4286 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4287 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4288 (kmp_affin_mask_t *)(*mask)); 4289 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in " 4290 "affinity mask for thread %d = %s\n", 4291 proc, gtid, buf); 4292 }); 4293 4294 if (__kmp_env_consistency_check) { 4295 if ((mask == NULL) || (*mask == NULL)) { 4296 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4297 } 4298 } 4299 4300 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4301 return -1; 4302 } 4303 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4304 return -2; 4305 } 4306 4307 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 4308 return 0; 4309 } 4310 4311 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) { 4312 if (!KMP_AFFINITY_CAPABLE()) { 4313 return -1; 4314 } 4315 4316 KA_TRACE( 4317 1000, (""); { 4318 int gtid = __kmp_entry_gtid(); 4319 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4320 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4321 (kmp_affin_mask_t *)(*mask)); 4322 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in " 4323 "affinity mask for thread %d = %s\n", 4324 proc, gtid, buf); 4325 }); 4326 4327 if (__kmp_env_consistency_check) { 4328 if ((mask == NULL) || (*mask == NULL)) { 4329 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 4330 } 4331 } 4332 4333 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4334 return -1; 4335 } 4336 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4337 return -2; 4338 } 4339 4340 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 4341 return 0; 4342 } 4343 4344 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) { 4345 if (!KMP_AFFINITY_CAPABLE()) { 4346 return -1; 4347 } 4348 4349 KA_TRACE( 4350 1000, (""); { 4351 int gtid = __kmp_entry_gtid(); 4352 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4353 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4354 (kmp_affin_mask_t *)(*mask)); 4355 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in " 4356 "affinity mask for thread %d = %s\n", 4357 proc, gtid, buf); 4358 }); 4359 4360 if (__kmp_env_consistency_check) { 4361 if ((mask == NULL) || (*mask == NULL)) { 4362 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc"); 4363 } 4364 } 4365 4366 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4367 return -1; 4368 } 4369 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4370 return 0; 4371 } 4372 4373 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 4374 } 4375 4376 // Dynamic affinity settings - Affinity balanced 4377 void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) { 4378 KMP_DEBUG_ASSERT(th); 4379 bool fine_gran = true; 4380 int tid = th->th.th_info.ds.ds_tid; 4381 4382 // Do not perform balanced affinity for the hidden helper threads 4383 if (KMP_HIDDEN_HELPER_THREAD(__kmp_gtid_from_thread(th))) 4384 return; 4385 4386 switch (__kmp_affinity_gran) { 4387 case KMP_HW_THREAD: 4388 break; 4389 case KMP_HW_CORE: 4390 if (__kmp_nThreadsPerCore > 1) { 4391 fine_gran = false; 4392 } 4393 break; 4394 case KMP_HW_SOCKET: 4395 if (nCoresPerPkg > 1) { 4396 fine_gran = false; 4397 } 4398 break; 4399 default: 4400 fine_gran = false; 4401 } 4402 4403 if (__kmp_topology->is_uniform()) { 4404 int coreID; 4405 int threadID; 4406 // Number of hyper threads per core in HT machine 4407 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 4408 // Number of cores 4409 int ncores = __kmp_ncores; 4410 if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) { 4411 __kmp_nth_per_core = __kmp_avail_proc / nPackages; 4412 ncores = nPackages; 4413 } 4414 // How many threads will be bound to each core 4415 int chunk = nthreads / ncores; 4416 // How many cores will have an additional thread bound to it - "big cores" 4417 int big_cores = nthreads % ncores; 4418 // Number of threads on the big cores 4419 int big_nth = (chunk + 1) * big_cores; 4420 if (tid < big_nth) { 4421 coreID = tid / (chunk + 1); 4422 threadID = (tid % (chunk + 1)) % __kmp_nth_per_core; 4423 } else { // tid >= big_nth 4424 coreID = (tid - big_cores) / chunk; 4425 threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core; 4426 } 4427 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 4428 "Illegal set affinity operation when not capable"); 4429 4430 kmp_affin_mask_t *mask = th->th.th_affin_mask; 4431 KMP_CPU_ZERO(mask); 4432 4433 if (fine_gran) { 4434 int osID = 4435 __kmp_topology->at(coreID * __kmp_nth_per_core + threadID).os_id; 4436 KMP_CPU_SET(osID, mask); 4437 } else { 4438 for (int i = 0; i < __kmp_nth_per_core; i++) { 4439 int osID; 4440 osID = __kmp_topology->at(coreID * __kmp_nth_per_core + i).os_id; 4441 KMP_CPU_SET(osID, mask); 4442 } 4443 } 4444 if (__kmp_affinity_verbose) { 4445 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4446 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4447 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4448 __kmp_gettid(), tid, buf); 4449 } 4450 __kmp_set_system_affinity(mask, TRUE); 4451 } else { // Non-uniform topology 4452 4453 kmp_affin_mask_t *mask = th->th.th_affin_mask; 4454 KMP_CPU_ZERO(mask); 4455 4456 int core_level = 4457 __kmp_affinity_find_core_level(__kmp_avail_proc, __kmp_aff_depth - 1); 4458 int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, 4459 __kmp_aff_depth - 1, core_level); 4460 int nth_per_core = __kmp_affinity_max_proc_per_core( 4461 __kmp_avail_proc, __kmp_aff_depth - 1, core_level); 4462 4463 // For performance gain consider the special case nthreads == 4464 // __kmp_avail_proc 4465 if (nthreads == __kmp_avail_proc) { 4466 if (fine_gran) { 4467 int osID = __kmp_topology->at(tid).os_id; 4468 KMP_CPU_SET(osID, mask); 4469 } else { 4470 int core = 4471 __kmp_affinity_find_core(tid, __kmp_aff_depth - 1, core_level); 4472 for (int i = 0; i < __kmp_avail_proc; i++) { 4473 int osID = __kmp_topology->at(i).os_id; 4474 if (__kmp_affinity_find_core(i, __kmp_aff_depth - 1, core_level) == 4475 core) { 4476 KMP_CPU_SET(osID, mask); 4477 } 4478 } 4479 } 4480 } else if (nthreads <= ncores) { 4481 4482 int core = 0; 4483 for (int i = 0; i < ncores; i++) { 4484 // Check if this core from procarr[] is in the mask 4485 int in_mask = 0; 4486 for (int j = 0; j < nth_per_core; j++) { 4487 if (procarr[i * nth_per_core + j] != -1) { 4488 in_mask = 1; 4489 break; 4490 } 4491 } 4492 if (in_mask) { 4493 if (tid == core) { 4494 for (int j = 0; j < nth_per_core; j++) { 4495 int osID = procarr[i * nth_per_core + j]; 4496 if (osID != -1) { 4497 KMP_CPU_SET(osID, mask); 4498 // For fine granularity it is enough to set the first available 4499 // osID for this core 4500 if (fine_gran) { 4501 break; 4502 } 4503 } 4504 } 4505 break; 4506 } else { 4507 core++; 4508 } 4509 } 4510 } 4511 } else { // nthreads > ncores 4512 // Array to save the number of processors at each core 4513 int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores); 4514 // Array to save the number of cores with "x" available processors; 4515 int *ncores_with_x_procs = 4516 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 4517 // Array to save the number of cores with # procs from x to nth_per_core 4518 int *ncores_with_x_to_max_procs = 4519 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 4520 4521 for (int i = 0; i <= nth_per_core; i++) { 4522 ncores_with_x_procs[i] = 0; 4523 ncores_with_x_to_max_procs[i] = 0; 4524 } 4525 4526 for (int i = 0; i < ncores; i++) { 4527 int cnt = 0; 4528 for (int j = 0; j < nth_per_core; j++) { 4529 if (procarr[i * nth_per_core + j] != -1) { 4530 cnt++; 4531 } 4532 } 4533 nproc_at_core[i] = cnt; 4534 ncores_with_x_procs[cnt]++; 4535 } 4536 4537 for (int i = 0; i <= nth_per_core; i++) { 4538 for (int j = i; j <= nth_per_core; j++) { 4539 ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j]; 4540 } 4541 } 4542 4543 // Max number of processors 4544 int nproc = nth_per_core * ncores; 4545 // An array to keep number of threads per each context 4546 int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc); 4547 for (int i = 0; i < nproc; i++) { 4548 newarr[i] = 0; 4549 } 4550 4551 int nth = nthreads; 4552 int flag = 0; 4553 while (nth > 0) { 4554 for (int j = 1; j <= nth_per_core; j++) { 4555 int cnt = ncores_with_x_to_max_procs[j]; 4556 for (int i = 0; i < ncores; i++) { 4557 // Skip the core with 0 processors 4558 if (nproc_at_core[i] == 0) { 4559 continue; 4560 } 4561 for (int k = 0; k < nth_per_core; k++) { 4562 if (procarr[i * nth_per_core + k] != -1) { 4563 if (newarr[i * nth_per_core + k] == 0) { 4564 newarr[i * nth_per_core + k] = 1; 4565 cnt--; 4566 nth--; 4567 break; 4568 } else { 4569 if (flag != 0) { 4570 newarr[i * nth_per_core + k]++; 4571 cnt--; 4572 nth--; 4573 break; 4574 } 4575 } 4576 } 4577 } 4578 if (cnt == 0 || nth == 0) { 4579 break; 4580 } 4581 } 4582 if (nth == 0) { 4583 break; 4584 } 4585 } 4586 flag = 1; 4587 } 4588 int sum = 0; 4589 for (int i = 0; i < nproc; i++) { 4590 sum += newarr[i]; 4591 if (sum > tid) { 4592 if (fine_gran) { 4593 int osID = procarr[i]; 4594 KMP_CPU_SET(osID, mask); 4595 } else { 4596 int coreID = i / nth_per_core; 4597 for (int ii = 0; ii < nth_per_core; ii++) { 4598 int osID = procarr[coreID * nth_per_core + ii]; 4599 if (osID != -1) { 4600 KMP_CPU_SET(osID, mask); 4601 } 4602 } 4603 } 4604 break; 4605 } 4606 } 4607 __kmp_free(newarr); 4608 } 4609 4610 if (__kmp_affinity_verbose) { 4611 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4612 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4613 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4614 __kmp_gettid(), tid, buf); 4615 } 4616 __kmp_set_system_affinity(mask, TRUE); 4617 } 4618 } 4619 4620 #if KMP_OS_LINUX || KMP_OS_FREEBSD 4621 // We don't need this entry for Windows because 4622 // there is GetProcessAffinityMask() api 4623 // 4624 // The intended usage is indicated by these steps: 4625 // 1) The user gets the current affinity mask 4626 // 2) Then sets the affinity by calling this function 4627 // 3) Error check the return value 4628 // 4) Use non-OpenMP parallelization 4629 // 5) Reset the affinity to what was stored in step 1) 4630 #ifdef __cplusplus 4631 extern "C" 4632 #endif 4633 int 4634 kmp_set_thread_affinity_mask_initial() 4635 // the function returns 0 on success, 4636 // -1 if we cannot bind thread 4637 // >0 (errno) if an error happened during binding 4638 { 4639 int gtid = __kmp_get_gtid(); 4640 if (gtid < 0) { 4641 // Do not touch non-omp threads 4642 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 4643 "non-omp thread, returning\n")); 4644 return -1; 4645 } 4646 if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) { 4647 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 4648 "affinity not initialized, returning\n")); 4649 return -1; 4650 } 4651 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 4652 "set full mask for thread %d\n", 4653 gtid)); 4654 KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL); 4655 return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE); 4656 } 4657 #endif 4658 4659 #endif // KMP_AFFINITY_SUPPORTED 4660