1 /* 2 * kmp_affinity.cpp -- affinity management 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_i18n.h" 16 #include "kmp_io.h" 17 #include "kmp_str.h" 18 #include "kmp_wrapper_getpid.h" 19 #if KMP_USE_HIER_SCHED 20 #include "kmp_dispatch_hier.h" 21 #endif 22 #if KMP_USE_HWLOC 23 // Copied from hwloc 24 #define HWLOC_GROUP_KIND_INTEL_MODULE 102 25 #define HWLOC_GROUP_KIND_INTEL_TILE 103 26 #define HWLOC_GROUP_KIND_INTEL_DIE 104 27 #define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220 28 #endif 29 30 // The machine topology 31 kmp_topology_t *__kmp_topology = nullptr; 32 // KMP_HW_SUBSET environment variable 33 kmp_hw_subset_t *__kmp_hw_subset = nullptr; 34 35 // Store the real or imagined machine hierarchy here 36 static hierarchy_info machine_hierarchy; 37 38 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); } 39 40 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 41 kmp_uint32 depth; 42 // The test below is true if affinity is available, but set to "none". Need to 43 // init on first use of hierarchical barrier. 44 if (TCR_1(machine_hierarchy.uninitialized)) 45 machine_hierarchy.init(nproc); 46 47 // Adjust the hierarchy in case num threads exceeds original 48 if (nproc > machine_hierarchy.base_num_threads) 49 machine_hierarchy.resize(nproc); 50 51 depth = machine_hierarchy.depth; 52 KMP_DEBUG_ASSERT(depth > 0); 53 54 thr_bar->depth = depth; 55 __kmp_type_convert(machine_hierarchy.numPerLevel[0] - 1, 56 &(thr_bar->base_leaf_kids)); 57 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 58 } 59 60 static int nCoresPerPkg, nPackages; 61 static int __kmp_nThreadsPerCore; 62 #ifndef KMP_DFLT_NTH_CORES 63 static int __kmp_ncores; 64 #endif 65 66 const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) { 67 switch (type) { 68 case KMP_HW_SOCKET: 69 return ((plural) ? KMP_I18N_STR(Sockets) : KMP_I18N_STR(Socket)); 70 case KMP_HW_DIE: 71 return ((plural) ? KMP_I18N_STR(Dice) : KMP_I18N_STR(Die)); 72 case KMP_HW_MODULE: 73 return ((plural) ? KMP_I18N_STR(Modules) : KMP_I18N_STR(Module)); 74 case KMP_HW_TILE: 75 return ((plural) ? KMP_I18N_STR(Tiles) : KMP_I18N_STR(Tile)); 76 case KMP_HW_NUMA: 77 return ((plural) ? KMP_I18N_STR(NumaDomains) : KMP_I18N_STR(NumaDomain)); 78 case KMP_HW_L3: 79 return ((plural) ? KMP_I18N_STR(L3Caches) : KMP_I18N_STR(L3Cache)); 80 case KMP_HW_L2: 81 return ((plural) ? KMP_I18N_STR(L2Caches) : KMP_I18N_STR(L2Cache)); 82 case KMP_HW_L1: 83 return ((plural) ? KMP_I18N_STR(L1Caches) : KMP_I18N_STR(L1Cache)); 84 case KMP_HW_LLC: 85 return ((plural) ? KMP_I18N_STR(LLCaches) : KMP_I18N_STR(LLCache)); 86 case KMP_HW_CORE: 87 return ((plural) ? KMP_I18N_STR(Cores) : KMP_I18N_STR(Core)); 88 case KMP_HW_THREAD: 89 return ((plural) ? KMP_I18N_STR(Threads) : KMP_I18N_STR(Thread)); 90 case KMP_HW_PROC_GROUP: 91 return ((plural) ? KMP_I18N_STR(ProcGroups) : KMP_I18N_STR(ProcGroup)); 92 } 93 return KMP_I18N_STR(Unknown); 94 } 95 96 const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) { 97 switch (type) { 98 case KMP_HW_SOCKET: 99 return ((plural) ? "sockets" : "socket"); 100 case KMP_HW_DIE: 101 return ((plural) ? "dice" : "die"); 102 case KMP_HW_MODULE: 103 return ((plural) ? "modules" : "module"); 104 case KMP_HW_TILE: 105 return ((plural) ? "tiles" : "tile"); 106 case KMP_HW_NUMA: 107 return ((plural) ? "numa_domains" : "numa_domain"); 108 case KMP_HW_L3: 109 return ((plural) ? "l3_caches" : "l3_cache"); 110 case KMP_HW_L2: 111 return ((plural) ? "l2_caches" : "l2_cache"); 112 case KMP_HW_L1: 113 return ((plural) ? "l1_caches" : "l1_cache"); 114 case KMP_HW_LLC: 115 return ((plural) ? "ll_caches" : "ll_cache"); 116 case KMP_HW_CORE: 117 return ((plural) ? "cores" : "core"); 118 case KMP_HW_THREAD: 119 return ((plural) ? "threads" : "thread"); 120 case KMP_HW_PROC_GROUP: 121 return ((plural) ? "proc_groups" : "proc_group"); 122 } 123 return ((plural) ? "unknowns" : "unknown"); 124 } 125 126 //////////////////////////////////////////////////////////////////////////////// 127 // kmp_hw_thread_t methods 128 int kmp_hw_thread_t::compare_ids(const void *a, const void *b) { 129 const kmp_hw_thread_t *ahwthread = (const kmp_hw_thread_t *)a; 130 const kmp_hw_thread_t *bhwthread = (const kmp_hw_thread_t *)b; 131 int depth = __kmp_topology->get_depth(); 132 for (int level = 0; level < depth; ++level) { 133 if (ahwthread->ids[level] < bhwthread->ids[level]) 134 return -1; 135 else if (ahwthread->ids[level] > bhwthread->ids[level]) 136 return 1; 137 } 138 if (ahwthread->os_id < bhwthread->os_id) 139 return -1; 140 else if (ahwthread->os_id > bhwthread->os_id) 141 return 1; 142 return 0; 143 } 144 145 #if KMP_AFFINITY_SUPPORTED 146 int kmp_hw_thread_t::compare_compact(const void *a, const void *b) { 147 int i; 148 const kmp_hw_thread_t *aa = (const kmp_hw_thread_t *)a; 149 const kmp_hw_thread_t *bb = (const kmp_hw_thread_t *)b; 150 int depth = __kmp_topology->get_depth(); 151 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0); 152 KMP_DEBUG_ASSERT(__kmp_affinity_compact <= depth); 153 for (i = 0; i < __kmp_affinity_compact; i++) { 154 int j = depth - i - 1; 155 if (aa->sub_ids[j] < bb->sub_ids[j]) 156 return -1; 157 if (aa->sub_ids[j] > bb->sub_ids[j]) 158 return 1; 159 } 160 for (; i < depth; i++) { 161 int j = i - __kmp_affinity_compact; 162 if (aa->sub_ids[j] < bb->sub_ids[j]) 163 return -1; 164 if (aa->sub_ids[j] > bb->sub_ids[j]) 165 return 1; 166 } 167 return 0; 168 } 169 #endif 170 171 void kmp_hw_thread_t::print() const { 172 int depth = __kmp_topology->get_depth(); 173 printf("%4d ", os_id); 174 for (int i = 0; i < depth; ++i) { 175 printf("%4d ", ids[i]); 176 } 177 printf("\n"); 178 } 179 180 //////////////////////////////////////////////////////////////////////////////// 181 // kmp_topology_t methods 182 183 // Remove layers that don't add information to the topology. 184 // This is done by having the layer take on the id = UNKNOWN_ID (-1) 185 void kmp_topology_t::_remove_radix1_layers() { 186 int preference[KMP_HW_LAST]; 187 int top_index1, top_index2; 188 // Set up preference associative array 189 preference[KMP_HW_PROC_GROUP] = 110; 190 preference[KMP_HW_SOCKET] = 100; 191 preference[KMP_HW_CORE] = 95; 192 preference[KMP_HW_THREAD] = 90; 193 preference[KMP_HW_NUMA] = 85; 194 preference[KMP_HW_DIE] = 80; 195 preference[KMP_HW_TILE] = 75; 196 preference[KMP_HW_MODULE] = 73; 197 preference[KMP_HW_L3] = 70; 198 preference[KMP_HW_L2] = 65; 199 preference[KMP_HW_L1] = 60; 200 preference[KMP_HW_LLC] = 5; 201 top_index1 = 0; 202 top_index2 = 1; 203 while (top_index1 < depth - 1 && top_index2 < depth) { 204 kmp_hw_t type1 = types[top_index1]; 205 kmp_hw_t type2 = types[top_index2]; 206 KMP_ASSERT_VALID_HW_TYPE(type1); 207 KMP_ASSERT_VALID_HW_TYPE(type2); 208 // Do not allow the three main topology levels (sockets, cores, threads) to 209 // be compacted down 210 if ((type1 == KMP_HW_THREAD || type1 == KMP_HW_CORE || 211 type1 == KMP_HW_SOCKET) && 212 (type2 == KMP_HW_THREAD || type2 == KMP_HW_CORE || 213 type2 == KMP_HW_SOCKET)) { 214 top_index1 = top_index2++; 215 continue; 216 } 217 bool radix1 = true; 218 bool all_same = true; 219 int id1 = hw_threads[0].ids[top_index1]; 220 int id2 = hw_threads[0].ids[top_index2]; 221 int pref1 = preference[type1]; 222 int pref2 = preference[type2]; 223 for (int hwidx = 1; hwidx < num_hw_threads; ++hwidx) { 224 if (hw_threads[hwidx].ids[top_index1] == id1 && 225 hw_threads[hwidx].ids[top_index2] != id2) { 226 radix1 = false; 227 break; 228 } 229 if (hw_threads[hwidx].ids[top_index2] != id2) 230 all_same = false; 231 id1 = hw_threads[hwidx].ids[top_index1]; 232 id2 = hw_threads[hwidx].ids[top_index2]; 233 } 234 if (radix1) { 235 // Select the layer to remove based on preference 236 kmp_hw_t remove_type, keep_type; 237 int remove_layer, remove_layer_ids; 238 if (pref1 > pref2) { 239 remove_type = type2; 240 remove_layer = remove_layer_ids = top_index2; 241 keep_type = type1; 242 } else { 243 remove_type = type1; 244 remove_layer = remove_layer_ids = top_index1; 245 keep_type = type2; 246 } 247 // If all the indexes for the second (deeper) layer are the same. 248 // e.g., all are zero, then make sure to keep the first layer's ids 249 if (all_same) 250 remove_layer_ids = top_index2; 251 // Remove radix one type by setting the equivalence, removing the id from 252 // the hw threads and removing the layer from types and depth 253 set_equivalent_type(remove_type, keep_type); 254 for (int idx = 0; idx < num_hw_threads; ++idx) { 255 kmp_hw_thread_t &hw_thread = hw_threads[idx]; 256 for (int d = remove_layer_ids; d < depth - 1; ++d) 257 hw_thread.ids[d] = hw_thread.ids[d + 1]; 258 } 259 for (int idx = remove_layer; idx < depth - 1; ++idx) 260 types[idx] = types[idx + 1]; 261 depth--; 262 } else { 263 top_index1 = top_index2++; 264 } 265 } 266 KMP_ASSERT(depth > 0); 267 } 268 269 void kmp_topology_t::_set_last_level_cache() { 270 if (get_equivalent_type(KMP_HW_L3) != KMP_HW_UNKNOWN) 271 set_equivalent_type(KMP_HW_LLC, KMP_HW_L3); 272 else if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN) 273 set_equivalent_type(KMP_HW_LLC, KMP_HW_L2); 274 #if KMP_MIC_SUPPORTED 275 else if (__kmp_mic_type == mic3) { 276 if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN) 277 set_equivalent_type(KMP_HW_LLC, KMP_HW_L2); 278 else if (get_equivalent_type(KMP_HW_TILE) != KMP_HW_UNKNOWN) 279 set_equivalent_type(KMP_HW_LLC, KMP_HW_TILE); 280 // L2/Tile wasn't detected so just say L1 281 else 282 set_equivalent_type(KMP_HW_LLC, KMP_HW_L1); 283 } 284 #endif 285 else if (get_equivalent_type(KMP_HW_L1) != KMP_HW_UNKNOWN) 286 set_equivalent_type(KMP_HW_LLC, KMP_HW_L1); 287 // Fallback is to set last level cache to socket or core 288 if (get_equivalent_type(KMP_HW_LLC) == KMP_HW_UNKNOWN) { 289 if (get_equivalent_type(KMP_HW_SOCKET) != KMP_HW_UNKNOWN) 290 set_equivalent_type(KMP_HW_LLC, KMP_HW_SOCKET); 291 else if (get_equivalent_type(KMP_HW_CORE) != KMP_HW_UNKNOWN) 292 set_equivalent_type(KMP_HW_LLC, KMP_HW_CORE); 293 } 294 KMP_ASSERT(get_equivalent_type(KMP_HW_LLC) != KMP_HW_UNKNOWN); 295 } 296 297 // Gather the count of each topology layer and the ratio 298 void kmp_topology_t::_gather_enumeration_information() { 299 int previous_id[KMP_HW_LAST]; 300 int max[KMP_HW_LAST]; 301 302 for (int i = 0; i < depth; ++i) { 303 previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID; 304 max[i] = 0; 305 count[i] = 0; 306 ratio[i] = 0; 307 } 308 for (int i = 0; i < num_hw_threads; ++i) { 309 kmp_hw_thread_t &hw_thread = hw_threads[i]; 310 for (int layer = 0; layer < depth; ++layer) { 311 int id = hw_thread.ids[layer]; 312 if (id != previous_id[layer]) { 313 // Add an additional increment to each count 314 for (int l = layer; l < depth; ++l) 315 count[l]++; 316 // Keep track of topology layer ratio statistics 317 max[layer]++; 318 for (int l = layer + 1; l < depth; ++l) { 319 if (max[l] > ratio[l]) 320 ratio[l] = max[l]; 321 max[l] = 1; 322 } 323 break; 324 } 325 } 326 for (int layer = 0; layer < depth; ++layer) { 327 previous_id[layer] = hw_thread.ids[layer]; 328 } 329 } 330 for (int layer = 0; layer < depth; ++layer) { 331 if (max[layer] > ratio[layer]) 332 ratio[layer] = max[layer]; 333 } 334 } 335 336 // Find out if the topology is uniform 337 void kmp_topology_t::_discover_uniformity() { 338 int num = 1; 339 for (int level = 0; level < depth; ++level) 340 num *= ratio[level]; 341 flags.uniform = (num == count[depth - 1]); 342 } 343 344 // Set all the sub_ids for each hardware thread 345 void kmp_topology_t::_set_sub_ids() { 346 int previous_id[KMP_HW_LAST]; 347 int sub_id[KMP_HW_LAST]; 348 349 for (int i = 0; i < depth; ++i) { 350 previous_id[i] = -1; 351 sub_id[i] = -1; 352 } 353 for (int i = 0; i < num_hw_threads; ++i) { 354 kmp_hw_thread_t &hw_thread = hw_threads[i]; 355 // Setup the sub_id 356 for (int j = 0; j < depth; ++j) { 357 if (hw_thread.ids[j] != previous_id[j]) { 358 sub_id[j]++; 359 for (int k = j + 1; k < depth; ++k) { 360 sub_id[k] = 0; 361 } 362 break; 363 } 364 } 365 // Set previous_id 366 for (int j = 0; j < depth; ++j) { 367 previous_id[j] = hw_thread.ids[j]; 368 } 369 // Set the sub_ids field 370 for (int j = 0; j < depth; ++j) { 371 hw_thread.sub_ids[j] = sub_id[j]; 372 } 373 } 374 } 375 376 void kmp_topology_t::_set_globals() { 377 // Set nCoresPerPkg, nPackages, __kmp_nThreadsPerCore, __kmp_ncores 378 int core_level, thread_level, package_level; 379 package_level = get_level(KMP_HW_SOCKET); 380 #if KMP_GROUP_AFFINITY 381 if (package_level == -1) 382 package_level = get_level(KMP_HW_PROC_GROUP); 383 #endif 384 core_level = get_level(KMP_HW_CORE); 385 thread_level = get_level(KMP_HW_THREAD); 386 387 KMP_ASSERT(core_level != -1); 388 KMP_ASSERT(thread_level != -1); 389 390 __kmp_nThreadsPerCore = calculate_ratio(thread_level, core_level); 391 if (package_level != -1) { 392 nCoresPerPkg = calculate_ratio(core_level, package_level); 393 nPackages = get_count(package_level); 394 } else { 395 // assume one socket 396 nCoresPerPkg = get_count(core_level); 397 nPackages = 1; 398 } 399 #ifndef KMP_DFLT_NTH_CORES 400 __kmp_ncores = get_count(core_level); 401 #endif 402 } 403 404 kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth, 405 const kmp_hw_t *types) { 406 kmp_topology_t *retval; 407 // Allocate all data in one large allocation 408 size_t size = sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc + 409 sizeof(int) * ndepth * 3; 410 char *bytes = (char *)__kmp_allocate(size); 411 retval = (kmp_topology_t *)bytes; 412 if (nproc > 0) { 413 retval->hw_threads = (kmp_hw_thread_t *)(bytes + sizeof(kmp_topology_t)); 414 } else { 415 retval->hw_threads = nullptr; 416 } 417 retval->num_hw_threads = nproc; 418 retval->depth = ndepth; 419 int *arr = 420 (int *)(bytes + sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc); 421 retval->types = (kmp_hw_t *)arr; 422 retval->ratio = arr + ndepth; 423 retval->count = arr + 2 * ndepth; 424 KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; } 425 for (int i = 0; i < ndepth; ++i) { 426 retval->types[i] = types[i]; 427 retval->equivalent[types[i]] = types[i]; 428 } 429 return retval; 430 } 431 432 void kmp_topology_t::deallocate(kmp_topology_t *topology) { 433 if (topology) 434 __kmp_free(topology); 435 } 436 437 bool kmp_topology_t::check_ids() const { 438 // Assume ids have been sorted 439 if (num_hw_threads == 0) 440 return true; 441 for (int i = 1; i < num_hw_threads; ++i) { 442 kmp_hw_thread_t ¤t_thread = hw_threads[i]; 443 kmp_hw_thread_t &previous_thread = hw_threads[i - 1]; 444 bool unique = false; 445 for (int j = 0; j < depth; ++j) { 446 if (previous_thread.ids[j] != current_thread.ids[j]) { 447 unique = true; 448 break; 449 } 450 } 451 if (unique) 452 continue; 453 return false; 454 } 455 return true; 456 } 457 458 void kmp_topology_t::dump() const { 459 printf("***********************\n"); 460 printf("*** __kmp_topology: ***\n"); 461 printf("***********************\n"); 462 printf("* depth: %d\n", depth); 463 464 printf("* types: "); 465 for (int i = 0; i < depth; ++i) 466 printf("%15s ", __kmp_hw_get_keyword(types[i])); 467 printf("\n"); 468 469 printf("* ratio: "); 470 for (int i = 0; i < depth; ++i) { 471 printf("%15d ", ratio[i]); 472 } 473 printf("\n"); 474 475 printf("* count: "); 476 for (int i = 0; i < depth; ++i) { 477 printf("%15d ", count[i]); 478 } 479 printf("\n"); 480 481 printf("* equivalent map:\n"); 482 KMP_FOREACH_HW_TYPE(i) { 483 const char *key = __kmp_hw_get_keyword(i); 484 const char *value = __kmp_hw_get_keyword(equivalent[i]); 485 printf("%-15s -> %-15s\n", key, value); 486 } 487 488 printf("* uniform: %s\n", (is_uniform() ? "Yes" : "No")); 489 490 printf("* num_hw_threads: %d\n", num_hw_threads); 491 printf("* hw_threads:\n"); 492 for (int i = 0; i < num_hw_threads; ++i) { 493 hw_threads[i].print(); 494 } 495 printf("***********************\n"); 496 } 497 498 void kmp_topology_t::print(const char *env_var) const { 499 kmp_str_buf_t buf; 500 int print_types_depth; 501 __kmp_str_buf_init(&buf); 502 kmp_hw_t print_types[KMP_HW_LAST + 2]; 503 504 // Num Available Threads 505 KMP_INFORM(AvailableOSProc, env_var, num_hw_threads); 506 507 // Uniform or not 508 if (is_uniform()) { 509 KMP_INFORM(Uniform, env_var); 510 } else { 511 KMP_INFORM(NonUniform, env_var); 512 } 513 514 // Equivalent types 515 KMP_FOREACH_HW_TYPE(type) { 516 kmp_hw_t eq_type = equivalent[type]; 517 if (eq_type != KMP_HW_UNKNOWN && eq_type != type) { 518 KMP_INFORM(AffEqualTopologyTypes, env_var, 519 __kmp_hw_get_catalog_string(type), 520 __kmp_hw_get_catalog_string(eq_type)); 521 } 522 } 523 524 // Quick topology 525 KMP_ASSERT(depth > 0 && depth <= (int)KMP_HW_LAST); 526 // Create a print types array that always guarantees printing 527 // the core and thread level 528 print_types_depth = 0; 529 for (int level = 0; level < depth; ++level) 530 print_types[print_types_depth++] = types[level]; 531 if (equivalent[KMP_HW_CORE] != KMP_HW_CORE) { 532 // Force in the core level for quick topology 533 if (print_types[print_types_depth - 1] == KMP_HW_THREAD) { 534 // Force core before thread e.g., 1 socket X 2 threads/socket 535 // becomes 1 socket X 1 core/socket X 2 threads/socket 536 print_types[print_types_depth - 1] = KMP_HW_CORE; 537 print_types[print_types_depth++] = KMP_HW_THREAD; 538 } else { 539 print_types[print_types_depth++] = KMP_HW_CORE; 540 } 541 } 542 // Always put threads at very end of quick topology 543 if (equivalent[KMP_HW_THREAD] != KMP_HW_THREAD) 544 print_types[print_types_depth++] = KMP_HW_THREAD; 545 546 __kmp_str_buf_clear(&buf); 547 kmp_hw_t numerator_type; 548 kmp_hw_t denominator_type = KMP_HW_UNKNOWN; 549 int core_level = get_level(KMP_HW_CORE); 550 int ncores = get_count(core_level); 551 552 for (int plevel = 0, level = 0; plevel < print_types_depth; ++plevel) { 553 int c; 554 bool plural; 555 numerator_type = print_types[plevel]; 556 KMP_ASSERT_VALID_HW_TYPE(numerator_type); 557 if (equivalent[numerator_type] != numerator_type) 558 c = 1; 559 else 560 c = get_ratio(level++); 561 plural = (c > 1); 562 if (plevel == 0) { 563 __kmp_str_buf_print(&buf, "%d %s", c, 564 __kmp_hw_get_catalog_string(numerator_type, plural)); 565 } else { 566 __kmp_str_buf_print(&buf, " x %d %s/%s", c, 567 __kmp_hw_get_catalog_string(numerator_type, plural), 568 __kmp_hw_get_catalog_string(denominator_type)); 569 } 570 denominator_type = numerator_type; 571 } 572 KMP_INFORM(TopologyGeneric, env_var, buf.str, ncores); 573 574 if (num_hw_threads <= 0) { 575 __kmp_str_buf_free(&buf); 576 return; 577 } 578 579 // Full OS proc to hardware thread map 580 KMP_INFORM(OSProcToPhysicalThreadMap, env_var); 581 for (int i = 0; i < num_hw_threads; i++) { 582 __kmp_str_buf_clear(&buf); 583 for (int level = 0; level < depth; ++level) { 584 kmp_hw_t type = types[level]; 585 __kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type)); 586 __kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]); 587 } 588 KMP_INFORM(OSProcMapToPack, env_var, hw_threads[i].os_id, buf.str); 589 } 590 591 __kmp_str_buf_free(&buf); 592 } 593 594 void kmp_topology_t::canonicalize() { 595 _remove_radix1_layers(); 596 _gather_enumeration_information(); 597 _discover_uniformity(); 598 _set_sub_ids(); 599 _set_globals(); 600 _set_last_level_cache(); 601 602 // Perform post canonicalization checking 603 KMP_ASSERT(depth > 0); 604 for (int level = 0; level < depth; ++level) { 605 // All counts, ratios, and types must be valid 606 KMP_ASSERT(count[level] > 0 && ratio[level] > 0); 607 KMP_ASSERT_VALID_HW_TYPE(types[level]); 608 // Detected types must point to themselves 609 KMP_ASSERT(equivalent[types[level]] == types[level]); 610 } 611 612 #if KMP_AFFINITY_SUPPORTED 613 // Set the number of affinity granularity levels 614 if (__kmp_affinity_gran_levels < 0) { 615 kmp_hw_t gran_type = get_equivalent_type(__kmp_affinity_gran); 616 // Check if user's granularity request is valid 617 if (gran_type == KMP_HW_UNKNOWN) { 618 // First try core, then thread, then package 619 kmp_hw_t gran_types[3] = {KMP_HW_CORE, KMP_HW_THREAD, KMP_HW_SOCKET}; 620 for (auto g : gran_types) { 621 if (__kmp_topology->get_equivalent_type(g) != KMP_HW_UNKNOWN) { 622 gran_type = g; 623 break; 624 } 625 } 626 KMP_ASSERT(gran_type != KMP_HW_UNKNOWN); 627 // Warn user what granularity setting will be used instead 628 KMP_WARNING(AffGranularityBad, "KMP_AFFINITY", 629 __kmp_hw_get_catalog_string(__kmp_affinity_gran), 630 __kmp_hw_get_catalog_string(gran_type)); 631 __kmp_affinity_gran = gran_type; 632 } 633 __kmp_affinity_gran_levels = 0; 634 for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i) 635 __kmp_affinity_gran_levels++; 636 } 637 #endif // KMP_AFFINITY_SUPPORTED 638 } 639 640 // Canonicalize an explicit packages X cores/pkg X threads/core topology 641 void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg, 642 int nthreads_per_core, int ncores) { 643 int ndepth = 3; 644 depth = ndepth; 645 KMP_FOREACH_HW_TYPE(i) { equivalent[i] = KMP_HW_UNKNOWN; } 646 for (int level = 0; level < depth; ++level) { 647 count[level] = 0; 648 ratio[level] = 0; 649 } 650 count[0] = npackages; 651 count[1] = ncores; 652 count[2] = __kmp_xproc; 653 ratio[0] = npackages; 654 ratio[1] = ncores_per_pkg; 655 ratio[2] = nthreads_per_core; 656 equivalent[KMP_HW_SOCKET] = KMP_HW_SOCKET; 657 equivalent[KMP_HW_CORE] = KMP_HW_CORE; 658 equivalent[KMP_HW_THREAD] = KMP_HW_THREAD; 659 types[0] = KMP_HW_SOCKET; 660 types[1] = KMP_HW_CORE; 661 types[2] = KMP_HW_THREAD; 662 //__kmp_avail_proc = __kmp_xproc; 663 _discover_uniformity(); 664 } 665 666 // Apply the KMP_HW_SUBSET envirable to the topology 667 // Returns true if KMP_HW_SUBSET filtered any processors 668 // otherwise, returns false 669 bool kmp_topology_t::filter_hw_subset() { 670 // If KMP_HW_SUBSET wasn't requested, then do nothing. 671 if (!__kmp_hw_subset) 672 return false; 673 674 // Check to see if KMP_HW_SUBSET is a valid subset of the detected topology 675 int hw_subset_depth = __kmp_hw_subset->get_depth(); 676 kmp_hw_t specified[KMP_HW_LAST]; 677 KMP_ASSERT(hw_subset_depth > 0); 678 KMP_FOREACH_HW_TYPE(i) { specified[i] = KMP_HW_UNKNOWN; } 679 for (int i = 0; i < hw_subset_depth; ++i) { 680 int max_count; 681 int num = __kmp_hw_subset->at(i).num; 682 int offset = __kmp_hw_subset->at(i).offset; 683 kmp_hw_t type = __kmp_hw_subset->at(i).type; 684 kmp_hw_t equivalent_type = equivalent[type]; 685 int level = get_level(type); 686 687 // Check to see if current layer is in detected machine topology 688 if (equivalent_type != KMP_HW_UNKNOWN) { 689 __kmp_hw_subset->at(i).type = equivalent_type; 690 } else { 691 KMP_WARNING(AffHWSubsetNotExistGeneric, 692 __kmp_hw_get_catalog_string(type)); 693 return false; 694 } 695 696 // Check to see if current layer has already been specified 697 // either directly or through an equivalent type 698 if (specified[equivalent_type] != KMP_HW_UNKNOWN) { 699 KMP_WARNING(AffHWSubsetEqvLayers, __kmp_hw_get_catalog_string(type), 700 __kmp_hw_get_catalog_string(specified[equivalent_type])); 701 return false; 702 } 703 specified[equivalent_type] = type; 704 705 // Check to see if layers are in order 706 if (i + 1 < hw_subset_depth) { 707 kmp_hw_t next_type = get_equivalent_type(__kmp_hw_subset->at(i + 1).type); 708 if (next_type == KMP_HW_UNKNOWN) { 709 KMP_WARNING( 710 AffHWSubsetNotExistGeneric, 711 __kmp_hw_get_catalog_string(__kmp_hw_subset->at(i + 1).type)); 712 return false; 713 } 714 int next_topology_level = get_level(next_type); 715 if (level > next_topology_level) { 716 KMP_WARNING(AffHWSubsetOutOfOrder, __kmp_hw_get_catalog_string(type), 717 __kmp_hw_get_catalog_string(next_type)); 718 return false; 719 } 720 } 721 722 // Check to see if each layer's num & offset parameters are valid 723 max_count = get_ratio(level); 724 if (max_count < 0 || num + offset > max_count) { 725 bool plural = (num > 1); 726 KMP_WARNING(AffHWSubsetManyGeneric, 727 __kmp_hw_get_catalog_string(type, plural)); 728 return false; 729 } 730 } 731 732 // Apply the filtered hardware subset 733 int new_index = 0; 734 for (int i = 0; i < num_hw_threads; ++i) { 735 kmp_hw_thread_t &hw_thread = hw_threads[i]; 736 // Check to see if this hardware thread should be filtered 737 bool should_be_filtered = false; 738 for (int level = 0, hw_subset_index = 0; 739 level < depth && hw_subset_index < hw_subset_depth; ++level) { 740 kmp_hw_t topology_type = types[level]; 741 auto hw_subset_item = __kmp_hw_subset->at(hw_subset_index); 742 kmp_hw_t hw_subset_type = hw_subset_item.type; 743 if (topology_type != hw_subset_type) 744 continue; 745 int num = hw_subset_item.num; 746 int offset = hw_subset_item.offset; 747 hw_subset_index++; 748 if (hw_thread.sub_ids[level] < offset || 749 hw_thread.sub_ids[level] >= offset + num) { 750 should_be_filtered = true; 751 break; 752 } 753 } 754 if (!should_be_filtered) { 755 if (i != new_index) 756 hw_threads[new_index] = hw_thread; 757 new_index++; 758 } else { 759 #if KMP_AFFINITY_SUPPORTED 760 KMP_CPU_CLR(hw_thread.os_id, __kmp_affin_fullMask); 761 #endif 762 __kmp_avail_proc--; 763 } 764 } 765 KMP_DEBUG_ASSERT(new_index <= num_hw_threads); 766 num_hw_threads = new_index; 767 768 // Post hardware subset canonicalization 769 _gather_enumeration_information(); 770 _discover_uniformity(); 771 _set_globals(); 772 _set_last_level_cache(); 773 return true; 774 } 775 776 bool kmp_topology_t::is_close(int hwt1, int hwt2, int hw_level) const { 777 if (hw_level >= depth) 778 return true; 779 bool retval = true; 780 const kmp_hw_thread_t &t1 = hw_threads[hwt1]; 781 const kmp_hw_thread_t &t2 = hw_threads[hwt2]; 782 for (int i = 0; i < (depth - hw_level); ++i) { 783 if (t1.ids[i] != t2.ids[i]) 784 return false; 785 } 786 return retval; 787 } 788 789 //////////////////////////////////////////////////////////////////////////////// 790 791 #if KMP_AFFINITY_SUPPORTED 792 class kmp_affinity_raii_t { 793 kmp_affin_mask_t *mask; 794 bool restored; 795 796 public: 797 kmp_affinity_raii_t() : restored(false) { 798 KMP_CPU_ALLOC(mask); 799 KMP_ASSERT(mask != NULL); 800 __kmp_get_system_affinity(mask, TRUE); 801 } 802 void restore() { 803 __kmp_set_system_affinity(mask, TRUE); 804 KMP_CPU_FREE(mask); 805 restored = true; 806 } 807 ~kmp_affinity_raii_t() { 808 if (!restored) { 809 __kmp_set_system_affinity(mask, TRUE); 810 KMP_CPU_FREE(mask); 811 } 812 } 813 }; 814 815 bool KMPAffinity::picked_api = false; 816 817 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); } 818 void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); } 819 void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); } 820 void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); } 821 void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); } 822 void KMPAffinity::operator delete(void *p) { __kmp_free(p); } 823 824 void KMPAffinity::pick_api() { 825 KMPAffinity *affinity_dispatch; 826 if (picked_api) 827 return; 828 #if KMP_USE_HWLOC 829 // Only use Hwloc if affinity isn't explicitly disabled and 830 // user requests Hwloc topology method 831 if (__kmp_affinity_top_method == affinity_top_method_hwloc && 832 __kmp_affinity_type != affinity_disabled) { 833 affinity_dispatch = new KMPHwlocAffinity(); 834 } else 835 #endif 836 { 837 affinity_dispatch = new KMPNativeAffinity(); 838 } 839 __kmp_affinity_dispatch = affinity_dispatch; 840 picked_api = true; 841 } 842 843 void KMPAffinity::destroy_api() { 844 if (__kmp_affinity_dispatch != NULL) { 845 delete __kmp_affinity_dispatch; 846 __kmp_affinity_dispatch = NULL; 847 picked_api = false; 848 } 849 } 850 851 #define KMP_ADVANCE_SCAN(scan) \ 852 while (*scan != '\0') { \ 853 scan++; \ 854 } 855 856 // Print the affinity mask to the character array in a pretty format. 857 // The format is a comma separated list of non-negative integers or integer 858 // ranges: e.g., 1,2,3-5,7,9-15 859 // The format can also be the string "{<empty>}" if no bits are set in mask 860 char *__kmp_affinity_print_mask(char *buf, int buf_len, 861 kmp_affin_mask_t *mask) { 862 int start = 0, finish = 0, previous = 0; 863 bool first_range; 864 KMP_ASSERT(buf); 865 KMP_ASSERT(buf_len >= 40); 866 KMP_ASSERT(mask); 867 char *scan = buf; 868 char *end = buf + buf_len - 1; 869 870 // Check for empty set. 871 if (mask->begin() == mask->end()) { 872 KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}"); 873 KMP_ADVANCE_SCAN(scan); 874 KMP_ASSERT(scan <= end); 875 return buf; 876 } 877 878 first_range = true; 879 start = mask->begin(); 880 while (1) { 881 // Find next range 882 // [start, previous] is inclusive range of contiguous bits in mask 883 for (finish = mask->next(start), previous = start; 884 finish == previous + 1 && finish != mask->end(); 885 finish = mask->next(finish)) { 886 previous = finish; 887 } 888 889 // The first range does not need a comma printed before it, but the rest 890 // of the ranges do need a comma beforehand 891 if (!first_range) { 892 KMP_SNPRINTF(scan, end - scan + 1, "%s", ","); 893 KMP_ADVANCE_SCAN(scan); 894 } else { 895 first_range = false; 896 } 897 // Range with three or more contiguous bits in the affinity mask 898 if (previous - start > 1) { 899 KMP_SNPRINTF(scan, end - scan + 1, "%u-%u", start, previous); 900 } else { 901 // Range with one or two contiguous bits in the affinity mask 902 KMP_SNPRINTF(scan, end - scan + 1, "%u", start); 903 KMP_ADVANCE_SCAN(scan); 904 if (previous - start > 0) { 905 KMP_SNPRINTF(scan, end - scan + 1, ",%u", previous); 906 } 907 } 908 KMP_ADVANCE_SCAN(scan); 909 // Start over with new start point 910 start = finish; 911 if (start == mask->end()) 912 break; 913 // Check for overflow 914 if (end - scan < 2) 915 break; 916 } 917 918 // Check for overflow 919 KMP_ASSERT(scan <= end); 920 return buf; 921 } 922 #undef KMP_ADVANCE_SCAN 923 924 // Print the affinity mask to the string buffer object in a pretty format 925 // The format is a comma separated list of non-negative integers or integer 926 // ranges: e.g., 1,2,3-5,7,9-15 927 // The format can also be the string "{<empty>}" if no bits are set in mask 928 kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf, 929 kmp_affin_mask_t *mask) { 930 int start = 0, finish = 0, previous = 0; 931 bool first_range; 932 KMP_ASSERT(buf); 933 KMP_ASSERT(mask); 934 935 __kmp_str_buf_clear(buf); 936 937 // Check for empty set. 938 if (mask->begin() == mask->end()) { 939 __kmp_str_buf_print(buf, "%s", "{<empty>}"); 940 return buf; 941 } 942 943 first_range = true; 944 start = mask->begin(); 945 while (1) { 946 // Find next range 947 // [start, previous] is inclusive range of contiguous bits in mask 948 for (finish = mask->next(start), previous = start; 949 finish == previous + 1 && finish != mask->end(); 950 finish = mask->next(finish)) { 951 previous = finish; 952 } 953 954 // The first range does not need a comma printed before it, but the rest 955 // of the ranges do need a comma beforehand 956 if (!first_range) { 957 __kmp_str_buf_print(buf, "%s", ","); 958 } else { 959 first_range = false; 960 } 961 // Range with three or more contiguous bits in the affinity mask 962 if (previous - start > 1) { 963 __kmp_str_buf_print(buf, "%u-%u", start, previous); 964 } else { 965 // Range with one or two contiguous bits in the affinity mask 966 __kmp_str_buf_print(buf, "%u", start); 967 if (previous - start > 0) { 968 __kmp_str_buf_print(buf, ",%u", previous); 969 } 970 } 971 // Start over with new start point 972 start = finish; 973 if (start == mask->end()) 974 break; 975 } 976 return buf; 977 } 978 979 void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) { 980 KMP_CPU_ZERO(mask); 981 982 #if KMP_GROUP_AFFINITY 983 984 if (__kmp_num_proc_groups > 1) { 985 int group; 986 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 987 for (group = 0; group < __kmp_num_proc_groups; group++) { 988 int i; 989 int num = __kmp_GetActiveProcessorCount(group); 990 for (i = 0; i < num; i++) { 991 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 992 } 993 } 994 } else 995 996 #endif /* KMP_GROUP_AFFINITY */ 997 998 { 999 int proc; 1000 for (proc = 0; proc < __kmp_xproc; proc++) { 1001 KMP_CPU_SET(proc, mask); 1002 } 1003 } 1004 } 1005 1006 // All of the __kmp_affinity_create_*_map() routines should allocate the 1007 // internal topology object and set the layer ids for it. Each routine 1008 // returns a boolean on whether it was successful at doing so. 1009 kmp_affin_mask_t *__kmp_affin_fullMask = NULL; 1010 1011 #if KMP_USE_HWLOC 1012 static inline bool __kmp_hwloc_is_cache_type(hwloc_obj_t obj) { 1013 #if HWLOC_API_VERSION >= 0x00020000 1014 return hwloc_obj_type_is_cache(obj->type); 1015 #else 1016 return obj->type == HWLOC_OBJ_CACHE; 1017 #endif 1018 } 1019 1020 // Returns KMP_HW_* type derived from HWLOC_* type 1021 static inline kmp_hw_t __kmp_hwloc_type_2_topology_type(hwloc_obj_t obj) { 1022 1023 if (__kmp_hwloc_is_cache_type(obj)) { 1024 if (obj->attr->cache.type == HWLOC_OBJ_CACHE_INSTRUCTION) 1025 return KMP_HW_UNKNOWN; 1026 switch (obj->attr->cache.depth) { 1027 case 1: 1028 return KMP_HW_L1; 1029 case 2: 1030 #if KMP_MIC_SUPPORTED 1031 if (__kmp_mic_type == mic3) { 1032 return KMP_HW_TILE; 1033 } 1034 #endif 1035 return KMP_HW_L2; 1036 case 3: 1037 return KMP_HW_L3; 1038 } 1039 return KMP_HW_UNKNOWN; 1040 } 1041 1042 switch (obj->type) { 1043 case HWLOC_OBJ_PACKAGE: 1044 return KMP_HW_SOCKET; 1045 case HWLOC_OBJ_NUMANODE: 1046 return KMP_HW_NUMA; 1047 case HWLOC_OBJ_CORE: 1048 return KMP_HW_CORE; 1049 case HWLOC_OBJ_PU: 1050 return KMP_HW_THREAD; 1051 case HWLOC_OBJ_GROUP: 1052 if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_DIE) 1053 return KMP_HW_DIE; 1054 else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_TILE) 1055 return KMP_HW_TILE; 1056 else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_MODULE) 1057 return KMP_HW_MODULE; 1058 else if (obj->attr->group.kind == HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP) 1059 return KMP_HW_PROC_GROUP; 1060 return KMP_HW_UNKNOWN; 1061 #if HWLOC_API_VERSION >= 0x00020100 1062 case HWLOC_OBJ_DIE: 1063 return KMP_HW_DIE; 1064 #endif 1065 } 1066 return KMP_HW_UNKNOWN; 1067 } 1068 1069 // Returns the number of objects of type 'type' below 'obj' within the topology 1070 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is 1071 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET 1072 // object. 1073 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj, 1074 hwloc_obj_type_t type) { 1075 int retval = 0; 1076 hwloc_obj_t first; 1077 for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type, 1078 obj->logical_index, type, 0); 1079 first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, 1080 obj->type, first) == obj; 1081 first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type, 1082 first)) { 1083 ++retval; 1084 } 1085 return retval; 1086 } 1087 1088 // This gets the sub_id for a lower object under a higher object in the 1089 // topology tree 1090 static int __kmp_hwloc_get_sub_id(hwloc_topology_t t, hwloc_obj_t higher, 1091 hwloc_obj_t lower) { 1092 hwloc_obj_t obj; 1093 hwloc_obj_type_t ltype = lower->type; 1094 int lindex = lower->logical_index - 1; 1095 int sub_id = 0; 1096 // Get the previous lower object 1097 obj = hwloc_get_obj_by_type(t, ltype, lindex); 1098 while (obj && lindex >= 0 && 1099 hwloc_bitmap_isincluded(obj->cpuset, higher->cpuset)) { 1100 if (obj->userdata) { 1101 sub_id = (int)(RCAST(kmp_intptr_t, obj->userdata)); 1102 break; 1103 } 1104 sub_id++; 1105 lindex--; 1106 obj = hwloc_get_obj_by_type(t, ltype, lindex); 1107 } 1108 // store sub_id + 1 so that 0 is differed from NULL 1109 lower->userdata = RCAST(void *, sub_id + 1); 1110 return sub_id; 1111 } 1112 1113 static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) { 1114 kmp_hw_t type; 1115 int hw_thread_index, sub_id; 1116 int depth; 1117 hwloc_obj_t pu, obj, root, prev; 1118 kmp_hw_t types[KMP_HW_LAST]; 1119 hwloc_obj_type_t hwloc_types[KMP_HW_LAST]; 1120 1121 hwloc_topology_t tp = __kmp_hwloc_topology; 1122 *msg_id = kmp_i18n_null; 1123 if (__kmp_affinity_verbose) { 1124 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 1125 } 1126 1127 if (!KMP_AFFINITY_CAPABLE()) { 1128 // Hack to try and infer the machine topology using only the data 1129 // available from hwloc on the current thread, and __kmp_xproc. 1130 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1131 // hwloc only guarantees existance of PU object, so check PACKAGE and CORE 1132 hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0); 1133 if (o != NULL) 1134 nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_CORE); 1135 else 1136 nCoresPerPkg = 1; // no PACKAGE found 1137 o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0); 1138 if (o != NULL) 1139 __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_PU); 1140 else 1141 __kmp_nThreadsPerCore = 1; // no CORE found 1142 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1143 if (nCoresPerPkg == 0) 1144 nCoresPerPkg = 1; // to prevent possible division by 0 1145 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1146 return true; 1147 } 1148 1149 root = hwloc_get_root_obj(tp); 1150 1151 // Figure out the depth and types in the topology 1152 depth = 0; 1153 pu = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin()); 1154 KMP_ASSERT(pu); 1155 obj = pu; 1156 types[depth] = KMP_HW_THREAD; 1157 hwloc_types[depth] = obj->type; 1158 depth++; 1159 while (obj != root && obj != NULL) { 1160 obj = obj->parent; 1161 #if HWLOC_API_VERSION >= 0x00020000 1162 if (obj->memory_arity) { 1163 hwloc_obj_t memory; 1164 for (memory = obj->memory_first_child; memory; 1165 memory = hwloc_get_next_child(tp, obj, memory)) { 1166 if (memory->type == HWLOC_OBJ_NUMANODE) 1167 break; 1168 } 1169 if (memory && memory->type == HWLOC_OBJ_NUMANODE) { 1170 types[depth] = KMP_HW_NUMA; 1171 hwloc_types[depth] = memory->type; 1172 depth++; 1173 } 1174 } 1175 #endif 1176 type = __kmp_hwloc_type_2_topology_type(obj); 1177 if (type != KMP_HW_UNKNOWN) { 1178 types[depth] = type; 1179 hwloc_types[depth] = obj->type; 1180 depth++; 1181 } 1182 } 1183 KMP_ASSERT(depth > 0); 1184 1185 // Get the order for the types correct 1186 for (int i = 0, j = depth - 1; i < j; ++i, --j) { 1187 hwloc_obj_type_t hwloc_temp = hwloc_types[i]; 1188 kmp_hw_t temp = types[i]; 1189 types[i] = types[j]; 1190 types[j] = temp; 1191 hwloc_types[i] = hwloc_types[j]; 1192 hwloc_types[j] = hwloc_temp; 1193 } 1194 1195 // Allocate the data structure to be returned. 1196 __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types); 1197 1198 hw_thread_index = 0; 1199 pu = NULL; 1200 while (pu = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, pu)) { 1201 int index = depth - 1; 1202 bool included = KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask); 1203 kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index); 1204 if (included) { 1205 hw_thread.clear(); 1206 hw_thread.ids[index] = pu->logical_index; 1207 hw_thread.os_id = pu->os_index; 1208 index--; 1209 } 1210 obj = pu; 1211 prev = obj; 1212 while (obj != root && obj != NULL) { 1213 obj = obj->parent; 1214 #if HWLOC_API_VERSION >= 0x00020000 1215 // NUMA Nodes are handled differently since they are not within the 1216 // parent/child structure anymore. They are separate children 1217 // of obj (memory_first_child points to first memory child) 1218 if (obj->memory_arity) { 1219 hwloc_obj_t memory; 1220 for (memory = obj->memory_first_child; memory; 1221 memory = hwloc_get_next_child(tp, obj, memory)) { 1222 if (memory->type == HWLOC_OBJ_NUMANODE) 1223 break; 1224 } 1225 if (memory && memory->type == HWLOC_OBJ_NUMANODE) { 1226 sub_id = __kmp_hwloc_get_sub_id(tp, memory, prev); 1227 if (included) { 1228 hw_thread.ids[index] = memory->logical_index; 1229 hw_thread.ids[index + 1] = sub_id; 1230 index--; 1231 } 1232 prev = memory; 1233 } 1234 prev = obj; 1235 } 1236 #endif 1237 type = __kmp_hwloc_type_2_topology_type(obj); 1238 if (type != KMP_HW_UNKNOWN) { 1239 sub_id = __kmp_hwloc_get_sub_id(tp, obj, prev); 1240 if (included) { 1241 hw_thread.ids[index] = obj->logical_index; 1242 hw_thread.ids[index + 1] = sub_id; 1243 index--; 1244 } 1245 prev = obj; 1246 } 1247 } 1248 if (included) 1249 hw_thread_index++; 1250 } 1251 __kmp_topology->sort_ids(); 1252 return true; 1253 } 1254 #endif // KMP_USE_HWLOC 1255 1256 // If we don't know how to retrieve the machine's processor topology, or 1257 // encounter an error in doing so, this routine is called to form a "flat" 1258 // mapping of os thread id's <-> processor id's. 1259 static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t *const msg_id) { 1260 *msg_id = kmp_i18n_null; 1261 int depth = 3; 1262 kmp_hw_t types[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD}; 1263 1264 if (__kmp_affinity_verbose) { 1265 KMP_INFORM(UsingFlatOS, "KMP_AFFINITY"); 1266 } 1267 1268 // Even if __kmp_affinity_type == affinity_none, this routine might still 1269 // called to set __kmp_ncores, as well as 1270 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1271 if (!KMP_AFFINITY_CAPABLE()) { 1272 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1273 __kmp_ncores = nPackages = __kmp_xproc; 1274 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1275 return true; 1276 } 1277 1278 // When affinity is off, this routine will still be called to set 1279 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1280 // Make sure all these vars are set correctly, and return now if affinity is 1281 // not enabled. 1282 __kmp_ncores = nPackages = __kmp_avail_proc; 1283 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1284 1285 // Construct the data structure to be returned. 1286 __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types); 1287 int avail_ct = 0; 1288 int i; 1289 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1290 // Skip this proc if it is not included in the machine model. 1291 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1292 continue; 1293 } 1294 kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct); 1295 hw_thread.clear(); 1296 hw_thread.os_id = i; 1297 hw_thread.ids[0] = i; 1298 hw_thread.ids[1] = 0; 1299 hw_thread.ids[2] = 0; 1300 avail_ct++; 1301 } 1302 if (__kmp_affinity_verbose) { 1303 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 1304 } 1305 return true; 1306 } 1307 1308 #if KMP_GROUP_AFFINITY 1309 // If multiple Windows* OS processor groups exist, we can create a 2-level 1310 // topology map with the groups at level 0 and the individual procs at level 1. 1311 // This facilitates letting the threads float among all procs in a group, 1312 // if granularity=group (the default when there are multiple groups). 1313 static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t *const msg_id) { 1314 *msg_id = kmp_i18n_null; 1315 int depth = 3; 1316 kmp_hw_t types[] = {KMP_HW_PROC_GROUP, KMP_HW_CORE, KMP_HW_THREAD}; 1317 const static size_t BITS_PER_GROUP = CHAR_BIT * sizeof(DWORD_PTR); 1318 1319 if (__kmp_affinity_verbose) { 1320 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 1321 } 1322 1323 // If we aren't affinity capable, then use flat topology 1324 if (!KMP_AFFINITY_CAPABLE()) { 1325 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1326 nPackages = __kmp_num_proc_groups; 1327 __kmp_nThreadsPerCore = 1; 1328 __kmp_ncores = __kmp_xproc; 1329 nCoresPerPkg = nPackages / __kmp_ncores; 1330 return true; 1331 } 1332 1333 // Construct the data structure to be returned. 1334 __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types); 1335 int avail_ct = 0; 1336 int i; 1337 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1338 // Skip this proc if it is not included in the machine model. 1339 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1340 continue; 1341 } 1342 kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct++); 1343 hw_thread.clear(); 1344 hw_thread.os_id = i; 1345 hw_thread.ids[0] = i / BITS_PER_GROUP; 1346 hw_thread.ids[1] = hw_thread.ids[2] = i % BITS_PER_GROUP; 1347 } 1348 return true; 1349 } 1350 #endif /* KMP_GROUP_AFFINITY */ 1351 1352 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1353 1354 template <kmp_uint32 LSB, kmp_uint32 MSB> 1355 static inline unsigned __kmp_extract_bits(kmp_uint32 v) { 1356 const kmp_uint32 SHIFT_LEFT = sizeof(kmp_uint32) * 8 - 1 - MSB; 1357 const kmp_uint32 SHIFT_RIGHT = LSB; 1358 kmp_uint32 retval = v; 1359 retval <<= SHIFT_LEFT; 1360 retval >>= (SHIFT_LEFT + SHIFT_RIGHT); 1361 return retval; 1362 } 1363 1364 static int __kmp_cpuid_mask_width(int count) { 1365 int r = 0; 1366 1367 while ((1 << r) < count) 1368 ++r; 1369 return r; 1370 } 1371 1372 class apicThreadInfo { 1373 public: 1374 unsigned osId; // param to __kmp_affinity_bind_thread 1375 unsigned apicId; // from cpuid after binding 1376 unsigned maxCoresPerPkg; // "" 1377 unsigned maxThreadsPerPkg; // "" 1378 unsigned pkgId; // inferred from above values 1379 unsigned coreId; // "" 1380 unsigned threadId; // "" 1381 }; 1382 1383 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, 1384 const void *b) { 1385 const apicThreadInfo *aa = (const apicThreadInfo *)a; 1386 const apicThreadInfo *bb = (const apicThreadInfo *)b; 1387 if (aa->pkgId < bb->pkgId) 1388 return -1; 1389 if (aa->pkgId > bb->pkgId) 1390 return 1; 1391 if (aa->coreId < bb->coreId) 1392 return -1; 1393 if (aa->coreId > bb->coreId) 1394 return 1; 1395 if (aa->threadId < bb->threadId) 1396 return -1; 1397 if (aa->threadId > bb->threadId) 1398 return 1; 1399 return 0; 1400 } 1401 1402 class kmp_cache_info_t { 1403 public: 1404 struct info_t { 1405 unsigned level, mask; 1406 }; 1407 kmp_cache_info_t() : depth(0) { get_leaf4_levels(); } 1408 size_t get_depth() const { return depth; } 1409 info_t &operator[](size_t index) { return table[index]; } 1410 const info_t &operator[](size_t index) const { return table[index]; } 1411 1412 static kmp_hw_t get_topology_type(unsigned level) { 1413 KMP_DEBUG_ASSERT(level >= 1 && level <= MAX_CACHE_LEVEL); 1414 switch (level) { 1415 case 1: 1416 return KMP_HW_L1; 1417 case 2: 1418 return KMP_HW_L2; 1419 case 3: 1420 return KMP_HW_L3; 1421 } 1422 return KMP_HW_UNKNOWN; 1423 } 1424 1425 private: 1426 static const int MAX_CACHE_LEVEL = 3; 1427 1428 size_t depth; 1429 info_t table[MAX_CACHE_LEVEL]; 1430 1431 void get_leaf4_levels() { 1432 unsigned level = 0; 1433 while (depth < MAX_CACHE_LEVEL) { 1434 unsigned cache_type, max_threads_sharing; 1435 unsigned cache_level, cache_mask_width; 1436 kmp_cpuid buf2; 1437 __kmp_x86_cpuid(4, level, &buf2); 1438 cache_type = __kmp_extract_bits<0, 4>(buf2.eax); 1439 if (!cache_type) 1440 break; 1441 // Skip instruction caches 1442 if (cache_type == 2) { 1443 level++; 1444 continue; 1445 } 1446 max_threads_sharing = __kmp_extract_bits<14, 25>(buf2.eax) + 1; 1447 cache_mask_width = __kmp_cpuid_mask_width(max_threads_sharing); 1448 cache_level = __kmp_extract_bits<5, 7>(buf2.eax); 1449 table[depth].level = cache_level; 1450 table[depth].mask = ((-1) << cache_mask_width); 1451 depth++; 1452 level++; 1453 } 1454 } 1455 }; 1456 1457 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 1458 // an algorithm which cycles through the available os threads, setting 1459 // the current thread's affinity mask to that thread, and then retrieves 1460 // the Apic Id for each thread context using the cpuid instruction. 1461 static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) { 1462 kmp_cpuid buf; 1463 *msg_id = kmp_i18n_null; 1464 1465 if (__kmp_affinity_verbose) { 1466 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 1467 } 1468 1469 // Check if cpuid leaf 4 is supported. 1470 __kmp_x86_cpuid(0, 0, &buf); 1471 if (buf.eax < 4) { 1472 *msg_id = kmp_i18n_str_NoLeaf4Support; 1473 return false; 1474 } 1475 1476 // The algorithm used starts by setting the affinity to each available thread 1477 // and retrieving info from the cpuid instruction, so if we are not capable of 1478 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we 1479 // need to do something else - use the defaults that we calculated from 1480 // issuing cpuid without binding to each proc. 1481 if (!KMP_AFFINITY_CAPABLE()) { 1482 // Hack to try and infer the machine topology using only the data 1483 // available from cpuid on the current thread, and __kmp_xproc. 1484 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1485 1486 // Get an upper bound on the number of threads per package using cpuid(1). 1487 // On some OS/chps combinations where HT is supported by the chip but is 1488 // disabled, this value will be 2 on a single core chip. Usually, it will be 1489 // 2 if HT is enabled and 1 if HT is disabled. 1490 __kmp_x86_cpuid(1, 0, &buf); 1491 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1492 if (maxThreadsPerPkg == 0) { 1493 maxThreadsPerPkg = 1; 1494 } 1495 1496 // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded 1497 // value. 1498 // 1499 // The author of cpu_count.cpp treated this only an upper bound on the 1500 // number of cores, but I haven't seen any cases where it was greater than 1501 // the actual number of cores, so we will treat it as exact in this block of 1502 // code. 1503 // 1504 // First, we need to check if cpuid(4) is supported on this chip. To see if 1505 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or 1506 // greater. 1507 __kmp_x86_cpuid(0, 0, &buf); 1508 if (buf.eax >= 4) { 1509 __kmp_x86_cpuid(4, 0, &buf); 1510 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1511 } else { 1512 nCoresPerPkg = 1; 1513 } 1514 1515 // There is no way to reliably tell if HT is enabled without issuing the 1516 // cpuid instruction from every thread, can correlating the cpuid info, so 1517 // if the machine is not affinity capable, we assume that HT is off. We have 1518 // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine 1519 // does not support HT. 1520 // 1521 // - Older OSes are usually found on machines with older chips, which do not 1522 // support HT. 1523 // - The performance penalty for mistakenly identifying a machine as HT when 1524 // it isn't (which results in blocktime being incorrectly set to 0) is 1525 // greater than the penalty when for mistakenly identifying a machine as 1526 // being 1 thread/core when it is really HT enabled (which results in 1527 // blocktime being incorrectly set to a positive value). 1528 __kmp_ncores = __kmp_xproc; 1529 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1530 __kmp_nThreadsPerCore = 1; 1531 return true; 1532 } 1533 1534 // From here on, we can assume that it is safe to call 1535 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 1536 // __kmp_affinity_type = affinity_none. 1537 1538 // Save the affinity mask for the current thread. 1539 kmp_affinity_raii_t previous_affinity; 1540 1541 // Run through each of the available contexts, binding the current thread 1542 // to it, and obtaining the pertinent information using the cpuid instr. 1543 // 1544 // The relevant information is: 1545 // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 1546 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 1547 // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value 1548 // of this field determines the width of the core# + thread# fields in the 1549 // Apic Id. It is also an upper bound on the number of threads per 1550 // package, but it has been verified that situations happen were it is not 1551 // exact. In particular, on certain OS/chip combinations where Intel(R) 1552 // Hyper-Threading Technology is supported by the chip but has been 1553 // disabled, the value of this field will be 2 (for a single core chip). 1554 // On other OS/chip combinations supporting Intel(R) Hyper-Threading 1555 // Technology, the value of this field will be 1 when Intel(R) 1556 // Hyper-Threading Technology is disabled and 2 when it is enabled. 1557 // - Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The value 1558 // of this field (+1) determines the width of the core# field in the Apic 1559 // Id. The comments in "cpucount.cpp" say that this value is an upper 1560 // bound, but the IA-32 architecture manual says that it is exactly the 1561 // number of cores per package, and I haven't seen any case where it 1562 // wasn't. 1563 // 1564 // From this information, deduce the package Id, core Id, and thread Id, 1565 // and set the corresponding fields in the apicThreadInfo struct. 1566 unsigned i; 1567 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 1568 __kmp_avail_proc * sizeof(apicThreadInfo)); 1569 unsigned nApics = 0; 1570 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1571 // Skip this proc if it is not included in the machine model. 1572 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1573 continue; 1574 } 1575 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 1576 1577 __kmp_affinity_dispatch->bind_thread(i); 1578 threadInfo[nApics].osId = i; 1579 1580 // The apic id and max threads per pkg come from cpuid(1). 1581 __kmp_x86_cpuid(1, 0, &buf); 1582 if (((buf.edx >> 9) & 1) == 0) { 1583 __kmp_free(threadInfo); 1584 *msg_id = kmp_i18n_str_ApicNotPresent; 1585 return false; 1586 } 1587 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 1588 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1589 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 1590 threadInfo[nApics].maxThreadsPerPkg = 1; 1591 } 1592 1593 // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded 1594 // value. 1595 // 1596 // First, we need to check if cpuid(4) is supported on this chip. To see if 1597 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n 1598 // or greater. 1599 __kmp_x86_cpuid(0, 0, &buf); 1600 if (buf.eax >= 4) { 1601 __kmp_x86_cpuid(4, 0, &buf); 1602 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1603 } else { 1604 threadInfo[nApics].maxCoresPerPkg = 1; 1605 } 1606 1607 // Infer the pkgId / coreId / threadId using only the info obtained locally. 1608 int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg); 1609 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 1610 1611 int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg); 1612 int widthT = widthCT - widthC; 1613 if (widthT < 0) { 1614 // I've never seen this one happen, but I suppose it could, if the cpuid 1615 // instruction on a chip was really screwed up. Make sure to restore the 1616 // affinity mask before the tail call. 1617 __kmp_free(threadInfo); 1618 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1619 return false; 1620 } 1621 1622 int maskC = (1 << widthC) - 1; 1623 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC; 1624 1625 int maskT = (1 << widthT) - 1; 1626 threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT; 1627 1628 nApics++; 1629 } 1630 1631 // We've collected all the info we need. 1632 // Restore the old affinity mask for this thread. 1633 previous_affinity.restore(); 1634 1635 // Sort the threadInfo table by physical Id. 1636 qsort(threadInfo, nApics, sizeof(*threadInfo), 1637 __kmp_affinity_cmp_apicThreadInfo_phys_id); 1638 1639 // The table is now sorted by pkgId / coreId / threadId, but we really don't 1640 // know the radix of any of the fields. pkgId's may be sparsely assigned among 1641 // the chips on a system. Although coreId's are usually assigned 1642 // [0 .. coresPerPkg-1] and threadId's are usually assigned 1643 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1644 // 1645 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 1646 // total # packages) are at this point - we want to determine that now. We 1647 // only have an upper bound on the first two figures. 1648 // 1649 // We also perform a consistency check at this point: the values returned by 1650 // the cpuid instruction for any thread bound to a given package had better 1651 // return the same info for maxThreadsPerPkg and maxCoresPerPkg. 1652 nPackages = 1; 1653 nCoresPerPkg = 1; 1654 __kmp_nThreadsPerCore = 1; 1655 unsigned nCores = 1; 1656 1657 unsigned pkgCt = 1; // to determine radii 1658 unsigned lastPkgId = threadInfo[0].pkgId; 1659 unsigned coreCt = 1; 1660 unsigned lastCoreId = threadInfo[0].coreId; 1661 unsigned threadCt = 1; 1662 unsigned lastThreadId = threadInfo[0].threadId; 1663 1664 // intra-pkg consist checks 1665 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 1666 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 1667 1668 for (i = 1; i < nApics; i++) { 1669 if (threadInfo[i].pkgId != lastPkgId) { 1670 nCores++; 1671 pkgCt++; 1672 lastPkgId = threadInfo[i].pkgId; 1673 if ((int)coreCt > nCoresPerPkg) 1674 nCoresPerPkg = coreCt; 1675 coreCt = 1; 1676 lastCoreId = threadInfo[i].coreId; 1677 if ((int)threadCt > __kmp_nThreadsPerCore) 1678 __kmp_nThreadsPerCore = threadCt; 1679 threadCt = 1; 1680 lastThreadId = threadInfo[i].threadId; 1681 1682 // This is a different package, so go on to the next iteration without 1683 // doing any consistency checks. Reset the consistency check vars, though. 1684 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 1685 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 1686 continue; 1687 } 1688 1689 if (threadInfo[i].coreId != lastCoreId) { 1690 nCores++; 1691 coreCt++; 1692 lastCoreId = threadInfo[i].coreId; 1693 if ((int)threadCt > __kmp_nThreadsPerCore) 1694 __kmp_nThreadsPerCore = threadCt; 1695 threadCt = 1; 1696 lastThreadId = threadInfo[i].threadId; 1697 } else if (threadInfo[i].threadId != lastThreadId) { 1698 threadCt++; 1699 lastThreadId = threadInfo[i].threadId; 1700 } else { 1701 __kmp_free(threadInfo); 1702 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1703 return false; 1704 } 1705 1706 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 1707 // fields agree between all the threads bounds to a given package. 1708 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) || 1709 (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 1710 __kmp_free(threadInfo); 1711 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1712 return false; 1713 } 1714 } 1715 // When affinity is off, this routine will still be called to set 1716 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1717 // Make sure all these vars are set correctly 1718 nPackages = pkgCt; 1719 if ((int)coreCt > nCoresPerPkg) 1720 nCoresPerPkg = coreCt; 1721 if ((int)threadCt > __kmp_nThreadsPerCore) 1722 __kmp_nThreadsPerCore = threadCt; 1723 __kmp_ncores = nCores; 1724 KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc); 1725 1726 // Now that we've determined the number of packages, the number of cores per 1727 // package, and the number of threads per core, we can construct the data 1728 // structure that is to be returned. 1729 int idx = 0; 1730 int pkgLevel = 0; 1731 int coreLevel = 1; 1732 int threadLevel = 2; 1733 //(__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 1734 int depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 1735 kmp_hw_t types[3]; 1736 if (pkgLevel >= 0) 1737 types[idx++] = KMP_HW_SOCKET; 1738 if (coreLevel >= 0) 1739 types[idx++] = KMP_HW_CORE; 1740 if (threadLevel >= 0) 1741 types[idx++] = KMP_HW_THREAD; 1742 1743 KMP_ASSERT(depth > 0); 1744 __kmp_topology = kmp_topology_t::allocate(nApics, depth, types); 1745 1746 for (i = 0; i < nApics; ++i) { 1747 idx = 0; 1748 unsigned os = threadInfo[i].osId; 1749 kmp_hw_thread_t &hw_thread = __kmp_topology->at(i); 1750 hw_thread.clear(); 1751 1752 if (pkgLevel >= 0) { 1753 hw_thread.ids[idx++] = threadInfo[i].pkgId; 1754 } 1755 if (coreLevel >= 0) { 1756 hw_thread.ids[idx++] = threadInfo[i].coreId; 1757 } 1758 if (threadLevel >= 0) { 1759 hw_thread.ids[idx++] = threadInfo[i].threadId; 1760 } 1761 hw_thread.os_id = os; 1762 } 1763 1764 __kmp_free(threadInfo); 1765 __kmp_topology->sort_ids(); 1766 if (!__kmp_topology->check_ids()) { 1767 kmp_topology_t::deallocate(__kmp_topology); 1768 __kmp_topology = nullptr; 1769 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1770 return false; 1771 } 1772 return true; 1773 } 1774 1775 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 1776 // architectures support a newer interface for specifying the x2APIC Ids, 1777 // based on CPUID.B or CPUID.1F 1778 /* 1779 * CPUID.B or 1F, Input ECX (sub leaf # aka level number) 1780 Bits Bits Bits Bits 1781 31-16 15-8 7-4 4-0 1782 ---+-----------+--------------+-------------+-----------------+ 1783 EAX| reserved | reserved | reserved | Bits to Shift | 1784 ---+-----------|--------------+-------------+-----------------| 1785 EBX| reserved | Num logical processors at level (16 bits) | 1786 ---+-----------|--------------+-------------------------------| 1787 ECX| reserved | Level Type | Level Number (8 bits) | 1788 ---+-----------+--------------+-------------------------------| 1789 EDX| X2APIC ID (32 bits) | 1790 ---+----------------------------------------------------------+ 1791 */ 1792 1793 enum { 1794 INTEL_LEVEL_TYPE_INVALID = 0, // Package level 1795 INTEL_LEVEL_TYPE_SMT = 1, 1796 INTEL_LEVEL_TYPE_CORE = 2, 1797 INTEL_LEVEL_TYPE_TILE = 3, 1798 INTEL_LEVEL_TYPE_MODULE = 4, 1799 INTEL_LEVEL_TYPE_DIE = 5, 1800 INTEL_LEVEL_TYPE_LAST = 6, 1801 }; 1802 1803 struct cpuid_level_info_t { 1804 unsigned level_type, mask, mask_width, nitems, cache_mask; 1805 }; 1806 1807 static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) { 1808 switch (intel_type) { 1809 case INTEL_LEVEL_TYPE_INVALID: 1810 return KMP_HW_SOCKET; 1811 case INTEL_LEVEL_TYPE_SMT: 1812 return KMP_HW_THREAD; 1813 case INTEL_LEVEL_TYPE_CORE: 1814 return KMP_HW_CORE; 1815 case INTEL_LEVEL_TYPE_TILE: 1816 return KMP_HW_TILE; 1817 case INTEL_LEVEL_TYPE_MODULE: 1818 return KMP_HW_MODULE; 1819 case INTEL_LEVEL_TYPE_DIE: 1820 return KMP_HW_DIE; 1821 } 1822 return KMP_HW_UNKNOWN; 1823 } 1824 1825 // This function takes the topology leaf, a levels array to store the levels 1826 // detected and a bitmap of the known levels. 1827 // Returns the number of levels in the topology 1828 static unsigned 1829 __kmp_x2apicid_get_levels(int leaf, 1830 cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST], 1831 kmp_uint64 known_levels) { 1832 unsigned level, levels_index; 1833 unsigned level_type, mask_width, nitems; 1834 kmp_cpuid buf; 1835 1836 // New algorithm has known topology layers act as highest unknown topology 1837 // layers when unknown topology layers exist. 1838 // e.g., Suppose layers were SMT <X> CORE <Y> <Z> PACKAGE, where <X> <Y> <Z> 1839 // are unknown topology layers, Then SMT will take the characteristics of 1840 // (SMT x <X>) and CORE will take the characteristics of (CORE x <Y> x <Z>). 1841 // This eliminates unknown portions of the topology while still keeping the 1842 // correct structure. 1843 level = levels_index = 0; 1844 do { 1845 __kmp_x86_cpuid(leaf, level, &buf); 1846 level_type = __kmp_extract_bits<8, 15>(buf.ecx); 1847 mask_width = __kmp_extract_bits<0, 4>(buf.eax); 1848 nitems = __kmp_extract_bits<0, 15>(buf.ebx); 1849 if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0) 1850 return 0; 1851 1852 if (known_levels & (1ull << level_type)) { 1853 // Add a new level to the topology 1854 KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST); 1855 levels[levels_index].level_type = level_type; 1856 levels[levels_index].mask_width = mask_width; 1857 levels[levels_index].nitems = nitems; 1858 levels_index++; 1859 } else { 1860 // If it is an unknown level, then logically move the previous layer up 1861 if (levels_index > 0) { 1862 levels[levels_index - 1].mask_width = mask_width; 1863 levels[levels_index - 1].nitems = nitems; 1864 } 1865 } 1866 level++; 1867 } while (level_type != INTEL_LEVEL_TYPE_INVALID); 1868 1869 // Set the masks to & with apicid 1870 for (unsigned i = 0; i < levels_index; ++i) { 1871 if (levels[i].level_type != INTEL_LEVEL_TYPE_INVALID) { 1872 levels[i].mask = ~((-1) << levels[i].mask_width); 1873 levels[i].cache_mask = (-1) << levels[i].mask_width; 1874 for (unsigned j = 0; j < i; ++j) 1875 levels[i].mask ^= levels[j].mask; 1876 } else { 1877 KMP_DEBUG_ASSERT(levels_index > 0); 1878 levels[i].mask = (-1) << levels[i - 1].mask_width; 1879 levels[i].cache_mask = 0; 1880 } 1881 } 1882 return levels_index; 1883 } 1884 1885 static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) { 1886 1887 cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST]; 1888 kmp_hw_t types[INTEL_LEVEL_TYPE_LAST]; 1889 unsigned levels_index; 1890 kmp_cpuid buf; 1891 kmp_uint64 known_levels; 1892 int topology_leaf, highest_leaf, apic_id; 1893 int num_leaves; 1894 static int leaves[] = {0, 0}; 1895 1896 kmp_i18n_id_t leaf_message_id; 1897 1898 KMP_BUILD_ASSERT(sizeof(known_levels) * CHAR_BIT > KMP_HW_LAST); 1899 1900 *msg_id = kmp_i18n_null; 1901 if (__kmp_affinity_verbose) { 1902 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 1903 } 1904 1905 // Figure out the known topology levels 1906 known_levels = 0ull; 1907 for (int i = 0; i < INTEL_LEVEL_TYPE_LAST; ++i) { 1908 if (__kmp_intel_type_2_topology_type(i) != KMP_HW_UNKNOWN) { 1909 known_levels |= (1ull << i); 1910 } 1911 } 1912 1913 // Get the highest cpuid leaf supported 1914 __kmp_x86_cpuid(0, 0, &buf); 1915 highest_leaf = buf.eax; 1916 1917 // If a specific topology method was requested, only allow that specific leaf 1918 // otherwise, try both leaves 31 and 11 in that order 1919 num_leaves = 0; 1920 if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 1921 num_leaves = 1; 1922 leaves[0] = 11; 1923 leaf_message_id = kmp_i18n_str_NoLeaf11Support; 1924 } else if (__kmp_affinity_top_method == affinity_top_method_x2apicid_1f) { 1925 num_leaves = 1; 1926 leaves[0] = 31; 1927 leaf_message_id = kmp_i18n_str_NoLeaf31Support; 1928 } else { 1929 num_leaves = 2; 1930 leaves[0] = 31; 1931 leaves[1] = 11; 1932 leaf_message_id = kmp_i18n_str_NoLeaf11Support; 1933 } 1934 1935 // Check to see if cpuid leaf 31 or 11 is supported. 1936 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1937 topology_leaf = -1; 1938 for (int i = 0; i < num_leaves; ++i) { 1939 int leaf = leaves[i]; 1940 if (highest_leaf < leaf) 1941 continue; 1942 __kmp_x86_cpuid(leaf, 0, &buf); 1943 if (buf.ebx == 0) 1944 continue; 1945 topology_leaf = leaf; 1946 levels_index = __kmp_x2apicid_get_levels(leaf, levels, known_levels); 1947 if (levels_index == 0) 1948 continue; 1949 break; 1950 } 1951 if (topology_leaf == -1 || levels_index == 0) { 1952 *msg_id = leaf_message_id; 1953 return false; 1954 } 1955 KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST); 1956 1957 // The algorithm used starts by setting the affinity to each available thread 1958 // and retrieving info from the cpuid instruction, so if we are not capable of 1959 // calling __kmp_get_system_affinity() and __kmp_get_system_affinity(), then 1960 // we need to do something else - use the defaults that we calculated from 1961 // issuing cpuid without binding to each proc. 1962 if (!KMP_AFFINITY_CAPABLE()) { 1963 // Hack to try and infer the machine topology using only the data 1964 // available from cpuid on the current thread, and __kmp_xproc. 1965 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1966 for (unsigned i = 0; i < levels_index; ++i) { 1967 if (levels[i].level_type == INTEL_LEVEL_TYPE_SMT) { 1968 __kmp_nThreadsPerCore = levels[i].nitems; 1969 } else if (levels[i].level_type == INTEL_LEVEL_TYPE_CORE) { 1970 nCoresPerPkg = levels[i].nitems; 1971 } 1972 } 1973 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1974 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1975 return true; 1976 } 1977 1978 // Allocate the data structure to be returned. 1979 int depth = levels_index; 1980 for (int i = depth - 1, j = 0; i >= 0; --i, ++j) 1981 types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type); 1982 __kmp_topology = 1983 kmp_topology_t::allocate(__kmp_avail_proc, levels_index, types); 1984 1985 // Insert equivalent cache types if they exist 1986 kmp_cache_info_t cache_info; 1987 for (size_t i = 0; i < cache_info.get_depth(); ++i) { 1988 const kmp_cache_info_t::info_t &info = cache_info[i]; 1989 unsigned cache_mask = info.mask; 1990 unsigned cache_level = info.level; 1991 for (unsigned j = 0; j < levels_index; ++j) { 1992 unsigned hw_cache_mask = levels[j].cache_mask; 1993 kmp_hw_t cache_type = kmp_cache_info_t::get_topology_type(cache_level); 1994 if (hw_cache_mask == cache_mask && j < levels_index - 1) { 1995 kmp_hw_t type = 1996 __kmp_intel_type_2_topology_type(levels[j + 1].level_type); 1997 __kmp_topology->set_equivalent_type(cache_type, type); 1998 } 1999 } 2000 } 2001 2002 // From here on, we can assume that it is safe to call 2003 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 2004 // __kmp_affinity_type = affinity_none. 2005 2006 // Save the affinity mask for the current thread. 2007 kmp_affinity_raii_t previous_affinity; 2008 2009 // Run through each of the available contexts, binding the current thread 2010 // to it, and obtaining the pertinent information using the cpuid instr. 2011 unsigned int proc; 2012 int hw_thread_index = 0; 2013 KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) { 2014 cpuid_level_info_t my_levels[INTEL_LEVEL_TYPE_LAST]; 2015 unsigned my_levels_index; 2016 2017 // Skip this proc if it is not included in the machine model. 2018 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 2019 continue; 2020 } 2021 KMP_DEBUG_ASSERT(hw_thread_index < __kmp_avail_proc); 2022 2023 __kmp_affinity_dispatch->bind_thread(proc); 2024 2025 // New algorithm 2026 __kmp_x86_cpuid(topology_leaf, 0, &buf); 2027 apic_id = buf.edx; 2028 kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index); 2029 my_levels_index = 2030 __kmp_x2apicid_get_levels(topology_leaf, my_levels, known_levels); 2031 if (my_levels_index == 0 || my_levels_index != levels_index) { 2032 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 2033 return false; 2034 } 2035 hw_thread.clear(); 2036 hw_thread.os_id = proc; 2037 // Put in topology information 2038 for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) { 2039 hw_thread.ids[idx] = apic_id & my_levels[j].mask; 2040 if (j > 0) { 2041 hw_thread.ids[idx] >>= my_levels[j - 1].mask_width; 2042 } 2043 } 2044 hw_thread_index++; 2045 } 2046 KMP_ASSERT(hw_thread_index > 0); 2047 __kmp_topology->sort_ids(); 2048 if (!__kmp_topology->check_ids()) { 2049 kmp_topology_t::deallocate(__kmp_topology); 2050 __kmp_topology = nullptr; 2051 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 2052 return false; 2053 } 2054 return true; 2055 } 2056 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 2057 2058 #define osIdIndex 0 2059 #define threadIdIndex 1 2060 #define coreIdIndex 2 2061 #define pkgIdIndex 3 2062 #define nodeIdIndex 4 2063 2064 typedef unsigned *ProcCpuInfo; 2065 static unsigned maxIndex = pkgIdIndex; 2066 2067 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, 2068 const void *b) { 2069 unsigned i; 2070 const unsigned *aa = *(unsigned *const *)a; 2071 const unsigned *bb = *(unsigned *const *)b; 2072 for (i = maxIndex;; i--) { 2073 if (aa[i] < bb[i]) 2074 return -1; 2075 if (aa[i] > bb[i]) 2076 return 1; 2077 if (i == osIdIndex) 2078 break; 2079 } 2080 return 0; 2081 } 2082 2083 #if KMP_USE_HIER_SCHED 2084 // Set the array sizes for the hierarchy layers 2085 static void __kmp_dispatch_set_hierarchy_values() { 2086 // Set the maximum number of L1's to number of cores 2087 // Set the maximum number of L2's to to either number of cores / 2 for 2088 // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing 2089 // Or the number of cores for Intel(R) Xeon(R) processors 2090 // Set the maximum number of NUMA nodes and L3's to number of packages 2091 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] = 2092 nPackages * nCoresPerPkg * __kmp_nThreadsPerCore; 2093 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores; 2094 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \ 2095 KMP_MIC_SUPPORTED 2096 if (__kmp_mic_type >= mic3) 2097 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2; 2098 else 2099 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) 2100 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores; 2101 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages; 2102 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages; 2103 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1; 2104 // Set the number of threads per unit 2105 // Number of hardware threads per L1/L2/L3/NUMA/LOOP 2106 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1; 2107 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] = 2108 __kmp_nThreadsPerCore; 2109 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \ 2110 KMP_MIC_SUPPORTED 2111 if (__kmp_mic_type >= mic3) 2112 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] = 2113 2 * __kmp_nThreadsPerCore; 2114 else 2115 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) 2116 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] = 2117 __kmp_nThreadsPerCore; 2118 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] = 2119 nCoresPerPkg * __kmp_nThreadsPerCore; 2120 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] = 2121 nCoresPerPkg * __kmp_nThreadsPerCore; 2122 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] = 2123 nPackages * nCoresPerPkg * __kmp_nThreadsPerCore; 2124 } 2125 2126 // Return the index into the hierarchy for this tid and layer type (L1, L2, etc) 2127 // i.e., this thread's L1 or this thread's L2, etc. 2128 int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) { 2129 int index = type + 1; 2130 int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1]; 2131 KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST); 2132 if (type == kmp_hier_layer_e::LAYER_THREAD) 2133 return tid; 2134 else if (type == kmp_hier_layer_e::LAYER_LOOP) 2135 return 0; 2136 KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0); 2137 if (tid >= num_hw_threads) 2138 tid = tid % num_hw_threads; 2139 return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index]; 2140 } 2141 2142 // Return the number of t1's per t2 2143 int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) { 2144 int i1 = t1 + 1; 2145 int i2 = t2 + 1; 2146 KMP_DEBUG_ASSERT(i1 <= i2); 2147 KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST); 2148 KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST); 2149 KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0); 2150 // (nthreads/t2) / (nthreads/t1) = t1 / t2 2151 return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1]; 2152 } 2153 #endif // KMP_USE_HIER_SCHED 2154 2155 static inline const char *__kmp_cpuinfo_get_filename() { 2156 const char *filename; 2157 if (__kmp_cpuinfo_file != nullptr) 2158 filename = __kmp_cpuinfo_file; 2159 else 2160 filename = "/proc/cpuinfo"; 2161 return filename; 2162 } 2163 2164 static inline const char *__kmp_cpuinfo_get_envvar() { 2165 const char *envvar = nullptr; 2166 if (__kmp_cpuinfo_file != nullptr) 2167 envvar = "KMP_CPUINFO_FILE"; 2168 return envvar; 2169 } 2170 2171 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 2172 // affinity map. 2173 static bool __kmp_affinity_create_cpuinfo_map(int *line, 2174 kmp_i18n_id_t *const msg_id) { 2175 const char *filename = __kmp_cpuinfo_get_filename(); 2176 const char *envvar = __kmp_cpuinfo_get_envvar(); 2177 *msg_id = kmp_i18n_null; 2178 2179 if (__kmp_affinity_verbose) { 2180 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 2181 } 2182 2183 kmp_safe_raii_file_t f(filename, "r", envvar); 2184 2185 // Scan of the file, and count the number of "processor" (osId) fields, 2186 // and find the highest value of <n> for a node_<n> field. 2187 char buf[256]; 2188 unsigned num_records = 0; 2189 while (!feof(f)) { 2190 buf[sizeof(buf) - 1] = 1; 2191 if (!fgets(buf, sizeof(buf), f)) { 2192 // Read errors presumably because of EOF 2193 break; 2194 } 2195 2196 char s1[] = "processor"; 2197 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 2198 num_records++; 2199 continue; 2200 } 2201 2202 // FIXME - this will match "node_<n> <garbage>" 2203 unsigned level; 2204 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 2205 // validate the input fisrt: 2206 if (level > (unsigned)__kmp_xproc) { // level is too big 2207 level = __kmp_xproc; 2208 } 2209 if (nodeIdIndex + level >= maxIndex) { 2210 maxIndex = nodeIdIndex + level; 2211 } 2212 continue; 2213 } 2214 } 2215 2216 // Check for empty file / no valid processor records, or too many. The number 2217 // of records can't exceed the number of valid bits in the affinity mask. 2218 if (num_records == 0) { 2219 *msg_id = kmp_i18n_str_NoProcRecords; 2220 return false; 2221 } 2222 if (num_records > (unsigned)__kmp_xproc) { 2223 *msg_id = kmp_i18n_str_TooManyProcRecords; 2224 return false; 2225 } 2226 2227 // Set the file pointer back to the beginning, so that we can scan the file 2228 // again, this time performing a full parse of the data. Allocate a vector of 2229 // ProcCpuInfo object, where we will place the data. Adding an extra element 2230 // at the end allows us to remove a lot of extra checks for termination 2231 // conditions. 2232 if (fseek(f, 0, SEEK_SET) != 0) { 2233 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 2234 return false; 2235 } 2236 2237 // Allocate the array of records to store the proc info in. The dummy 2238 // element at the end makes the logic in filling them out easier to code. 2239 unsigned **threadInfo = 2240 (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *)); 2241 unsigned i; 2242 for (i = 0; i <= num_records; i++) { 2243 threadInfo[i] = 2244 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2245 } 2246 2247 #define CLEANUP_THREAD_INFO \ 2248 for (i = 0; i <= num_records; i++) { \ 2249 __kmp_free(threadInfo[i]); \ 2250 } \ 2251 __kmp_free(threadInfo); 2252 2253 // A value of UINT_MAX means that we didn't find the field 2254 unsigned __index; 2255 2256 #define INIT_PROC_INFO(p) \ 2257 for (__index = 0; __index <= maxIndex; __index++) { \ 2258 (p)[__index] = UINT_MAX; \ 2259 } 2260 2261 for (i = 0; i <= num_records; i++) { 2262 INIT_PROC_INFO(threadInfo[i]); 2263 } 2264 2265 unsigned num_avail = 0; 2266 *line = 0; 2267 while (!feof(f)) { 2268 // Create an inner scoping level, so that all the goto targets at the end of 2269 // the loop appear in an outer scoping level. This avoids warnings about 2270 // jumping past an initialization to a target in the same block. 2271 { 2272 buf[sizeof(buf) - 1] = 1; 2273 bool long_line = false; 2274 if (!fgets(buf, sizeof(buf), f)) { 2275 // Read errors presumably because of EOF 2276 // If there is valid data in threadInfo[num_avail], then fake 2277 // a blank line in ensure that the last address gets parsed. 2278 bool valid = false; 2279 for (i = 0; i <= maxIndex; i++) { 2280 if (threadInfo[num_avail][i] != UINT_MAX) { 2281 valid = true; 2282 } 2283 } 2284 if (!valid) { 2285 break; 2286 } 2287 buf[0] = 0; 2288 } else if (!buf[sizeof(buf) - 1]) { 2289 // The line is longer than the buffer. Set a flag and don't 2290 // emit an error if we were going to ignore the line, anyway. 2291 long_line = true; 2292 2293 #define CHECK_LINE \ 2294 if (long_line) { \ 2295 CLEANUP_THREAD_INFO; \ 2296 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 2297 return false; \ 2298 } 2299 } 2300 (*line)++; 2301 2302 char s1[] = "processor"; 2303 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 2304 CHECK_LINE; 2305 char *p = strchr(buf + sizeof(s1) - 1, ':'); 2306 unsigned val; 2307 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2308 goto no_val; 2309 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) 2310 #if KMP_ARCH_AARCH64 2311 // Handle the old AArch64 /proc/cpuinfo layout differently, 2312 // it contains all of the 'processor' entries listed in a 2313 // single 'Processor' section, therefore the normal looking 2314 // for duplicates in that section will always fail. 2315 num_avail++; 2316 #else 2317 goto dup_field; 2318 #endif 2319 threadInfo[num_avail][osIdIndex] = val; 2320 #if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64) 2321 char path[256]; 2322 KMP_SNPRINTF( 2323 path, sizeof(path), 2324 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 2325 threadInfo[num_avail][osIdIndex]); 2326 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 2327 2328 KMP_SNPRINTF(path, sizeof(path), 2329 "/sys/devices/system/cpu/cpu%u/topology/core_id", 2330 threadInfo[num_avail][osIdIndex]); 2331 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 2332 continue; 2333 #else 2334 } 2335 char s2[] = "physical id"; 2336 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 2337 CHECK_LINE; 2338 char *p = strchr(buf + sizeof(s2) - 1, ':'); 2339 unsigned val; 2340 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2341 goto no_val; 2342 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) 2343 goto dup_field; 2344 threadInfo[num_avail][pkgIdIndex] = val; 2345 continue; 2346 } 2347 char s3[] = "core id"; 2348 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 2349 CHECK_LINE; 2350 char *p = strchr(buf + sizeof(s3) - 1, ':'); 2351 unsigned val; 2352 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2353 goto no_val; 2354 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) 2355 goto dup_field; 2356 threadInfo[num_avail][coreIdIndex] = val; 2357 continue; 2358 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 2359 } 2360 char s4[] = "thread id"; 2361 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 2362 CHECK_LINE; 2363 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2364 unsigned val; 2365 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2366 goto no_val; 2367 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) 2368 goto dup_field; 2369 threadInfo[num_avail][threadIdIndex] = val; 2370 continue; 2371 } 2372 unsigned level; 2373 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 2374 CHECK_LINE; 2375 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2376 unsigned val; 2377 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2378 goto no_val; 2379 KMP_ASSERT(nodeIdIndex + level <= maxIndex); 2380 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) 2381 goto dup_field; 2382 threadInfo[num_avail][nodeIdIndex + level] = val; 2383 continue; 2384 } 2385 2386 // We didn't recognize the leading token on the line. There are lots of 2387 // leading tokens that we don't recognize - if the line isn't empty, go on 2388 // to the next line. 2389 if ((*buf != 0) && (*buf != '\n')) { 2390 // If the line is longer than the buffer, read characters 2391 // until we find a newline. 2392 if (long_line) { 2393 int ch; 2394 while (((ch = fgetc(f)) != EOF) && (ch != '\n')) 2395 ; 2396 } 2397 continue; 2398 } 2399 2400 // A newline has signalled the end of the processor record. 2401 // Check that there aren't too many procs specified. 2402 if ((int)num_avail == __kmp_xproc) { 2403 CLEANUP_THREAD_INFO; 2404 *msg_id = kmp_i18n_str_TooManyEntries; 2405 return false; 2406 } 2407 2408 // Check for missing fields. The osId field must be there, and we 2409 // currently require that the physical id field is specified, also. 2410 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 2411 CLEANUP_THREAD_INFO; 2412 *msg_id = kmp_i18n_str_MissingProcField; 2413 return false; 2414 } 2415 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 2416 CLEANUP_THREAD_INFO; 2417 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 2418 return false; 2419 } 2420 2421 // Skip this proc if it is not included in the machine model. 2422 if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], 2423 __kmp_affin_fullMask)) { 2424 INIT_PROC_INFO(threadInfo[num_avail]); 2425 continue; 2426 } 2427 2428 // We have a successful parse of this proc's info. 2429 // Increment the counter, and prepare for the next proc. 2430 num_avail++; 2431 KMP_ASSERT(num_avail <= num_records); 2432 INIT_PROC_INFO(threadInfo[num_avail]); 2433 } 2434 continue; 2435 2436 no_val: 2437 CLEANUP_THREAD_INFO; 2438 *msg_id = kmp_i18n_str_MissingValCpuinfo; 2439 return false; 2440 2441 dup_field: 2442 CLEANUP_THREAD_INFO; 2443 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 2444 return false; 2445 } 2446 *line = 0; 2447 2448 #if KMP_MIC && REDUCE_TEAM_SIZE 2449 unsigned teamSize = 0; 2450 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2451 2452 // check for num_records == __kmp_xproc ??? 2453 2454 // If it is configured to omit the package level when there is only a single 2455 // package, the logic at the end of this routine won't work if there is only a 2456 // single thread 2457 KMP_ASSERT(num_avail > 0); 2458 KMP_ASSERT(num_avail <= num_records); 2459 2460 // Sort the threadInfo table by physical Id. 2461 qsort(threadInfo, num_avail, sizeof(*threadInfo), 2462 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 2463 2464 // The table is now sorted by pkgId / coreId / threadId, but we really don't 2465 // know the radix of any of the fields. pkgId's may be sparsely assigned among 2466 // the chips on a system. Although coreId's are usually assigned 2467 // [0 .. coresPerPkg-1] and threadId's are usually assigned 2468 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2469 // 2470 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 2471 // total # packages) are at this point - we want to determine that now. We 2472 // only have an upper bound on the first two figures. 2473 unsigned *counts = 2474 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2475 unsigned *maxCt = 2476 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2477 unsigned *totals = 2478 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2479 unsigned *lastId = 2480 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2481 2482 bool assign_thread_ids = false; 2483 unsigned threadIdCt; 2484 unsigned index; 2485 2486 restart_radix_check: 2487 threadIdCt = 0; 2488 2489 // Initialize the counter arrays with data from threadInfo[0]. 2490 if (assign_thread_ids) { 2491 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 2492 threadInfo[0][threadIdIndex] = threadIdCt++; 2493 } else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 2494 threadIdCt = threadInfo[0][threadIdIndex] + 1; 2495 } 2496 } 2497 for (index = 0; index <= maxIndex; index++) { 2498 counts[index] = 1; 2499 maxCt[index] = 1; 2500 totals[index] = 1; 2501 lastId[index] = threadInfo[0][index]; 2502 ; 2503 } 2504 2505 // Run through the rest of the OS procs. 2506 for (i = 1; i < num_avail; i++) { 2507 // Find the most significant index whose id differs from the id for the 2508 // previous OS proc. 2509 for (index = maxIndex; index >= threadIdIndex; index--) { 2510 if (assign_thread_ids && (index == threadIdIndex)) { 2511 // Auto-assign the thread id field if it wasn't specified. 2512 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2513 threadInfo[i][threadIdIndex] = threadIdCt++; 2514 } 2515 // Apparently the thread id field was specified for some entries and not 2516 // others. Start the thread id counter off at the next higher thread id. 2517 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2518 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2519 } 2520 } 2521 if (threadInfo[i][index] != lastId[index]) { 2522 // Run through all indices which are less significant, and reset the 2523 // counts to 1. At all levels up to and including index, we need to 2524 // increment the totals and record the last id. 2525 unsigned index2; 2526 for (index2 = threadIdIndex; index2 < index; index2++) { 2527 totals[index2]++; 2528 if (counts[index2] > maxCt[index2]) { 2529 maxCt[index2] = counts[index2]; 2530 } 2531 counts[index2] = 1; 2532 lastId[index2] = threadInfo[i][index2]; 2533 } 2534 counts[index]++; 2535 totals[index]++; 2536 lastId[index] = threadInfo[i][index]; 2537 2538 if (assign_thread_ids && (index > threadIdIndex)) { 2539 2540 #if KMP_MIC && REDUCE_TEAM_SIZE 2541 // The default team size is the total #threads in the machine 2542 // minus 1 thread for every core that has 3 or more threads. 2543 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2544 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2545 2546 // Restart the thread counter, as we are on a new core. 2547 threadIdCt = 0; 2548 2549 // Auto-assign the thread id field if it wasn't specified. 2550 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2551 threadInfo[i][threadIdIndex] = threadIdCt++; 2552 } 2553 2554 // Apparently the thread id field was specified for some entries and 2555 // not others. Start the thread id counter off at the next higher 2556 // thread id. 2557 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2558 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2559 } 2560 } 2561 break; 2562 } 2563 } 2564 if (index < threadIdIndex) { 2565 // If thread ids were specified, it is an error if they are not unique. 2566 // Also, check that we waven't already restarted the loop (to be safe - 2567 // shouldn't need to). 2568 if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) { 2569 __kmp_free(lastId); 2570 __kmp_free(totals); 2571 __kmp_free(maxCt); 2572 __kmp_free(counts); 2573 CLEANUP_THREAD_INFO; 2574 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2575 return false; 2576 } 2577 2578 // If the thread ids were not specified and we see entries entries that 2579 // are duplicates, start the loop over and assign the thread ids manually. 2580 assign_thread_ids = true; 2581 goto restart_radix_check; 2582 } 2583 } 2584 2585 #if KMP_MIC && REDUCE_TEAM_SIZE 2586 // The default team size is the total #threads in the machine 2587 // minus 1 thread for every core that has 3 or more threads. 2588 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2589 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2590 2591 for (index = threadIdIndex; index <= maxIndex; index++) { 2592 if (counts[index] > maxCt[index]) { 2593 maxCt[index] = counts[index]; 2594 } 2595 } 2596 2597 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2598 nCoresPerPkg = maxCt[coreIdIndex]; 2599 nPackages = totals[pkgIdIndex]; 2600 2601 // Check to see if the machine topology is uniform 2602 unsigned prod = totals[maxIndex]; 2603 for (index = threadIdIndex; index < maxIndex; index++) { 2604 prod *= maxCt[index]; 2605 } 2606 2607 // When affinity is off, this routine will still be called to set 2608 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 2609 // Make sure all these vars are set correctly, and return now if affinity is 2610 // not enabled. 2611 __kmp_ncores = totals[coreIdIndex]; 2612 if (!KMP_AFFINITY_CAPABLE()) { 2613 KMP_ASSERT(__kmp_affinity_type == affinity_none); 2614 return true; 2615 } 2616 2617 #if KMP_MIC && REDUCE_TEAM_SIZE 2618 // Set the default team size. 2619 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2620 __kmp_dflt_team_nth = teamSize; 2621 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting " 2622 "__kmp_dflt_team_nth = %d\n", 2623 __kmp_dflt_team_nth)); 2624 } 2625 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2626 2627 KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc); 2628 2629 // Count the number of levels which have more nodes at that level than at the 2630 // parent's level (with there being an implicit root node of the top level). 2631 // This is equivalent to saying that there is at least one node at this level 2632 // which has a sibling. These levels are in the map, and the package level is 2633 // always in the map. 2634 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2635 for (index = threadIdIndex; index < maxIndex; index++) { 2636 KMP_ASSERT(totals[index] >= totals[index + 1]); 2637 inMap[index] = (totals[index] > totals[index + 1]); 2638 } 2639 inMap[maxIndex] = (totals[maxIndex] > 1); 2640 inMap[pkgIdIndex] = true; 2641 inMap[coreIdIndex] = true; 2642 inMap[threadIdIndex] = true; 2643 2644 int depth = 0; 2645 int idx = 0; 2646 kmp_hw_t types[KMP_HW_LAST]; 2647 int pkgLevel = -1; 2648 int coreLevel = -1; 2649 int threadLevel = -1; 2650 for (index = threadIdIndex; index <= maxIndex; index++) { 2651 if (inMap[index]) { 2652 depth++; 2653 } 2654 } 2655 if (inMap[pkgIdIndex]) { 2656 pkgLevel = idx; 2657 types[idx++] = KMP_HW_SOCKET; 2658 } 2659 if (inMap[coreIdIndex]) { 2660 coreLevel = idx; 2661 types[idx++] = KMP_HW_CORE; 2662 } 2663 if (inMap[threadIdIndex]) { 2664 threadLevel = idx; 2665 types[idx++] = KMP_HW_THREAD; 2666 } 2667 KMP_ASSERT(depth > 0); 2668 2669 // Construct the data structure that is to be returned. 2670 __kmp_topology = kmp_topology_t::allocate(num_avail, depth, types); 2671 2672 for (i = 0; i < num_avail; ++i) { 2673 unsigned os = threadInfo[i][osIdIndex]; 2674 int src_index; 2675 int dst_index = 0; 2676 kmp_hw_thread_t &hw_thread = __kmp_topology->at(i); 2677 hw_thread.clear(); 2678 hw_thread.os_id = os; 2679 2680 idx = 0; 2681 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2682 if (!inMap[src_index]) { 2683 continue; 2684 } 2685 if (src_index == pkgIdIndex) { 2686 hw_thread.ids[pkgLevel] = threadInfo[i][src_index]; 2687 } else if (src_index == coreIdIndex) { 2688 hw_thread.ids[coreLevel] = threadInfo[i][src_index]; 2689 } else if (src_index == threadIdIndex) { 2690 hw_thread.ids[threadLevel] = threadInfo[i][src_index]; 2691 } 2692 dst_index++; 2693 } 2694 } 2695 2696 __kmp_free(inMap); 2697 __kmp_free(lastId); 2698 __kmp_free(totals); 2699 __kmp_free(maxCt); 2700 __kmp_free(counts); 2701 CLEANUP_THREAD_INFO; 2702 __kmp_topology->sort_ids(); 2703 if (!__kmp_topology->check_ids()) { 2704 kmp_topology_t::deallocate(__kmp_topology); 2705 __kmp_topology = nullptr; 2706 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2707 return false; 2708 } 2709 return true; 2710 } 2711 2712 // Create and return a table of affinity masks, indexed by OS thread ID. 2713 // This routine handles OR'ing together all the affinity masks of threads 2714 // that are sufficiently close, if granularity > fine. 2715 static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex, 2716 unsigned *numUnique) { 2717 // First form a table of affinity masks in order of OS thread id. 2718 int maxOsId; 2719 int i; 2720 int numAddrs = __kmp_topology->get_num_hw_threads(); 2721 int depth = __kmp_topology->get_depth(); 2722 KMP_ASSERT(numAddrs); 2723 KMP_ASSERT(depth); 2724 2725 maxOsId = 0; 2726 for (i = numAddrs - 1;; --i) { 2727 int osId = __kmp_topology->at(i).os_id; 2728 if (osId > maxOsId) { 2729 maxOsId = osId; 2730 } 2731 if (i == 0) 2732 break; 2733 } 2734 kmp_affin_mask_t *osId2Mask; 2735 KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1)); 2736 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2737 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2738 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2739 } 2740 if (__kmp_affinity_gran_levels >= (int)depth) { 2741 if (__kmp_affinity_verbose || 2742 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 2743 KMP_WARNING(AffThreadsMayMigrate); 2744 } 2745 } 2746 2747 // Run through the table, forming the masks for all threads on each core. 2748 // Threads on the same core will have identical kmp_hw_thread_t objects, not 2749 // considering the last level, which must be the thread id. All threads on a 2750 // core will appear consecutively. 2751 int unique = 0; 2752 int j = 0; // index of 1st thread on core 2753 int leader = 0; 2754 kmp_affin_mask_t *sum; 2755 KMP_CPU_ALLOC_ON_STACK(sum); 2756 KMP_CPU_ZERO(sum); 2757 KMP_CPU_SET(__kmp_topology->at(0).os_id, sum); 2758 for (i = 1; i < numAddrs; i++) { 2759 // If this thread is sufficiently close to the leader (within the 2760 // granularity setting), then set the bit for this os thread in the 2761 // affinity mask for this group, and go on to the next thread. 2762 if (__kmp_topology->is_close(leader, i, __kmp_affinity_gran_levels)) { 2763 KMP_CPU_SET(__kmp_topology->at(i).os_id, sum); 2764 continue; 2765 } 2766 2767 // For every thread in this group, copy the mask to the thread's entry in 2768 // the osId2Mask table. Mark the first address as a leader. 2769 for (; j < i; j++) { 2770 int osId = __kmp_topology->at(j).os_id; 2771 KMP_DEBUG_ASSERT(osId <= maxOsId); 2772 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2773 KMP_CPU_COPY(mask, sum); 2774 __kmp_topology->at(j).leader = (j == leader); 2775 } 2776 unique++; 2777 2778 // Start a new mask. 2779 leader = i; 2780 KMP_CPU_ZERO(sum); 2781 KMP_CPU_SET(__kmp_topology->at(i).os_id, sum); 2782 } 2783 2784 // For every thread in last group, copy the mask to the thread's 2785 // entry in the osId2Mask table. 2786 for (; j < i; j++) { 2787 int osId = __kmp_topology->at(j).os_id; 2788 KMP_DEBUG_ASSERT(osId <= maxOsId); 2789 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2790 KMP_CPU_COPY(mask, sum); 2791 __kmp_topology->at(j).leader = (j == leader); 2792 } 2793 unique++; 2794 KMP_CPU_FREE_FROM_STACK(sum); 2795 2796 *maxIndex = maxOsId; 2797 *numUnique = unique; 2798 return osId2Mask; 2799 } 2800 2801 // Stuff for the affinity proclist parsers. It's easier to declare these vars 2802 // as file-static than to try and pass them through the calling sequence of 2803 // the recursive-descent OMP_PLACES parser. 2804 static kmp_affin_mask_t *newMasks; 2805 static int numNewMasks; 2806 static int nextNewMask; 2807 2808 #define ADD_MASK(_mask) \ 2809 { \ 2810 if (nextNewMask >= numNewMasks) { \ 2811 int i; \ 2812 numNewMasks *= 2; \ 2813 kmp_affin_mask_t *temp; \ 2814 KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \ 2815 for (i = 0; i < numNewMasks / 2; i++) { \ 2816 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); \ 2817 kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i); \ 2818 KMP_CPU_COPY(dest, src); \ 2819 } \ 2820 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2); \ 2821 newMasks = temp; \ 2822 } \ 2823 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 2824 nextNewMask++; \ 2825 } 2826 2827 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId) \ 2828 { \ 2829 if (((_osId) > _maxOsId) || \ 2830 (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 2831 if (__kmp_affinity_verbose || \ 2832 (__kmp_affinity_warnings && \ 2833 (__kmp_affinity_type != affinity_none))) { \ 2834 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 2835 } \ 2836 } else { \ 2837 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 2838 } \ 2839 } 2840 2841 // Re-parse the proclist (for the explicit affinity type), and form the list 2842 // of affinity newMasks indexed by gtid. 2843 static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 2844 unsigned int *out_numMasks, 2845 const char *proclist, 2846 kmp_affin_mask_t *osId2Mask, 2847 int maxOsId) { 2848 int i; 2849 const char *scan = proclist; 2850 const char *next = proclist; 2851 2852 // We use malloc() for the temporary mask vector, so that we can use 2853 // realloc() to extend it. 2854 numNewMasks = 2; 2855 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 2856 nextNewMask = 0; 2857 kmp_affin_mask_t *sumMask; 2858 KMP_CPU_ALLOC(sumMask); 2859 int setSize = 0; 2860 2861 for (;;) { 2862 int start, end, stride; 2863 2864 SKIP_WS(scan); 2865 next = scan; 2866 if (*next == '\0') { 2867 break; 2868 } 2869 2870 if (*next == '{') { 2871 int num; 2872 setSize = 0; 2873 next++; // skip '{' 2874 SKIP_WS(next); 2875 scan = next; 2876 2877 // Read the first integer in the set. 2878 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist"); 2879 SKIP_DIGITS(next); 2880 num = __kmp_str_to_int(scan, *next); 2881 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2882 2883 // Copy the mask for that osId to the sum (union) mask. 2884 if ((num > maxOsId) || 2885 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2886 if (__kmp_affinity_verbose || 2887 (__kmp_affinity_warnings && 2888 (__kmp_affinity_type != affinity_none))) { 2889 KMP_WARNING(AffIgnoreInvalidProcID, num); 2890 } 2891 KMP_CPU_ZERO(sumMask); 2892 } else { 2893 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2894 setSize = 1; 2895 } 2896 2897 for (;;) { 2898 // Check for end of set. 2899 SKIP_WS(next); 2900 if (*next == '}') { 2901 next++; // skip '}' 2902 break; 2903 } 2904 2905 // Skip optional comma. 2906 if (*next == ',') { 2907 next++; 2908 } 2909 SKIP_WS(next); 2910 2911 // Read the next integer in the set. 2912 scan = next; 2913 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2914 2915 SKIP_DIGITS(next); 2916 num = __kmp_str_to_int(scan, *next); 2917 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2918 2919 // Add the mask for that osId to the sum mask. 2920 if ((num > maxOsId) || 2921 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2922 if (__kmp_affinity_verbose || 2923 (__kmp_affinity_warnings && 2924 (__kmp_affinity_type != affinity_none))) { 2925 KMP_WARNING(AffIgnoreInvalidProcID, num); 2926 } 2927 } else { 2928 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2929 setSize++; 2930 } 2931 } 2932 if (setSize > 0) { 2933 ADD_MASK(sumMask); 2934 } 2935 2936 SKIP_WS(next); 2937 if (*next == ',') { 2938 next++; 2939 } 2940 scan = next; 2941 continue; 2942 } 2943 2944 // Read the first integer. 2945 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2946 SKIP_DIGITS(next); 2947 start = __kmp_str_to_int(scan, *next); 2948 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 2949 SKIP_WS(next); 2950 2951 // If this isn't a range, then add a mask to the list and go on. 2952 if (*next != '-') { 2953 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2954 2955 // Skip optional comma. 2956 if (*next == ',') { 2957 next++; 2958 } 2959 scan = next; 2960 continue; 2961 } 2962 2963 // This is a range. Skip over the '-' and read in the 2nd int. 2964 next++; // skip '-' 2965 SKIP_WS(next); 2966 scan = next; 2967 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2968 SKIP_DIGITS(next); 2969 end = __kmp_str_to_int(scan, *next); 2970 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 2971 2972 // Check for a stride parameter 2973 stride = 1; 2974 SKIP_WS(next); 2975 if (*next == ':') { 2976 // A stride is specified. Skip over the ':" and read the 3rd int. 2977 int sign = +1; 2978 next++; // skip ':' 2979 SKIP_WS(next); 2980 scan = next; 2981 if (*next == '-') { 2982 sign = -1; 2983 next++; 2984 SKIP_WS(next); 2985 scan = next; 2986 } 2987 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2988 SKIP_DIGITS(next); 2989 stride = __kmp_str_to_int(scan, *next); 2990 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 2991 stride *= sign; 2992 } 2993 2994 // Do some range checks. 2995 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 2996 if (stride > 0) { 2997 KMP_ASSERT2(start <= end, "bad explicit proc list"); 2998 } else { 2999 KMP_ASSERT2(start >= end, "bad explicit proc list"); 3000 } 3001 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 3002 3003 // Add the mask for each OS proc # to the list. 3004 if (stride > 0) { 3005 do { 3006 ADD_MASK_OSID(start, osId2Mask, maxOsId); 3007 start += stride; 3008 } while (start <= end); 3009 } else { 3010 do { 3011 ADD_MASK_OSID(start, osId2Mask, maxOsId); 3012 start += stride; 3013 } while (start >= end); 3014 } 3015 3016 // Skip optional comma. 3017 SKIP_WS(next); 3018 if (*next == ',') { 3019 next++; 3020 } 3021 scan = next; 3022 } 3023 3024 *out_numMasks = nextNewMask; 3025 if (nextNewMask == 0) { 3026 *out_masks = NULL; 3027 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3028 return; 3029 } 3030 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3031 for (i = 0; i < nextNewMask; i++) { 3032 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 3033 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 3034 KMP_CPU_COPY(dest, src); 3035 } 3036 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3037 KMP_CPU_FREE(sumMask); 3038 } 3039 3040 /*----------------------------------------------------------------------------- 3041 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 3042 places. Again, Here is the grammar: 3043 3044 place_list := place 3045 place_list := place , place_list 3046 place := num 3047 place := place : num 3048 place := place : num : signed 3049 place := { subplacelist } 3050 place := ! place // (lowest priority) 3051 subplace_list := subplace 3052 subplace_list := subplace , subplace_list 3053 subplace := num 3054 subplace := num : num 3055 subplace := num : num : signed 3056 signed := num 3057 signed := + signed 3058 signed := - signed 3059 -----------------------------------------------------------------------------*/ 3060 static void __kmp_process_subplace_list(const char **scan, 3061 kmp_affin_mask_t *osId2Mask, 3062 int maxOsId, kmp_affin_mask_t *tempMask, 3063 int *setSize) { 3064 const char *next; 3065 3066 for (;;) { 3067 int start, count, stride, i; 3068 3069 // Read in the starting proc id 3070 SKIP_WS(*scan); 3071 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3072 next = *scan; 3073 SKIP_DIGITS(next); 3074 start = __kmp_str_to_int(*scan, *next); 3075 KMP_ASSERT(start >= 0); 3076 *scan = next; 3077 3078 // valid follow sets are ',' ':' and '}' 3079 SKIP_WS(*scan); 3080 if (**scan == '}' || **scan == ',') { 3081 if ((start > maxOsId) || 3082 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3083 if (__kmp_affinity_verbose || 3084 (__kmp_affinity_warnings && 3085 (__kmp_affinity_type != affinity_none))) { 3086 KMP_WARNING(AffIgnoreInvalidProcID, start); 3087 } 3088 } else { 3089 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3090 (*setSize)++; 3091 } 3092 if (**scan == '}') { 3093 break; 3094 } 3095 (*scan)++; // skip ',' 3096 continue; 3097 } 3098 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3099 (*scan)++; // skip ':' 3100 3101 // Read count parameter 3102 SKIP_WS(*scan); 3103 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3104 next = *scan; 3105 SKIP_DIGITS(next); 3106 count = __kmp_str_to_int(*scan, *next); 3107 KMP_ASSERT(count >= 0); 3108 *scan = next; 3109 3110 // valid follow sets are ',' ':' and '}' 3111 SKIP_WS(*scan); 3112 if (**scan == '}' || **scan == ',') { 3113 for (i = 0; i < count; i++) { 3114 if ((start > maxOsId) || 3115 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3116 if (__kmp_affinity_verbose || 3117 (__kmp_affinity_warnings && 3118 (__kmp_affinity_type != affinity_none))) { 3119 KMP_WARNING(AffIgnoreInvalidProcID, start); 3120 } 3121 break; // don't proliferate warnings for large count 3122 } else { 3123 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3124 start++; 3125 (*setSize)++; 3126 } 3127 } 3128 if (**scan == '}') { 3129 break; 3130 } 3131 (*scan)++; // skip ',' 3132 continue; 3133 } 3134 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3135 (*scan)++; // skip ':' 3136 3137 // Read stride parameter 3138 int sign = +1; 3139 for (;;) { 3140 SKIP_WS(*scan); 3141 if (**scan == '+') { 3142 (*scan)++; // skip '+' 3143 continue; 3144 } 3145 if (**scan == '-') { 3146 sign *= -1; 3147 (*scan)++; // skip '-' 3148 continue; 3149 } 3150 break; 3151 } 3152 SKIP_WS(*scan); 3153 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3154 next = *scan; 3155 SKIP_DIGITS(next); 3156 stride = __kmp_str_to_int(*scan, *next); 3157 KMP_ASSERT(stride >= 0); 3158 *scan = next; 3159 stride *= sign; 3160 3161 // valid follow sets are ',' and '}' 3162 SKIP_WS(*scan); 3163 if (**scan == '}' || **scan == ',') { 3164 for (i = 0; i < count; i++) { 3165 if ((start > maxOsId) || 3166 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3167 if (__kmp_affinity_verbose || 3168 (__kmp_affinity_warnings && 3169 (__kmp_affinity_type != affinity_none))) { 3170 KMP_WARNING(AffIgnoreInvalidProcID, start); 3171 } 3172 break; // don't proliferate warnings for large count 3173 } else { 3174 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3175 start += stride; 3176 (*setSize)++; 3177 } 3178 } 3179 if (**scan == '}') { 3180 break; 3181 } 3182 (*scan)++; // skip ',' 3183 continue; 3184 } 3185 3186 KMP_ASSERT2(0, "bad explicit places list"); 3187 } 3188 } 3189 3190 static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 3191 int maxOsId, kmp_affin_mask_t *tempMask, 3192 int *setSize) { 3193 const char *next; 3194 3195 // valid follow sets are '{' '!' and num 3196 SKIP_WS(*scan); 3197 if (**scan == '{') { 3198 (*scan)++; // skip '{' 3199 __kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize); 3200 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 3201 (*scan)++; // skip '}' 3202 } else if (**scan == '!') { 3203 (*scan)++; // skip '!' 3204 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 3205 KMP_CPU_COMPLEMENT(maxOsId, tempMask); 3206 } else if ((**scan >= '0') && (**scan <= '9')) { 3207 next = *scan; 3208 SKIP_DIGITS(next); 3209 int num = __kmp_str_to_int(*scan, *next); 3210 KMP_ASSERT(num >= 0); 3211 if ((num > maxOsId) || 3212 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3213 if (__kmp_affinity_verbose || 3214 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 3215 KMP_WARNING(AffIgnoreInvalidProcID, num); 3216 } 3217 } else { 3218 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 3219 (*setSize)++; 3220 } 3221 *scan = next; // skip num 3222 } else { 3223 KMP_ASSERT2(0, "bad explicit places list"); 3224 } 3225 } 3226 3227 // static void 3228 void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 3229 unsigned int *out_numMasks, 3230 const char *placelist, 3231 kmp_affin_mask_t *osId2Mask, 3232 int maxOsId) { 3233 int i, j, count, stride, sign; 3234 const char *scan = placelist; 3235 const char *next = placelist; 3236 3237 numNewMasks = 2; 3238 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 3239 nextNewMask = 0; 3240 3241 // tempMask is modified based on the previous or initial 3242 // place to form the current place 3243 // previousMask contains the previous place 3244 kmp_affin_mask_t *tempMask; 3245 kmp_affin_mask_t *previousMask; 3246 KMP_CPU_ALLOC(tempMask); 3247 KMP_CPU_ZERO(tempMask); 3248 KMP_CPU_ALLOC(previousMask); 3249 KMP_CPU_ZERO(previousMask); 3250 int setSize = 0; 3251 3252 for (;;) { 3253 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 3254 3255 // valid follow sets are ',' ':' and EOL 3256 SKIP_WS(scan); 3257 if (*scan == '\0' || *scan == ',') { 3258 if (setSize > 0) { 3259 ADD_MASK(tempMask); 3260 } 3261 KMP_CPU_ZERO(tempMask); 3262 setSize = 0; 3263 if (*scan == '\0') { 3264 break; 3265 } 3266 scan++; // skip ',' 3267 continue; 3268 } 3269 3270 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3271 scan++; // skip ':' 3272 3273 // Read count parameter 3274 SKIP_WS(scan); 3275 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 3276 next = scan; 3277 SKIP_DIGITS(next); 3278 count = __kmp_str_to_int(scan, *next); 3279 KMP_ASSERT(count >= 0); 3280 scan = next; 3281 3282 // valid follow sets are ',' ':' and EOL 3283 SKIP_WS(scan); 3284 if (*scan == '\0' || *scan == ',') { 3285 stride = +1; 3286 } else { 3287 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3288 scan++; // skip ':' 3289 3290 // Read stride parameter 3291 sign = +1; 3292 for (;;) { 3293 SKIP_WS(scan); 3294 if (*scan == '+') { 3295 scan++; // skip '+' 3296 continue; 3297 } 3298 if (*scan == '-') { 3299 sign *= -1; 3300 scan++; // skip '-' 3301 continue; 3302 } 3303 break; 3304 } 3305 SKIP_WS(scan); 3306 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 3307 next = scan; 3308 SKIP_DIGITS(next); 3309 stride = __kmp_str_to_int(scan, *next); 3310 KMP_DEBUG_ASSERT(stride >= 0); 3311 scan = next; 3312 stride *= sign; 3313 } 3314 3315 // Add places determined by initial_place : count : stride 3316 for (i = 0; i < count; i++) { 3317 if (setSize == 0) { 3318 break; 3319 } 3320 // Add the current place, then build the next place (tempMask) from that 3321 KMP_CPU_COPY(previousMask, tempMask); 3322 ADD_MASK(previousMask); 3323 KMP_CPU_ZERO(tempMask); 3324 setSize = 0; 3325 KMP_CPU_SET_ITERATE(j, previousMask) { 3326 if (!KMP_CPU_ISSET(j, previousMask)) { 3327 continue; 3328 } 3329 if ((j + stride > maxOsId) || (j + stride < 0) || 3330 (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) || 3331 (!KMP_CPU_ISSET(j + stride, 3332 KMP_CPU_INDEX(osId2Mask, j + stride)))) { 3333 if ((__kmp_affinity_verbose || 3334 (__kmp_affinity_warnings && 3335 (__kmp_affinity_type != affinity_none))) && 3336 i < count - 1) { 3337 KMP_WARNING(AffIgnoreInvalidProcID, j + stride); 3338 } 3339 continue; 3340 } 3341 KMP_CPU_SET(j + stride, tempMask); 3342 setSize++; 3343 } 3344 } 3345 KMP_CPU_ZERO(tempMask); 3346 setSize = 0; 3347 3348 // valid follow sets are ',' and EOL 3349 SKIP_WS(scan); 3350 if (*scan == '\0') { 3351 break; 3352 } 3353 if (*scan == ',') { 3354 scan++; // skip ',' 3355 continue; 3356 } 3357 3358 KMP_ASSERT2(0, "bad explicit places list"); 3359 } 3360 3361 *out_numMasks = nextNewMask; 3362 if (nextNewMask == 0) { 3363 *out_masks = NULL; 3364 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3365 return; 3366 } 3367 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3368 KMP_CPU_FREE(tempMask); 3369 KMP_CPU_FREE(previousMask); 3370 for (i = 0; i < nextNewMask; i++) { 3371 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 3372 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 3373 KMP_CPU_COPY(dest, src); 3374 } 3375 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3376 } 3377 3378 #undef ADD_MASK 3379 #undef ADD_MASK_OSID 3380 3381 // This function figures out the deepest level at which there is at least one 3382 // cluster/core with more than one processing unit bound to it. 3383 static int __kmp_affinity_find_core_level(int nprocs, int bottom_level) { 3384 int core_level = 0; 3385 3386 for (int i = 0; i < nprocs; i++) { 3387 const kmp_hw_thread_t &hw_thread = __kmp_topology->at(i); 3388 for (int j = bottom_level; j > 0; j--) { 3389 if (hw_thread.ids[j] > 0) { 3390 if (core_level < (j - 1)) { 3391 core_level = j - 1; 3392 } 3393 } 3394 } 3395 } 3396 return core_level; 3397 } 3398 3399 // This function counts number of clusters/cores at given level. 3400 static int __kmp_affinity_compute_ncores(int nprocs, int bottom_level, 3401 int core_level) { 3402 return __kmp_topology->get_count(core_level); 3403 } 3404 // This function finds to which cluster/core given processing unit is bound. 3405 static int __kmp_affinity_find_core(int proc, int bottom_level, 3406 int core_level) { 3407 int core = 0; 3408 KMP_DEBUG_ASSERT(proc >= 0 && proc < __kmp_topology->get_num_hw_threads()); 3409 for (int i = 0; i <= proc; ++i) { 3410 if (i + 1 <= proc) { 3411 for (int j = 0; j <= core_level; ++j) { 3412 if (__kmp_topology->at(i + 1).sub_ids[j] != 3413 __kmp_topology->at(i).sub_ids[j]) { 3414 core++; 3415 break; 3416 } 3417 } 3418 } 3419 } 3420 return core; 3421 } 3422 3423 // This function finds maximal number of processing units bound to a 3424 // cluster/core at given level. 3425 static int __kmp_affinity_max_proc_per_core(int nprocs, int bottom_level, 3426 int core_level) { 3427 if (core_level >= bottom_level) 3428 return 1; 3429 int thread_level = __kmp_topology->get_level(KMP_HW_THREAD); 3430 return __kmp_topology->calculate_ratio(thread_level, core_level); 3431 } 3432 3433 static int *procarr = NULL; 3434 static int __kmp_aff_depth = 0; 3435 3436 // Create a one element mask array (set of places) which only contains the 3437 // initial process's affinity mask 3438 static void __kmp_create_affinity_none_places() { 3439 KMP_ASSERT(__kmp_affin_fullMask != NULL); 3440 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3441 __kmp_affinity_num_masks = 1; 3442 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 3443 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, 0); 3444 KMP_CPU_COPY(dest, __kmp_affin_fullMask); 3445 } 3446 3447 static void __kmp_aux_affinity_initialize(void) { 3448 if (__kmp_affinity_masks != NULL) { 3449 KMP_ASSERT(__kmp_affin_fullMask != NULL); 3450 return; 3451 } 3452 3453 // Create the "full" mask - this defines all of the processors that we 3454 // consider to be in the machine model. If respect is set, then it is the 3455 // initialization thread's affinity mask. Otherwise, it is all processors that 3456 // we know about on the machine. 3457 if (__kmp_affin_fullMask == NULL) { 3458 KMP_CPU_ALLOC(__kmp_affin_fullMask); 3459 } 3460 if (KMP_AFFINITY_CAPABLE()) { 3461 __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE); 3462 if (__kmp_affinity_respect_mask) { 3463 // Count the number of available processors. 3464 unsigned i; 3465 __kmp_avail_proc = 0; 3466 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 3467 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 3468 continue; 3469 } 3470 __kmp_avail_proc++; 3471 } 3472 if (__kmp_avail_proc > __kmp_xproc) { 3473 if (__kmp_affinity_verbose || 3474 (__kmp_affinity_warnings && 3475 (__kmp_affinity_type != affinity_none))) { 3476 KMP_WARNING(ErrorInitializeAffinity); 3477 } 3478 __kmp_affinity_type = affinity_none; 3479 KMP_AFFINITY_DISABLE(); 3480 return; 3481 } 3482 3483 if (__kmp_affinity_verbose) { 3484 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 3485 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 3486 __kmp_affin_fullMask); 3487 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 3488 } 3489 } else { 3490 if (__kmp_affinity_verbose) { 3491 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 3492 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 3493 __kmp_affin_fullMask); 3494 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 3495 } 3496 __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask); 3497 __kmp_avail_proc = __kmp_xproc; 3498 #if KMP_OS_WINDOWS 3499 // Set the process affinity mask since threads' affinity 3500 // masks must be subset of process mask in Windows* OS 3501 __kmp_affin_fullMask->set_process_affinity(true); 3502 #endif 3503 } 3504 } 3505 3506 kmp_i18n_id_t msg_id = kmp_i18n_null; 3507 3508 // For backward compatibility, setting KMP_CPUINFO_FILE => 3509 // KMP_TOPOLOGY_METHOD=cpuinfo 3510 if ((__kmp_cpuinfo_file != NULL) && 3511 (__kmp_affinity_top_method == affinity_top_method_all)) { 3512 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 3513 } 3514 3515 bool success = false; 3516 if (__kmp_affinity_top_method == affinity_top_method_all) { 3517 // In the default code path, errors are not fatal - we just try using 3518 // another method. We only emit a warning message if affinity is on, or the 3519 // verbose flag is set, an the nowarnings flag was not set. 3520 #if KMP_USE_HWLOC 3521 if (!success && 3522 __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 3523 if (!__kmp_hwloc_error) { 3524 success = __kmp_affinity_create_hwloc_map(&msg_id); 3525 if (!success && __kmp_affinity_verbose) { 3526 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 3527 } 3528 } else if (__kmp_affinity_verbose) { 3529 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 3530 } 3531 } 3532 #endif 3533 3534 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 3535 if (!success) { 3536 success = __kmp_affinity_create_x2apicid_map(&msg_id); 3537 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3538 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3539 } 3540 } 3541 if (!success) { 3542 success = __kmp_affinity_create_apicid_map(&msg_id); 3543 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3544 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3545 } 3546 } 3547 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3548 3549 #if KMP_OS_LINUX 3550 if (!success) { 3551 int line = 0; 3552 success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id); 3553 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3554 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3555 } 3556 } 3557 #endif /* KMP_OS_LINUX */ 3558 3559 #if KMP_GROUP_AFFINITY 3560 if (!success && (__kmp_num_proc_groups > 1)) { 3561 success = __kmp_affinity_create_proc_group_map(&msg_id); 3562 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3563 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3564 } 3565 } 3566 #endif /* KMP_GROUP_AFFINITY */ 3567 3568 if (!success) { 3569 success = __kmp_affinity_create_flat_map(&msg_id); 3570 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3571 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3572 } 3573 KMP_ASSERT(success); 3574 } 3575 } 3576 3577 // If the user has specified that a paricular topology discovery method is to be 3578 // used, then we abort if that method fails. The exception is group affinity, 3579 // which might have been implicitly set. 3580 #if KMP_USE_HWLOC 3581 else if (__kmp_affinity_top_method == affinity_top_method_hwloc) { 3582 KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC); 3583 success = __kmp_affinity_create_hwloc_map(&msg_id); 3584 if (!success) { 3585 KMP_ASSERT(msg_id != kmp_i18n_null); 3586 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3587 } 3588 } 3589 #endif // KMP_USE_HWLOC 3590 3591 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 3592 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid || 3593 __kmp_affinity_top_method == affinity_top_method_x2apicid_1f) { 3594 success = __kmp_affinity_create_x2apicid_map(&msg_id); 3595 if (!success) { 3596 KMP_ASSERT(msg_id != kmp_i18n_null); 3597 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3598 } 3599 } else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 3600 success = __kmp_affinity_create_apicid_map(&msg_id); 3601 if (!success) { 3602 KMP_ASSERT(msg_id != kmp_i18n_null); 3603 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3604 } 3605 } 3606 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3607 3608 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 3609 int line = 0; 3610 success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id); 3611 if (!success) { 3612 KMP_ASSERT(msg_id != kmp_i18n_null); 3613 const char *filename = __kmp_cpuinfo_get_filename(); 3614 if (line > 0) { 3615 KMP_FATAL(FileLineMsgExiting, filename, line, 3616 __kmp_i18n_catgets(msg_id)); 3617 } else { 3618 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 3619 } 3620 } 3621 } 3622 3623 #if KMP_GROUP_AFFINITY 3624 else if (__kmp_affinity_top_method == affinity_top_method_group) { 3625 success = __kmp_affinity_create_proc_group_map(&msg_id); 3626 KMP_ASSERT(success); 3627 if (!success) { 3628 KMP_ASSERT(msg_id != kmp_i18n_null); 3629 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3630 } 3631 } 3632 #endif /* KMP_GROUP_AFFINITY */ 3633 3634 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 3635 success = __kmp_affinity_create_flat_map(&msg_id); 3636 // should not fail 3637 KMP_ASSERT(success); 3638 } 3639 3640 // Early exit if topology could not be created 3641 if (!__kmp_topology) { 3642 if (KMP_AFFINITY_CAPABLE() && 3643 (__kmp_affinity_verbose || 3644 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) { 3645 KMP_WARNING(ErrorInitializeAffinity); 3646 } 3647 if (nPackages > 0 && nCoresPerPkg > 0 && __kmp_nThreadsPerCore > 0 && 3648 __kmp_ncores > 0) { 3649 __kmp_topology = kmp_topology_t::allocate(0, 0, NULL); 3650 __kmp_topology->canonicalize(nPackages, nCoresPerPkg, 3651 __kmp_nThreadsPerCore, __kmp_ncores); 3652 if (__kmp_affinity_verbose) { 3653 __kmp_topology->print("KMP_AFFINITY"); 3654 } 3655 } 3656 __kmp_affinity_type = affinity_none; 3657 __kmp_create_affinity_none_places(); 3658 #if KMP_USE_HIER_SCHED 3659 __kmp_dispatch_set_hierarchy_values(); 3660 #endif 3661 KMP_AFFINITY_DISABLE(); 3662 return; 3663 } 3664 3665 // Canonicalize, print (if requested), apply KMP_HW_SUBSET, and 3666 // initialize other data structures which depend on the topology 3667 __kmp_topology->canonicalize(); 3668 if (__kmp_affinity_verbose) 3669 __kmp_topology->print("KMP_AFFINITY"); 3670 bool filtered = __kmp_topology->filter_hw_subset(); 3671 if (filtered && __kmp_affinity_verbose) 3672 __kmp_topology->print("KMP_HW_SUBSET"); 3673 machine_hierarchy.init(__kmp_topology->get_num_hw_threads()); 3674 KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads()); 3675 // If KMP_AFFINITY=none, then only create the single "none" place 3676 // which is the process's initial affinity mask or the number of 3677 // hardware threads depending on respect,norespect 3678 if (__kmp_affinity_type == affinity_none) { 3679 __kmp_create_affinity_none_places(); 3680 #if KMP_USE_HIER_SCHED 3681 __kmp_dispatch_set_hierarchy_values(); 3682 #endif 3683 return; 3684 } 3685 int depth = __kmp_topology->get_depth(); 3686 3687 // Create the table of masks, indexed by thread Id. 3688 unsigned maxIndex; 3689 unsigned numUnique; 3690 kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique); 3691 if (__kmp_affinity_gran_levels == 0) { 3692 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 3693 } 3694 3695 switch (__kmp_affinity_type) { 3696 3697 case affinity_explicit: 3698 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 3699 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) { 3700 __kmp_affinity_process_proclist( 3701 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 3702 __kmp_affinity_proclist, osId2Mask, maxIndex); 3703 } else { 3704 __kmp_affinity_process_placelist( 3705 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 3706 __kmp_affinity_proclist, osId2Mask, maxIndex); 3707 } 3708 if (__kmp_affinity_num_masks == 0) { 3709 if (__kmp_affinity_verbose || 3710 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 3711 KMP_WARNING(AffNoValidProcID); 3712 } 3713 __kmp_affinity_type = affinity_none; 3714 __kmp_create_affinity_none_places(); 3715 return; 3716 } 3717 break; 3718 3719 // The other affinity types rely on sorting the hardware threads according to 3720 // some permutation of the machine topology tree. Set __kmp_affinity_compact 3721 // and __kmp_affinity_offset appropriately, then jump to a common code 3722 // fragment to do the sort and create the array of affinity masks. 3723 case affinity_logical: 3724 __kmp_affinity_compact = 0; 3725 if (__kmp_affinity_offset) { 3726 __kmp_affinity_offset = 3727 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 3728 } 3729 goto sortTopology; 3730 3731 case affinity_physical: 3732 if (__kmp_nThreadsPerCore > 1) { 3733 __kmp_affinity_compact = 1; 3734 if (__kmp_affinity_compact >= depth) { 3735 __kmp_affinity_compact = 0; 3736 } 3737 } else { 3738 __kmp_affinity_compact = 0; 3739 } 3740 if (__kmp_affinity_offset) { 3741 __kmp_affinity_offset = 3742 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 3743 } 3744 goto sortTopology; 3745 3746 case affinity_scatter: 3747 if (__kmp_affinity_compact >= depth) { 3748 __kmp_affinity_compact = 0; 3749 } else { 3750 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 3751 } 3752 goto sortTopology; 3753 3754 case affinity_compact: 3755 if (__kmp_affinity_compact >= depth) { 3756 __kmp_affinity_compact = depth - 1; 3757 } 3758 goto sortTopology; 3759 3760 case affinity_balanced: 3761 if (depth <= 1) { 3762 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 3763 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 3764 } 3765 __kmp_affinity_type = affinity_none; 3766 __kmp_create_affinity_none_places(); 3767 return; 3768 } else if (!__kmp_topology->is_uniform()) { 3769 // Save the depth for further usage 3770 __kmp_aff_depth = depth; 3771 3772 int core_level = 3773 __kmp_affinity_find_core_level(__kmp_avail_proc, depth - 1); 3774 int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, depth - 1, 3775 core_level); 3776 int maxprocpercore = __kmp_affinity_max_proc_per_core( 3777 __kmp_avail_proc, depth - 1, core_level); 3778 3779 int nproc = ncores * maxprocpercore; 3780 if ((nproc < 2) || (nproc < __kmp_avail_proc)) { 3781 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 3782 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 3783 } 3784 __kmp_affinity_type = affinity_none; 3785 return; 3786 } 3787 3788 procarr = (int *)__kmp_allocate(sizeof(int) * nproc); 3789 for (int i = 0; i < nproc; i++) { 3790 procarr[i] = -1; 3791 } 3792 3793 int lastcore = -1; 3794 int inlastcore = 0; 3795 for (int i = 0; i < __kmp_avail_proc; i++) { 3796 int proc = __kmp_topology->at(i).os_id; 3797 int core = __kmp_affinity_find_core(i, depth - 1, core_level); 3798 3799 if (core == lastcore) { 3800 inlastcore++; 3801 } else { 3802 inlastcore = 0; 3803 } 3804 lastcore = core; 3805 3806 procarr[core * maxprocpercore + inlastcore] = proc; 3807 } 3808 } 3809 if (__kmp_affinity_compact >= depth) { 3810 __kmp_affinity_compact = depth - 1; 3811 } 3812 3813 sortTopology: 3814 // Allocate the gtid->affinity mask table. 3815 if (__kmp_affinity_dups) { 3816 __kmp_affinity_num_masks = __kmp_avail_proc; 3817 } else { 3818 __kmp_affinity_num_masks = numUnique; 3819 } 3820 3821 if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) && 3822 (__kmp_affinity_num_places > 0) && 3823 ((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) { 3824 __kmp_affinity_num_masks = __kmp_affinity_num_places; 3825 } 3826 3827 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 3828 3829 // Sort the topology table according to the current setting of 3830 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 3831 __kmp_topology->sort_compact(); 3832 { 3833 int i; 3834 unsigned j; 3835 int num_hw_threads = __kmp_topology->get_num_hw_threads(); 3836 for (i = 0, j = 0; i < num_hw_threads; i++) { 3837 if ((!__kmp_affinity_dups) && (!__kmp_topology->at(i).leader)) { 3838 continue; 3839 } 3840 int osId = __kmp_topology->at(i).os_id; 3841 3842 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 3843 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j); 3844 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 3845 KMP_CPU_COPY(dest, src); 3846 if (++j >= __kmp_affinity_num_masks) { 3847 break; 3848 } 3849 } 3850 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 3851 } 3852 // Sort the topology back using ids 3853 __kmp_topology->sort_ids(); 3854 break; 3855 3856 default: 3857 KMP_ASSERT2(0, "Unexpected affinity setting"); 3858 } 3859 3860 KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1); 3861 } 3862 3863 void __kmp_affinity_initialize(void) { 3864 // Much of the code above was written assuming that if a machine was not 3865 // affinity capable, then __kmp_affinity_type == affinity_none. We now 3866 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 3867 // There are too many checks for __kmp_affinity_type == affinity_none 3868 // in this code. Instead of trying to change them all, check if 3869 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 3870 // affinity_none, call the real initialization routine, then restore 3871 // __kmp_affinity_type to affinity_disabled. 3872 int disabled = (__kmp_affinity_type == affinity_disabled); 3873 if (!KMP_AFFINITY_CAPABLE()) { 3874 KMP_ASSERT(disabled); 3875 } 3876 if (disabled) { 3877 __kmp_affinity_type = affinity_none; 3878 } 3879 __kmp_aux_affinity_initialize(); 3880 if (disabled) { 3881 __kmp_affinity_type = affinity_disabled; 3882 } 3883 } 3884 3885 void __kmp_affinity_uninitialize(void) { 3886 if (__kmp_affinity_masks != NULL) { 3887 KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 3888 __kmp_affinity_masks = NULL; 3889 } 3890 if (__kmp_affin_fullMask != NULL) { 3891 KMP_CPU_FREE(__kmp_affin_fullMask); 3892 __kmp_affin_fullMask = NULL; 3893 } 3894 __kmp_affinity_num_masks = 0; 3895 __kmp_affinity_type = affinity_default; 3896 __kmp_affinity_num_places = 0; 3897 if (__kmp_affinity_proclist != NULL) { 3898 __kmp_free(__kmp_affinity_proclist); 3899 __kmp_affinity_proclist = NULL; 3900 } 3901 if (procarr != NULL) { 3902 __kmp_free(procarr); 3903 procarr = NULL; 3904 } 3905 #if KMP_USE_HWLOC 3906 if (__kmp_hwloc_topology != NULL) { 3907 hwloc_topology_destroy(__kmp_hwloc_topology); 3908 __kmp_hwloc_topology = NULL; 3909 } 3910 #endif 3911 if (__kmp_hw_subset) { 3912 kmp_hw_subset_t::deallocate(__kmp_hw_subset); 3913 __kmp_hw_subset = nullptr; 3914 } 3915 if (__kmp_topology) { 3916 kmp_topology_t::deallocate(__kmp_topology); 3917 __kmp_topology = nullptr; 3918 } 3919 KMPAffinity::destroy_api(); 3920 } 3921 3922 void __kmp_affinity_set_init_mask(int gtid, int isa_root) { 3923 if (!KMP_AFFINITY_CAPABLE()) { 3924 return; 3925 } 3926 3927 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 3928 if (th->th.th_affin_mask == NULL) { 3929 KMP_CPU_ALLOC(th->th.th_affin_mask); 3930 } else { 3931 KMP_CPU_ZERO(th->th.th_affin_mask); 3932 } 3933 3934 // Copy the thread mask to the kmp_info_t structure. If 3935 // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that 3936 // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set, 3937 // then the full mask is the same as the mask of the initialization thread. 3938 kmp_affin_mask_t *mask; 3939 int i; 3940 3941 if (KMP_AFFINITY_NON_PROC_BIND) { 3942 if ((__kmp_affinity_type == affinity_none) || 3943 (__kmp_affinity_type == affinity_balanced) || 3944 KMP_HIDDEN_HELPER_THREAD(gtid)) { 3945 #if KMP_GROUP_AFFINITY 3946 if (__kmp_num_proc_groups > 1) { 3947 return; 3948 } 3949 #endif 3950 KMP_ASSERT(__kmp_affin_fullMask != NULL); 3951 i = 0; 3952 mask = __kmp_affin_fullMask; 3953 } else { 3954 int mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid); 3955 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 3956 i = (mask_idx + __kmp_affinity_offset) % __kmp_affinity_num_masks; 3957 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 3958 } 3959 } else { 3960 if ((!isa_root) || KMP_HIDDEN_HELPER_THREAD(gtid) || 3961 (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 3962 #if KMP_GROUP_AFFINITY 3963 if (__kmp_num_proc_groups > 1) { 3964 return; 3965 } 3966 #endif 3967 KMP_ASSERT(__kmp_affin_fullMask != NULL); 3968 i = KMP_PLACE_ALL; 3969 mask = __kmp_affin_fullMask; 3970 } else { 3971 // int i = some hash function or just a counter that doesn't 3972 // always start at 0. Use adjusted gtid for now. 3973 int mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid); 3974 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 3975 i = (mask_idx + __kmp_affinity_offset) % __kmp_affinity_num_masks; 3976 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 3977 } 3978 } 3979 3980 th->th.th_current_place = i; 3981 if (isa_root || KMP_HIDDEN_HELPER_THREAD(gtid)) { 3982 th->th.th_new_place = i; 3983 th->th.th_first_place = 0; 3984 th->th.th_last_place = __kmp_affinity_num_masks - 1; 3985 } else if (KMP_AFFINITY_NON_PROC_BIND) { 3986 // When using a Non-OMP_PROC_BIND affinity method, 3987 // set all threads' place-partition-var to the entire place list 3988 th->th.th_first_place = 0; 3989 th->th.th_last_place = __kmp_affinity_num_masks - 1; 3990 } 3991 3992 if (i == KMP_PLACE_ALL) { 3993 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 3994 gtid)); 3995 } else { 3996 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 3997 gtid, i)); 3998 } 3999 4000 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4001 4002 if (__kmp_affinity_verbose && !KMP_HIDDEN_HELPER_THREAD(gtid) 4003 /* to avoid duplicate printing (will be correctly printed on barrier) */ 4004 && (__kmp_affinity_type == affinity_none || 4005 (i != KMP_PLACE_ALL && __kmp_affinity_type != affinity_balanced))) { 4006 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4007 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4008 th->th.th_affin_mask); 4009 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4010 __kmp_gettid(), gtid, buf); 4011 } 4012 4013 #if KMP_DEBUG 4014 // Hidden helper thread affinity only printed for debug builds 4015 if (__kmp_affinity_verbose && KMP_HIDDEN_HELPER_THREAD(gtid)) { 4016 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4017 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4018 th->th.th_affin_mask); 4019 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY (hidden helper thread)", 4020 (kmp_int32)getpid(), __kmp_gettid(), gtid, buf); 4021 } 4022 #endif 4023 4024 #if KMP_OS_WINDOWS 4025 // On Windows* OS, the process affinity mask might have changed. If the user 4026 // didn't request affinity and this call fails, just continue silently. 4027 // See CQ171393. 4028 if (__kmp_affinity_type == affinity_none) { 4029 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 4030 } else 4031 #endif 4032 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4033 } 4034 4035 void __kmp_affinity_set_place(int gtid) { 4036 if (!KMP_AFFINITY_CAPABLE()) { 4037 return; 4038 } 4039 4040 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4041 4042 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current " 4043 "place = %d)\n", 4044 gtid, th->th.th_new_place, th->th.th_current_place)); 4045 4046 // Check that the new place is within this thread's partition. 4047 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4048 KMP_ASSERT(th->th.th_new_place >= 0); 4049 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 4050 if (th->th.th_first_place <= th->th.th_last_place) { 4051 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) && 4052 (th->th.th_new_place <= th->th.th_last_place)); 4053 } else { 4054 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) || 4055 (th->th.th_new_place >= th->th.th_last_place)); 4056 } 4057 4058 // Copy the thread mask to the kmp_info_t structure, 4059 // and set this thread's affinity. 4060 kmp_affin_mask_t *mask = 4061 KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place); 4062 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4063 th->th.th_current_place = th->th.th_new_place; 4064 4065 if (__kmp_affinity_verbose) { 4066 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4067 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4068 th->th.th_affin_mask); 4069 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), 4070 __kmp_gettid(), gtid, buf); 4071 } 4072 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4073 } 4074 4075 int __kmp_aux_set_affinity(void **mask) { 4076 int gtid; 4077 kmp_info_t *th; 4078 int retval; 4079 4080 if (!KMP_AFFINITY_CAPABLE()) { 4081 return -1; 4082 } 4083 4084 gtid = __kmp_entry_gtid(); 4085 KA_TRACE( 4086 1000, (""); { 4087 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4088 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4089 (kmp_affin_mask_t *)(*mask)); 4090 __kmp_debug_printf( 4091 "kmp_set_affinity: setting affinity mask for thread %d = %s\n", 4092 gtid, buf); 4093 }); 4094 4095 if (__kmp_env_consistency_check) { 4096 if ((mask == NULL) || (*mask == NULL)) { 4097 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4098 } else { 4099 unsigned proc; 4100 int num_procs = 0; 4101 4102 KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) { 4103 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4104 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4105 } 4106 if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 4107 continue; 4108 } 4109 num_procs++; 4110 } 4111 if (num_procs == 0) { 4112 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4113 } 4114 4115 #if KMP_GROUP_AFFINITY 4116 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 4117 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4118 } 4119 #endif /* KMP_GROUP_AFFINITY */ 4120 } 4121 } 4122 4123 th = __kmp_threads[gtid]; 4124 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4125 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4126 if (retval == 0) { 4127 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 4128 } 4129 4130 th->th.th_current_place = KMP_PLACE_UNDEFINED; 4131 th->th.th_new_place = KMP_PLACE_UNDEFINED; 4132 th->th.th_first_place = 0; 4133 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4134 4135 // Turn off 4.0 affinity for the current tread at this parallel level. 4136 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 4137 4138 return retval; 4139 } 4140 4141 int __kmp_aux_get_affinity(void **mask) { 4142 int gtid; 4143 int retval; 4144 kmp_info_t *th; 4145 4146 if (!KMP_AFFINITY_CAPABLE()) { 4147 return -1; 4148 } 4149 4150 gtid = __kmp_entry_gtid(); 4151 th = __kmp_threads[gtid]; 4152 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4153 4154 KA_TRACE( 4155 1000, (""); { 4156 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4157 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4158 th->th.th_affin_mask); 4159 __kmp_printf( 4160 "kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, 4161 buf); 4162 }); 4163 4164 if (__kmp_env_consistency_check) { 4165 if ((mask == NULL) || (*mask == NULL)) { 4166 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 4167 } 4168 } 4169 4170 #if !KMP_OS_WINDOWS 4171 4172 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4173 KA_TRACE( 4174 1000, (""); { 4175 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4176 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4177 (kmp_affin_mask_t *)(*mask)); 4178 __kmp_printf( 4179 "kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, 4180 buf); 4181 }); 4182 return retval; 4183 4184 #else 4185 (void)retval; 4186 4187 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 4188 return 0; 4189 4190 #endif /* KMP_OS_WINDOWS */ 4191 } 4192 4193 int __kmp_aux_get_affinity_max_proc() { 4194 if (!KMP_AFFINITY_CAPABLE()) { 4195 return 0; 4196 } 4197 #if KMP_GROUP_AFFINITY 4198 if (__kmp_num_proc_groups > 1) { 4199 return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT); 4200 } 4201 #endif 4202 return __kmp_xproc; 4203 } 4204 4205 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) { 4206 if (!KMP_AFFINITY_CAPABLE()) { 4207 return -1; 4208 } 4209 4210 KA_TRACE( 4211 1000, (""); { 4212 int gtid = __kmp_entry_gtid(); 4213 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4214 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4215 (kmp_affin_mask_t *)(*mask)); 4216 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in " 4217 "affinity mask for thread %d = %s\n", 4218 proc, gtid, buf); 4219 }); 4220 4221 if (__kmp_env_consistency_check) { 4222 if ((mask == NULL) || (*mask == NULL)) { 4223 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4224 } 4225 } 4226 4227 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4228 return -1; 4229 } 4230 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4231 return -2; 4232 } 4233 4234 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 4235 return 0; 4236 } 4237 4238 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) { 4239 if (!KMP_AFFINITY_CAPABLE()) { 4240 return -1; 4241 } 4242 4243 KA_TRACE( 4244 1000, (""); { 4245 int gtid = __kmp_entry_gtid(); 4246 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4247 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4248 (kmp_affin_mask_t *)(*mask)); 4249 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in " 4250 "affinity mask for thread %d = %s\n", 4251 proc, gtid, buf); 4252 }); 4253 4254 if (__kmp_env_consistency_check) { 4255 if ((mask == NULL) || (*mask == NULL)) { 4256 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 4257 } 4258 } 4259 4260 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4261 return -1; 4262 } 4263 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4264 return -2; 4265 } 4266 4267 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 4268 return 0; 4269 } 4270 4271 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) { 4272 if (!KMP_AFFINITY_CAPABLE()) { 4273 return -1; 4274 } 4275 4276 KA_TRACE( 4277 1000, (""); { 4278 int gtid = __kmp_entry_gtid(); 4279 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4280 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4281 (kmp_affin_mask_t *)(*mask)); 4282 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in " 4283 "affinity mask for thread %d = %s\n", 4284 proc, gtid, buf); 4285 }); 4286 4287 if (__kmp_env_consistency_check) { 4288 if ((mask == NULL) || (*mask == NULL)) { 4289 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc"); 4290 } 4291 } 4292 4293 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4294 return -1; 4295 } 4296 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4297 return 0; 4298 } 4299 4300 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 4301 } 4302 4303 // Dynamic affinity settings - Affinity balanced 4304 void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) { 4305 KMP_DEBUG_ASSERT(th); 4306 bool fine_gran = true; 4307 int tid = th->th.th_info.ds.ds_tid; 4308 4309 // Do not perform balanced affinity for the hidden helper threads 4310 if (KMP_HIDDEN_HELPER_THREAD(__kmp_gtid_from_thread(th))) 4311 return; 4312 4313 switch (__kmp_affinity_gran) { 4314 case KMP_HW_THREAD: 4315 break; 4316 case KMP_HW_CORE: 4317 if (__kmp_nThreadsPerCore > 1) { 4318 fine_gran = false; 4319 } 4320 break; 4321 case KMP_HW_SOCKET: 4322 if (nCoresPerPkg > 1) { 4323 fine_gran = false; 4324 } 4325 break; 4326 default: 4327 fine_gran = false; 4328 } 4329 4330 if (__kmp_topology->is_uniform()) { 4331 int coreID; 4332 int threadID; 4333 // Number of hyper threads per core in HT machine 4334 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 4335 // Number of cores 4336 int ncores = __kmp_ncores; 4337 if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) { 4338 __kmp_nth_per_core = __kmp_avail_proc / nPackages; 4339 ncores = nPackages; 4340 } 4341 // How many threads will be bound to each core 4342 int chunk = nthreads / ncores; 4343 // How many cores will have an additional thread bound to it - "big cores" 4344 int big_cores = nthreads % ncores; 4345 // Number of threads on the big cores 4346 int big_nth = (chunk + 1) * big_cores; 4347 if (tid < big_nth) { 4348 coreID = tid / (chunk + 1); 4349 threadID = (tid % (chunk + 1)) % __kmp_nth_per_core; 4350 } else { // tid >= big_nth 4351 coreID = (tid - big_cores) / chunk; 4352 threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core; 4353 } 4354 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 4355 "Illegal set affinity operation when not capable"); 4356 4357 kmp_affin_mask_t *mask = th->th.th_affin_mask; 4358 KMP_CPU_ZERO(mask); 4359 4360 if (fine_gran) { 4361 int osID = 4362 __kmp_topology->at(coreID * __kmp_nth_per_core + threadID).os_id; 4363 KMP_CPU_SET(osID, mask); 4364 } else { 4365 for (int i = 0; i < __kmp_nth_per_core; i++) { 4366 int osID; 4367 osID = __kmp_topology->at(coreID * __kmp_nth_per_core + i).os_id; 4368 KMP_CPU_SET(osID, mask); 4369 } 4370 } 4371 if (__kmp_affinity_verbose) { 4372 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4373 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4374 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4375 __kmp_gettid(), tid, buf); 4376 } 4377 __kmp_set_system_affinity(mask, TRUE); 4378 } else { // Non-uniform topology 4379 4380 kmp_affin_mask_t *mask = th->th.th_affin_mask; 4381 KMP_CPU_ZERO(mask); 4382 4383 int core_level = 4384 __kmp_affinity_find_core_level(__kmp_avail_proc, __kmp_aff_depth - 1); 4385 int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, 4386 __kmp_aff_depth - 1, core_level); 4387 int nth_per_core = __kmp_affinity_max_proc_per_core( 4388 __kmp_avail_proc, __kmp_aff_depth - 1, core_level); 4389 4390 // For performance gain consider the special case nthreads == 4391 // __kmp_avail_proc 4392 if (nthreads == __kmp_avail_proc) { 4393 if (fine_gran) { 4394 int osID = __kmp_topology->at(tid).os_id; 4395 KMP_CPU_SET(osID, mask); 4396 } else { 4397 int core = 4398 __kmp_affinity_find_core(tid, __kmp_aff_depth - 1, core_level); 4399 for (int i = 0; i < __kmp_avail_proc; i++) { 4400 int osID = __kmp_topology->at(i).os_id; 4401 if (__kmp_affinity_find_core(i, __kmp_aff_depth - 1, core_level) == 4402 core) { 4403 KMP_CPU_SET(osID, mask); 4404 } 4405 } 4406 } 4407 } else if (nthreads <= ncores) { 4408 4409 int core = 0; 4410 for (int i = 0; i < ncores; i++) { 4411 // Check if this core from procarr[] is in the mask 4412 int in_mask = 0; 4413 for (int j = 0; j < nth_per_core; j++) { 4414 if (procarr[i * nth_per_core + j] != -1) { 4415 in_mask = 1; 4416 break; 4417 } 4418 } 4419 if (in_mask) { 4420 if (tid == core) { 4421 for (int j = 0; j < nth_per_core; j++) { 4422 int osID = procarr[i * nth_per_core + j]; 4423 if (osID != -1) { 4424 KMP_CPU_SET(osID, mask); 4425 // For fine granularity it is enough to set the first available 4426 // osID for this core 4427 if (fine_gran) { 4428 break; 4429 } 4430 } 4431 } 4432 break; 4433 } else { 4434 core++; 4435 } 4436 } 4437 } 4438 } else { // nthreads > ncores 4439 // Array to save the number of processors at each core 4440 int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores); 4441 // Array to save the number of cores with "x" available processors; 4442 int *ncores_with_x_procs = 4443 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 4444 // Array to save the number of cores with # procs from x to nth_per_core 4445 int *ncores_with_x_to_max_procs = 4446 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 4447 4448 for (int i = 0; i <= nth_per_core; i++) { 4449 ncores_with_x_procs[i] = 0; 4450 ncores_with_x_to_max_procs[i] = 0; 4451 } 4452 4453 for (int i = 0; i < ncores; i++) { 4454 int cnt = 0; 4455 for (int j = 0; j < nth_per_core; j++) { 4456 if (procarr[i * nth_per_core + j] != -1) { 4457 cnt++; 4458 } 4459 } 4460 nproc_at_core[i] = cnt; 4461 ncores_with_x_procs[cnt]++; 4462 } 4463 4464 for (int i = 0; i <= nth_per_core; i++) { 4465 for (int j = i; j <= nth_per_core; j++) { 4466 ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j]; 4467 } 4468 } 4469 4470 // Max number of processors 4471 int nproc = nth_per_core * ncores; 4472 // An array to keep number of threads per each context 4473 int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc); 4474 for (int i = 0; i < nproc; i++) { 4475 newarr[i] = 0; 4476 } 4477 4478 int nth = nthreads; 4479 int flag = 0; 4480 while (nth > 0) { 4481 for (int j = 1; j <= nth_per_core; j++) { 4482 int cnt = ncores_with_x_to_max_procs[j]; 4483 for (int i = 0; i < ncores; i++) { 4484 // Skip the core with 0 processors 4485 if (nproc_at_core[i] == 0) { 4486 continue; 4487 } 4488 for (int k = 0; k < nth_per_core; k++) { 4489 if (procarr[i * nth_per_core + k] != -1) { 4490 if (newarr[i * nth_per_core + k] == 0) { 4491 newarr[i * nth_per_core + k] = 1; 4492 cnt--; 4493 nth--; 4494 break; 4495 } else { 4496 if (flag != 0) { 4497 newarr[i * nth_per_core + k]++; 4498 cnt--; 4499 nth--; 4500 break; 4501 } 4502 } 4503 } 4504 } 4505 if (cnt == 0 || nth == 0) { 4506 break; 4507 } 4508 } 4509 if (nth == 0) { 4510 break; 4511 } 4512 } 4513 flag = 1; 4514 } 4515 int sum = 0; 4516 for (int i = 0; i < nproc; i++) { 4517 sum += newarr[i]; 4518 if (sum > tid) { 4519 if (fine_gran) { 4520 int osID = procarr[i]; 4521 KMP_CPU_SET(osID, mask); 4522 } else { 4523 int coreID = i / nth_per_core; 4524 for (int ii = 0; ii < nth_per_core; ii++) { 4525 int osID = procarr[coreID * nth_per_core + ii]; 4526 if (osID != -1) { 4527 KMP_CPU_SET(osID, mask); 4528 } 4529 } 4530 } 4531 break; 4532 } 4533 } 4534 __kmp_free(newarr); 4535 } 4536 4537 if (__kmp_affinity_verbose) { 4538 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4539 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4540 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4541 __kmp_gettid(), tid, buf); 4542 } 4543 __kmp_set_system_affinity(mask, TRUE); 4544 } 4545 } 4546 4547 #if KMP_OS_LINUX || KMP_OS_FREEBSD 4548 // We don't need this entry for Windows because 4549 // there is GetProcessAffinityMask() api 4550 // 4551 // The intended usage is indicated by these steps: 4552 // 1) The user gets the current affinity mask 4553 // 2) Then sets the affinity by calling this function 4554 // 3) Error check the return value 4555 // 4) Use non-OpenMP parallelization 4556 // 5) Reset the affinity to what was stored in step 1) 4557 #ifdef __cplusplus 4558 extern "C" 4559 #endif 4560 int 4561 kmp_set_thread_affinity_mask_initial() 4562 // the function returns 0 on success, 4563 // -1 if we cannot bind thread 4564 // >0 (errno) if an error happened during binding 4565 { 4566 int gtid = __kmp_get_gtid(); 4567 if (gtid < 0) { 4568 // Do not touch non-omp threads 4569 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 4570 "non-omp thread, returning\n")); 4571 return -1; 4572 } 4573 if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) { 4574 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 4575 "affinity not initialized, returning\n")); 4576 return -1; 4577 } 4578 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 4579 "set full mask for thread %d\n", 4580 gtid)); 4581 KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL); 4582 return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE); 4583 } 4584 #endif 4585 4586 #endif // KMP_AFFINITY_SUPPORTED 4587