1 /* 2 * kmp_affinity.cpp -- affinity management 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_i18n.h" 16 #include "kmp_io.h" 17 #include "kmp_str.h" 18 #include "kmp_wrapper_getpid.h" 19 #if KMP_USE_HIER_SCHED 20 #include "kmp_dispatch_hier.h" 21 #endif 22 #if KMP_USE_HWLOC 23 // Copied from hwloc 24 #define HWLOC_GROUP_KIND_INTEL_MODULE 102 25 #define HWLOC_GROUP_KIND_INTEL_TILE 103 26 #define HWLOC_GROUP_KIND_INTEL_DIE 104 27 #define HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP 220 28 #endif 29 30 // The machine topology 31 kmp_topology_t *__kmp_topology = nullptr; 32 // KMP_HW_SUBSET environment variable 33 kmp_hw_subset_t *__kmp_hw_subset = nullptr; 34 35 // Store the real or imagined machine hierarchy here 36 static hierarchy_info machine_hierarchy; 37 38 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); } 39 40 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 41 kmp_uint32 depth; 42 // The test below is true if affinity is available, but set to "none". Need to 43 // init on first use of hierarchical barrier. 44 if (TCR_1(machine_hierarchy.uninitialized)) 45 machine_hierarchy.init(nproc); 46 47 // Adjust the hierarchy in case num threads exceeds original 48 if (nproc > machine_hierarchy.base_num_threads) 49 machine_hierarchy.resize(nproc); 50 51 depth = machine_hierarchy.depth; 52 KMP_DEBUG_ASSERT(depth > 0); 53 54 thr_bar->depth = depth; 55 __kmp_type_convert(machine_hierarchy.numPerLevel[0] - 1, 56 &(thr_bar->base_leaf_kids)); 57 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 58 } 59 60 static int nCoresPerPkg, nPackages; 61 static int __kmp_nThreadsPerCore; 62 #ifndef KMP_DFLT_NTH_CORES 63 static int __kmp_ncores; 64 #endif 65 66 const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) { 67 switch (type) { 68 case KMP_HW_SOCKET: 69 return ((plural) ? KMP_I18N_STR(Sockets) : KMP_I18N_STR(Socket)); 70 case KMP_HW_DIE: 71 return ((plural) ? KMP_I18N_STR(Dice) : KMP_I18N_STR(Die)); 72 case KMP_HW_MODULE: 73 return ((plural) ? KMP_I18N_STR(Modules) : KMP_I18N_STR(Module)); 74 case KMP_HW_TILE: 75 return ((plural) ? KMP_I18N_STR(Tiles) : KMP_I18N_STR(Tile)); 76 case KMP_HW_NUMA: 77 return ((plural) ? KMP_I18N_STR(NumaDomains) : KMP_I18N_STR(NumaDomain)); 78 case KMP_HW_L3: 79 return ((plural) ? KMP_I18N_STR(L3Caches) : KMP_I18N_STR(L3Cache)); 80 case KMP_HW_L2: 81 return ((plural) ? KMP_I18N_STR(L2Caches) : KMP_I18N_STR(L2Cache)); 82 case KMP_HW_L1: 83 return ((plural) ? KMP_I18N_STR(L1Caches) : KMP_I18N_STR(L1Cache)); 84 case KMP_HW_LLC: 85 return ((plural) ? KMP_I18N_STR(LLCaches) : KMP_I18N_STR(LLCache)); 86 case KMP_HW_CORE: 87 return ((plural) ? KMP_I18N_STR(Cores) : KMP_I18N_STR(Core)); 88 case KMP_HW_THREAD: 89 return ((plural) ? KMP_I18N_STR(Threads) : KMP_I18N_STR(Thread)); 90 case KMP_HW_PROC_GROUP: 91 return ((plural) ? KMP_I18N_STR(ProcGroups) : KMP_I18N_STR(ProcGroup)); 92 } 93 return KMP_I18N_STR(Unknown); 94 } 95 96 const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) { 97 switch (type) { 98 case KMP_HW_SOCKET: 99 return ((plural) ? "sockets" : "socket"); 100 case KMP_HW_DIE: 101 return ((plural) ? "dice" : "die"); 102 case KMP_HW_MODULE: 103 return ((plural) ? "modules" : "module"); 104 case KMP_HW_TILE: 105 return ((plural) ? "tiles" : "tile"); 106 case KMP_HW_NUMA: 107 return ((plural) ? "numa_domains" : "numa_domain"); 108 case KMP_HW_L3: 109 return ((plural) ? "l3_caches" : "l3_cache"); 110 case KMP_HW_L2: 111 return ((plural) ? "l2_caches" : "l2_cache"); 112 case KMP_HW_L1: 113 return ((plural) ? "l1_caches" : "l1_cache"); 114 case KMP_HW_LLC: 115 return ((plural) ? "ll_caches" : "ll_cache"); 116 case KMP_HW_CORE: 117 return ((plural) ? "cores" : "core"); 118 case KMP_HW_THREAD: 119 return ((plural) ? "threads" : "thread"); 120 case KMP_HW_PROC_GROUP: 121 return ((plural) ? "proc_groups" : "proc_group"); 122 } 123 return ((plural) ? "unknowns" : "unknown"); 124 } 125 126 //////////////////////////////////////////////////////////////////////////////// 127 // kmp_hw_thread_t methods 128 int kmp_hw_thread_t::compare_ids(const void *a, const void *b) { 129 const kmp_hw_thread_t *ahwthread = (const kmp_hw_thread_t *)a; 130 const kmp_hw_thread_t *bhwthread = (const kmp_hw_thread_t *)b; 131 int depth = __kmp_topology->get_depth(); 132 for (int level = 0; level < depth; ++level) { 133 if (ahwthread->ids[level] < bhwthread->ids[level]) 134 return -1; 135 else if (ahwthread->ids[level] > bhwthread->ids[level]) 136 return 1; 137 } 138 if (ahwthread->os_id < bhwthread->os_id) 139 return -1; 140 else if (ahwthread->os_id > bhwthread->os_id) 141 return 1; 142 return 0; 143 } 144 145 #if KMP_AFFINITY_SUPPORTED 146 int kmp_hw_thread_t::compare_compact(const void *a, const void *b) { 147 int i; 148 const kmp_hw_thread_t *aa = (const kmp_hw_thread_t *)a; 149 const kmp_hw_thread_t *bb = (const kmp_hw_thread_t *)b; 150 int depth = __kmp_topology->get_depth(); 151 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0); 152 KMP_DEBUG_ASSERT(__kmp_affinity_compact <= depth); 153 for (i = 0; i < __kmp_affinity_compact; i++) { 154 int j = depth - i - 1; 155 if (aa->sub_ids[j] < bb->sub_ids[j]) 156 return -1; 157 if (aa->sub_ids[j] > bb->sub_ids[j]) 158 return 1; 159 } 160 for (; i < depth; i++) { 161 int j = i - __kmp_affinity_compact; 162 if (aa->sub_ids[j] < bb->sub_ids[j]) 163 return -1; 164 if (aa->sub_ids[j] > bb->sub_ids[j]) 165 return 1; 166 } 167 return 0; 168 } 169 #endif 170 171 void kmp_hw_thread_t::print() const { 172 int depth = __kmp_topology->get_depth(); 173 printf("%4d ", os_id); 174 for (int i = 0; i < depth; ++i) { 175 printf("%4d ", ids[i]); 176 } 177 printf("\n"); 178 } 179 180 //////////////////////////////////////////////////////////////////////////////// 181 // kmp_topology_t methods 182 183 // Remove layers that don't add information to the topology. 184 // This is done by having the layer take on the id = UNKNOWN_ID (-1) 185 void kmp_topology_t::_remove_radix1_layers() { 186 int preference[KMP_HW_LAST]; 187 int top_index1, top_index2; 188 // Set up preference associative array 189 preference[KMP_HW_PROC_GROUP] = 110; 190 preference[KMP_HW_SOCKET] = 100; 191 preference[KMP_HW_CORE] = 95; 192 preference[KMP_HW_THREAD] = 90; 193 preference[KMP_HW_NUMA] = 85; 194 preference[KMP_HW_DIE] = 80; 195 preference[KMP_HW_TILE] = 75; 196 preference[KMP_HW_MODULE] = 73; 197 preference[KMP_HW_L3] = 70; 198 preference[KMP_HW_L2] = 65; 199 preference[KMP_HW_L1] = 60; 200 preference[KMP_HW_LLC] = 5; 201 top_index1 = 0; 202 top_index2 = 1; 203 while (top_index1 < depth - 1 && top_index2 < depth) { 204 kmp_hw_t type1 = types[top_index1]; 205 kmp_hw_t type2 = types[top_index2]; 206 KMP_ASSERT_VALID_HW_TYPE(type1); 207 KMP_ASSERT_VALID_HW_TYPE(type2); 208 // Do not allow the three main topology levels (sockets, cores, threads) to 209 // be compacted down 210 if ((type1 == KMP_HW_THREAD || type1 == KMP_HW_CORE || 211 type1 == KMP_HW_SOCKET) && 212 (type2 == KMP_HW_THREAD || type2 == KMP_HW_CORE || 213 type2 == KMP_HW_SOCKET)) { 214 top_index1 = top_index2++; 215 continue; 216 } 217 bool radix1 = true; 218 bool all_same = true; 219 int id1 = hw_threads[0].ids[top_index1]; 220 int id2 = hw_threads[0].ids[top_index2]; 221 int pref1 = preference[type1]; 222 int pref2 = preference[type2]; 223 for (int hwidx = 1; hwidx < num_hw_threads; ++hwidx) { 224 if (hw_threads[hwidx].ids[top_index1] == id1 && 225 hw_threads[hwidx].ids[top_index2] != id2) { 226 radix1 = false; 227 break; 228 } 229 if (hw_threads[hwidx].ids[top_index2] != id2) 230 all_same = false; 231 id1 = hw_threads[hwidx].ids[top_index1]; 232 id2 = hw_threads[hwidx].ids[top_index2]; 233 } 234 if (radix1) { 235 // Select the layer to remove based on preference 236 kmp_hw_t remove_type, keep_type; 237 int remove_layer, remove_layer_ids; 238 if (pref1 > pref2) { 239 remove_type = type2; 240 remove_layer = remove_layer_ids = top_index2; 241 keep_type = type1; 242 } else { 243 remove_type = type1; 244 remove_layer = remove_layer_ids = top_index1; 245 keep_type = type2; 246 } 247 // If all the indexes for the second (deeper) layer are the same. 248 // e.g., all are zero, then make sure to keep the first layer's ids 249 if (all_same) 250 remove_layer_ids = top_index2; 251 // Remove radix one type by setting the equivalence, removing the id from 252 // the hw threads and removing the layer from types and depth 253 set_equivalent_type(remove_type, keep_type); 254 for (int idx = 0; idx < num_hw_threads; ++idx) { 255 kmp_hw_thread_t &hw_thread = hw_threads[idx]; 256 for (int d = remove_layer_ids; d < depth - 1; ++d) 257 hw_thread.ids[d] = hw_thread.ids[d + 1]; 258 } 259 for (int idx = remove_layer; idx < depth - 1; ++idx) 260 types[idx] = types[idx + 1]; 261 depth--; 262 } else { 263 top_index1 = top_index2++; 264 } 265 } 266 KMP_ASSERT(depth > 0); 267 } 268 269 void kmp_topology_t::_set_last_level_cache() { 270 if (get_equivalent_type(KMP_HW_L3) != KMP_HW_UNKNOWN) 271 set_equivalent_type(KMP_HW_LLC, KMP_HW_L3); 272 else if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN) 273 set_equivalent_type(KMP_HW_LLC, KMP_HW_L2); 274 #if KMP_MIC_SUPPORTED 275 else if (__kmp_mic_type == mic3) { 276 if (get_equivalent_type(KMP_HW_L2) != KMP_HW_UNKNOWN) 277 set_equivalent_type(KMP_HW_LLC, KMP_HW_L2); 278 else if (get_equivalent_type(KMP_HW_TILE) != KMP_HW_UNKNOWN) 279 set_equivalent_type(KMP_HW_LLC, KMP_HW_TILE); 280 // L2/Tile wasn't detected so just say L1 281 else 282 set_equivalent_type(KMP_HW_LLC, KMP_HW_L1); 283 } 284 #endif 285 else if (get_equivalent_type(KMP_HW_L1) != KMP_HW_UNKNOWN) 286 set_equivalent_type(KMP_HW_LLC, KMP_HW_L1); 287 // Fallback is to set last level cache to socket or core 288 if (get_equivalent_type(KMP_HW_LLC) == KMP_HW_UNKNOWN) { 289 if (get_equivalent_type(KMP_HW_SOCKET) != KMP_HW_UNKNOWN) 290 set_equivalent_type(KMP_HW_LLC, KMP_HW_SOCKET); 291 else if (get_equivalent_type(KMP_HW_CORE) != KMP_HW_UNKNOWN) 292 set_equivalent_type(KMP_HW_LLC, KMP_HW_CORE); 293 } 294 KMP_ASSERT(get_equivalent_type(KMP_HW_LLC) != KMP_HW_UNKNOWN); 295 } 296 297 // Gather the count of each topology layer and the ratio 298 void kmp_topology_t::_gather_enumeration_information() { 299 int previous_id[KMP_HW_LAST]; 300 int max[KMP_HW_LAST]; 301 302 for (int i = 0; i < depth; ++i) { 303 previous_id[i] = kmp_hw_thread_t::UNKNOWN_ID; 304 max[i] = 0; 305 count[i] = 0; 306 ratio[i] = 0; 307 } 308 for (int i = 0; i < num_hw_threads; ++i) { 309 kmp_hw_thread_t &hw_thread = hw_threads[i]; 310 for (int layer = 0; layer < depth; ++layer) { 311 int id = hw_thread.ids[layer]; 312 if (id != previous_id[layer]) { 313 // Add an additional increment to each count 314 for (int l = layer; l < depth; ++l) 315 count[l]++; 316 // Keep track of topology layer ratio statistics 317 max[layer]++; 318 for (int l = layer + 1; l < depth; ++l) { 319 if (max[l] > ratio[l]) 320 ratio[l] = max[l]; 321 max[l] = 1; 322 } 323 break; 324 } 325 } 326 for (int layer = 0; layer < depth; ++layer) { 327 previous_id[layer] = hw_thread.ids[layer]; 328 } 329 } 330 for (int layer = 0; layer < depth; ++layer) { 331 if (max[layer] > ratio[layer]) 332 ratio[layer] = max[layer]; 333 } 334 } 335 336 // Find out if the topology is uniform 337 void kmp_topology_t::_discover_uniformity() { 338 int num = 1; 339 for (int level = 0; level < depth; ++level) 340 num *= ratio[level]; 341 flags.uniform = (num == count[depth - 1]); 342 } 343 344 // Set all the sub_ids for each hardware thread 345 void kmp_topology_t::_set_sub_ids() { 346 int previous_id[KMP_HW_LAST]; 347 int sub_id[KMP_HW_LAST]; 348 349 for (int i = 0; i < depth; ++i) { 350 previous_id[i] = -1; 351 sub_id[i] = -1; 352 } 353 for (int i = 0; i < num_hw_threads; ++i) { 354 kmp_hw_thread_t &hw_thread = hw_threads[i]; 355 // Setup the sub_id 356 for (int j = 0; j < depth; ++j) { 357 if (hw_thread.ids[j] != previous_id[j]) { 358 sub_id[j]++; 359 for (int k = j + 1; k < depth; ++k) { 360 sub_id[k] = 0; 361 } 362 break; 363 } 364 } 365 // Set previous_id 366 for (int j = 0; j < depth; ++j) { 367 previous_id[j] = hw_thread.ids[j]; 368 } 369 // Set the sub_ids field 370 for (int j = 0; j < depth; ++j) { 371 hw_thread.sub_ids[j] = sub_id[j]; 372 } 373 } 374 } 375 376 void kmp_topology_t::_set_globals() { 377 // Set nCoresPerPkg, nPackages, __kmp_nThreadsPerCore, __kmp_ncores 378 int core_level, thread_level, package_level; 379 package_level = get_level(KMP_HW_SOCKET); 380 #if KMP_GROUP_AFFINITY 381 if (package_level == -1) 382 package_level = get_level(KMP_HW_PROC_GROUP); 383 #endif 384 core_level = get_level(KMP_HW_CORE); 385 thread_level = get_level(KMP_HW_THREAD); 386 387 KMP_ASSERT(core_level != -1); 388 KMP_ASSERT(thread_level != -1); 389 390 __kmp_nThreadsPerCore = calculate_ratio(thread_level, core_level); 391 if (package_level != -1) { 392 nCoresPerPkg = calculate_ratio(core_level, package_level); 393 nPackages = get_count(package_level); 394 } else { 395 // assume one socket 396 nCoresPerPkg = get_count(core_level); 397 nPackages = 1; 398 } 399 #ifndef KMP_DFLT_NTH_CORES 400 __kmp_ncores = get_count(core_level); 401 #endif 402 } 403 404 kmp_topology_t *kmp_topology_t::allocate(int nproc, int ndepth, 405 const kmp_hw_t *types) { 406 kmp_topology_t *retval; 407 // Allocate all data in one large allocation 408 size_t size = sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc + 409 sizeof(int) * ndepth * 3; 410 char *bytes = (char *)__kmp_allocate(size); 411 retval = (kmp_topology_t *)bytes; 412 if (nproc > 0) { 413 retval->hw_threads = (kmp_hw_thread_t *)(bytes + sizeof(kmp_topology_t)); 414 } else { 415 retval->hw_threads = nullptr; 416 } 417 retval->num_hw_threads = nproc; 418 retval->depth = ndepth; 419 int *arr = 420 (int *)(bytes + sizeof(kmp_topology_t) + sizeof(kmp_hw_thread_t) * nproc); 421 retval->types = (kmp_hw_t *)arr; 422 retval->ratio = arr + ndepth; 423 retval->count = arr + 2 * ndepth; 424 KMP_FOREACH_HW_TYPE(type) { retval->equivalent[type] = KMP_HW_UNKNOWN; } 425 for (int i = 0; i < ndepth; ++i) { 426 retval->types[i] = types[i]; 427 retval->equivalent[types[i]] = types[i]; 428 } 429 return retval; 430 } 431 432 void kmp_topology_t::deallocate(kmp_topology_t *topology) { 433 if (topology) 434 __kmp_free(topology); 435 } 436 437 bool kmp_topology_t::check_ids() const { 438 // Assume ids have been sorted 439 if (num_hw_threads == 0) 440 return true; 441 for (int i = 1; i < num_hw_threads; ++i) { 442 kmp_hw_thread_t ¤t_thread = hw_threads[i]; 443 kmp_hw_thread_t &previous_thread = hw_threads[i - 1]; 444 bool unique = false; 445 for (int j = 0; j < depth; ++j) { 446 if (previous_thread.ids[j] != current_thread.ids[j]) { 447 unique = true; 448 break; 449 } 450 } 451 if (unique) 452 continue; 453 return false; 454 } 455 return true; 456 } 457 458 void kmp_topology_t::dump() const { 459 printf("***********************\n"); 460 printf("*** __kmp_topology: ***\n"); 461 printf("***********************\n"); 462 printf("* depth: %d\n", depth); 463 464 printf("* types: "); 465 for (int i = 0; i < depth; ++i) 466 printf("%15s ", __kmp_hw_get_keyword(types[i])); 467 printf("\n"); 468 469 printf("* ratio: "); 470 for (int i = 0; i < depth; ++i) { 471 printf("%15d ", ratio[i]); 472 } 473 printf("\n"); 474 475 printf("* count: "); 476 for (int i = 0; i < depth; ++i) { 477 printf("%15d ", count[i]); 478 } 479 printf("\n"); 480 481 printf("* equivalent map:\n"); 482 KMP_FOREACH_HW_TYPE(i) { 483 const char *key = __kmp_hw_get_keyword(i); 484 const char *value = __kmp_hw_get_keyword(equivalent[i]); 485 printf("%-15s -> %-15s\n", key, value); 486 } 487 488 printf("* uniform: %s\n", (is_uniform() ? "Yes" : "No")); 489 490 printf("* num_hw_threads: %d\n", num_hw_threads); 491 printf("* hw_threads:\n"); 492 for (int i = 0; i < num_hw_threads; ++i) { 493 hw_threads[i].print(); 494 } 495 printf("***********************\n"); 496 } 497 498 void kmp_topology_t::print(const char *env_var) const { 499 kmp_str_buf_t buf; 500 int print_types_depth; 501 __kmp_str_buf_init(&buf); 502 kmp_hw_t print_types[KMP_HW_LAST + 2]; 503 504 // Num Available Threads 505 KMP_INFORM(AvailableOSProc, env_var, num_hw_threads); 506 507 // Uniform or not 508 if (is_uniform()) { 509 KMP_INFORM(Uniform, env_var); 510 } else { 511 KMP_INFORM(NonUniform, env_var); 512 } 513 514 // Equivalent types 515 KMP_FOREACH_HW_TYPE(type) { 516 kmp_hw_t eq_type = equivalent[type]; 517 if (eq_type != KMP_HW_UNKNOWN && eq_type != type) { 518 KMP_INFORM(AffEqualTopologyTypes, env_var, 519 __kmp_hw_get_catalog_string(type), 520 __kmp_hw_get_catalog_string(eq_type)); 521 } 522 } 523 524 // Quick topology 525 KMP_ASSERT(depth > 0 && depth <= (int)KMP_HW_LAST); 526 // Create a print types array that always guarantees printing 527 // the core and thread level 528 print_types_depth = 0; 529 for (int level = 0; level < depth; ++level) 530 print_types[print_types_depth++] = types[level]; 531 if (equivalent[KMP_HW_CORE] != KMP_HW_CORE) { 532 // Force in the core level for quick topology 533 if (print_types[print_types_depth - 1] == KMP_HW_THREAD) { 534 // Force core before thread e.g., 1 socket X 2 threads/socket 535 // becomes 1 socket X 1 core/socket X 2 threads/socket 536 print_types[print_types_depth - 1] = KMP_HW_CORE; 537 print_types[print_types_depth++] = KMP_HW_THREAD; 538 } else { 539 print_types[print_types_depth++] = KMP_HW_CORE; 540 } 541 } 542 // Always put threads at very end of quick topology 543 if (equivalent[KMP_HW_THREAD] != KMP_HW_THREAD) 544 print_types[print_types_depth++] = KMP_HW_THREAD; 545 546 __kmp_str_buf_clear(&buf); 547 kmp_hw_t numerator_type; 548 kmp_hw_t denominator_type = KMP_HW_UNKNOWN; 549 int core_level = get_level(KMP_HW_CORE); 550 int ncores = get_count(core_level); 551 552 for (int plevel = 0, level = 0; plevel < print_types_depth; ++plevel) { 553 int c; 554 bool plural; 555 numerator_type = print_types[plevel]; 556 KMP_ASSERT_VALID_HW_TYPE(numerator_type); 557 if (equivalent[numerator_type] != numerator_type) 558 c = 1; 559 else 560 c = get_ratio(level++); 561 plural = (c > 1); 562 if (plevel == 0) { 563 __kmp_str_buf_print(&buf, "%d %s", c, 564 __kmp_hw_get_catalog_string(numerator_type, plural)); 565 } else { 566 __kmp_str_buf_print(&buf, " x %d %s/%s", c, 567 __kmp_hw_get_catalog_string(numerator_type, plural), 568 __kmp_hw_get_catalog_string(denominator_type)); 569 } 570 denominator_type = numerator_type; 571 } 572 KMP_INFORM(TopologyGeneric, env_var, buf.str, ncores); 573 574 if (num_hw_threads <= 0) { 575 __kmp_str_buf_free(&buf); 576 return; 577 } 578 579 // Full OS proc to hardware thread map 580 KMP_INFORM(OSProcToPhysicalThreadMap, env_var); 581 for (int i = 0; i < num_hw_threads; i++) { 582 __kmp_str_buf_clear(&buf); 583 for (int level = 0; level < depth; ++level) { 584 kmp_hw_t type = types[level]; 585 __kmp_str_buf_print(&buf, "%s ", __kmp_hw_get_catalog_string(type)); 586 __kmp_str_buf_print(&buf, "%d ", hw_threads[i].ids[level]); 587 } 588 KMP_INFORM(OSProcMapToPack, env_var, hw_threads[i].os_id, buf.str); 589 } 590 591 __kmp_str_buf_free(&buf); 592 } 593 594 void kmp_topology_t::canonicalize() { 595 _remove_radix1_layers(); 596 _gather_enumeration_information(); 597 _discover_uniformity(); 598 _set_sub_ids(); 599 _set_globals(); 600 _set_last_level_cache(); 601 602 #if KMP_MIC_SUPPORTED 603 // Manually Add L2 = Tile equivalence 604 if (__kmp_mic_type == mic3) { 605 if (get_level(KMP_HW_L2) != -1) 606 set_equivalent_type(KMP_HW_TILE, KMP_HW_L2); 607 else if (get_level(KMP_HW_TILE) != -1) 608 set_equivalent_type(KMP_HW_L2, KMP_HW_TILE); 609 } 610 #endif 611 612 // Perform post canonicalization checking 613 KMP_ASSERT(depth > 0); 614 for (int level = 0; level < depth; ++level) { 615 // All counts, ratios, and types must be valid 616 KMP_ASSERT(count[level] > 0 && ratio[level] > 0); 617 KMP_ASSERT_VALID_HW_TYPE(types[level]); 618 // Detected types must point to themselves 619 KMP_ASSERT(equivalent[types[level]] == types[level]); 620 } 621 622 #if KMP_AFFINITY_SUPPORTED 623 // Set the number of affinity granularity levels 624 if (__kmp_affinity_gran_levels < 0) { 625 kmp_hw_t gran_type = get_equivalent_type(__kmp_affinity_gran); 626 // Check if user's granularity request is valid 627 if (gran_type == KMP_HW_UNKNOWN) { 628 // First try core, then thread, then package 629 kmp_hw_t gran_types[3] = {KMP_HW_CORE, KMP_HW_THREAD, KMP_HW_SOCKET}; 630 for (auto g : gran_types) { 631 if (__kmp_topology->get_equivalent_type(g) != KMP_HW_UNKNOWN) { 632 gran_type = g; 633 break; 634 } 635 } 636 KMP_ASSERT(gran_type != KMP_HW_UNKNOWN); 637 // Warn user what granularity setting will be used instead 638 KMP_WARNING(AffGranularityBad, "KMP_AFFINITY", 639 __kmp_hw_get_catalog_string(__kmp_affinity_gran), 640 __kmp_hw_get_catalog_string(gran_type)); 641 __kmp_affinity_gran = gran_type; 642 } 643 __kmp_affinity_gran_levels = 0; 644 for (int i = depth - 1; i >= 0 && get_type(i) != gran_type; --i) 645 __kmp_affinity_gran_levels++; 646 } 647 #endif // KMP_AFFINITY_SUPPORTED 648 } 649 650 // Canonicalize an explicit packages X cores/pkg X threads/core topology 651 void kmp_topology_t::canonicalize(int npackages, int ncores_per_pkg, 652 int nthreads_per_core, int ncores) { 653 int ndepth = 3; 654 depth = ndepth; 655 KMP_FOREACH_HW_TYPE(i) { equivalent[i] = KMP_HW_UNKNOWN; } 656 for (int level = 0; level < depth; ++level) { 657 count[level] = 0; 658 ratio[level] = 0; 659 } 660 count[0] = npackages; 661 count[1] = ncores; 662 count[2] = __kmp_xproc; 663 ratio[0] = npackages; 664 ratio[1] = ncores_per_pkg; 665 ratio[2] = nthreads_per_core; 666 equivalent[KMP_HW_SOCKET] = KMP_HW_SOCKET; 667 equivalent[KMP_HW_CORE] = KMP_HW_CORE; 668 equivalent[KMP_HW_THREAD] = KMP_HW_THREAD; 669 types[0] = KMP_HW_SOCKET; 670 types[1] = KMP_HW_CORE; 671 types[2] = KMP_HW_THREAD; 672 //__kmp_avail_proc = __kmp_xproc; 673 _discover_uniformity(); 674 } 675 676 // Apply the KMP_HW_SUBSET envirable to the topology 677 // Returns true if KMP_HW_SUBSET filtered any processors 678 // otherwise, returns false 679 bool kmp_topology_t::filter_hw_subset() { 680 // If KMP_HW_SUBSET wasn't requested, then do nothing. 681 if (!__kmp_hw_subset) 682 return false; 683 684 // Check to see if KMP_HW_SUBSET is a valid subset of the detected topology 685 int hw_subset_depth = __kmp_hw_subset->get_depth(); 686 kmp_hw_t specified[KMP_HW_LAST]; 687 KMP_ASSERT(hw_subset_depth > 0); 688 KMP_FOREACH_HW_TYPE(i) { specified[i] = KMP_HW_UNKNOWN; } 689 for (int i = 0; i < hw_subset_depth; ++i) { 690 int max_count; 691 int num = __kmp_hw_subset->at(i).num; 692 int offset = __kmp_hw_subset->at(i).offset; 693 kmp_hw_t type = __kmp_hw_subset->at(i).type; 694 kmp_hw_t equivalent_type = equivalent[type]; 695 int level = get_level(type); 696 697 // Check to see if current layer is in detected machine topology 698 if (equivalent_type != KMP_HW_UNKNOWN) { 699 __kmp_hw_subset->at(i).type = equivalent_type; 700 } else { 701 KMP_WARNING(AffHWSubsetNotExistGeneric, 702 __kmp_hw_get_catalog_string(type)); 703 return false; 704 } 705 706 // Check to see if current layer has already been specified 707 // either directly or through an equivalent type 708 if (specified[equivalent_type] != KMP_HW_UNKNOWN) { 709 KMP_WARNING(AffHWSubsetEqvLayers, __kmp_hw_get_catalog_string(type), 710 __kmp_hw_get_catalog_string(specified[equivalent_type])); 711 return false; 712 } 713 specified[equivalent_type] = type; 714 715 // Check to see if layers are in order 716 if (i + 1 < hw_subset_depth) { 717 kmp_hw_t next_type = get_equivalent_type(__kmp_hw_subset->at(i + 1).type); 718 if (next_type == KMP_HW_UNKNOWN) { 719 KMP_WARNING( 720 AffHWSubsetNotExistGeneric, 721 __kmp_hw_get_catalog_string(__kmp_hw_subset->at(i + 1).type)); 722 return false; 723 } 724 int next_topology_level = get_level(next_type); 725 if (level > next_topology_level) { 726 KMP_WARNING(AffHWSubsetOutOfOrder, __kmp_hw_get_catalog_string(type), 727 __kmp_hw_get_catalog_string(next_type)); 728 return false; 729 } 730 } 731 732 // Check to see if each layer's num & offset parameters are valid 733 max_count = get_ratio(level); 734 if (max_count < 0 || num + offset > max_count) { 735 bool plural = (num > 1); 736 KMP_WARNING(AffHWSubsetManyGeneric, 737 __kmp_hw_get_catalog_string(type, plural)); 738 return false; 739 } 740 } 741 742 // Apply the filtered hardware subset 743 int new_index = 0; 744 for (int i = 0; i < num_hw_threads; ++i) { 745 kmp_hw_thread_t &hw_thread = hw_threads[i]; 746 // Check to see if this hardware thread should be filtered 747 bool should_be_filtered = false; 748 for (int level = 0, hw_subset_index = 0; 749 level < depth && hw_subset_index < hw_subset_depth; ++level) { 750 kmp_hw_t topology_type = types[level]; 751 auto hw_subset_item = __kmp_hw_subset->at(hw_subset_index); 752 kmp_hw_t hw_subset_type = hw_subset_item.type; 753 if (topology_type != hw_subset_type) 754 continue; 755 int num = hw_subset_item.num; 756 int offset = hw_subset_item.offset; 757 hw_subset_index++; 758 if (hw_thread.sub_ids[level] < offset || 759 hw_thread.sub_ids[level] >= offset + num) { 760 should_be_filtered = true; 761 break; 762 } 763 } 764 if (!should_be_filtered) { 765 if (i != new_index) 766 hw_threads[new_index] = hw_thread; 767 new_index++; 768 } else { 769 #if KMP_AFFINITY_SUPPORTED 770 KMP_CPU_CLR(hw_thread.os_id, __kmp_affin_fullMask); 771 #endif 772 __kmp_avail_proc--; 773 } 774 } 775 KMP_DEBUG_ASSERT(new_index <= num_hw_threads); 776 num_hw_threads = new_index; 777 778 // Post hardware subset canonicalization 779 _gather_enumeration_information(); 780 _discover_uniformity(); 781 _set_globals(); 782 _set_last_level_cache(); 783 return true; 784 } 785 786 bool kmp_topology_t::is_close(int hwt1, int hwt2, int hw_level) const { 787 if (hw_level >= depth) 788 return true; 789 bool retval = true; 790 const kmp_hw_thread_t &t1 = hw_threads[hwt1]; 791 const kmp_hw_thread_t &t2 = hw_threads[hwt2]; 792 for (int i = 0; i < (depth - hw_level); ++i) { 793 if (t1.ids[i] != t2.ids[i]) 794 return false; 795 } 796 return retval; 797 } 798 799 //////////////////////////////////////////////////////////////////////////////// 800 801 #if KMP_AFFINITY_SUPPORTED 802 class kmp_affinity_raii_t { 803 kmp_affin_mask_t *mask; 804 bool restored; 805 806 public: 807 kmp_affinity_raii_t() : restored(false) { 808 KMP_CPU_ALLOC(mask); 809 KMP_ASSERT(mask != NULL); 810 __kmp_get_system_affinity(mask, TRUE); 811 } 812 void restore() { 813 __kmp_set_system_affinity(mask, TRUE); 814 KMP_CPU_FREE(mask); 815 restored = true; 816 } 817 ~kmp_affinity_raii_t() { 818 if (!restored) { 819 __kmp_set_system_affinity(mask, TRUE); 820 KMP_CPU_FREE(mask); 821 } 822 } 823 }; 824 825 bool KMPAffinity::picked_api = false; 826 827 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); } 828 void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); } 829 void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); } 830 void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); } 831 void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); } 832 void KMPAffinity::operator delete(void *p) { __kmp_free(p); } 833 834 void KMPAffinity::pick_api() { 835 KMPAffinity *affinity_dispatch; 836 if (picked_api) 837 return; 838 #if KMP_USE_HWLOC 839 // Only use Hwloc if affinity isn't explicitly disabled and 840 // user requests Hwloc topology method 841 if (__kmp_affinity_top_method == affinity_top_method_hwloc && 842 __kmp_affinity_type != affinity_disabled) { 843 affinity_dispatch = new KMPHwlocAffinity(); 844 } else 845 #endif 846 { 847 affinity_dispatch = new KMPNativeAffinity(); 848 } 849 __kmp_affinity_dispatch = affinity_dispatch; 850 picked_api = true; 851 } 852 853 void KMPAffinity::destroy_api() { 854 if (__kmp_affinity_dispatch != NULL) { 855 delete __kmp_affinity_dispatch; 856 __kmp_affinity_dispatch = NULL; 857 picked_api = false; 858 } 859 } 860 861 #define KMP_ADVANCE_SCAN(scan) \ 862 while (*scan != '\0') { \ 863 scan++; \ 864 } 865 866 // Print the affinity mask to the character array in a pretty format. 867 // The format is a comma separated list of non-negative integers or integer 868 // ranges: e.g., 1,2,3-5,7,9-15 869 // The format can also be the string "{<empty>}" if no bits are set in mask 870 char *__kmp_affinity_print_mask(char *buf, int buf_len, 871 kmp_affin_mask_t *mask) { 872 int start = 0, finish = 0, previous = 0; 873 bool first_range; 874 KMP_ASSERT(buf); 875 KMP_ASSERT(buf_len >= 40); 876 KMP_ASSERT(mask); 877 char *scan = buf; 878 char *end = buf + buf_len - 1; 879 880 // Check for empty set. 881 if (mask->begin() == mask->end()) { 882 KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}"); 883 KMP_ADVANCE_SCAN(scan); 884 KMP_ASSERT(scan <= end); 885 return buf; 886 } 887 888 first_range = true; 889 start = mask->begin(); 890 while (1) { 891 // Find next range 892 // [start, previous] is inclusive range of contiguous bits in mask 893 for (finish = mask->next(start), previous = start; 894 finish == previous + 1 && finish != mask->end(); 895 finish = mask->next(finish)) { 896 previous = finish; 897 } 898 899 // The first range does not need a comma printed before it, but the rest 900 // of the ranges do need a comma beforehand 901 if (!first_range) { 902 KMP_SNPRINTF(scan, end - scan + 1, "%s", ","); 903 KMP_ADVANCE_SCAN(scan); 904 } else { 905 first_range = false; 906 } 907 // Range with three or more contiguous bits in the affinity mask 908 if (previous - start > 1) { 909 KMP_SNPRINTF(scan, end - scan + 1, "%u-%u", start, previous); 910 } else { 911 // Range with one or two contiguous bits in the affinity mask 912 KMP_SNPRINTF(scan, end - scan + 1, "%u", start); 913 KMP_ADVANCE_SCAN(scan); 914 if (previous - start > 0) { 915 KMP_SNPRINTF(scan, end - scan + 1, ",%u", previous); 916 } 917 } 918 KMP_ADVANCE_SCAN(scan); 919 // Start over with new start point 920 start = finish; 921 if (start == mask->end()) 922 break; 923 // Check for overflow 924 if (end - scan < 2) 925 break; 926 } 927 928 // Check for overflow 929 KMP_ASSERT(scan <= end); 930 return buf; 931 } 932 #undef KMP_ADVANCE_SCAN 933 934 // Print the affinity mask to the string buffer object in a pretty format 935 // The format is a comma separated list of non-negative integers or integer 936 // ranges: e.g., 1,2,3-5,7,9-15 937 // The format can also be the string "{<empty>}" if no bits are set in mask 938 kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf, 939 kmp_affin_mask_t *mask) { 940 int start = 0, finish = 0, previous = 0; 941 bool first_range; 942 KMP_ASSERT(buf); 943 KMP_ASSERT(mask); 944 945 __kmp_str_buf_clear(buf); 946 947 // Check for empty set. 948 if (mask->begin() == mask->end()) { 949 __kmp_str_buf_print(buf, "%s", "{<empty>}"); 950 return buf; 951 } 952 953 first_range = true; 954 start = mask->begin(); 955 while (1) { 956 // Find next range 957 // [start, previous] is inclusive range of contiguous bits in mask 958 for (finish = mask->next(start), previous = start; 959 finish == previous + 1 && finish != mask->end(); 960 finish = mask->next(finish)) { 961 previous = finish; 962 } 963 964 // The first range does not need a comma printed before it, but the rest 965 // of the ranges do need a comma beforehand 966 if (!first_range) { 967 __kmp_str_buf_print(buf, "%s", ","); 968 } else { 969 first_range = false; 970 } 971 // Range with three or more contiguous bits in the affinity mask 972 if (previous - start > 1) { 973 __kmp_str_buf_print(buf, "%u-%u", start, previous); 974 } else { 975 // Range with one or two contiguous bits in the affinity mask 976 __kmp_str_buf_print(buf, "%u", start); 977 if (previous - start > 0) { 978 __kmp_str_buf_print(buf, ",%u", previous); 979 } 980 } 981 // Start over with new start point 982 start = finish; 983 if (start == mask->end()) 984 break; 985 } 986 return buf; 987 } 988 989 void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) { 990 KMP_CPU_ZERO(mask); 991 992 #if KMP_GROUP_AFFINITY 993 994 if (__kmp_num_proc_groups > 1) { 995 int group; 996 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 997 for (group = 0; group < __kmp_num_proc_groups; group++) { 998 int i; 999 int num = __kmp_GetActiveProcessorCount(group); 1000 for (i = 0; i < num; i++) { 1001 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 1002 } 1003 } 1004 } else 1005 1006 #endif /* KMP_GROUP_AFFINITY */ 1007 1008 { 1009 int proc; 1010 for (proc = 0; proc < __kmp_xproc; proc++) { 1011 KMP_CPU_SET(proc, mask); 1012 } 1013 } 1014 } 1015 1016 // All of the __kmp_affinity_create_*_map() routines should allocate the 1017 // internal topology object and set the layer ids for it. Each routine 1018 // returns a boolean on whether it was successful at doing so. 1019 kmp_affin_mask_t *__kmp_affin_fullMask = NULL; 1020 1021 #if KMP_USE_HWLOC 1022 static inline bool __kmp_hwloc_is_cache_type(hwloc_obj_t obj) { 1023 #if HWLOC_API_VERSION >= 0x00020000 1024 return hwloc_obj_type_is_cache(obj->type); 1025 #else 1026 return obj->type == HWLOC_OBJ_CACHE; 1027 #endif 1028 } 1029 1030 // Returns KMP_HW_* type derived from HWLOC_* type 1031 static inline kmp_hw_t __kmp_hwloc_type_2_topology_type(hwloc_obj_t obj) { 1032 1033 if (__kmp_hwloc_is_cache_type(obj)) { 1034 if (obj->attr->cache.type == HWLOC_OBJ_CACHE_INSTRUCTION) 1035 return KMP_HW_UNKNOWN; 1036 switch (obj->attr->cache.depth) { 1037 case 1: 1038 return KMP_HW_L1; 1039 case 2: 1040 #if KMP_MIC_SUPPORTED 1041 if (__kmp_mic_type == mic3) { 1042 return KMP_HW_TILE; 1043 } 1044 #endif 1045 return KMP_HW_L2; 1046 case 3: 1047 return KMP_HW_L3; 1048 } 1049 return KMP_HW_UNKNOWN; 1050 } 1051 1052 switch (obj->type) { 1053 case HWLOC_OBJ_PACKAGE: 1054 return KMP_HW_SOCKET; 1055 case HWLOC_OBJ_NUMANODE: 1056 return KMP_HW_NUMA; 1057 case HWLOC_OBJ_CORE: 1058 return KMP_HW_CORE; 1059 case HWLOC_OBJ_PU: 1060 return KMP_HW_THREAD; 1061 case HWLOC_OBJ_GROUP: 1062 if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_DIE) 1063 return KMP_HW_DIE; 1064 else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_TILE) 1065 return KMP_HW_TILE; 1066 else if (obj->attr->group.kind == HWLOC_GROUP_KIND_INTEL_MODULE) 1067 return KMP_HW_MODULE; 1068 else if (obj->attr->group.kind == HWLOC_GROUP_KIND_WINDOWS_PROCESSOR_GROUP) 1069 return KMP_HW_PROC_GROUP; 1070 return KMP_HW_UNKNOWN; 1071 #if HWLOC_API_VERSION >= 0x00020100 1072 case HWLOC_OBJ_DIE: 1073 return KMP_HW_DIE; 1074 #endif 1075 } 1076 return KMP_HW_UNKNOWN; 1077 } 1078 1079 // Returns the number of objects of type 'type' below 'obj' within the topology 1080 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is 1081 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET 1082 // object. 1083 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj, 1084 hwloc_obj_type_t type) { 1085 int retval = 0; 1086 hwloc_obj_t first; 1087 for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type, 1088 obj->logical_index, type, 0); 1089 first != NULL && hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, 1090 obj->type, first) == obj; 1091 first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type, 1092 first)) { 1093 ++retval; 1094 } 1095 return retval; 1096 } 1097 1098 // This gets the sub_id for a lower object under a higher object in the 1099 // topology tree 1100 static int __kmp_hwloc_get_sub_id(hwloc_topology_t t, hwloc_obj_t higher, 1101 hwloc_obj_t lower) { 1102 hwloc_obj_t obj; 1103 hwloc_obj_type_t ltype = lower->type; 1104 int lindex = lower->logical_index - 1; 1105 int sub_id = 0; 1106 // Get the previous lower object 1107 obj = hwloc_get_obj_by_type(t, ltype, lindex); 1108 while (obj && lindex >= 0 && 1109 hwloc_bitmap_isincluded(obj->cpuset, higher->cpuset)) { 1110 if (obj->userdata) { 1111 sub_id = (int)(RCAST(kmp_intptr_t, obj->userdata)); 1112 break; 1113 } 1114 sub_id++; 1115 lindex--; 1116 obj = hwloc_get_obj_by_type(t, ltype, lindex); 1117 } 1118 // store sub_id + 1 so that 0 is differed from NULL 1119 lower->userdata = RCAST(void *, sub_id + 1); 1120 return sub_id; 1121 } 1122 1123 static bool __kmp_affinity_create_hwloc_map(kmp_i18n_id_t *const msg_id) { 1124 kmp_hw_t type; 1125 int hw_thread_index, sub_id; 1126 int depth; 1127 hwloc_obj_t pu, obj, root, prev; 1128 kmp_hw_t types[KMP_HW_LAST]; 1129 hwloc_obj_type_t hwloc_types[KMP_HW_LAST]; 1130 1131 hwloc_topology_t tp = __kmp_hwloc_topology; 1132 *msg_id = kmp_i18n_null; 1133 if (__kmp_affinity_verbose) { 1134 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 1135 } 1136 1137 if (!KMP_AFFINITY_CAPABLE()) { 1138 // Hack to try and infer the machine topology using only the data 1139 // available from hwloc on the current thread, and __kmp_xproc. 1140 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1141 // hwloc only guarantees existance of PU object, so check PACKAGE and CORE 1142 hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0); 1143 if (o != NULL) 1144 nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_CORE); 1145 else 1146 nCoresPerPkg = 1; // no PACKAGE found 1147 o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0); 1148 if (o != NULL) 1149 __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_PU); 1150 else 1151 __kmp_nThreadsPerCore = 1; // no CORE found 1152 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1153 if (nCoresPerPkg == 0) 1154 nCoresPerPkg = 1; // to prevent possible division by 0 1155 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1156 return true; 1157 } 1158 1159 root = hwloc_get_root_obj(tp); 1160 1161 // Figure out the depth and types in the topology 1162 depth = 0; 1163 pu = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin()); 1164 KMP_ASSERT(pu); 1165 obj = pu; 1166 types[depth] = KMP_HW_THREAD; 1167 hwloc_types[depth] = obj->type; 1168 depth++; 1169 while (obj != root && obj != NULL) { 1170 obj = obj->parent; 1171 #if HWLOC_API_VERSION >= 0x00020000 1172 if (obj->memory_arity) { 1173 hwloc_obj_t memory; 1174 for (memory = obj->memory_first_child; memory; 1175 memory = hwloc_get_next_child(tp, obj, memory)) { 1176 if (memory->type == HWLOC_OBJ_NUMANODE) 1177 break; 1178 } 1179 if (memory && memory->type == HWLOC_OBJ_NUMANODE) { 1180 types[depth] = KMP_HW_NUMA; 1181 hwloc_types[depth] = memory->type; 1182 depth++; 1183 } 1184 } 1185 #endif 1186 type = __kmp_hwloc_type_2_topology_type(obj); 1187 if (type != KMP_HW_UNKNOWN) { 1188 types[depth] = type; 1189 hwloc_types[depth] = obj->type; 1190 depth++; 1191 } 1192 } 1193 KMP_ASSERT(depth > 0); 1194 1195 // Get the order for the types correct 1196 for (int i = 0, j = depth - 1; i < j; ++i, --j) { 1197 hwloc_obj_type_t hwloc_temp = hwloc_types[i]; 1198 kmp_hw_t temp = types[i]; 1199 types[i] = types[j]; 1200 types[j] = temp; 1201 hwloc_types[i] = hwloc_types[j]; 1202 hwloc_types[j] = hwloc_temp; 1203 } 1204 1205 // Allocate the data structure to be returned. 1206 __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types); 1207 1208 hw_thread_index = 0; 1209 pu = NULL; 1210 while (pu = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, pu)) { 1211 int index = depth - 1; 1212 bool included = KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask); 1213 kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index); 1214 if (included) { 1215 hw_thread.clear(); 1216 hw_thread.ids[index] = pu->logical_index; 1217 hw_thread.os_id = pu->os_index; 1218 index--; 1219 } 1220 obj = pu; 1221 prev = obj; 1222 while (obj != root && obj != NULL) { 1223 obj = obj->parent; 1224 #if HWLOC_API_VERSION >= 0x00020000 1225 // NUMA Nodes are handled differently since they are not within the 1226 // parent/child structure anymore. They are separate children 1227 // of obj (memory_first_child points to first memory child) 1228 if (obj->memory_arity) { 1229 hwloc_obj_t memory; 1230 for (memory = obj->memory_first_child; memory; 1231 memory = hwloc_get_next_child(tp, obj, memory)) { 1232 if (memory->type == HWLOC_OBJ_NUMANODE) 1233 break; 1234 } 1235 if (memory && memory->type == HWLOC_OBJ_NUMANODE) { 1236 sub_id = __kmp_hwloc_get_sub_id(tp, memory, prev); 1237 if (included) { 1238 hw_thread.ids[index] = memory->logical_index; 1239 hw_thread.ids[index + 1] = sub_id; 1240 index--; 1241 } 1242 prev = memory; 1243 } 1244 prev = obj; 1245 } 1246 #endif 1247 type = __kmp_hwloc_type_2_topology_type(obj); 1248 if (type != KMP_HW_UNKNOWN) { 1249 sub_id = __kmp_hwloc_get_sub_id(tp, obj, prev); 1250 if (included) { 1251 hw_thread.ids[index] = obj->logical_index; 1252 hw_thread.ids[index + 1] = sub_id; 1253 index--; 1254 } 1255 prev = obj; 1256 } 1257 } 1258 if (included) 1259 hw_thread_index++; 1260 } 1261 __kmp_topology->sort_ids(); 1262 return true; 1263 } 1264 #endif // KMP_USE_HWLOC 1265 1266 // If we don't know how to retrieve the machine's processor topology, or 1267 // encounter an error in doing so, this routine is called to form a "flat" 1268 // mapping of os thread id's <-> processor id's. 1269 static bool __kmp_affinity_create_flat_map(kmp_i18n_id_t *const msg_id) { 1270 *msg_id = kmp_i18n_null; 1271 int depth = 3; 1272 kmp_hw_t types[] = {KMP_HW_SOCKET, KMP_HW_CORE, KMP_HW_THREAD}; 1273 1274 if (__kmp_affinity_verbose) { 1275 KMP_INFORM(UsingFlatOS, "KMP_AFFINITY"); 1276 } 1277 1278 // Even if __kmp_affinity_type == affinity_none, this routine might still 1279 // called to set __kmp_ncores, as well as 1280 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1281 if (!KMP_AFFINITY_CAPABLE()) { 1282 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1283 __kmp_ncores = nPackages = __kmp_xproc; 1284 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1285 return true; 1286 } 1287 1288 // When affinity is off, this routine will still be called to set 1289 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1290 // Make sure all these vars are set correctly, and return now if affinity is 1291 // not enabled. 1292 __kmp_ncores = nPackages = __kmp_avail_proc; 1293 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1294 1295 // Construct the data structure to be returned. 1296 __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types); 1297 int avail_ct = 0; 1298 int i; 1299 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1300 // Skip this proc if it is not included in the machine model. 1301 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1302 continue; 1303 } 1304 kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct); 1305 hw_thread.clear(); 1306 hw_thread.os_id = i; 1307 hw_thread.ids[0] = i; 1308 hw_thread.ids[1] = 0; 1309 hw_thread.ids[2] = 0; 1310 avail_ct++; 1311 } 1312 if (__kmp_affinity_verbose) { 1313 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 1314 } 1315 return true; 1316 } 1317 1318 #if KMP_GROUP_AFFINITY 1319 // If multiple Windows* OS processor groups exist, we can create a 2-level 1320 // topology map with the groups at level 0 and the individual procs at level 1. 1321 // This facilitates letting the threads float among all procs in a group, 1322 // if granularity=group (the default when there are multiple groups). 1323 static bool __kmp_affinity_create_proc_group_map(kmp_i18n_id_t *const msg_id) { 1324 *msg_id = kmp_i18n_null; 1325 int depth = 3; 1326 kmp_hw_t types[] = {KMP_HW_PROC_GROUP, KMP_HW_CORE, KMP_HW_THREAD}; 1327 const static size_t BITS_PER_GROUP = CHAR_BIT * sizeof(DWORD_PTR); 1328 1329 if (__kmp_affinity_verbose) { 1330 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 1331 } 1332 1333 // If we aren't affinity capable, then use flat topology 1334 if (!KMP_AFFINITY_CAPABLE()) { 1335 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1336 nPackages = __kmp_num_proc_groups; 1337 __kmp_nThreadsPerCore = 1; 1338 __kmp_ncores = __kmp_xproc; 1339 nCoresPerPkg = nPackages / __kmp_ncores; 1340 return true; 1341 } 1342 1343 // Construct the data structure to be returned. 1344 __kmp_topology = kmp_topology_t::allocate(__kmp_avail_proc, depth, types); 1345 int avail_ct = 0; 1346 int i; 1347 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1348 // Skip this proc if it is not included in the machine model. 1349 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1350 continue; 1351 } 1352 kmp_hw_thread_t &hw_thread = __kmp_topology->at(avail_ct++); 1353 hw_thread.clear(); 1354 hw_thread.os_id = i; 1355 hw_thread.ids[0] = i / BITS_PER_GROUP; 1356 hw_thread.ids[1] = hw_thread.ids[2] = i % BITS_PER_GROUP; 1357 } 1358 return true; 1359 } 1360 #endif /* KMP_GROUP_AFFINITY */ 1361 1362 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1363 1364 template <kmp_uint32 LSB, kmp_uint32 MSB> 1365 static inline unsigned __kmp_extract_bits(kmp_uint32 v) { 1366 const kmp_uint32 SHIFT_LEFT = sizeof(kmp_uint32) * 8 - 1 - MSB; 1367 const kmp_uint32 SHIFT_RIGHT = LSB; 1368 kmp_uint32 retval = v; 1369 retval <<= SHIFT_LEFT; 1370 retval >>= (SHIFT_LEFT + SHIFT_RIGHT); 1371 return retval; 1372 } 1373 1374 static int __kmp_cpuid_mask_width(int count) { 1375 int r = 0; 1376 1377 while ((1 << r) < count) 1378 ++r; 1379 return r; 1380 } 1381 1382 class apicThreadInfo { 1383 public: 1384 unsigned osId; // param to __kmp_affinity_bind_thread 1385 unsigned apicId; // from cpuid after binding 1386 unsigned maxCoresPerPkg; // "" 1387 unsigned maxThreadsPerPkg; // "" 1388 unsigned pkgId; // inferred from above values 1389 unsigned coreId; // "" 1390 unsigned threadId; // "" 1391 }; 1392 1393 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, 1394 const void *b) { 1395 const apicThreadInfo *aa = (const apicThreadInfo *)a; 1396 const apicThreadInfo *bb = (const apicThreadInfo *)b; 1397 if (aa->pkgId < bb->pkgId) 1398 return -1; 1399 if (aa->pkgId > bb->pkgId) 1400 return 1; 1401 if (aa->coreId < bb->coreId) 1402 return -1; 1403 if (aa->coreId > bb->coreId) 1404 return 1; 1405 if (aa->threadId < bb->threadId) 1406 return -1; 1407 if (aa->threadId > bb->threadId) 1408 return 1; 1409 return 0; 1410 } 1411 1412 class kmp_cache_info_t { 1413 public: 1414 struct info_t { 1415 unsigned level, mask; 1416 }; 1417 kmp_cache_info_t() : depth(0) { get_leaf4_levels(); } 1418 size_t get_depth() const { return depth; } 1419 info_t &operator[](size_t index) { return table[index]; } 1420 const info_t &operator[](size_t index) const { return table[index]; } 1421 1422 static kmp_hw_t get_topology_type(unsigned level) { 1423 KMP_DEBUG_ASSERT(level >= 1 && level <= MAX_CACHE_LEVEL); 1424 switch (level) { 1425 case 1: 1426 return KMP_HW_L1; 1427 case 2: 1428 return KMP_HW_L2; 1429 case 3: 1430 return KMP_HW_L3; 1431 } 1432 return KMP_HW_UNKNOWN; 1433 } 1434 1435 private: 1436 static const int MAX_CACHE_LEVEL = 3; 1437 1438 size_t depth; 1439 info_t table[MAX_CACHE_LEVEL]; 1440 1441 void get_leaf4_levels() { 1442 unsigned level = 0; 1443 while (depth < MAX_CACHE_LEVEL) { 1444 unsigned cache_type, max_threads_sharing; 1445 unsigned cache_level, cache_mask_width; 1446 kmp_cpuid buf2; 1447 __kmp_x86_cpuid(4, level, &buf2); 1448 cache_type = __kmp_extract_bits<0, 4>(buf2.eax); 1449 if (!cache_type) 1450 break; 1451 // Skip instruction caches 1452 if (cache_type == 2) { 1453 level++; 1454 continue; 1455 } 1456 max_threads_sharing = __kmp_extract_bits<14, 25>(buf2.eax) + 1; 1457 cache_mask_width = __kmp_cpuid_mask_width(max_threads_sharing); 1458 cache_level = __kmp_extract_bits<5, 7>(buf2.eax); 1459 table[depth].level = cache_level; 1460 table[depth].mask = ((-1) << cache_mask_width); 1461 depth++; 1462 level++; 1463 } 1464 } 1465 }; 1466 1467 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 1468 // an algorithm which cycles through the available os threads, setting 1469 // the current thread's affinity mask to that thread, and then retrieves 1470 // the Apic Id for each thread context using the cpuid instruction. 1471 static bool __kmp_affinity_create_apicid_map(kmp_i18n_id_t *const msg_id) { 1472 kmp_cpuid buf; 1473 *msg_id = kmp_i18n_null; 1474 1475 if (__kmp_affinity_verbose) { 1476 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 1477 } 1478 1479 // Check if cpuid leaf 4 is supported. 1480 __kmp_x86_cpuid(0, 0, &buf); 1481 if (buf.eax < 4) { 1482 *msg_id = kmp_i18n_str_NoLeaf4Support; 1483 return false; 1484 } 1485 1486 // The algorithm used starts by setting the affinity to each available thread 1487 // and retrieving info from the cpuid instruction, so if we are not capable of 1488 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we 1489 // need to do something else - use the defaults that we calculated from 1490 // issuing cpuid without binding to each proc. 1491 if (!KMP_AFFINITY_CAPABLE()) { 1492 // Hack to try and infer the machine topology using only the data 1493 // available from cpuid on the current thread, and __kmp_xproc. 1494 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1495 1496 // Get an upper bound on the number of threads per package using cpuid(1). 1497 // On some OS/chps combinations where HT is supported by the chip but is 1498 // disabled, this value will be 2 on a single core chip. Usually, it will be 1499 // 2 if HT is enabled and 1 if HT is disabled. 1500 __kmp_x86_cpuid(1, 0, &buf); 1501 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1502 if (maxThreadsPerPkg == 0) { 1503 maxThreadsPerPkg = 1; 1504 } 1505 1506 // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded 1507 // value. 1508 // 1509 // The author of cpu_count.cpp treated this only an upper bound on the 1510 // number of cores, but I haven't seen any cases where it was greater than 1511 // the actual number of cores, so we will treat it as exact in this block of 1512 // code. 1513 // 1514 // First, we need to check if cpuid(4) is supported on this chip. To see if 1515 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or 1516 // greater. 1517 __kmp_x86_cpuid(0, 0, &buf); 1518 if (buf.eax >= 4) { 1519 __kmp_x86_cpuid(4, 0, &buf); 1520 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1521 } else { 1522 nCoresPerPkg = 1; 1523 } 1524 1525 // There is no way to reliably tell if HT is enabled without issuing the 1526 // cpuid instruction from every thread, can correlating the cpuid info, so 1527 // if the machine is not affinity capable, we assume that HT is off. We have 1528 // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine 1529 // does not support HT. 1530 // 1531 // - Older OSes are usually found on machines with older chips, which do not 1532 // support HT. 1533 // - The performance penalty for mistakenly identifying a machine as HT when 1534 // it isn't (which results in blocktime being incorrectly set to 0) is 1535 // greater than the penalty when for mistakenly identifying a machine as 1536 // being 1 thread/core when it is really HT enabled (which results in 1537 // blocktime being incorrectly set to a positive value). 1538 __kmp_ncores = __kmp_xproc; 1539 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1540 __kmp_nThreadsPerCore = 1; 1541 return true; 1542 } 1543 1544 // From here on, we can assume that it is safe to call 1545 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 1546 // __kmp_affinity_type = affinity_none. 1547 1548 // Save the affinity mask for the current thread. 1549 kmp_affinity_raii_t previous_affinity; 1550 1551 // Run through each of the available contexts, binding the current thread 1552 // to it, and obtaining the pertinent information using the cpuid instr. 1553 // 1554 // The relevant information is: 1555 // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 1556 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 1557 // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value 1558 // of this field determines the width of the core# + thread# fields in the 1559 // Apic Id. It is also an upper bound on the number of threads per 1560 // package, but it has been verified that situations happen were it is not 1561 // exact. In particular, on certain OS/chip combinations where Intel(R) 1562 // Hyper-Threading Technology is supported by the chip but has been 1563 // disabled, the value of this field will be 2 (for a single core chip). 1564 // On other OS/chip combinations supporting Intel(R) Hyper-Threading 1565 // Technology, the value of this field will be 1 when Intel(R) 1566 // Hyper-Threading Technology is disabled and 2 when it is enabled. 1567 // - Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The value 1568 // of this field (+1) determines the width of the core# field in the Apic 1569 // Id. The comments in "cpucount.cpp" say that this value is an upper 1570 // bound, but the IA-32 architecture manual says that it is exactly the 1571 // number of cores per package, and I haven't seen any case where it 1572 // wasn't. 1573 // 1574 // From this information, deduce the package Id, core Id, and thread Id, 1575 // and set the corresponding fields in the apicThreadInfo struct. 1576 unsigned i; 1577 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 1578 __kmp_avail_proc * sizeof(apicThreadInfo)); 1579 unsigned nApics = 0; 1580 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 1581 // Skip this proc if it is not included in the machine model. 1582 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 1583 continue; 1584 } 1585 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 1586 1587 __kmp_affinity_dispatch->bind_thread(i); 1588 threadInfo[nApics].osId = i; 1589 1590 // The apic id and max threads per pkg come from cpuid(1). 1591 __kmp_x86_cpuid(1, 0, &buf); 1592 if (((buf.edx >> 9) & 1) == 0) { 1593 __kmp_free(threadInfo); 1594 *msg_id = kmp_i18n_str_ApicNotPresent; 1595 return false; 1596 } 1597 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 1598 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1599 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 1600 threadInfo[nApics].maxThreadsPerPkg = 1; 1601 } 1602 1603 // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded 1604 // value. 1605 // 1606 // First, we need to check if cpuid(4) is supported on this chip. To see if 1607 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n 1608 // or greater. 1609 __kmp_x86_cpuid(0, 0, &buf); 1610 if (buf.eax >= 4) { 1611 __kmp_x86_cpuid(4, 0, &buf); 1612 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1613 } else { 1614 threadInfo[nApics].maxCoresPerPkg = 1; 1615 } 1616 1617 // Infer the pkgId / coreId / threadId using only the info obtained locally. 1618 int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg); 1619 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 1620 1621 int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg); 1622 int widthT = widthCT - widthC; 1623 if (widthT < 0) { 1624 // I've never seen this one happen, but I suppose it could, if the cpuid 1625 // instruction on a chip was really screwed up. Make sure to restore the 1626 // affinity mask before the tail call. 1627 __kmp_free(threadInfo); 1628 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1629 return false; 1630 } 1631 1632 int maskC = (1 << widthC) - 1; 1633 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC; 1634 1635 int maskT = (1 << widthT) - 1; 1636 threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT; 1637 1638 nApics++; 1639 } 1640 1641 // We've collected all the info we need. 1642 // Restore the old affinity mask for this thread. 1643 previous_affinity.restore(); 1644 1645 // Sort the threadInfo table by physical Id. 1646 qsort(threadInfo, nApics, sizeof(*threadInfo), 1647 __kmp_affinity_cmp_apicThreadInfo_phys_id); 1648 1649 // The table is now sorted by pkgId / coreId / threadId, but we really don't 1650 // know the radix of any of the fields. pkgId's may be sparsely assigned among 1651 // the chips on a system. Although coreId's are usually assigned 1652 // [0 .. coresPerPkg-1] and threadId's are usually assigned 1653 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1654 // 1655 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 1656 // total # packages) are at this point - we want to determine that now. We 1657 // only have an upper bound on the first two figures. 1658 // 1659 // We also perform a consistency check at this point: the values returned by 1660 // the cpuid instruction for any thread bound to a given package had better 1661 // return the same info for maxThreadsPerPkg and maxCoresPerPkg. 1662 nPackages = 1; 1663 nCoresPerPkg = 1; 1664 __kmp_nThreadsPerCore = 1; 1665 unsigned nCores = 1; 1666 1667 unsigned pkgCt = 1; // to determine radii 1668 unsigned lastPkgId = threadInfo[0].pkgId; 1669 unsigned coreCt = 1; 1670 unsigned lastCoreId = threadInfo[0].coreId; 1671 unsigned threadCt = 1; 1672 unsigned lastThreadId = threadInfo[0].threadId; 1673 1674 // intra-pkg consist checks 1675 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 1676 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 1677 1678 for (i = 1; i < nApics; i++) { 1679 if (threadInfo[i].pkgId != lastPkgId) { 1680 nCores++; 1681 pkgCt++; 1682 lastPkgId = threadInfo[i].pkgId; 1683 if ((int)coreCt > nCoresPerPkg) 1684 nCoresPerPkg = coreCt; 1685 coreCt = 1; 1686 lastCoreId = threadInfo[i].coreId; 1687 if ((int)threadCt > __kmp_nThreadsPerCore) 1688 __kmp_nThreadsPerCore = threadCt; 1689 threadCt = 1; 1690 lastThreadId = threadInfo[i].threadId; 1691 1692 // This is a different package, so go on to the next iteration without 1693 // doing any consistency checks. Reset the consistency check vars, though. 1694 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 1695 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 1696 continue; 1697 } 1698 1699 if (threadInfo[i].coreId != lastCoreId) { 1700 nCores++; 1701 coreCt++; 1702 lastCoreId = threadInfo[i].coreId; 1703 if ((int)threadCt > __kmp_nThreadsPerCore) 1704 __kmp_nThreadsPerCore = threadCt; 1705 threadCt = 1; 1706 lastThreadId = threadInfo[i].threadId; 1707 } else if (threadInfo[i].threadId != lastThreadId) { 1708 threadCt++; 1709 lastThreadId = threadInfo[i].threadId; 1710 } else { 1711 __kmp_free(threadInfo); 1712 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1713 return false; 1714 } 1715 1716 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 1717 // fields agree between all the threads bounds to a given package. 1718 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) || 1719 (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 1720 __kmp_free(threadInfo); 1721 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1722 return false; 1723 } 1724 } 1725 // When affinity is off, this routine will still be called to set 1726 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 1727 // Make sure all these vars are set correctly 1728 nPackages = pkgCt; 1729 if ((int)coreCt > nCoresPerPkg) 1730 nCoresPerPkg = coreCt; 1731 if ((int)threadCt > __kmp_nThreadsPerCore) 1732 __kmp_nThreadsPerCore = threadCt; 1733 __kmp_ncores = nCores; 1734 KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc); 1735 1736 // Now that we've determined the number of packages, the number of cores per 1737 // package, and the number of threads per core, we can construct the data 1738 // structure that is to be returned. 1739 int idx = 0; 1740 int pkgLevel = 0; 1741 int coreLevel = 1; 1742 int threadLevel = 2; 1743 //(__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 1744 int depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 1745 kmp_hw_t types[3]; 1746 if (pkgLevel >= 0) 1747 types[idx++] = KMP_HW_SOCKET; 1748 if (coreLevel >= 0) 1749 types[idx++] = KMP_HW_CORE; 1750 if (threadLevel >= 0) 1751 types[idx++] = KMP_HW_THREAD; 1752 1753 KMP_ASSERT(depth > 0); 1754 __kmp_topology = kmp_topology_t::allocate(nApics, depth, types); 1755 1756 for (i = 0; i < nApics; ++i) { 1757 idx = 0; 1758 unsigned os = threadInfo[i].osId; 1759 kmp_hw_thread_t &hw_thread = __kmp_topology->at(i); 1760 hw_thread.clear(); 1761 1762 if (pkgLevel >= 0) { 1763 hw_thread.ids[idx++] = threadInfo[i].pkgId; 1764 } 1765 if (coreLevel >= 0) { 1766 hw_thread.ids[idx++] = threadInfo[i].coreId; 1767 } 1768 if (threadLevel >= 0) { 1769 hw_thread.ids[idx++] = threadInfo[i].threadId; 1770 } 1771 hw_thread.os_id = os; 1772 } 1773 1774 __kmp_free(threadInfo); 1775 __kmp_topology->sort_ids(); 1776 if (!__kmp_topology->check_ids()) { 1777 kmp_topology_t::deallocate(__kmp_topology); 1778 __kmp_topology = nullptr; 1779 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1780 return false; 1781 } 1782 return true; 1783 } 1784 1785 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 1786 // architectures support a newer interface for specifying the x2APIC Ids, 1787 // based on CPUID.B or CPUID.1F 1788 /* 1789 * CPUID.B or 1F, Input ECX (sub leaf # aka level number) 1790 Bits Bits Bits Bits 1791 31-16 15-8 7-4 4-0 1792 ---+-----------+--------------+-------------+-----------------+ 1793 EAX| reserved | reserved | reserved | Bits to Shift | 1794 ---+-----------|--------------+-------------+-----------------| 1795 EBX| reserved | Num logical processors at level (16 bits) | 1796 ---+-----------|--------------+-------------------------------| 1797 ECX| reserved | Level Type | Level Number (8 bits) | 1798 ---+-----------+--------------+-------------------------------| 1799 EDX| X2APIC ID (32 bits) | 1800 ---+----------------------------------------------------------+ 1801 */ 1802 1803 enum { 1804 INTEL_LEVEL_TYPE_INVALID = 0, // Package level 1805 INTEL_LEVEL_TYPE_SMT = 1, 1806 INTEL_LEVEL_TYPE_CORE = 2, 1807 INTEL_LEVEL_TYPE_TILE = 3, 1808 INTEL_LEVEL_TYPE_MODULE = 4, 1809 INTEL_LEVEL_TYPE_DIE = 5, 1810 INTEL_LEVEL_TYPE_LAST = 6, 1811 }; 1812 1813 struct cpuid_level_info_t { 1814 unsigned level_type, mask, mask_width, nitems, cache_mask; 1815 }; 1816 1817 static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) { 1818 switch (intel_type) { 1819 case INTEL_LEVEL_TYPE_INVALID: 1820 return KMP_HW_SOCKET; 1821 case INTEL_LEVEL_TYPE_SMT: 1822 return KMP_HW_THREAD; 1823 case INTEL_LEVEL_TYPE_CORE: 1824 return KMP_HW_CORE; 1825 case INTEL_LEVEL_TYPE_TILE: 1826 return KMP_HW_TILE; 1827 case INTEL_LEVEL_TYPE_MODULE: 1828 return KMP_HW_MODULE; 1829 case INTEL_LEVEL_TYPE_DIE: 1830 return KMP_HW_DIE; 1831 } 1832 return KMP_HW_UNKNOWN; 1833 } 1834 1835 // This function takes the topology leaf, a levels array to store the levels 1836 // detected and a bitmap of the known levels. 1837 // Returns the number of levels in the topology 1838 static unsigned 1839 __kmp_x2apicid_get_levels(int leaf, 1840 cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST], 1841 kmp_uint64 known_levels) { 1842 unsigned level, levels_index; 1843 unsigned level_type, mask_width, nitems; 1844 kmp_cpuid buf; 1845 1846 // New algorithm has known topology layers act as highest unknown topology 1847 // layers when unknown topology layers exist. 1848 // e.g., Suppose layers were SMT <X> CORE <Y> <Z> PACKAGE, where <X> <Y> <Z> 1849 // are unknown topology layers, Then SMT will take the characteristics of 1850 // (SMT x <X>) and CORE will take the characteristics of (CORE x <Y> x <Z>). 1851 // This eliminates unknown portions of the topology while still keeping the 1852 // correct structure. 1853 level = levels_index = 0; 1854 do { 1855 __kmp_x86_cpuid(leaf, level, &buf); 1856 level_type = __kmp_extract_bits<8, 15>(buf.ecx); 1857 mask_width = __kmp_extract_bits<0, 4>(buf.eax); 1858 nitems = __kmp_extract_bits<0, 15>(buf.ebx); 1859 if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0) 1860 return 0; 1861 1862 if (known_levels & (1ull << level_type)) { 1863 // Add a new level to the topology 1864 KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST); 1865 levels[levels_index].level_type = level_type; 1866 levels[levels_index].mask_width = mask_width; 1867 levels[levels_index].nitems = nitems; 1868 levels_index++; 1869 } else { 1870 // If it is an unknown level, then logically move the previous layer up 1871 if (levels_index > 0) { 1872 levels[levels_index - 1].mask_width = mask_width; 1873 levels[levels_index - 1].nitems = nitems; 1874 } 1875 } 1876 level++; 1877 } while (level_type != INTEL_LEVEL_TYPE_INVALID); 1878 1879 // Set the masks to & with apicid 1880 for (unsigned i = 0; i < levels_index; ++i) { 1881 if (levels[i].level_type != INTEL_LEVEL_TYPE_INVALID) { 1882 levels[i].mask = ~((-1) << levels[i].mask_width); 1883 levels[i].cache_mask = (-1) << levels[i].mask_width; 1884 for (unsigned j = 0; j < i; ++j) 1885 levels[i].mask ^= levels[j].mask; 1886 } else { 1887 KMP_DEBUG_ASSERT(levels_index > 0); 1888 levels[i].mask = (-1) << levels[i - 1].mask_width; 1889 levels[i].cache_mask = 0; 1890 } 1891 } 1892 return levels_index; 1893 } 1894 1895 static bool __kmp_affinity_create_x2apicid_map(kmp_i18n_id_t *const msg_id) { 1896 1897 cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST]; 1898 kmp_hw_t types[INTEL_LEVEL_TYPE_LAST]; 1899 unsigned levels_index; 1900 kmp_cpuid buf; 1901 kmp_uint64 known_levels; 1902 int topology_leaf, highest_leaf, apic_id; 1903 int num_leaves; 1904 static int leaves[] = {0, 0}; 1905 1906 kmp_i18n_id_t leaf_message_id; 1907 1908 KMP_BUILD_ASSERT(sizeof(known_levels) * CHAR_BIT > KMP_HW_LAST); 1909 1910 *msg_id = kmp_i18n_null; 1911 if (__kmp_affinity_verbose) { 1912 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 1913 } 1914 1915 // Figure out the known topology levels 1916 known_levels = 0ull; 1917 for (int i = 0; i < INTEL_LEVEL_TYPE_LAST; ++i) { 1918 if (__kmp_intel_type_2_topology_type(i) != KMP_HW_UNKNOWN) { 1919 known_levels |= (1ull << i); 1920 } 1921 } 1922 1923 // Get the highest cpuid leaf supported 1924 __kmp_x86_cpuid(0, 0, &buf); 1925 highest_leaf = buf.eax; 1926 1927 // If a specific topology method was requested, only allow that specific leaf 1928 // otherwise, try both leaves 31 and 11 in that order 1929 num_leaves = 0; 1930 if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 1931 num_leaves = 1; 1932 leaves[0] = 11; 1933 leaf_message_id = kmp_i18n_str_NoLeaf11Support; 1934 } else if (__kmp_affinity_top_method == affinity_top_method_x2apicid_1f) { 1935 num_leaves = 1; 1936 leaves[0] = 31; 1937 leaf_message_id = kmp_i18n_str_NoLeaf31Support; 1938 } else { 1939 num_leaves = 2; 1940 leaves[0] = 31; 1941 leaves[1] = 11; 1942 leaf_message_id = kmp_i18n_str_NoLeaf11Support; 1943 } 1944 1945 // Check to see if cpuid leaf 31 or 11 is supported. 1946 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1947 topology_leaf = -1; 1948 for (int i = 0; i < num_leaves; ++i) { 1949 int leaf = leaves[i]; 1950 if (highest_leaf < leaf) 1951 continue; 1952 __kmp_x86_cpuid(leaf, 0, &buf); 1953 if (buf.ebx == 0) 1954 continue; 1955 topology_leaf = leaf; 1956 levels_index = __kmp_x2apicid_get_levels(leaf, levels, known_levels); 1957 if (levels_index == 0) 1958 continue; 1959 break; 1960 } 1961 if (topology_leaf == -1 || levels_index == 0) { 1962 *msg_id = leaf_message_id; 1963 return false; 1964 } 1965 KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST); 1966 1967 // The algorithm used starts by setting the affinity to each available thread 1968 // and retrieving info from the cpuid instruction, so if we are not capable of 1969 // calling __kmp_get_system_affinity() and __kmp_get_system_affinity(), then 1970 // we need to do something else - use the defaults that we calculated from 1971 // issuing cpuid without binding to each proc. 1972 if (!KMP_AFFINITY_CAPABLE()) { 1973 // Hack to try and infer the machine topology using only the data 1974 // available from cpuid on the current thread, and __kmp_xproc. 1975 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1976 for (unsigned i = 0; i < levels_index; ++i) { 1977 if (levels[i].level_type == INTEL_LEVEL_TYPE_SMT) { 1978 __kmp_nThreadsPerCore = levels[i].nitems; 1979 } else if (levels[i].level_type == INTEL_LEVEL_TYPE_CORE) { 1980 nCoresPerPkg = levels[i].nitems; 1981 } 1982 } 1983 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1984 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1985 return true; 1986 } 1987 1988 // Allocate the data structure to be returned. 1989 int depth = levels_index; 1990 for (int i = depth - 1, j = 0; i >= 0; --i, ++j) 1991 types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type); 1992 __kmp_topology = 1993 kmp_topology_t::allocate(__kmp_avail_proc, levels_index, types); 1994 1995 // Insert equivalent cache types if they exist 1996 kmp_cache_info_t cache_info; 1997 for (size_t i = 0; i < cache_info.get_depth(); ++i) { 1998 const kmp_cache_info_t::info_t &info = cache_info[i]; 1999 unsigned cache_mask = info.mask; 2000 unsigned cache_level = info.level; 2001 for (unsigned j = 0; j < levels_index; ++j) { 2002 unsigned hw_cache_mask = levels[j].cache_mask; 2003 kmp_hw_t cache_type = kmp_cache_info_t::get_topology_type(cache_level); 2004 if (hw_cache_mask == cache_mask && j < levels_index - 1) { 2005 kmp_hw_t type = 2006 __kmp_intel_type_2_topology_type(levels[j + 1].level_type); 2007 __kmp_topology->set_equivalent_type(cache_type, type); 2008 } 2009 } 2010 } 2011 2012 // From here on, we can assume that it is safe to call 2013 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if 2014 // __kmp_affinity_type = affinity_none. 2015 2016 // Save the affinity mask for the current thread. 2017 kmp_affinity_raii_t previous_affinity; 2018 2019 // Run through each of the available contexts, binding the current thread 2020 // to it, and obtaining the pertinent information using the cpuid instr. 2021 unsigned int proc; 2022 int hw_thread_index = 0; 2023 KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) { 2024 cpuid_level_info_t my_levels[INTEL_LEVEL_TYPE_LAST]; 2025 unsigned my_levels_index; 2026 2027 // Skip this proc if it is not included in the machine model. 2028 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 2029 continue; 2030 } 2031 KMP_DEBUG_ASSERT(hw_thread_index < __kmp_avail_proc); 2032 2033 __kmp_affinity_dispatch->bind_thread(proc); 2034 2035 // New algorithm 2036 __kmp_x86_cpuid(topology_leaf, 0, &buf); 2037 apic_id = buf.edx; 2038 kmp_hw_thread_t &hw_thread = __kmp_topology->at(hw_thread_index); 2039 my_levels_index = 2040 __kmp_x2apicid_get_levels(topology_leaf, my_levels, known_levels); 2041 if (my_levels_index == 0 || my_levels_index != levels_index) { 2042 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 2043 return false; 2044 } 2045 hw_thread.clear(); 2046 hw_thread.os_id = proc; 2047 // Put in topology information 2048 for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) { 2049 hw_thread.ids[idx] = apic_id & my_levels[j].mask; 2050 if (j > 0) { 2051 hw_thread.ids[idx] >>= my_levels[j - 1].mask_width; 2052 } 2053 } 2054 hw_thread_index++; 2055 } 2056 KMP_ASSERT(hw_thread_index > 0); 2057 __kmp_topology->sort_ids(); 2058 if (!__kmp_topology->check_ids()) { 2059 kmp_topology_t::deallocate(__kmp_topology); 2060 __kmp_topology = nullptr; 2061 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 2062 return false; 2063 } 2064 return true; 2065 } 2066 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 2067 2068 #define osIdIndex 0 2069 #define threadIdIndex 1 2070 #define coreIdIndex 2 2071 #define pkgIdIndex 3 2072 #define nodeIdIndex 4 2073 2074 typedef unsigned *ProcCpuInfo; 2075 static unsigned maxIndex = pkgIdIndex; 2076 2077 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, 2078 const void *b) { 2079 unsigned i; 2080 const unsigned *aa = *(unsigned *const *)a; 2081 const unsigned *bb = *(unsigned *const *)b; 2082 for (i = maxIndex;; i--) { 2083 if (aa[i] < bb[i]) 2084 return -1; 2085 if (aa[i] > bb[i]) 2086 return 1; 2087 if (i == osIdIndex) 2088 break; 2089 } 2090 return 0; 2091 } 2092 2093 #if KMP_USE_HIER_SCHED 2094 // Set the array sizes for the hierarchy layers 2095 static void __kmp_dispatch_set_hierarchy_values() { 2096 // Set the maximum number of L1's to number of cores 2097 // Set the maximum number of L2's to to either number of cores / 2 for 2098 // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing 2099 // Or the number of cores for Intel(R) Xeon(R) processors 2100 // Set the maximum number of NUMA nodes and L3's to number of packages 2101 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] = 2102 nPackages * nCoresPerPkg * __kmp_nThreadsPerCore; 2103 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores; 2104 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \ 2105 KMP_MIC_SUPPORTED 2106 if (__kmp_mic_type >= mic3) 2107 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2; 2108 else 2109 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) 2110 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores; 2111 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages; 2112 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages; 2113 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1; 2114 // Set the number of threads per unit 2115 // Number of hardware threads per L1/L2/L3/NUMA/LOOP 2116 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1; 2117 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] = 2118 __kmp_nThreadsPerCore; 2119 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \ 2120 KMP_MIC_SUPPORTED 2121 if (__kmp_mic_type >= mic3) 2122 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] = 2123 2 * __kmp_nThreadsPerCore; 2124 else 2125 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) 2126 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] = 2127 __kmp_nThreadsPerCore; 2128 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] = 2129 nCoresPerPkg * __kmp_nThreadsPerCore; 2130 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] = 2131 nCoresPerPkg * __kmp_nThreadsPerCore; 2132 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] = 2133 nPackages * nCoresPerPkg * __kmp_nThreadsPerCore; 2134 } 2135 2136 // Return the index into the hierarchy for this tid and layer type (L1, L2, etc) 2137 // i.e., this thread's L1 or this thread's L2, etc. 2138 int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) { 2139 int index = type + 1; 2140 int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1]; 2141 KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST); 2142 if (type == kmp_hier_layer_e::LAYER_THREAD) 2143 return tid; 2144 else if (type == kmp_hier_layer_e::LAYER_LOOP) 2145 return 0; 2146 KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0); 2147 if (tid >= num_hw_threads) 2148 tid = tid % num_hw_threads; 2149 return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index]; 2150 } 2151 2152 // Return the number of t1's per t2 2153 int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) { 2154 int i1 = t1 + 1; 2155 int i2 = t2 + 1; 2156 KMP_DEBUG_ASSERT(i1 <= i2); 2157 KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST); 2158 KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST); 2159 KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0); 2160 // (nthreads/t2) / (nthreads/t1) = t1 / t2 2161 return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1]; 2162 } 2163 #endif // KMP_USE_HIER_SCHED 2164 2165 static inline const char *__kmp_cpuinfo_get_filename() { 2166 const char *filename; 2167 if (__kmp_cpuinfo_file != nullptr) 2168 filename = __kmp_cpuinfo_file; 2169 else 2170 filename = "/proc/cpuinfo"; 2171 return filename; 2172 } 2173 2174 static inline const char *__kmp_cpuinfo_get_envvar() { 2175 const char *envvar = nullptr; 2176 if (__kmp_cpuinfo_file != nullptr) 2177 envvar = "KMP_CPUINFO_FILE"; 2178 return envvar; 2179 } 2180 2181 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 2182 // affinity map. 2183 static bool __kmp_affinity_create_cpuinfo_map(int *line, 2184 kmp_i18n_id_t *const msg_id) { 2185 const char *filename = __kmp_cpuinfo_get_filename(); 2186 const char *envvar = __kmp_cpuinfo_get_envvar(); 2187 *msg_id = kmp_i18n_null; 2188 2189 if (__kmp_affinity_verbose) { 2190 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 2191 } 2192 2193 kmp_safe_raii_file_t f(filename, "r", envvar); 2194 2195 // Scan of the file, and count the number of "processor" (osId) fields, 2196 // and find the highest value of <n> for a node_<n> field. 2197 char buf[256]; 2198 unsigned num_records = 0; 2199 while (!feof(f)) { 2200 buf[sizeof(buf) - 1] = 1; 2201 if (!fgets(buf, sizeof(buf), f)) { 2202 // Read errors presumably because of EOF 2203 break; 2204 } 2205 2206 char s1[] = "processor"; 2207 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 2208 num_records++; 2209 continue; 2210 } 2211 2212 // FIXME - this will match "node_<n> <garbage>" 2213 unsigned level; 2214 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 2215 // validate the input fisrt: 2216 if (level > (unsigned)__kmp_xproc) { // level is too big 2217 level = __kmp_xproc; 2218 } 2219 if (nodeIdIndex + level >= maxIndex) { 2220 maxIndex = nodeIdIndex + level; 2221 } 2222 continue; 2223 } 2224 } 2225 2226 // Check for empty file / no valid processor records, or too many. The number 2227 // of records can't exceed the number of valid bits in the affinity mask. 2228 if (num_records == 0) { 2229 *msg_id = kmp_i18n_str_NoProcRecords; 2230 return false; 2231 } 2232 if (num_records > (unsigned)__kmp_xproc) { 2233 *msg_id = kmp_i18n_str_TooManyProcRecords; 2234 return false; 2235 } 2236 2237 // Set the file pointer back to the beginning, so that we can scan the file 2238 // again, this time performing a full parse of the data. Allocate a vector of 2239 // ProcCpuInfo object, where we will place the data. Adding an extra element 2240 // at the end allows us to remove a lot of extra checks for termination 2241 // conditions. 2242 if (fseek(f, 0, SEEK_SET) != 0) { 2243 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 2244 return false; 2245 } 2246 2247 // Allocate the array of records to store the proc info in. The dummy 2248 // element at the end makes the logic in filling them out easier to code. 2249 unsigned **threadInfo = 2250 (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *)); 2251 unsigned i; 2252 for (i = 0; i <= num_records; i++) { 2253 threadInfo[i] = 2254 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2255 } 2256 2257 #define CLEANUP_THREAD_INFO \ 2258 for (i = 0; i <= num_records; i++) { \ 2259 __kmp_free(threadInfo[i]); \ 2260 } \ 2261 __kmp_free(threadInfo); 2262 2263 // A value of UINT_MAX means that we didn't find the field 2264 unsigned __index; 2265 2266 #define INIT_PROC_INFO(p) \ 2267 for (__index = 0; __index <= maxIndex; __index++) { \ 2268 (p)[__index] = UINT_MAX; \ 2269 } 2270 2271 for (i = 0; i <= num_records; i++) { 2272 INIT_PROC_INFO(threadInfo[i]); 2273 } 2274 2275 unsigned num_avail = 0; 2276 *line = 0; 2277 while (!feof(f)) { 2278 // Create an inner scoping level, so that all the goto targets at the end of 2279 // the loop appear in an outer scoping level. This avoids warnings about 2280 // jumping past an initialization to a target in the same block. 2281 { 2282 buf[sizeof(buf) - 1] = 1; 2283 bool long_line = false; 2284 if (!fgets(buf, sizeof(buf), f)) { 2285 // Read errors presumably because of EOF 2286 // If there is valid data in threadInfo[num_avail], then fake 2287 // a blank line in ensure that the last address gets parsed. 2288 bool valid = false; 2289 for (i = 0; i <= maxIndex; i++) { 2290 if (threadInfo[num_avail][i] != UINT_MAX) { 2291 valid = true; 2292 } 2293 } 2294 if (!valid) { 2295 break; 2296 } 2297 buf[0] = 0; 2298 } else if (!buf[sizeof(buf) - 1]) { 2299 // The line is longer than the buffer. Set a flag and don't 2300 // emit an error if we were going to ignore the line, anyway. 2301 long_line = true; 2302 2303 #define CHECK_LINE \ 2304 if (long_line) { \ 2305 CLEANUP_THREAD_INFO; \ 2306 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 2307 return false; \ 2308 } 2309 } 2310 (*line)++; 2311 2312 char s1[] = "processor"; 2313 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 2314 CHECK_LINE; 2315 char *p = strchr(buf + sizeof(s1) - 1, ':'); 2316 unsigned val; 2317 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2318 goto no_val; 2319 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) 2320 #if KMP_ARCH_AARCH64 2321 // Handle the old AArch64 /proc/cpuinfo layout differently, 2322 // it contains all of the 'processor' entries listed in a 2323 // single 'Processor' section, therefore the normal looking 2324 // for duplicates in that section will always fail. 2325 num_avail++; 2326 #else 2327 goto dup_field; 2328 #endif 2329 threadInfo[num_avail][osIdIndex] = val; 2330 #if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64) 2331 char path[256]; 2332 KMP_SNPRINTF( 2333 path, sizeof(path), 2334 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 2335 threadInfo[num_avail][osIdIndex]); 2336 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 2337 2338 KMP_SNPRINTF(path, sizeof(path), 2339 "/sys/devices/system/cpu/cpu%u/topology/core_id", 2340 threadInfo[num_avail][osIdIndex]); 2341 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 2342 continue; 2343 #else 2344 } 2345 char s2[] = "physical id"; 2346 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 2347 CHECK_LINE; 2348 char *p = strchr(buf + sizeof(s2) - 1, ':'); 2349 unsigned val; 2350 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2351 goto no_val; 2352 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) 2353 goto dup_field; 2354 threadInfo[num_avail][pkgIdIndex] = val; 2355 continue; 2356 } 2357 char s3[] = "core id"; 2358 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 2359 CHECK_LINE; 2360 char *p = strchr(buf + sizeof(s3) - 1, ':'); 2361 unsigned val; 2362 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2363 goto no_val; 2364 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) 2365 goto dup_field; 2366 threadInfo[num_avail][coreIdIndex] = val; 2367 continue; 2368 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 2369 } 2370 char s4[] = "thread id"; 2371 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 2372 CHECK_LINE; 2373 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2374 unsigned val; 2375 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2376 goto no_val; 2377 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) 2378 goto dup_field; 2379 threadInfo[num_avail][threadIdIndex] = val; 2380 continue; 2381 } 2382 unsigned level; 2383 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) { 2384 CHECK_LINE; 2385 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2386 unsigned val; 2387 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) 2388 goto no_val; 2389 // validate the input before using level: 2390 if (level > (unsigned)__kmp_xproc) { // level is too big 2391 level = __kmp_xproc; 2392 } 2393 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) 2394 goto dup_field; 2395 threadInfo[num_avail][nodeIdIndex + level] = val; 2396 continue; 2397 } 2398 2399 // We didn't recognize the leading token on the line. There are lots of 2400 // leading tokens that we don't recognize - if the line isn't empty, go on 2401 // to the next line. 2402 if ((*buf != 0) && (*buf != '\n')) { 2403 // If the line is longer than the buffer, read characters 2404 // until we find a newline. 2405 if (long_line) { 2406 int ch; 2407 while (((ch = fgetc(f)) != EOF) && (ch != '\n')) 2408 ; 2409 } 2410 continue; 2411 } 2412 2413 // A newline has signalled the end of the processor record. 2414 // Check that there aren't too many procs specified. 2415 if ((int)num_avail == __kmp_xproc) { 2416 CLEANUP_THREAD_INFO; 2417 *msg_id = kmp_i18n_str_TooManyEntries; 2418 return false; 2419 } 2420 2421 // Check for missing fields. The osId field must be there, and we 2422 // currently require that the physical id field is specified, also. 2423 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 2424 CLEANUP_THREAD_INFO; 2425 *msg_id = kmp_i18n_str_MissingProcField; 2426 return false; 2427 } 2428 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 2429 CLEANUP_THREAD_INFO; 2430 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 2431 return false; 2432 } 2433 2434 // Skip this proc if it is not included in the machine model. 2435 if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], 2436 __kmp_affin_fullMask)) { 2437 INIT_PROC_INFO(threadInfo[num_avail]); 2438 continue; 2439 } 2440 2441 // We have a successful parse of this proc's info. 2442 // Increment the counter, and prepare for the next proc. 2443 num_avail++; 2444 KMP_ASSERT(num_avail <= num_records); 2445 INIT_PROC_INFO(threadInfo[num_avail]); 2446 } 2447 continue; 2448 2449 no_val: 2450 CLEANUP_THREAD_INFO; 2451 *msg_id = kmp_i18n_str_MissingValCpuinfo; 2452 return false; 2453 2454 dup_field: 2455 CLEANUP_THREAD_INFO; 2456 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 2457 return false; 2458 } 2459 *line = 0; 2460 2461 #if KMP_MIC && REDUCE_TEAM_SIZE 2462 unsigned teamSize = 0; 2463 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2464 2465 // check for num_records == __kmp_xproc ??? 2466 2467 // If it is configured to omit the package level when there is only a single 2468 // package, the logic at the end of this routine won't work if there is only a 2469 // single thread 2470 KMP_ASSERT(num_avail > 0); 2471 KMP_ASSERT(num_avail <= num_records); 2472 2473 // Sort the threadInfo table by physical Id. 2474 qsort(threadInfo, num_avail, sizeof(*threadInfo), 2475 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 2476 2477 // The table is now sorted by pkgId / coreId / threadId, but we really don't 2478 // know the radix of any of the fields. pkgId's may be sparsely assigned among 2479 // the chips on a system. Although coreId's are usually assigned 2480 // [0 .. coresPerPkg-1] and threadId's are usually assigned 2481 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2482 // 2483 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the 2484 // total # packages) are at this point - we want to determine that now. We 2485 // only have an upper bound on the first two figures. 2486 unsigned *counts = 2487 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2488 unsigned *maxCt = 2489 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2490 unsigned *totals = 2491 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2492 unsigned *lastId = 2493 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned)); 2494 2495 bool assign_thread_ids = false; 2496 unsigned threadIdCt; 2497 unsigned index; 2498 2499 restart_radix_check: 2500 threadIdCt = 0; 2501 2502 // Initialize the counter arrays with data from threadInfo[0]. 2503 if (assign_thread_ids) { 2504 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 2505 threadInfo[0][threadIdIndex] = threadIdCt++; 2506 } else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 2507 threadIdCt = threadInfo[0][threadIdIndex] + 1; 2508 } 2509 } 2510 for (index = 0; index <= maxIndex; index++) { 2511 counts[index] = 1; 2512 maxCt[index] = 1; 2513 totals[index] = 1; 2514 lastId[index] = threadInfo[0][index]; 2515 ; 2516 } 2517 2518 // Run through the rest of the OS procs. 2519 for (i = 1; i < num_avail; i++) { 2520 // Find the most significant index whose id differs from the id for the 2521 // previous OS proc. 2522 for (index = maxIndex; index >= threadIdIndex; index--) { 2523 if (assign_thread_ids && (index == threadIdIndex)) { 2524 // Auto-assign the thread id field if it wasn't specified. 2525 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2526 threadInfo[i][threadIdIndex] = threadIdCt++; 2527 } 2528 // Apparently the thread id field was specified for some entries and not 2529 // others. Start the thread id counter off at the next higher thread id. 2530 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2531 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2532 } 2533 } 2534 if (threadInfo[i][index] != lastId[index]) { 2535 // Run through all indices which are less significant, and reset the 2536 // counts to 1. At all levels up to and including index, we need to 2537 // increment the totals and record the last id. 2538 unsigned index2; 2539 for (index2 = threadIdIndex; index2 < index; index2++) { 2540 totals[index2]++; 2541 if (counts[index2] > maxCt[index2]) { 2542 maxCt[index2] = counts[index2]; 2543 } 2544 counts[index2] = 1; 2545 lastId[index2] = threadInfo[i][index2]; 2546 } 2547 counts[index]++; 2548 totals[index]++; 2549 lastId[index] = threadInfo[i][index]; 2550 2551 if (assign_thread_ids && (index > threadIdIndex)) { 2552 2553 #if KMP_MIC && REDUCE_TEAM_SIZE 2554 // The default team size is the total #threads in the machine 2555 // minus 1 thread for every core that has 3 or more threads. 2556 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2557 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2558 2559 // Restart the thread counter, as we are on a new core. 2560 threadIdCt = 0; 2561 2562 // Auto-assign the thread id field if it wasn't specified. 2563 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2564 threadInfo[i][threadIdIndex] = threadIdCt++; 2565 } 2566 2567 // Apparently the thread id field was specified for some entries and 2568 // not others. Start the thread id counter off at the next higher 2569 // thread id. 2570 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2571 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2572 } 2573 } 2574 break; 2575 } 2576 } 2577 if (index < threadIdIndex) { 2578 // If thread ids were specified, it is an error if they are not unique. 2579 // Also, check that we waven't already restarted the loop (to be safe - 2580 // shouldn't need to). 2581 if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) { 2582 __kmp_free(lastId); 2583 __kmp_free(totals); 2584 __kmp_free(maxCt); 2585 __kmp_free(counts); 2586 CLEANUP_THREAD_INFO; 2587 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2588 return false; 2589 } 2590 2591 // If the thread ids were not specified and we see entries entries that 2592 // are duplicates, start the loop over and assign the thread ids manually. 2593 assign_thread_ids = true; 2594 goto restart_radix_check; 2595 } 2596 } 2597 2598 #if KMP_MIC && REDUCE_TEAM_SIZE 2599 // The default team size is the total #threads in the machine 2600 // minus 1 thread for every core that has 3 or more threads. 2601 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1); 2602 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2603 2604 for (index = threadIdIndex; index <= maxIndex; index++) { 2605 if (counts[index] > maxCt[index]) { 2606 maxCt[index] = counts[index]; 2607 } 2608 } 2609 2610 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2611 nCoresPerPkg = maxCt[coreIdIndex]; 2612 nPackages = totals[pkgIdIndex]; 2613 2614 // When affinity is off, this routine will still be called to set 2615 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 2616 // Make sure all these vars are set correctly, and return now if affinity is 2617 // not enabled. 2618 __kmp_ncores = totals[coreIdIndex]; 2619 if (!KMP_AFFINITY_CAPABLE()) { 2620 KMP_ASSERT(__kmp_affinity_type == affinity_none); 2621 return true; 2622 } 2623 2624 #if KMP_MIC && REDUCE_TEAM_SIZE 2625 // Set the default team size. 2626 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2627 __kmp_dflt_team_nth = teamSize; 2628 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting " 2629 "__kmp_dflt_team_nth = %d\n", 2630 __kmp_dflt_team_nth)); 2631 } 2632 #endif // KMP_MIC && REDUCE_TEAM_SIZE 2633 2634 KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc); 2635 2636 // Count the number of levels which have more nodes at that level than at the 2637 // parent's level (with there being an implicit root node of the top level). 2638 // This is equivalent to saying that there is at least one node at this level 2639 // which has a sibling. These levels are in the map, and the package level is 2640 // always in the map. 2641 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2642 for (index = threadIdIndex; index < maxIndex; index++) { 2643 KMP_ASSERT(totals[index] >= totals[index + 1]); 2644 inMap[index] = (totals[index] > totals[index + 1]); 2645 } 2646 inMap[maxIndex] = (totals[maxIndex] > 1); 2647 inMap[pkgIdIndex] = true; 2648 inMap[coreIdIndex] = true; 2649 inMap[threadIdIndex] = true; 2650 2651 int depth = 0; 2652 int idx = 0; 2653 kmp_hw_t types[KMP_HW_LAST]; 2654 int pkgLevel = -1; 2655 int coreLevel = -1; 2656 int threadLevel = -1; 2657 for (index = threadIdIndex; index <= maxIndex; index++) { 2658 if (inMap[index]) { 2659 depth++; 2660 } 2661 } 2662 if (inMap[pkgIdIndex]) { 2663 pkgLevel = idx; 2664 types[idx++] = KMP_HW_SOCKET; 2665 } 2666 if (inMap[coreIdIndex]) { 2667 coreLevel = idx; 2668 types[idx++] = KMP_HW_CORE; 2669 } 2670 if (inMap[threadIdIndex]) { 2671 threadLevel = idx; 2672 types[idx++] = KMP_HW_THREAD; 2673 } 2674 KMP_ASSERT(depth > 0); 2675 2676 // Construct the data structure that is to be returned. 2677 __kmp_topology = kmp_topology_t::allocate(num_avail, depth, types); 2678 2679 for (i = 0; i < num_avail; ++i) { 2680 unsigned os = threadInfo[i][osIdIndex]; 2681 int src_index; 2682 int dst_index = 0; 2683 kmp_hw_thread_t &hw_thread = __kmp_topology->at(i); 2684 hw_thread.clear(); 2685 hw_thread.os_id = os; 2686 2687 idx = 0; 2688 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2689 if (!inMap[src_index]) { 2690 continue; 2691 } 2692 if (src_index == pkgIdIndex) { 2693 hw_thread.ids[pkgLevel] = threadInfo[i][src_index]; 2694 } else if (src_index == coreIdIndex) { 2695 hw_thread.ids[coreLevel] = threadInfo[i][src_index]; 2696 } else if (src_index == threadIdIndex) { 2697 hw_thread.ids[threadLevel] = threadInfo[i][src_index]; 2698 } 2699 dst_index++; 2700 } 2701 } 2702 2703 __kmp_free(inMap); 2704 __kmp_free(lastId); 2705 __kmp_free(totals); 2706 __kmp_free(maxCt); 2707 __kmp_free(counts); 2708 CLEANUP_THREAD_INFO; 2709 __kmp_topology->sort_ids(); 2710 if (!__kmp_topology->check_ids()) { 2711 kmp_topology_t::deallocate(__kmp_topology); 2712 __kmp_topology = nullptr; 2713 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2714 return false; 2715 } 2716 return true; 2717 } 2718 2719 // Create and return a table of affinity masks, indexed by OS thread ID. 2720 // This routine handles OR'ing together all the affinity masks of threads 2721 // that are sufficiently close, if granularity > fine. 2722 static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex, 2723 unsigned *numUnique) { 2724 // First form a table of affinity masks in order of OS thread id. 2725 int maxOsId; 2726 int i; 2727 int numAddrs = __kmp_topology->get_num_hw_threads(); 2728 int depth = __kmp_topology->get_depth(); 2729 KMP_ASSERT(numAddrs); 2730 KMP_ASSERT(depth); 2731 2732 maxOsId = 0; 2733 for (i = numAddrs - 1;; --i) { 2734 int osId = __kmp_topology->at(i).os_id; 2735 if (osId > maxOsId) { 2736 maxOsId = osId; 2737 } 2738 if (i == 0) 2739 break; 2740 } 2741 kmp_affin_mask_t *osId2Mask; 2742 KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1)); 2743 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2744 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2745 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2746 } 2747 if (__kmp_affinity_gran_levels >= (int)depth) { 2748 if (__kmp_affinity_verbose || 2749 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 2750 KMP_WARNING(AffThreadsMayMigrate); 2751 } 2752 } 2753 2754 // Run through the table, forming the masks for all threads on each core. 2755 // Threads on the same core will have identical kmp_hw_thread_t objects, not 2756 // considering the last level, which must be the thread id. All threads on a 2757 // core will appear consecutively. 2758 int unique = 0; 2759 int j = 0; // index of 1st thread on core 2760 int leader = 0; 2761 kmp_affin_mask_t *sum; 2762 KMP_CPU_ALLOC_ON_STACK(sum); 2763 KMP_CPU_ZERO(sum); 2764 KMP_CPU_SET(__kmp_topology->at(0).os_id, sum); 2765 for (i = 1; i < numAddrs; i++) { 2766 // If this thread is sufficiently close to the leader (within the 2767 // granularity setting), then set the bit for this os thread in the 2768 // affinity mask for this group, and go on to the next thread. 2769 if (__kmp_topology->is_close(leader, i, __kmp_affinity_gran_levels)) { 2770 KMP_CPU_SET(__kmp_topology->at(i).os_id, sum); 2771 continue; 2772 } 2773 2774 // For every thread in this group, copy the mask to the thread's entry in 2775 // the osId2Mask table. Mark the first address as a leader. 2776 for (; j < i; j++) { 2777 int osId = __kmp_topology->at(j).os_id; 2778 KMP_DEBUG_ASSERT(osId <= maxOsId); 2779 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2780 KMP_CPU_COPY(mask, sum); 2781 __kmp_topology->at(j).leader = (j == leader); 2782 } 2783 unique++; 2784 2785 // Start a new mask. 2786 leader = i; 2787 KMP_CPU_ZERO(sum); 2788 KMP_CPU_SET(__kmp_topology->at(i).os_id, sum); 2789 } 2790 2791 // For every thread in last group, copy the mask to the thread's 2792 // entry in the osId2Mask table. 2793 for (; j < i; j++) { 2794 int osId = __kmp_topology->at(j).os_id; 2795 KMP_DEBUG_ASSERT(osId <= maxOsId); 2796 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2797 KMP_CPU_COPY(mask, sum); 2798 __kmp_topology->at(j).leader = (j == leader); 2799 } 2800 unique++; 2801 KMP_CPU_FREE_FROM_STACK(sum); 2802 2803 *maxIndex = maxOsId; 2804 *numUnique = unique; 2805 return osId2Mask; 2806 } 2807 2808 // Stuff for the affinity proclist parsers. It's easier to declare these vars 2809 // as file-static than to try and pass them through the calling sequence of 2810 // the recursive-descent OMP_PLACES parser. 2811 static kmp_affin_mask_t *newMasks; 2812 static int numNewMasks; 2813 static int nextNewMask; 2814 2815 #define ADD_MASK(_mask) \ 2816 { \ 2817 if (nextNewMask >= numNewMasks) { \ 2818 int i; \ 2819 numNewMasks *= 2; \ 2820 kmp_affin_mask_t *temp; \ 2821 KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \ 2822 for (i = 0; i < numNewMasks / 2; i++) { \ 2823 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); \ 2824 kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i); \ 2825 KMP_CPU_COPY(dest, src); \ 2826 } \ 2827 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2); \ 2828 newMasks = temp; \ 2829 } \ 2830 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 2831 nextNewMask++; \ 2832 } 2833 2834 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId) \ 2835 { \ 2836 if (((_osId) > _maxOsId) || \ 2837 (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 2838 if (__kmp_affinity_verbose || \ 2839 (__kmp_affinity_warnings && \ 2840 (__kmp_affinity_type != affinity_none))) { \ 2841 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 2842 } \ 2843 } else { \ 2844 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 2845 } \ 2846 } 2847 2848 // Re-parse the proclist (for the explicit affinity type), and form the list 2849 // of affinity newMasks indexed by gtid. 2850 static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 2851 unsigned int *out_numMasks, 2852 const char *proclist, 2853 kmp_affin_mask_t *osId2Mask, 2854 int maxOsId) { 2855 int i; 2856 const char *scan = proclist; 2857 const char *next = proclist; 2858 2859 // We use malloc() for the temporary mask vector, so that we can use 2860 // realloc() to extend it. 2861 numNewMasks = 2; 2862 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 2863 nextNewMask = 0; 2864 kmp_affin_mask_t *sumMask; 2865 KMP_CPU_ALLOC(sumMask); 2866 int setSize = 0; 2867 2868 for (;;) { 2869 int start, end, stride; 2870 2871 SKIP_WS(scan); 2872 next = scan; 2873 if (*next == '\0') { 2874 break; 2875 } 2876 2877 if (*next == '{') { 2878 int num; 2879 setSize = 0; 2880 next++; // skip '{' 2881 SKIP_WS(next); 2882 scan = next; 2883 2884 // Read the first integer in the set. 2885 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist"); 2886 SKIP_DIGITS(next); 2887 num = __kmp_str_to_int(scan, *next); 2888 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2889 2890 // Copy the mask for that osId to the sum (union) mask. 2891 if ((num > maxOsId) || 2892 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2893 if (__kmp_affinity_verbose || 2894 (__kmp_affinity_warnings && 2895 (__kmp_affinity_type != affinity_none))) { 2896 KMP_WARNING(AffIgnoreInvalidProcID, num); 2897 } 2898 KMP_CPU_ZERO(sumMask); 2899 } else { 2900 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2901 setSize = 1; 2902 } 2903 2904 for (;;) { 2905 // Check for end of set. 2906 SKIP_WS(next); 2907 if (*next == '}') { 2908 next++; // skip '}' 2909 break; 2910 } 2911 2912 // Skip optional comma. 2913 if (*next == ',') { 2914 next++; 2915 } 2916 SKIP_WS(next); 2917 2918 // Read the next integer in the set. 2919 scan = next; 2920 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2921 2922 SKIP_DIGITS(next); 2923 num = __kmp_str_to_int(scan, *next); 2924 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2925 2926 // Add the mask for that osId to the sum mask. 2927 if ((num > maxOsId) || 2928 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2929 if (__kmp_affinity_verbose || 2930 (__kmp_affinity_warnings && 2931 (__kmp_affinity_type != affinity_none))) { 2932 KMP_WARNING(AffIgnoreInvalidProcID, num); 2933 } 2934 } else { 2935 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2936 setSize++; 2937 } 2938 } 2939 if (setSize > 0) { 2940 ADD_MASK(sumMask); 2941 } 2942 2943 SKIP_WS(next); 2944 if (*next == ',') { 2945 next++; 2946 } 2947 scan = next; 2948 continue; 2949 } 2950 2951 // Read the first integer. 2952 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2953 SKIP_DIGITS(next); 2954 start = __kmp_str_to_int(scan, *next); 2955 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 2956 SKIP_WS(next); 2957 2958 // If this isn't a range, then add a mask to the list and go on. 2959 if (*next != '-') { 2960 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2961 2962 // Skip optional comma. 2963 if (*next == ',') { 2964 next++; 2965 } 2966 scan = next; 2967 continue; 2968 } 2969 2970 // This is a range. Skip over the '-' and read in the 2nd int. 2971 next++; // skip '-' 2972 SKIP_WS(next); 2973 scan = next; 2974 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2975 SKIP_DIGITS(next); 2976 end = __kmp_str_to_int(scan, *next); 2977 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 2978 2979 // Check for a stride parameter 2980 stride = 1; 2981 SKIP_WS(next); 2982 if (*next == ':') { 2983 // A stride is specified. Skip over the ':" and read the 3rd int. 2984 int sign = +1; 2985 next++; // skip ':' 2986 SKIP_WS(next); 2987 scan = next; 2988 if (*next == '-') { 2989 sign = -1; 2990 next++; 2991 SKIP_WS(next); 2992 scan = next; 2993 } 2994 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2995 SKIP_DIGITS(next); 2996 stride = __kmp_str_to_int(scan, *next); 2997 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 2998 stride *= sign; 2999 } 3000 3001 // Do some range checks. 3002 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 3003 if (stride > 0) { 3004 KMP_ASSERT2(start <= end, "bad explicit proc list"); 3005 } else { 3006 KMP_ASSERT2(start >= end, "bad explicit proc list"); 3007 } 3008 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 3009 3010 // Add the mask for each OS proc # to the list. 3011 if (stride > 0) { 3012 do { 3013 ADD_MASK_OSID(start, osId2Mask, maxOsId); 3014 start += stride; 3015 } while (start <= end); 3016 } else { 3017 do { 3018 ADD_MASK_OSID(start, osId2Mask, maxOsId); 3019 start += stride; 3020 } while (start >= end); 3021 } 3022 3023 // Skip optional comma. 3024 SKIP_WS(next); 3025 if (*next == ',') { 3026 next++; 3027 } 3028 scan = next; 3029 } 3030 3031 *out_numMasks = nextNewMask; 3032 if (nextNewMask == 0) { 3033 *out_masks = NULL; 3034 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3035 return; 3036 } 3037 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3038 for (i = 0; i < nextNewMask; i++) { 3039 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 3040 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 3041 KMP_CPU_COPY(dest, src); 3042 } 3043 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3044 KMP_CPU_FREE(sumMask); 3045 } 3046 3047 /*----------------------------------------------------------------------------- 3048 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 3049 places. Again, Here is the grammar: 3050 3051 place_list := place 3052 place_list := place , place_list 3053 place := num 3054 place := place : num 3055 place := place : num : signed 3056 place := { subplacelist } 3057 place := ! place // (lowest priority) 3058 subplace_list := subplace 3059 subplace_list := subplace , subplace_list 3060 subplace := num 3061 subplace := num : num 3062 subplace := num : num : signed 3063 signed := num 3064 signed := + signed 3065 signed := - signed 3066 -----------------------------------------------------------------------------*/ 3067 static void __kmp_process_subplace_list(const char **scan, 3068 kmp_affin_mask_t *osId2Mask, 3069 int maxOsId, kmp_affin_mask_t *tempMask, 3070 int *setSize) { 3071 const char *next; 3072 3073 for (;;) { 3074 int start, count, stride, i; 3075 3076 // Read in the starting proc id 3077 SKIP_WS(*scan); 3078 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3079 next = *scan; 3080 SKIP_DIGITS(next); 3081 start = __kmp_str_to_int(*scan, *next); 3082 KMP_ASSERT(start >= 0); 3083 *scan = next; 3084 3085 // valid follow sets are ',' ':' and '}' 3086 SKIP_WS(*scan); 3087 if (**scan == '}' || **scan == ',') { 3088 if ((start > maxOsId) || 3089 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3090 if (__kmp_affinity_verbose || 3091 (__kmp_affinity_warnings && 3092 (__kmp_affinity_type != affinity_none))) { 3093 KMP_WARNING(AffIgnoreInvalidProcID, start); 3094 } 3095 } else { 3096 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3097 (*setSize)++; 3098 } 3099 if (**scan == '}') { 3100 break; 3101 } 3102 (*scan)++; // skip ',' 3103 continue; 3104 } 3105 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3106 (*scan)++; // skip ':' 3107 3108 // Read count parameter 3109 SKIP_WS(*scan); 3110 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3111 next = *scan; 3112 SKIP_DIGITS(next); 3113 count = __kmp_str_to_int(*scan, *next); 3114 KMP_ASSERT(count >= 0); 3115 *scan = next; 3116 3117 // valid follow sets are ',' ':' and '}' 3118 SKIP_WS(*scan); 3119 if (**scan == '}' || **scan == ',') { 3120 for (i = 0; i < count; i++) { 3121 if ((start > maxOsId) || 3122 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3123 if (__kmp_affinity_verbose || 3124 (__kmp_affinity_warnings && 3125 (__kmp_affinity_type != affinity_none))) { 3126 KMP_WARNING(AffIgnoreInvalidProcID, start); 3127 } 3128 break; // don't proliferate warnings for large count 3129 } else { 3130 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3131 start++; 3132 (*setSize)++; 3133 } 3134 } 3135 if (**scan == '}') { 3136 break; 3137 } 3138 (*scan)++; // skip ',' 3139 continue; 3140 } 3141 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3142 (*scan)++; // skip ':' 3143 3144 // Read stride parameter 3145 int sign = +1; 3146 for (;;) { 3147 SKIP_WS(*scan); 3148 if (**scan == '+') { 3149 (*scan)++; // skip '+' 3150 continue; 3151 } 3152 if (**scan == '-') { 3153 sign *= -1; 3154 (*scan)++; // skip '-' 3155 continue; 3156 } 3157 break; 3158 } 3159 SKIP_WS(*scan); 3160 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list"); 3161 next = *scan; 3162 SKIP_DIGITS(next); 3163 stride = __kmp_str_to_int(*scan, *next); 3164 KMP_ASSERT(stride >= 0); 3165 *scan = next; 3166 stride *= sign; 3167 3168 // valid follow sets are ',' and '}' 3169 SKIP_WS(*scan); 3170 if (**scan == '}' || **scan == ',') { 3171 for (i = 0; i < count; i++) { 3172 if ((start > maxOsId) || 3173 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3174 if (__kmp_affinity_verbose || 3175 (__kmp_affinity_warnings && 3176 (__kmp_affinity_type != affinity_none))) { 3177 KMP_WARNING(AffIgnoreInvalidProcID, start); 3178 } 3179 break; // don't proliferate warnings for large count 3180 } else { 3181 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3182 start += stride; 3183 (*setSize)++; 3184 } 3185 } 3186 if (**scan == '}') { 3187 break; 3188 } 3189 (*scan)++; // skip ',' 3190 continue; 3191 } 3192 3193 KMP_ASSERT2(0, "bad explicit places list"); 3194 } 3195 } 3196 3197 static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 3198 int maxOsId, kmp_affin_mask_t *tempMask, 3199 int *setSize) { 3200 const char *next; 3201 3202 // valid follow sets are '{' '!' and num 3203 SKIP_WS(*scan); 3204 if (**scan == '{') { 3205 (*scan)++; // skip '{' 3206 __kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize); 3207 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 3208 (*scan)++; // skip '}' 3209 } else if (**scan == '!') { 3210 (*scan)++; // skip '!' 3211 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 3212 KMP_CPU_COMPLEMENT(maxOsId, tempMask); 3213 } else if ((**scan >= '0') && (**scan <= '9')) { 3214 next = *scan; 3215 SKIP_DIGITS(next); 3216 int num = __kmp_str_to_int(*scan, *next); 3217 KMP_ASSERT(num >= 0); 3218 if ((num > maxOsId) || 3219 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3220 if (__kmp_affinity_verbose || 3221 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 3222 KMP_WARNING(AffIgnoreInvalidProcID, num); 3223 } 3224 } else { 3225 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 3226 (*setSize)++; 3227 } 3228 *scan = next; // skip num 3229 } else { 3230 KMP_ASSERT2(0, "bad explicit places list"); 3231 } 3232 } 3233 3234 // static void 3235 void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 3236 unsigned int *out_numMasks, 3237 const char *placelist, 3238 kmp_affin_mask_t *osId2Mask, 3239 int maxOsId) { 3240 int i, j, count, stride, sign; 3241 const char *scan = placelist; 3242 const char *next = placelist; 3243 3244 numNewMasks = 2; 3245 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 3246 nextNewMask = 0; 3247 3248 // tempMask is modified based on the previous or initial 3249 // place to form the current place 3250 // previousMask contains the previous place 3251 kmp_affin_mask_t *tempMask; 3252 kmp_affin_mask_t *previousMask; 3253 KMP_CPU_ALLOC(tempMask); 3254 KMP_CPU_ZERO(tempMask); 3255 KMP_CPU_ALLOC(previousMask); 3256 KMP_CPU_ZERO(previousMask); 3257 int setSize = 0; 3258 3259 for (;;) { 3260 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 3261 3262 // valid follow sets are ',' ':' and EOL 3263 SKIP_WS(scan); 3264 if (*scan == '\0' || *scan == ',') { 3265 if (setSize > 0) { 3266 ADD_MASK(tempMask); 3267 } 3268 KMP_CPU_ZERO(tempMask); 3269 setSize = 0; 3270 if (*scan == '\0') { 3271 break; 3272 } 3273 scan++; // skip ',' 3274 continue; 3275 } 3276 3277 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3278 scan++; // skip ':' 3279 3280 // Read count parameter 3281 SKIP_WS(scan); 3282 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 3283 next = scan; 3284 SKIP_DIGITS(next); 3285 count = __kmp_str_to_int(scan, *next); 3286 KMP_ASSERT(count >= 0); 3287 scan = next; 3288 3289 // valid follow sets are ',' ':' and EOL 3290 SKIP_WS(scan); 3291 if (*scan == '\0' || *scan == ',') { 3292 stride = +1; 3293 } else { 3294 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3295 scan++; // skip ':' 3296 3297 // Read stride parameter 3298 sign = +1; 3299 for (;;) { 3300 SKIP_WS(scan); 3301 if (*scan == '+') { 3302 scan++; // skip '+' 3303 continue; 3304 } 3305 if (*scan == '-') { 3306 sign *= -1; 3307 scan++; // skip '-' 3308 continue; 3309 } 3310 break; 3311 } 3312 SKIP_WS(scan); 3313 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list"); 3314 next = scan; 3315 SKIP_DIGITS(next); 3316 stride = __kmp_str_to_int(scan, *next); 3317 KMP_DEBUG_ASSERT(stride >= 0); 3318 scan = next; 3319 stride *= sign; 3320 } 3321 3322 // Add places determined by initial_place : count : stride 3323 for (i = 0; i < count; i++) { 3324 if (setSize == 0) { 3325 break; 3326 } 3327 // Add the current place, then build the next place (tempMask) from that 3328 KMP_CPU_COPY(previousMask, tempMask); 3329 ADD_MASK(previousMask); 3330 KMP_CPU_ZERO(tempMask); 3331 setSize = 0; 3332 KMP_CPU_SET_ITERATE(j, previousMask) { 3333 if (!KMP_CPU_ISSET(j, previousMask)) { 3334 continue; 3335 } 3336 if ((j + stride > maxOsId) || (j + stride < 0) || 3337 (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) || 3338 (!KMP_CPU_ISSET(j + stride, 3339 KMP_CPU_INDEX(osId2Mask, j + stride)))) { 3340 if ((__kmp_affinity_verbose || 3341 (__kmp_affinity_warnings && 3342 (__kmp_affinity_type != affinity_none))) && 3343 i < count - 1) { 3344 KMP_WARNING(AffIgnoreInvalidProcID, j + stride); 3345 } 3346 continue; 3347 } 3348 KMP_CPU_SET(j + stride, tempMask); 3349 setSize++; 3350 } 3351 } 3352 KMP_CPU_ZERO(tempMask); 3353 setSize = 0; 3354 3355 // valid follow sets are ',' and EOL 3356 SKIP_WS(scan); 3357 if (*scan == '\0') { 3358 break; 3359 } 3360 if (*scan == ',') { 3361 scan++; // skip ',' 3362 continue; 3363 } 3364 3365 KMP_ASSERT2(0, "bad explicit places list"); 3366 } 3367 3368 *out_numMasks = nextNewMask; 3369 if (nextNewMask == 0) { 3370 *out_masks = NULL; 3371 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3372 return; 3373 } 3374 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3375 KMP_CPU_FREE(tempMask); 3376 KMP_CPU_FREE(previousMask); 3377 for (i = 0; i < nextNewMask; i++) { 3378 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); 3379 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i); 3380 KMP_CPU_COPY(dest, src); 3381 } 3382 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3383 } 3384 3385 #undef ADD_MASK 3386 #undef ADD_MASK_OSID 3387 3388 // This function figures out the deepest level at which there is at least one 3389 // cluster/core with more than one processing unit bound to it. 3390 static int __kmp_affinity_find_core_level(int nprocs, int bottom_level) { 3391 int core_level = 0; 3392 3393 for (int i = 0; i < nprocs; i++) { 3394 const kmp_hw_thread_t &hw_thread = __kmp_topology->at(i); 3395 for (int j = bottom_level; j > 0; j--) { 3396 if (hw_thread.ids[j] > 0) { 3397 if (core_level < (j - 1)) { 3398 core_level = j - 1; 3399 } 3400 } 3401 } 3402 } 3403 return core_level; 3404 } 3405 3406 // This function counts number of clusters/cores at given level. 3407 static int __kmp_affinity_compute_ncores(int nprocs, int bottom_level, 3408 int core_level) { 3409 return __kmp_topology->get_count(core_level); 3410 } 3411 // This function finds to which cluster/core given processing unit is bound. 3412 static int __kmp_affinity_find_core(int proc, int bottom_level, 3413 int core_level) { 3414 int core = 0; 3415 KMP_DEBUG_ASSERT(proc >= 0 && proc < __kmp_topology->get_num_hw_threads()); 3416 for (int i = 0; i <= proc; ++i) { 3417 if (i + 1 <= proc) { 3418 for (int j = 0; j <= core_level; ++j) { 3419 if (__kmp_topology->at(i + 1).sub_ids[j] != 3420 __kmp_topology->at(i).sub_ids[j]) { 3421 core++; 3422 break; 3423 } 3424 } 3425 } 3426 } 3427 return core; 3428 } 3429 3430 // This function finds maximal number of processing units bound to a 3431 // cluster/core at given level. 3432 static int __kmp_affinity_max_proc_per_core(int nprocs, int bottom_level, 3433 int core_level) { 3434 if (core_level >= bottom_level) 3435 return 1; 3436 int thread_level = __kmp_topology->get_level(KMP_HW_THREAD); 3437 return __kmp_topology->calculate_ratio(thread_level, core_level); 3438 } 3439 3440 static int *procarr = NULL; 3441 static int __kmp_aff_depth = 0; 3442 3443 // Create a one element mask array (set of places) which only contains the 3444 // initial process's affinity mask 3445 static void __kmp_create_affinity_none_places() { 3446 KMP_ASSERT(__kmp_affin_fullMask != NULL); 3447 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3448 __kmp_affinity_num_masks = 1; 3449 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 3450 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, 0); 3451 KMP_CPU_COPY(dest, __kmp_affin_fullMask); 3452 } 3453 3454 static void __kmp_aux_affinity_initialize(void) { 3455 if (__kmp_affinity_masks != NULL) { 3456 KMP_ASSERT(__kmp_affin_fullMask != NULL); 3457 return; 3458 } 3459 3460 // Create the "full" mask - this defines all of the processors that we 3461 // consider to be in the machine model. If respect is set, then it is the 3462 // initialization thread's affinity mask. Otherwise, it is all processors that 3463 // we know about on the machine. 3464 if (__kmp_affin_fullMask == NULL) { 3465 KMP_CPU_ALLOC(__kmp_affin_fullMask); 3466 } 3467 if (KMP_AFFINITY_CAPABLE()) { 3468 __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE); 3469 if (__kmp_affinity_respect_mask) { 3470 // Count the number of available processors. 3471 unsigned i; 3472 __kmp_avail_proc = 0; 3473 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) { 3474 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) { 3475 continue; 3476 } 3477 __kmp_avail_proc++; 3478 } 3479 if (__kmp_avail_proc > __kmp_xproc) { 3480 if (__kmp_affinity_verbose || 3481 (__kmp_affinity_warnings && 3482 (__kmp_affinity_type != affinity_none))) { 3483 KMP_WARNING(ErrorInitializeAffinity); 3484 } 3485 __kmp_affinity_type = affinity_none; 3486 KMP_AFFINITY_DISABLE(); 3487 return; 3488 } 3489 3490 if (__kmp_affinity_verbose) { 3491 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 3492 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 3493 __kmp_affin_fullMask); 3494 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 3495 } 3496 } else { 3497 if (__kmp_affinity_verbose) { 3498 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 3499 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 3500 __kmp_affin_fullMask); 3501 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 3502 } 3503 __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask); 3504 __kmp_avail_proc = __kmp_xproc; 3505 #if KMP_OS_WINDOWS 3506 // Set the process affinity mask since threads' affinity 3507 // masks must be subset of process mask in Windows* OS 3508 __kmp_affin_fullMask->set_process_affinity(true); 3509 #endif 3510 } 3511 } 3512 3513 kmp_i18n_id_t msg_id = kmp_i18n_null; 3514 3515 // For backward compatibility, setting KMP_CPUINFO_FILE => 3516 // KMP_TOPOLOGY_METHOD=cpuinfo 3517 if ((__kmp_cpuinfo_file != NULL) && 3518 (__kmp_affinity_top_method == affinity_top_method_all)) { 3519 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 3520 } 3521 3522 bool success = false; 3523 if (__kmp_affinity_top_method == affinity_top_method_all) { 3524 // In the default code path, errors are not fatal - we just try using 3525 // another method. We only emit a warning message if affinity is on, or the 3526 // verbose flag is set, an the nowarnings flag was not set. 3527 #if KMP_USE_HWLOC 3528 if (!success && 3529 __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) { 3530 if (!__kmp_hwloc_error) { 3531 success = __kmp_affinity_create_hwloc_map(&msg_id); 3532 if (!success && __kmp_affinity_verbose) { 3533 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 3534 } 3535 } else if (__kmp_affinity_verbose) { 3536 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 3537 } 3538 } 3539 #endif 3540 3541 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 3542 if (!success) { 3543 success = __kmp_affinity_create_x2apicid_map(&msg_id); 3544 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3545 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3546 } 3547 } 3548 if (!success) { 3549 success = __kmp_affinity_create_apicid_map(&msg_id); 3550 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3551 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3552 } 3553 } 3554 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3555 3556 #if KMP_OS_LINUX 3557 if (!success) { 3558 int line = 0; 3559 success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id); 3560 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3561 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3562 } 3563 } 3564 #endif /* KMP_OS_LINUX */ 3565 3566 #if KMP_GROUP_AFFINITY 3567 if (!success && (__kmp_num_proc_groups > 1)) { 3568 success = __kmp_affinity_create_proc_group_map(&msg_id); 3569 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3570 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3571 } 3572 } 3573 #endif /* KMP_GROUP_AFFINITY */ 3574 3575 if (!success) { 3576 success = __kmp_affinity_create_flat_map(&msg_id); 3577 if (!success && __kmp_affinity_verbose && msg_id != kmp_i18n_null) { 3578 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id)); 3579 } 3580 KMP_ASSERT(success); 3581 } 3582 } 3583 3584 // If the user has specified that a paricular topology discovery method is to be 3585 // used, then we abort if that method fails. The exception is group affinity, 3586 // which might have been implicitly set. 3587 #if KMP_USE_HWLOC 3588 else if (__kmp_affinity_top_method == affinity_top_method_hwloc) { 3589 KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC); 3590 success = __kmp_affinity_create_hwloc_map(&msg_id); 3591 if (!success) { 3592 KMP_ASSERT(msg_id != kmp_i18n_null); 3593 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3594 } 3595 } 3596 #endif // KMP_USE_HWLOC 3597 3598 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 3599 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid || 3600 __kmp_affinity_top_method == affinity_top_method_x2apicid_1f) { 3601 success = __kmp_affinity_create_x2apicid_map(&msg_id); 3602 if (!success) { 3603 KMP_ASSERT(msg_id != kmp_i18n_null); 3604 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3605 } 3606 } else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 3607 success = __kmp_affinity_create_apicid_map(&msg_id); 3608 if (!success) { 3609 KMP_ASSERT(msg_id != kmp_i18n_null); 3610 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3611 } 3612 } 3613 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3614 3615 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 3616 int line = 0; 3617 success = __kmp_affinity_create_cpuinfo_map(&line, &msg_id); 3618 if (!success) { 3619 KMP_ASSERT(msg_id != kmp_i18n_null); 3620 const char *filename = __kmp_cpuinfo_get_filename(); 3621 if (line > 0) { 3622 KMP_FATAL(FileLineMsgExiting, filename, line, 3623 __kmp_i18n_catgets(msg_id)); 3624 } else { 3625 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 3626 } 3627 } 3628 } 3629 3630 #if KMP_GROUP_AFFINITY 3631 else if (__kmp_affinity_top_method == affinity_top_method_group) { 3632 success = __kmp_affinity_create_proc_group_map(&msg_id); 3633 KMP_ASSERT(success); 3634 if (!success) { 3635 KMP_ASSERT(msg_id != kmp_i18n_null); 3636 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3637 } 3638 } 3639 #endif /* KMP_GROUP_AFFINITY */ 3640 3641 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 3642 success = __kmp_affinity_create_flat_map(&msg_id); 3643 // should not fail 3644 KMP_ASSERT(success); 3645 } 3646 3647 // Early exit if topology could not be created 3648 if (!__kmp_topology) { 3649 if (KMP_AFFINITY_CAPABLE() && 3650 (__kmp_affinity_verbose || 3651 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) { 3652 KMP_WARNING(ErrorInitializeAffinity); 3653 } 3654 if (nPackages > 0 && nCoresPerPkg > 0 && __kmp_nThreadsPerCore > 0 && 3655 __kmp_ncores > 0) { 3656 __kmp_topology = kmp_topology_t::allocate(0, 0, NULL); 3657 __kmp_topology->canonicalize(nPackages, nCoresPerPkg, 3658 __kmp_nThreadsPerCore, __kmp_ncores); 3659 if (__kmp_affinity_verbose) { 3660 __kmp_topology->print("KMP_AFFINITY"); 3661 } 3662 } 3663 __kmp_affinity_type = affinity_none; 3664 __kmp_create_affinity_none_places(); 3665 #if KMP_USE_HIER_SCHED 3666 __kmp_dispatch_set_hierarchy_values(); 3667 #endif 3668 KMP_AFFINITY_DISABLE(); 3669 return; 3670 } 3671 3672 // Canonicalize, print (if requested), apply KMP_HW_SUBSET, and 3673 // initialize other data structures which depend on the topology 3674 __kmp_topology->canonicalize(); 3675 if (__kmp_affinity_verbose) 3676 __kmp_topology->print("KMP_AFFINITY"); 3677 bool filtered = __kmp_topology->filter_hw_subset(); 3678 if (filtered && __kmp_affinity_verbose) 3679 __kmp_topology->print("KMP_HW_SUBSET"); 3680 machine_hierarchy.init(__kmp_topology->get_num_hw_threads()); 3681 KMP_ASSERT(__kmp_avail_proc == __kmp_topology->get_num_hw_threads()); 3682 // If KMP_AFFINITY=none, then only create the single "none" place 3683 // which is the process's initial affinity mask or the number of 3684 // hardware threads depending on respect,norespect 3685 if (__kmp_affinity_type == affinity_none) { 3686 __kmp_create_affinity_none_places(); 3687 #if KMP_USE_HIER_SCHED 3688 __kmp_dispatch_set_hierarchy_values(); 3689 #endif 3690 return; 3691 } 3692 int depth = __kmp_topology->get_depth(); 3693 3694 // Create the table of masks, indexed by thread Id. 3695 unsigned maxIndex; 3696 unsigned numUnique; 3697 kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique); 3698 if (__kmp_affinity_gran_levels == 0) { 3699 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 3700 } 3701 3702 switch (__kmp_affinity_type) { 3703 3704 case affinity_explicit: 3705 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 3706 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) { 3707 __kmp_affinity_process_proclist( 3708 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 3709 __kmp_affinity_proclist, osId2Mask, maxIndex); 3710 } else { 3711 __kmp_affinity_process_placelist( 3712 &__kmp_affinity_masks, &__kmp_affinity_num_masks, 3713 __kmp_affinity_proclist, osId2Mask, maxIndex); 3714 } 3715 if (__kmp_affinity_num_masks == 0) { 3716 if (__kmp_affinity_verbose || 3717 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) { 3718 KMP_WARNING(AffNoValidProcID); 3719 } 3720 __kmp_affinity_type = affinity_none; 3721 __kmp_create_affinity_none_places(); 3722 return; 3723 } 3724 break; 3725 3726 // The other affinity types rely on sorting the hardware threads according to 3727 // some permutation of the machine topology tree. Set __kmp_affinity_compact 3728 // and __kmp_affinity_offset appropriately, then jump to a common code 3729 // fragment to do the sort and create the array of affinity masks. 3730 case affinity_logical: 3731 __kmp_affinity_compact = 0; 3732 if (__kmp_affinity_offset) { 3733 __kmp_affinity_offset = 3734 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 3735 } 3736 goto sortTopology; 3737 3738 case affinity_physical: 3739 if (__kmp_nThreadsPerCore > 1) { 3740 __kmp_affinity_compact = 1; 3741 if (__kmp_affinity_compact >= depth) { 3742 __kmp_affinity_compact = 0; 3743 } 3744 } else { 3745 __kmp_affinity_compact = 0; 3746 } 3747 if (__kmp_affinity_offset) { 3748 __kmp_affinity_offset = 3749 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc; 3750 } 3751 goto sortTopology; 3752 3753 case affinity_scatter: 3754 if (__kmp_affinity_compact >= depth) { 3755 __kmp_affinity_compact = 0; 3756 } else { 3757 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 3758 } 3759 goto sortTopology; 3760 3761 case affinity_compact: 3762 if (__kmp_affinity_compact >= depth) { 3763 __kmp_affinity_compact = depth - 1; 3764 } 3765 goto sortTopology; 3766 3767 case affinity_balanced: 3768 if (depth <= 1) { 3769 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 3770 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 3771 } 3772 __kmp_affinity_type = affinity_none; 3773 __kmp_create_affinity_none_places(); 3774 return; 3775 } else if (!__kmp_topology->is_uniform()) { 3776 // Save the depth for further usage 3777 __kmp_aff_depth = depth; 3778 3779 int core_level = 3780 __kmp_affinity_find_core_level(__kmp_avail_proc, depth - 1); 3781 int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, depth - 1, 3782 core_level); 3783 int maxprocpercore = __kmp_affinity_max_proc_per_core( 3784 __kmp_avail_proc, depth - 1, core_level); 3785 3786 int nproc = ncores * maxprocpercore; 3787 if ((nproc < 2) || (nproc < __kmp_avail_proc)) { 3788 if (__kmp_affinity_verbose || __kmp_affinity_warnings) { 3789 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY"); 3790 } 3791 __kmp_affinity_type = affinity_none; 3792 return; 3793 } 3794 3795 procarr = (int *)__kmp_allocate(sizeof(int) * nproc); 3796 for (int i = 0; i < nproc; i++) { 3797 procarr[i] = -1; 3798 } 3799 3800 int lastcore = -1; 3801 int inlastcore = 0; 3802 for (int i = 0; i < __kmp_avail_proc; i++) { 3803 int proc = __kmp_topology->at(i).os_id; 3804 int core = __kmp_affinity_find_core(i, depth - 1, core_level); 3805 3806 if (core == lastcore) { 3807 inlastcore++; 3808 } else { 3809 inlastcore = 0; 3810 } 3811 lastcore = core; 3812 3813 procarr[core * maxprocpercore + inlastcore] = proc; 3814 } 3815 } 3816 if (__kmp_affinity_compact >= depth) { 3817 __kmp_affinity_compact = depth - 1; 3818 } 3819 3820 sortTopology: 3821 // Allocate the gtid->affinity mask table. 3822 if (__kmp_affinity_dups) { 3823 __kmp_affinity_num_masks = __kmp_avail_proc; 3824 } else { 3825 __kmp_affinity_num_masks = numUnique; 3826 } 3827 3828 if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) && 3829 (__kmp_affinity_num_places > 0) && 3830 ((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) { 3831 __kmp_affinity_num_masks = __kmp_affinity_num_places; 3832 } 3833 3834 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 3835 3836 // Sort the topology table according to the current setting of 3837 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 3838 __kmp_topology->sort_compact(); 3839 { 3840 int i; 3841 unsigned j; 3842 int num_hw_threads = __kmp_topology->get_num_hw_threads(); 3843 for (i = 0, j = 0; i < num_hw_threads; i++) { 3844 if ((!__kmp_affinity_dups) && (!__kmp_topology->at(i).leader)) { 3845 continue; 3846 } 3847 int osId = __kmp_topology->at(i).os_id; 3848 3849 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 3850 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j); 3851 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 3852 KMP_CPU_COPY(dest, src); 3853 if (++j >= __kmp_affinity_num_masks) { 3854 break; 3855 } 3856 } 3857 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 3858 } 3859 // Sort the topology back using ids 3860 __kmp_topology->sort_ids(); 3861 break; 3862 3863 default: 3864 KMP_ASSERT2(0, "Unexpected affinity setting"); 3865 } 3866 3867 KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1); 3868 } 3869 3870 void __kmp_affinity_initialize(void) { 3871 // Much of the code above was written assuming that if a machine was not 3872 // affinity capable, then __kmp_affinity_type == affinity_none. We now 3873 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 3874 // There are too many checks for __kmp_affinity_type == affinity_none 3875 // in this code. Instead of trying to change them all, check if 3876 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 3877 // affinity_none, call the real initialization routine, then restore 3878 // __kmp_affinity_type to affinity_disabled. 3879 int disabled = (__kmp_affinity_type == affinity_disabled); 3880 if (!KMP_AFFINITY_CAPABLE()) { 3881 KMP_ASSERT(disabled); 3882 } 3883 if (disabled) { 3884 __kmp_affinity_type = affinity_none; 3885 } 3886 __kmp_aux_affinity_initialize(); 3887 if (disabled) { 3888 __kmp_affinity_type = affinity_disabled; 3889 } 3890 } 3891 3892 void __kmp_affinity_uninitialize(void) { 3893 if (__kmp_affinity_masks != NULL) { 3894 KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 3895 __kmp_affinity_masks = NULL; 3896 } 3897 if (__kmp_affin_fullMask != NULL) { 3898 KMP_CPU_FREE(__kmp_affin_fullMask); 3899 __kmp_affin_fullMask = NULL; 3900 } 3901 __kmp_affinity_num_masks = 0; 3902 __kmp_affinity_type = affinity_default; 3903 __kmp_affinity_num_places = 0; 3904 if (__kmp_affinity_proclist != NULL) { 3905 __kmp_free(__kmp_affinity_proclist); 3906 __kmp_affinity_proclist = NULL; 3907 } 3908 if (procarr != NULL) { 3909 __kmp_free(procarr); 3910 procarr = NULL; 3911 } 3912 #if KMP_USE_HWLOC 3913 if (__kmp_hwloc_topology != NULL) { 3914 hwloc_topology_destroy(__kmp_hwloc_topology); 3915 __kmp_hwloc_topology = NULL; 3916 } 3917 #endif 3918 if (__kmp_hw_subset) { 3919 kmp_hw_subset_t::deallocate(__kmp_hw_subset); 3920 __kmp_hw_subset = nullptr; 3921 } 3922 if (__kmp_topology) { 3923 kmp_topology_t::deallocate(__kmp_topology); 3924 __kmp_topology = nullptr; 3925 } 3926 KMPAffinity::destroy_api(); 3927 } 3928 3929 void __kmp_affinity_set_init_mask(int gtid, int isa_root) { 3930 if (!KMP_AFFINITY_CAPABLE()) { 3931 return; 3932 } 3933 3934 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 3935 if (th->th.th_affin_mask == NULL) { 3936 KMP_CPU_ALLOC(th->th.th_affin_mask); 3937 } else { 3938 KMP_CPU_ZERO(th->th.th_affin_mask); 3939 } 3940 3941 // Copy the thread mask to the kmp_info_t structure. If 3942 // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that 3943 // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set, 3944 // then the full mask is the same as the mask of the initialization thread. 3945 kmp_affin_mask_t *mask; 3946 int i; 3947 3948 if (KMP_AFFINITY_NON_PROC_BIND) { 3949 if ((__kmp_affinity_type == affinity_none) || 3950 (__kmp_affinity_type == affinity_balanced) || 3951 KMP_HIDDEN_HELPER_THREAD(gtid)) { 3952 #if KMP_GROUP_AFFINITY 3953 if (__kmp_num_proc_groups > 1) { 3954 return; 3955 } 3956 #endif 3957 KMP_ASSERT(__kmp_affin_fullMask != NULL); 3958 i = 0; 3959 mask = __kmp_affin_fullMask; 3960 } else { 3961 int mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid); 3962 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 3963 i = (mask_idx + __kmp_affinity_offset) % __kmp_affinity_num_masks; 3964 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 3965 } 3966 } else { 3967 if ((!isa_root) || KMP_HIDDEN_HELPER_THREAD(gtid) || 3968 (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 3969 #if KMP_GROUP_AFFINITY 3970 if (__kmp_num_proc_groups > 1) { 3971 return; 3972 } 3973 #endif 3974 KMP_ASSERT(__kmp_affin_fullMask != NULL); 3975 i = KMP_PLACE_ALL; 3976 mask = __kmp_affin_fullMask; 3977 } else { 3978 // int i = some hash function or just a counter that doesn't 3979 // always start at 0. Use adjusted gtid for now. 3980 int mask_idx = __kmp_adjust_gtid_for_hidden_helpers(gtid); 3981 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0); 3982 i = (mask_idx + __kmp_affinity_offset) % __kmp_affinity_num_masks; 3983 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 3984 } 3985 } 3986 3987 th->th.th_current_place = i; 3988 if (isa_root || KMP_HIDDEN_HELPER_THREAD(gtid)) { 3989 th->th.th_new_place = i; 3990 th->th.th_first_place = 0; 3991 th->th.th_last_place = __kmp_affinity_num_masks - 1; 3992 } else if (KMP_AFFINITY_NON_PROC_BIND) { 3993 // When using a Non-OMP_PROC_BIND affinity method, 3994 // set all threads' place-partition-var to the entire place list 3995 th->th.th_first_place = 0; 3996 th->th.th_last_place = __kmp_affinity_num_masks - 1; 3997 } 3998 3999 if (i == KMP_PLACE_ALL) { 4000 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 4001 gtid)); 4002 } else { 4003 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 4004 gtid, i)); 4005 } 4006 4007 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4008 4009 if (__kmp_affinity_verbose && !KMP_HIDDEN_HELPER_THREAD(gtid) 4010 /* to avoid duplicate printing (will be correctly printed on barrier) */ 4011 && (__kmp_affinity_type == affinity_none || 4012 (i != KMP_PLACE_ALL && __kmp_affinity_type != affinity_balanced))) { 4013 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4014 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4015 th->th.th_affin_mask); 4016 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4017 __kmp_gettid(), gtid, buf); 4018 } 4019 4020 #if KMP_DEBUG 4021 // Hidden helper thread affinity only printed for debug builds 4022 if (__kmp_affinity_verbose && KMP_HIDDEN_HELPER_THREAD(gtid)) { 4023 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4024 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4025 th->th.th_affin_mask); 4026 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY (hidden helper thread)", 4027 (kmp_int32)getpid(), __kmp_gettid(), gtid, buf); 4028 } 4029 #endif 4030 4031 #if KMP_OS_WINDOWS 4032 // On Windows* OS, the process affinity mask might have changed. If the user 4033 // didn't request affinity and this call fails, just continue silently. 4034 // See CQ171393. 4035 if (__kmp_affinity_type == affinity_none) { 4036 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 4037 } else 4038 #endif 4039 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4040 } 4041 4042 void __kmp_affinity_set_place(int gtid) { 4043 if (!KMP_AFFINITY_CAPABLE()) { 4044 return; 4045 } 4046 4047 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4048 4049 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current " 4050 "place = %d)\n", 4051 gtid, th->th.th_new_place, th->th.th_current_place)); 4052 4053 // Check that the new place is within this thread's partition. 4054 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4055 KMP_ASSERT(th->th.th_new_place >= 0); 4056 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 4057 if (th->th.th_first_place <= th->th.th_last_place) { 4058 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) && 4059 (th->th.th_new_place <= th->th.th_last_place)); 4060 } else { 4061 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) || 4062 (th->th.th_new_place >= th->th.th_last_place)); 4063 } 4064 4065 // Copy the thread mask to the kmp_info_t structure, 4066 // and set this thread's affinity. 4067 kmp_affin_mask_t *mask = 4068 KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place); 4069 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4070 th->th.th_current_place = th->th.th_new_place; 4071 4072 if (__kmp_affinity_verbose) { 4073 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4074 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4075 th->th.th_affin_mask); 4076 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), 4077 __kmp_gettid(), gtid, buf); 4078 } 4079 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4080 } 4081 4082 int __kmp_aux_set_affinity(void **mask) { 4083 int gtid; 4084 kmp_info_t *th; 4085 int retval; 4086 4087 if (!KMP_AFFINITY_CAPABLE()) { 4088 return -1; 4089 } 4090 4091 gtid = __kmp_entry_gtid(); 4092 KA_TRACE( 4093 1000, (""); { 4094 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4095 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4096 (kmp_affin_mask_t *)(*mask)); 4097 __kmp_debug_printf( 4098 "kmp_set_affinity: setting affinity mask for thread %d = %s\n", 4099 gtid, buf); 4100 }); 4101 4102 if (__kmp_env_consistency_check) { 4103 if ((mask == NULL) || (*mask == NULL)) { 4104 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4105 } else { 4106 unsigned proc; 4107 int num_procs = 0; 4108 4109 KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) { 4110 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4111 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4112 } 4113 if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 4114 continue; 4115 } 4116 num_procs++; 4117 } 4118 if (num_procs == 0) { 4119 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4120 } 4121 4122 #if KMP_GROUP_AFFINITY 4123 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 4124 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4125 } 4126 #endif /* KMP_GROUP_AFFINITY */ 4127 } 4128 } 4129 4130 th = __kmp_threads[gtid]; 4131 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4132 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4133 if (retval == 0) { 4134 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 4135 } 4136 4137 th->th.th_current_place = KMP_PLACE_UNDEFINED; 4138 th->th.th_new_place = KMP_PLACE_UNDEFINED; 4139 th->th.th_first_place = 0; 4140 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4141 4142 // Turn off 4.0 affinity for the current tread at this parallel level. 4143 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 4144 4145 return retval; 4146 } 4147 4148 int __kmp_aux_get_affinity(void **mask) { 4149 int gtid; 4150 int retval; 4151 #if KMP_OS_WINDOWS || KMP_DEBUG 4152 kmp_info_t *th; 4153 #endif 4154 if (!KMP_AFFINITY_CAPABLE()) { 4155 return -1; 4156 } 4157 4158 gtid = __kmp_entry_gtid(); 4159 #if KMP_OS_WINDOWS || KMP_DEBUG 4160 th = __kmp_threads[gtid]; 4161 #else 4162 (void)gtid; // unused variable 4163 #endif 4164 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4165 4166 KA_TRACE( 4167 1000, (""); { 4168 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4169 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4170 th->th.th_affin_mask); 4171 __kmp_printf( 4172 "kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, 4173 buf); 4174 }); 4175 4176 if (__kmp_env_consistency_check) { 4177 if ((mask == NULL) || (*mask == NULL)) { 4178 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 4179 } 4180 } 4181 4182 #if !KMP_OS_WINDOWS 4183 4184 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4185 KA_TRACE( 4186 1000, (""); { 4187 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4188 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4189 (kmp_affin_mask_t *)(*mask)); 4190 __kmp_printf( 4191 "kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, 4192 buf); 4193 }); 4194 return retval; 4195 4196 #else 4197 (void)retval; 4198 4199 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 4200 return 0; 4201 4202 #endif /* KMP_OS_WINDOWS */ 4203 } 4204 4205 int __kmp_aux_get_affinity_max_proc() { 4206 if (!KMP_AFFINITY_CAPABLE()) { 4207 return 0; 4208 } 4209 #if KMP_GROUP_AFFINITY 4210 if (__kmp_num_proc_groups > 1) { 4211 return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT); 4212 } 4213 #endif 4214 return __kmp_xproc; 4215 } 4216 4217 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) { 4218 if (!KMP_AFFINITY_CAPABLE()) { 4219 return -1; 4220 } 4221 4222 KA_TRACE( 4223 1000, (""); { 4224 int gtid = __kmp_entry_gtid(); 4225 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4226 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4227 (kmp_affin_mask_t *)(*mask)); 4228 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in " 4229 "affinity mask for thread %d = %s\n", 4230 proc, gtid, buf); 4231 }); 4232 4233 if (__kmp_env_consistency_check) { 4234 if ((mask == NULL) || (*mask == NULL)) { 4235 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4236 } 4237 } 4238 4239 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4240 return -1; 4241 } 4242 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4243 return -2; 4244 } 4245 4246 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 4247 return 0; 4248 } 4249 4250 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) { 4251 if (!KMP_AFFINITY_CAPABLE()) { 4252 return -1; 4253 } 4254 4255 KA_TRACE( 4256 1000, (""); { 4257 int gtid = __kmp_entry_gtid(); 4258 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4259 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4260 (kmp_affin_mask_t *)(*mask)); 4261 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in " 4262 "affinity mask for thread %d = %s\n", 4263 proc, gtid, buf); 4264 }); 4265 4266 if (__kmp_env_consistency_check) { 4267 if ((mask == NULL) || (*mask == NULL)) { 4268 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 4269 } 4270 } 4271 4272 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4273 return -1; 4274 } 4275 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4276 return -2; 4277 } 4278 4279 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 4280 return 0; 4281 } 4282 4283 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) { 4284 if (!KMP_AFFINITY_CAPABLE()) { 4285 return -1; 4286 } 4287 4288 KA_TRACE( 4289 1000, (""); { 4290 int gtid = __kmp_entry_gtid(); 4291 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4292 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4293 (kmp_affin_mask_t *)(*mask)); 4294 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in " 4295 "affinity mask for thread %d = %s\n", 4296 proc, gtid, buf); 4297 }); 4298 4299 if (__kmp_env_consistency_check) { 4300 if ((mask == NULL) || (*mask == NULL)) { 4301 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc"); 4302 } 4303 } 4304 4305 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) { 4306 return -1; 4307 } 4308 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) { 4309 return 0; 4310 } 4311 4312 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 4313 } 4314 4315 // Dynamic affinity settings - Affinity balanced 4316 void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) { 4317 KMP_DEBUG_ASSERT(th); 4318 bool fine_gran = true; 4319 int tid = th->th.th_info.ds.ds_tid; 4320 4321 // Do not perform balanced affinity for the hidden helper threads 4322 if (KMP_HIDDEN_HELPER_THREAD(__kmp_gtid_from_thread(th))) 4323 return; 4324 4325 switch (__kmp_affinity_gran) { 4326 case KMP_HW_THREAD: 4327 break; 4328 case KMP_HW_CORE: 4329 if (__kmp_nThreadsPerCore > 1) { 4330 fine_gran = false; 4331 } 4332 break; 4333 case KMP_HW_SOCKET: 4334 if (nCoresPerPkg > 1) { 4335 fine_gran = false; 4336 } 4337 break; 4338 default: 4339 fine_gran = false; 4340 } 4341 4342 if (__kmp_topology->is_uniform()) { 4343 int coreID; 4344 int threadID; 4345 // Number of hyper threads per core in HT machine 4346 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 4347 // Number of cores 4348 int ncores = __kmp_ncores; 4349 if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) { 4350 __kmp_nth_per_core = __kmp_avail_proc / nPackages; 4351 ncores = nPackages; 4352 } 4353 // How many threads will be bound to each core 4354 int chunk = nthreads / ncores; 4355 // How many cores will have an additional thread bound to it - "big cores" 4356 int big_cores = nthreads % ncores; 4357 // Number of threads on the big cores 4358 int big_nth = (chunk + 1) * big_cores; 4359 if (tid < big_nth) { 4360 coreID = tid / (chunk + 1); 4361 threadID = (tid % (chunk + 1)) % __kmp_nth_per_core; 4362 } else { // tid >= big_nth 4363 coreID = (tid - big_cores) / chunk; 4364 threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core; 4365 } 4366 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 4367 "Illegal set affinity operation when not capable"); 4368 4369 kmp_affin_mask_t *mask = th->th.th_affin_mask; 4370 KMP_CPU_ZERO(mask); 4371 4372 if (fine_gran) { 4373 int osID = 4374 __kmp_topology->at(coreID * __kmp_nth_per_core + threadID).os_id; 4375 KMP_CPU_SET(osID, mask); 4376 } else { 4377 for (int i = 0; i < __kmp_nth_per_core; i++) { 4378 int osID; 4379 osID = __kmp_topology->at(coreID * __kmp_nth_per_core + i).os_id; 4380 KMP_CPU_SET(osID, mask); 4381 } 4382 } 4383 if (__kmp_affinity_verbose) { 4384 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4385 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4386 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4387 __kmp_gettid(), tid, buf); 4388 } 4389 __kmp_set_system_affinity(mask, TRUE); 4390 } else { // Non-uniform topology 4391 4392 kmp_affin_mask_t *mask = th->th.th_affin_mask; 4393 KMP_CPU_ZERO(mask); 4394 4395 int core_level = 4396 __kmp_affinity_find_core_level(__kmp_avail_proc, __kmp_aff_depth - 1); 4397 int ncores = __kmp_affinity_compute_ncores(__kmp_avail_proc, 4398 __kmp_aff_depth - 1, core_level); 4399 int nth_per_core = __kmp_affinity_max_proc_per_core( 4400 __kmp_avail_proc, __kmp_aff_depth - 1, core_level); 4401 4402 // For performance gain consider the special case nthreads == 4403 // __kmp_avail_proc 4404 if (nthreads == __kmp_avail_proc) { 4405 if (fine_gran) { 4406 int osID = __kmp_topology->at(tid).os_id; 4407 KMP_CPU_SET(osID, mask); 4408 } else { 4409 int core = 4410 __kmp_affinity_find_core(tid, __kmp_aff_depth - 1, core_level); 4411 for (int i = 0; i < __kmp_avail_proc; i++) { 4412 int osID = __kmp_topology->at(i).os_id; 4413 if (__kmp_affinity_find_core(i, __kmp_aff_depth - 1, core_level) == 4414 core) { 4415 KMP_CPU_SET(osID, mask); 4416 } 4417 } 4418 } 4419 } else if (nthreads <= ncores) { 4420 4421 int core = 0; 4422 for (int i = 0; i < ncores; i++) { 4423 // Check if this core from procarr[] is in the mask 4424 int in_mask = 0; 4425 for (int j = 0; j < nth_per_core; j++) { 4426 if (procarr[i * nth_per_core + j] != -1) { 4427 in_mask = 1; 4428 break; 4429 } 4430 } 4431 if (in_mask) { 4432 if (tid == core) { 4433 for (int j = 0; j < nth_per_core; j++) { 4434 int osID = procarr[i * nth_per_core + j]; 4435 if (osID != -1) { 4436 KMP_CPU_SET(osID, mask); 4437 // For fine granularity it is enough to set the first available 4438 // osID for this core 4439 if (fine_gran) { 4440 break; 4441 } 4442 } 4443 } 4444 break; 4445 } else { 4446 core++; 4447 } 4448 } 4449 } 4450 } else { // nthreads > ncores 4451 // Array to save the number of processors at each core 4452 int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores); 4453 // Array to save the number of cores with "x" available processors; 4454 int *ncores_with_x_procs = 4455 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 4456 // Array to save the number of cores with # procs from x to nth_per_core 4457 int *ncores_with_x_to_max_procs = 4458 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1)); 4459 4460 for (int i = 0; i <= nth_per_core; i++) { 4461 ncores_with_x_procs[i] = 0; 4462 ncores_with_x_to_max_procs[i] = 0; 4463 } 4464 4465 for (int i = 0; i < ncores; i++) { 4466 int cnt = 0; 4467 for (int j = 0; j < nth_per_core; j++) { 4468 if (procarr[i * nth_per_core + j] != -1) { 4469 cnt++; 4470 } 4471 } 4472 nproc_at_core[i] = cnt; 4473 ncores_with_x_procs[cnt]++; 4474 } 4475 4476 for (int i = 0; i <= nth_per_core; i++) { 4477 for (int j = i; j <= nth_per_core; j++) { 4478 ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j]; 4479 } 4480 } 4481 4482 // Max number of processors 4483 int nproc = nth_per_core * ncores; 4484 // An array to keep number of threads per each context 4485 int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc); 4486 for (int i = 0; i < nproc; i++) { 4487 newarr[i] = 0; 4488 } 4489 4490 int nth = nthreads; 4491 int flag = 0; 4492 while (nth > 0) { 4493 for (int j = 1; j <= nth_per_core; j++) { 4494 int cnt = ncores_with_x_to_max_procs[j]; 4495 for (int i = 0; i < ncores; i++) { 4496 // Skip the core with 0 processors 4497 if (nproc_at_core[i] == 0) { 4498 continue; 4499 } 4500 for (int k = 0; k < nth_per_core; k++) { 4501 if (procarr[i * nth_per_core + k] != -1) { 4502 if (newarr[i * nth_per_core + k] == 0) { 4503 newarr[i * nth_per_core + k] = 1; 4504 cnt--; 4505 nth--; 4506 break; 4507 } else { 4508 if (flag != 0) { 4509 newarr[i * nth_per_core + k]++; 4510 cnt--; 4511 nth--; 4512 break; 4513 } 4514 } 4515 } 4516 } 4517 if (cnt == 0 || nth == 0) { 4518 break; 4519 } 4520 } 4521 if (nth == 0) { 4522 break; 4523 } 4524 } 4525 flag = 1; 4526 } 4527 int sum = 0; 4528 for (int i = 0; i < nproc; i++) { 4529 sum += newarr[i]; 4530 if (sum > tid) { 4531 if (fine_gran) { 4532 int osID = procarr[i]; 4533 KMP_CPU_SET(osID, mask); 4534 } else { 4535 int coreID = i / nth_per_core; 4536 for (int ii = 0; ii < nth_per_core; ii++) { 4537 int osID = procarr[coreID * nth_per_core + ii]; 4538 if (osID != -1) { 4539 KMP_CPU_SET(osID, mask); 4540 } 4541 } 4542 } 4543 break; 4544 } 4545 } 4546 __kmp_free(newarr); 4547 } 4548 4549 if (__kmp_affinity_verbose) { 4550 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4551 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4552 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4553 __kmp_gettid(), tid, buf); 4554 } 4555 __kmp_set_system_affinity(mask, TRUE); 4556 } 4557 } 4558 4559 #if KMP_OS_LINUX || KMP_OS_FREEBSD 4560 // We don't need this entry for Windows because 4561 // there is GetProcessAffinityMask() api 4562 // 4563 // The intended usage is indicated by these steps: 4564 // 1) The user gets the current affinity mask 4565 // 2) Then sets the affinity by calling this function 4566 // 3) Error check the return value 4567 // 4) Use non-OpenMP parallelization 4568 // 5) Reset the affinity to what was stored in step 1) 4569 #ifdef __cplusplus 4570 extern "C" 4571 #endif 4572 int 4573 kmp_set_thread_affinity_mask_initial() 4574 // the function returns 0 on success, 4575 // -1 if we cannot bind thread 4576 // >0 (errno) if an error happened during binding 4577 { 4578 int gtid = __kmp_get_gtid(); 4579 if (gtid < 0) { 4580 // Do not touch non-omp threads 4581 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 4582 "non-omp thread, returning\n")); 4583 return -1; 4584 } 4585 if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) { 4586 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 4587 "affinity not initialized, returning\n")); 4588 return -1; 4589 } 4590 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: " 4591 "set full mask for thread %d\n", 4592 gtid)); 4593 KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL); 4594 return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE); 4595 } 4596 #endif 4597 4598 #endif // KMP_AFFINITY_SUPPORTED 4599