1 /* 2 Copyright (c) 2019-2023 Intel Corporation 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 #include <vector> 18 #include <mutex> 19 20 #include "../tbb/assert_impl.h" // Out-of-line TBB assertion handling routines are instantiated here. 21 #include "oneapi/tbb/detail/_assert.h" 22 #include "oneapi/tbb/detail/_config.h" 23 24 #if _MSC_VER && !__INTEL_COMPILER && !__clang__ 25 #pragma warning( push ) 26 #pragma warning( disable : 4100 ) 27 #elif _MSC_VER && __clang__ 28 #pragma GCC diagnostic push 29 #pragma GCC diagnostic ignored "-Wunused-parameter" 30 #endif 31 #include <hwloc.h> 32 #if _MSC_VER && !__INTEL_COMPILER && !__clang__ 33 #pragma warning( pop ) 34 #elif _MSC_VER && __clang__ 35 #pragma GCC diagnostic pop 36 #endif 37 38 #define __TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT (HWLOC_API_VERSION >= 0x20400) 39 #define __TBBBIND_HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING_PRESENT (HWLOC_API_VERSION >= 0x20500) 40 41 // Most of hwloc calls returns negative exit code on error. 42 // This macro tracks error codes that are returned from the hwloc interfaces. 43 #define assertion_hwloc_wrapper(command, ...) \ 44 __TBB_ASSERT_EX( (command(__VA_ARGS__)) >= 0, "Error occurred during call to hwloc API."); 45 46 namespace tbb { 47 namespace detail { 48 namespace r1 { 49 50 //------------------------------------------------------------------------ 51 // Information about the machine's hardware TBB is happen to work on 52 //------------------------------------------------------------------------ 53 class system_topology { 54 friend class binding_handler; 55 56 // Common topology members 57 hwloc_topology_t topology{nullptr}; 58 hwloc_cpuset_t process_cpu_affinity_mask{nullptr}; 59 hwloc_nodeset_t process_node_affinity_mask{nullptr}; 60 std::size_t number_of_processors_groups{1}; 61 62 // NUMA API related topology members 63 std::vector<hwloc_cpuset_t> numa_affinity_masks_list{}; 64 std::vector<int> numa_indexes_list{}; 65 int numa_nodes_count{0}; 66 67 // Hybrid CPUs API related topology members 68 std::vector<hwloc_cpuset_t> core_types_affinity_masks_list{}; 69 std::vector<int> core_types_indexes_list{}; 70 71 enum init_stages { uninitialized, 72 started, 73 topology_allocated, 74 topology_loaded, 75 topology_parsed } initialization_state; 76 77 // Binding threads that locate in another Windows Processor groups 78 // is allowed only if machine topology contains several Windows Processors groups 79 // and process affinity mask wasn't limited manually (affinity mask cannot violates 80 // processors group boundaries). 81 bool intergroup_binding_allowed(std::size_t groups_num) { return groups_num > 1; } 82 83 private: 84 void topology_initialization(std::size_t groups_num) { 85 initialization_state = started; 86 87 // Parse topology 88 if ( hwloc_topology_init( &topology ) == 0 ) { 89 initialization_state = topology_allocated; 90 #if __TBBBIND_HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING_PRESENT 91 if ( groups_num == 1 && 92 hwloc_topology_set_flags(topology, 93 HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM | 94 HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING 95 ) != 0 96 ) { 97 return; 98 } 99 #endif 100 if ( hwloc_topology_load( topology ) == 0 ) { 101 initialization_state = topology_loaded; 102 } 103 } 104 if ( initialization_state != topology_loaded ) 105 return; 106 107 #if __TBB_CPUBIND_PRESENT 108 // Getting process affinity mask 109 if ( intergroup_binding_allowed(groups_num) ) { 110 process_cpu_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_cpuset (topology)); 111 process_node_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_nodeset(topology)); 112 } else { 113 process_cpu_affinity_mask = hwloc_bitmap_alloc(); 114 process_node_affinity_mask = hwloc_bitmap_alloc(); 115 116 assertion_hwloc_wrapper(hwloc_get_cpubind, topology, process_cpu_affinity_mask, 0); 117 hwloc_cpuset_to_nodeset(topology, process_cpu_affinity_mask, process_node_affinity_mask); 118 } 119 #else 120 process_cpu_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_cpuset (topology)); 121 process_node_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_nodeset(topology)); 122 #endif 123 124 number_of_processors_groups = groups_num; 125 } 126 127 void numa_topology_parsing() { 128 // Fill parameters with stubs if topology parsing is broken. 129 if ( initialization_state != topology_loaded ) { 130 numa_nodes_count = 1; 131 numa_indexes_list.push_back(-1); 132 return; 133 } 134 135 // If system contains no NUMA nodes, HWLOC 1.11 returns an infinitely filled bitmap. 136 // hwloc_bitmap_weight() returns negative value for such bitmaps, so we use this check 137 // to change way of topology initialization. 138 numa_nodes_count = hwloc_bitmap_weight(process_node_affinity_mask); 139 if (numa_nodes_count <= 0) { 140 // numa_nodes_count may be empty if the process affinity mask is empty too (invalid case) 141 // or if some internal HWLOC error occurred. 142 // So we place -1 as index in this case. 143 numa_indexes_list.push_back(numa_nodes_count == 0 ? -1 : 0); 144 numa_nodes_count = 1; 145 146 numa_affinity_masks_list.push_back(hwloc_bitmap_dup(process_cpu_affinity_mask)); 147 } else { 148 // Get NUMA logical indexes list 149 unsigned counter = 0; 150 int i = 0; 151 int max_numa_index = -1; 152 numa_indexes_list.resize(numa_nodes_count); 153 hwloc_obj_t node_buffer; 154 hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) { 155 node_buffer = hwloc_get_numanode_obj_by_os_index(topology, i); 156 numa_indexes_list[counter] = static_cast<int>(node_buffer->logical_index); 157 158 if ( numa_indexes_list[counter] > max_numa_index ) { 159 max_numa_index = numa_indexes_list[counter]; 160 } 161 162 counter++; 163 } hwloc_bitmap_foreach_end(); 164 __TBB_ASSERT(max_numa_index >= 0, "Maximal NUMA index must not be negative"); 165 166 // Fill concurrency and affinity masks lists 167 numa_affinity_masks_list.resize(max_numa_index + 1); 168 int index = 0; 169 hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) { 170 node_buffer = hwloc_get_numanode_obj_by_os_index(topology, i); 171 index = static_cast<int>(node_buffer->logical_index); 172 173 hwloc_cpuset_t& current_mask = numa_affinity_masks_list[index]; 174 current_mask = hwloc_bitmap_dup(node_buffer->cpuset); 175 176 hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask); 177 __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask), "hwloc detected unavailable NUMA node"); 178 } hwloc_bitmap_foreach_end(); 179 } 180 } 181 182 void core_types_topology_parsing() { 183 // Fill parameters with stubs if topology parsing is broken. 184 if ( initialization_state != topology_loaded ) { 185 core_types_indexes_list.push_back(-1); 186 return; 187 } 188 #if __TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT 189 __TBB_ASSERT(hwloc_get_api_version() >= 0x20400, "Hybrid CPUs support interfaces required HWLOC >= 2.4"); 190 // Parsing the hybrid CPU topology 191 int core_types_number = hwloc_cpukinds_get_nr(topology, 0); 192 bool core_types_parsing_broken = core_types_number <= 0; 193 if (!core_types_parsing_broken) { 194 core_types_affinity_masks_list.resize(core_types_number); 195 int efficiency{-1}; 196 197 for (int core_type = 0; core_type < core_types_number; ++core_type) { 198 hwloc_cpuset_t& current_mask = core_types_affinity_masks_list[core_type]; 199 current_mask = hwloc_bitmap_alloc(); 200 201 if (!hwloc_cpukinds_get_info(topology, core_type, current_mask, &efficiency, nullptr, nullptr, 0) 202 && efficiency >= 0 203 ) { 204 hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask); 205 206 if (hwloc_bitmap_weight(current_mask) > 0) { 207 core_types_indexes_list.push_back(core_type); 208 } 209 __TBB_ASSERT(hwloc_bitmap_weight(current_mask) >= 0, "Infinivitely filled core type mask"); 210 } else { 211 core_types_parsing_broken = true; 212 break; 213 } 214 } 215 } 216 #else /*!__TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/ 217 bool core_types_parsing_broken{true}; 218 #endif /*__TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/ 219 220 if (core_types_parsing_broken) { 221 for (auto& core_type_mask : core_types_affinity_masks_list) { 222 hwloc_bitmap_free(core_type_mask); 223 } 224 core_types_affinity_masks_list.resize(1); 225 core_types_indexes_list.resize(1); 226 227 core_types_affinity_masks_list[0] = hwloc_bitmap_dup(process_cpu_affinity_mask); 228 core_types_indexes_list[0] = -1; 229 } 230 } 231 232 void enforce_hwloc_2_5_runtime_linkage() { 233 // Without the call of this function HWLOC 2.4 can be successfully loaded during the tbbbind_2_5 loading. 234 // It is possible since tbbbind_2_5 don't use any new entry points that were introduced in HWLOC 2.5 235 // But tbbbind_2_5 compiles with HWLOC 2.5 header, therefore such situation requires binary forward compatibility 236 // which are not guaranteed by the HWLOC library. To enforce linkage tbbbind_2_5 only with HWLOC >= 2.5 version 237 // this function calls the interface that is available in the HWLOC 2.5 only. 238 #if HWLOC_API_VERSION >= 0x20500 239 auto some_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, nullptr); 240 hwloc_get_obj_with_same_locality(topology, some_core, HWLOC_OBJ_CORE, nullptr, nullptr, 0); 241 #endif 242 } 243 244 245 void initialize( std::size_t groups_num ) { 246 if ( initialization_state != uninitialized ) 247 return; 248 249 topology_initialization(groups_num); 250 numa_topology_parsing(); 251 core_types_topology_parsing(); 252 253 enforce_hwloc_2_5_runtime_linkage(); 254 255 if (initialization_state == topology_loaded) 256 initialization_state = topology_parsed; 257 } 258 259 static system_topology* instance_ptr; 260 public: 261 typedef hwloc_cpuset_t affinity_mask; 262 typedef hwloc_const_cpuset_t const_affinity_mask; 263 264 bool is_topology_parsed() { return initialization_state == topology_parsed; } 265 266 static void construct( std::size_t groups_num ) { 267 if (instance_ptr == nullptr) { 268 instance_ptr = new system_topology(); 269 instance_ptr->initialize(groups_num); 270 } 271 } 272 273 static system_topology& instance() { 274 __TBB_ASSERT(instance_ptr != nullptr, "Getting instance of non-constructed topology"); 275 return *instance_ptr; 276 } 277 278 static void destroy() { 279 __TBB_ASSERT(instance_ptr != nullptr, "Destroying non-constructed topology"); 280 delete instance_ptr; 281 } 282 283 ~system_topology() { 284 if ( is_topology_parsed() ) { 285 for (auto& numa_node_mask : numa_affinity_masks_list) { 286 hwloc_bitmap_free(numa_node_mask); 287 } 288 289 for (auto& core_type_mask : core_types_affinity_masks_list) { 290 hwloc_bitmap_free(core_type_mask); 291 } 292 293 hwloc_bitmap_free(process_node_affinity_mask); 294 hwloc_bitmap_free(process_cpu_affinity_mask); 295 } 296 297 if ( initialization_state >= topology_allocated ) { 298 hwloc_topology_destroy(topology); 299 } 300 301 initialization_state = uninitialized; 302 } 303 304 void fill_topology_information( 305 int& _numa_nodes_count, int*& _numa_indexes_list, 306 int& _core_types_count, int*& _core_types_indexes_list 307 ) { 308 __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized system_topology"); 309 _numa_nodes_count = numa_nodes_count; 310 _numa_indexes_list = numa_indexes_list.data(); 311 312 _core_types_count = (int)core_types_indexes_list.size(); 313 _core_types_indexes_list = core_types_indexes_list.data(); 314 } 315 316 void fill_constraints_affinity_mask(affinity_mask input_mask, int numa_node_index, int core_type_index, int max_threads_per_core) { 317 __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized system_topology"); 318 __TBB_ASSERT(numa_node_index < (int)numa_affinity_masks_list.size(), "Wrong NUMA node id"); 319 __TBB_ASSERT(core_type_index < (int)core_types_affinity_masks_list.size(), "Wrong core type id"); 320 __TBB_ASSERT(max_threads_per_core == -1 || max_threads_per_core > 0, "Wrong max_threads_per_core"); 321 322 hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc(); 323 hwloc_cpuset_t core_mask = hwloc_bitmap_alloc(); 324 325 hwloc_bitmap_copy(constraints_mask, process_cpu_affinity_mask); 326 if (numa_node_index >= 0) { 327 hwloc_bitmap_and(constraints_mask, constraints_mask, numa_affinity_masks_list[numa_node_index]); 328 } 329 if (core_type_index >= 0) { 330 hwloc_bitmap_and(constraints_mask, constraints_mask, core_types_affinity_masks_list[core_type_index]); 331 } 332 if (max_threads_per_core > 0) { 333 // clear input mask 334 hwloc_bitmap_zero(input_mask); 335 336 hwloc_obj_t current_core = nullptr; 337 while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) { 338 hwloc_bitmap_and(core_mask, constraints_mask, current_core->cpuset); 339 340 // fit the core mask to required bits number 341 int current_threads_per_core = 0; 342 for (int id = hwloc_bitmap_first(core_mask); id != -1; id = hwloc_bitmap_next(core_mask, id)) { 343 if (++current_threads_per_core > max_threads_per_core) { 344 hwloc_bitmap_clr(core_mask, id); 345 } 346 } 347 348 hwloc_bitmap_or(input_mask, input_mask, core_mask); 349 } 350 } else { 351 hwloc_bitmap_copy(input_mask, constraints_mask); 352 } 353 354 hwloc_bitmap_free(core_mask); 355 hwloc_bitmap_free(constraints_mask); 356 } 357 358 void fit_num_threads_per_core(affinity_mask result_mask, affinity_mask current_mask, affinity_mask constraints_mask) { 359 hwloc_bitmap_zero(result_mask); 360 hwloc_obj_t current_core = nullptr; 361 while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) { 362 if (hwloc_bitmap_intersects(current_mask, current_core->cpuset)) { 363 hwloc_bitmap_or(result_mask, result_mask, current_core->cpuset); 364 } 365 } 366 hwloc_bitmap_and(result_mask, result_mask, constraints_mask); 367 } 368 369 int get_default_concurrency(int numa_node_index, int core_type_index, int max_threads_per_core) { 370 __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized system_topology"); 371 372 hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc(); 373 fill_constraints_affinity_mask(constraints_mask, numa_node_index, core_type_index, max_threads_per_core); 374 375 int default_concurrency = hwloc_bitmap_weight(constraints_mask); 376 hwloc_bitmap_free(constraints_mask); 377 return default_concurrency; 378 } 379 380 affinity_mask allocate_process_affinity_mask() { 381 __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized system_topology"); 382 return hwloc_bitmap_dup(process_cpu_affinity_mask); 383 } 384 385 void free_affinity_mask( affinity_mask mask_to_free ) { 386 hwloc_bitmap_free(mask_to_free); // If bitmap is nullptr, no operation is performed. 387 } 388 389 void store_current_affinity_mask( affinity_mask current_mask ) { 390 assertion_hwloc_wrapper(hwloc_get_cpubind, topology, current_mask, HWLOC_CPUBIND_THREAD); 391 392 hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask); 393 __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask), 394 "Current affinity mask must intersects with process affinity mask"); 395 } 396 397 void set_affinity_mask( const_affinity_mask mask ) { 398 if (hwloc_bitmap_weight(mask) > 0) { 399 assertion_hwloc_wrapper(hwloc_set_cpubind, topology, mask, HWLOC_CPUBIND_THREAD); 400 } 401 } 402 }; 403 404 system_topology* system_topology::instance_ptr{nullptr}; 405 406 class binding_handler { 407 // Following vector saves thread affinity mask on scheduler entry to return it to this thread 408 // on scheduler exit. 409 typedef std::vector<system_topology::affinity_mask> affinity_masks_container; 410 affinity_masks_container affinity_backup; 411 system_topology::affinity_mask handler_affinity_mask; 412 413 #ifdef _WIN32 414 affinity_masks_container affinity_buffer; 415 int my_numa_node_id; 416 int my_core_type_id; 417 int my_max_threads_per_core; 418 #endif 419 420 public: 421 binding_handler( std::size_t size, int numa_node_id, int core_type_id, int max_threads_per_core ) 422 : affinity_backup(size) 423 #ifdef _WIN32 424 , affinity_buffer(size) 425 , my_numa_node_id(numa_node_id) 426 , my_core_type_id(core_type_id) 427 , my_max_threads_per_core(max_threads_per_core) 428 #endif 429 { 430 for (std::size_t i = 0; i < size; ++i) { 431 affinity_backup[i] = system_topology::instance().allocate_process_affinity_mask(); 432 #ifdef _WIN32 433 affinity_buffer[i] = system_topology::instance().allocate_process_affinity_mask(); 434 #endif 435 } 436 handler_affinity_mask = system_topology::instance().allocate_process_affinity_mask(); 437 system_topology::instance().fill_constraints_affinity_mask 438 (handler_affinity_mask, numa_node_id, core_type_id, max_threads_per_core); 439 } 440 441 ~binding_handler() { 442 for (std::size_t i = 0; i < affinity_backup.size(); ++i) { 443 system_topology::instance().free_affinity_mask(affinity_backup[i]); 444 #ifdef _WIN32 445 system_topology::instance().free_affinity_mask(affinity_buffer[i]); 446 #endif 447 } 448 system_topology::instance().free_affinity_mask(handler_affinity_mask); 449 } 450 451 void apply_affinity( unsigned slot_num ) { 452 auto& topology = system_topology::instance(); 453 __TBB_ASSERT(slot_num < affinity_backup.size(), 454 "The slot number is greater than the number of slots in the arena"); 455 __TBB_ASSERT(topology.is_topology_parsed(), 456 "Trying to get access to uninitialized system_topology"); 457 458 topology.store_current_affinity_mask(affinity_backup[slot_num]); 459 460 #ifdef _WIN32 461 // TBBBind supports only systems where NUMA nodes and core types do not cross the border 462 // between several processor groups. So if a certain NUMA node or core type constraint 463 // specified, then the constraints affinity mask will not cross the processor groups' border. 464 465 // But if we have constraint based only on the max_threads_per_core setting, then the 466 // constraints affinity mask does may cross the border between several processor groups 467 // on machines with more then 64 hardware threads. That is why we need to use the special 468 // function, which regulates the number of threads in the current threads mask. 469 if (topology.number_of_processors_groups > 1 && my_max_threads_per_core != -1 && 470 (my_numa_node_id == -1 || topology.numa_indexes_list.size() == 1) && 471 (my_core_type_id == -1 || topology.core_types_indexes_list.size() == 1) 472 ) { 473 topology.fit_num_threads_per_core(affinity_buffer[slot_num], affinity_backup[slot_num], handler_affinity_mask); 474 topology.set_affinity_mask(affinity_buffer[slot_num]); 475 return; 476 } 477 #endif 478 topology.set_affinity_mask(handler_affinity_mask); 479 } 480 481 void restore_previous_affinity_mask( unsigned slot_num ) { 482 auto& topology = system_topology::instance(); 483 __TBB_ASSERT(topology.is_topology_parsed(), 484 "Trying to get access to uninitialized system_topology"); 485 topology.set_affinity_mask(affinity_backup[slot_num]); 486 }; 487 488 }; 489 490 extern "C" { // exported to TBB interfaces 491 492 TBBBIND_EXPORT void __TBB_internal_initialize_system_topology( 493 std::size_t groups_num, 494 int& numa_nodes_count, int*& numa_indexes_list, 495 int& core_types_count, int*& core_types_indexes_list 496 ) { 497 system_topology::construct(groups_num); 498 system_topology::instance().fill_topology_information( 499 numa_nodes_count, numa_indexes_list, 500 core_types_count, core_types_indexes_list 501 ); 502 } 503 504 TBBBIND_EXPORT binding_handler* __TBB_internal_allocate_binding_handler(int number_of_slots, int numa_id, int core_type_id, int max_threads_per_core) { 505 __TBB_ASSERT(number_of_slots > 0, "Trying to create numa handler for 0 threads."); 506 return new binding_handler(number_of_slots, numa_id, core_type_id, max_threads_per_core); 507 } 508 509 TBBBIND_EXPORT void __TBB_internal_deallocate_binding_handler(binding_handler* handler_ptr) { 510 __TBB_ASSERT(handler_ptr != nullptr, "Trying to deallocate nullptr pointer."); 511 delete handler_ptr; 512 } 513 514 TBBBIND_EXPORT void __TBB_internal_apply_affinity(binding_handler* handler_ptr, int slot_num) { 515 __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata."); 516 handler_ptr->apply_affinity(slot_num); 517 } 518 519 TBBBIND_EXPORT void __TBB_internal_restore_affinity(binding_handler* handler_ptr, int slot_num) { 520 __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata."); 521 handler_ptr->restore_previous_affinity_mask(slot_num); 522 } 523 524 TBBBIND_EXPORT int __TBB_internal_get_default_concurrency(int numa_id, int core_type_id, int max_threads_per_core) { 525 return system_topology::instance().get_default_concurrency(numa_id, core_type_id, max_threads_per_core); 526 } 527 528 void __TBB_internal_destroy_system_topology() { 529 return system_topology::destroy(); 530 } 531 532 } // extern "C" 533 534 } // namespace r1 535 } // namespace detail 536 } // namespace tbb 537 538 #undef assertion_hwloc_wrapper 539