1 /* 2 Copyright (c) 2019-2021 Intel Corporation 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 #include <vector> 18 #include <mutex> 19 20 #include "../tbb/assert_impl.h" // Out-of-line TBB assertion handling routines are instantiated here. 21 #include "oneapi/tbb/detail/_assert.h" 22 #include "oneapi/tbb/detail/_config.h" 23 24 #if _MSC_VER && !__INTEL_COMPILER && !__clang__ 25 #pragma warning( push ) 26 #pragma warning( disable : 4100 ) 27 #elif _MSC_VER && __clang__ 28 #pragma GCC diagnostic push 29 #pragma GCC diagnostic ignored "-Wunused-parameter" 30 #endif 31 #include <hwloc.h> 32 #if _MSC_VER && !__INTEL_COMPILER && !__clang__ 33 #pragma warning( pop ) 34 #elif _MSC_VER && __clang__ 35 #pragma GCC diagnostic pop 36 #endif 37 38 #define __TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT (HWLOC_API_VERSION >= 0x20400) 39 #define __TBBBIND_HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING_PRESENT (HWLOC_API_VERSION >= 0x20500) 40 41 // Most of hwloc calls returns negative exit code on error. 42 // This macro tracks error codes that are returned from the hwloc interfaces. 43 #define assertion_hwloc_wrapper(command, ...) \ 44 __TBB_ASSERT_EX( (command(__VA_ARGS__)) >= 0, "Error occurred during call to hwloc API."); 45 46 namespace tbb { 47 namespace detail { 48 namespace r1 { 49 50 //------------------------------------------------------------------------ 51 // Information about the machine's hardware TBB is happen to work on 52 //------------------------------------------------------------------------ 53 class system_topology { 54 friend class binding_handler; 55 56 // Common topology members 57 hwloc_topology_t topology{nullptr}; 58 hwloc_cpuset_t process_cpu_affinity_mask{nullptr}; 59 hwloc_nodeset_t process_node_affinity_mask{nullptr}; 60 std::size_t number_of_processors_groups{1}; 61 62 // NUMA API related topology members 63 std::vector<hwloc_cpuset_t> numa_affinity_masks_list{}; 64 std::vector<int> numa_indexes_list{}; 65 int numa_nodes_count{0}; 66 67 // Hybrid CPUs API related topology members 68 std::vector<hwloc_cpuset_t> core_types_affinity_masks_list{}; 69 std::vector<int> core_types_indexes_list{}; 70 71 enum init_stages { uninitialized, 72 started, 73 topology_allocated, 74 topology_loaded, 75 topology_parsed } initialization_state; 76 77 // Binding threads that locate in another Windows Processor groups 78 // is allowed only if machine topology contains several Windows Processors groups 79 // and process affinity mask wasn`t limited manually (affinity mask cannot violates 80 // processors group boundaries). 81 bool intergroup_binding_allowed(std::size_t groups_num) { return groups_num > 1; } 82 83 private: 84 void topology_initialization(std::size_t groups_num) { 85 initialization_state = started; 86 87 // Parse topology 88 if ( hwloc_topology_init( &topology ) == 0 ) { 89 initialization_state = topology_allocated; 90 #if __TBBBIND_HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING_PRESENT 91 if ( groups_num == 1 && 92 hwloc_topology_set_flags(topology, 93 HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM | 94 HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING 95 ) != 0 96 ) { 97 return; 98 } 99 #endif 100 if ( hwloc_topology_load( topology ) == 0 ) { 101 initialization_state = topology_loaded; 102 } 103 } 104 if ( initialization_state != topology_loaded ) 105 return; 106 107 // Getting process affinity mask 108 if ( intergroup_binding_allowed(groups_num) ) { 109 process_cpu_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_cpuset (topology)); 110 process_node_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_nodeset(topology)); 111 } else { 112 process_cpu_affinity_mask = hwloc_bitmap_alloc(); 113 process_node_affinity_mask = hwloc_bitmap_alloc(); 114 115 assertion_hwloc_wrapper(hwloc_get_cpubind, topology, process_cpu_affinity_mask, 0); 116 hwloc_cpuset_to_nodeset(topology, process_cpu_affinity_mask, process_node_affinity_mask); 117 } 118 119 number_of_processors_groups = groups_num; 120 } 121 122 void numa_topology_parsing() { 123 // Fill parameters with stubs if topology parsing is broken. 124 if ( initialization_state != topology_loaded ) { 125 numa_nodes_count = 1; 126 numa_indexes_list.push_back(-1); 127 return; 128 } 129 130 // If system contains no NUMA nodes, HWLOC 1.11 returns an infinitely filled bitmap. 131 // hwloc_bitmap_weight() returns negative value for such bitmaps, so we use this check 132 // to change way of topology initialization. 133 numa_nodes_count = hwloc_bitmap_weight(process_node_affinity_mask); 134 if (numa_nodes_count <= 0) { 135 // numa_nodes_count may be empty if the process affinity mask is empty too (invalid case) 136 // or if some internal HWLOC error occurred. 137 // So we place -1 as index in this case. 138 numa_indexes_list.push_back(numa_nodes_count == 0 ? -1 : 0); 139 numa_nodes_count = 1; 140 141 numa_affinity_masks_list.push_back(hwloc_bitmap_dup(process_cpu_affinity_mask)); 142 } else { 143 // Get NUMA logical indexes list 144 unsigned counter = 0; 145 int i = 0; 146 int max_numa_index = -1; 147 numa_indexes_list.resize(numa_nodes_count); 148 hwloc_obj_t node_buffer; 149 hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) { 150 node_buffer = hwloc_get_numanode_obj_by_os_index(topology, i); 151 numa_indexes_list[counter] = static_cast<int>(node_buffer->logical_index); 152 153 if ( numa_indexes_list[counter] > max_numa_index ) { 154 max_numa_index = numa_indexes_list[counter]; 155 } 156 157 counter++; 158 } hwloc_bitmap_foreach_end(); 159 __TBB_ASSERT(max_numa_index >= 0, "Maximal NUMA index must not be negative"); 160 161 // Fill concurrency and affinity masks lists 162 numa_affinity_masks_list.resize(max_numa_index + 1); 163 int index = 0; 164 hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) { 165 node_buffer = hwloc_get_numanode_obj_by_os_index(topology, i); 166 index = static_cast<int>(node_buffer->logical_index); 167 168 hwloc_cpuset_t& current_mask = numa_affinity_masks_list[index]; 169 current_mask = hwloc_bitmap_dup(node_buffer->cpuset); 170 171 hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask); 172 __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask), "hwloc detected unavailable NUMA node"); 173 } hwloc_bitmap_foreach_end(); 174 } 175 } 176 177 void core_types_topology_parsing() { 178 // Fill parameters with stubs if topology parsing is broken. 179 if ( initialization_state != topology_loaded ) { 180 core_types_indexes_list.push_back(-1); 181 return; 182 } 183 #if __TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT 184 __TBB_ASSERT(hwloc_get_api_version() >= 0x20400, "Hybrid CPUs support interfaces required HWLOC >= 2.4"); 185 // Parsing the hybrid CPU topology 186 int core_types_number = hwloc_cpukinds_get_nr(topology, 0); 187 bool core_types_parsing_broken = core_types_number <= 0; 188 if (!core_types_parsing_broken) { 189 core_types_affinity_masks_list.resize(core_types_number); 190 int efficiency{-1}; 191 192 for (int core_type = 0; core_type < core_types_number; ++core_type) { 193 hwloc_cpuset_t& current_mask = core_types_affinity_masks_list[core_type]; 194 current_mask = hwloc_bitmap_alloc(); 195 196 if (!hwloc_cpukinds_get_info(topology, core_type, current_mask, &efficiency, nullptr, nullptr, 0) 197 && efficiency >= 0 198 ) { 199 hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask); 200 201 if (hwloc_bitmap_weight(current_mask) > 0) { 202 core_types_indexes_list.push_back(core_type); 203 } 204 __TBB_ASSERT(hwloc_bitmap_weight(current_mask) >= 0, "Infinivitely filled core type mask"); 205 } else { 206 core_types_parsing_broken = true; 207 break; 208 } 209 } 210 } 211 #else /*!__TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/ 212 bool core_types_parsing_broken{true}; 213 #endif /*__TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/ 214 215 if (core_types_parsing_broken) { 216 for (auto& core_type_mask : core_types_affinity_masks_list) { 217 hwloc_bitmap_free(core_type_mask); 218 } 219 core_types_affinity_masks_list.resize(1); 220 core_types_indexes_list.resize(1); 221 222 core_types_affinity_masks_list[0] = hwloc_bitmap_dup(process_cpu_affinity_mask); 223 core_types_indexes_list[0] = -1; 224 } 225 } 226 227 void enforce_hwloc_2_5_runtime_linkage() { 228 // Without the call of this function HWLOC 2.4 can be successfully loaded during the tbbbind_2_5 loading. 229 // It is possible since tbbbind_2_5 don't use any new entry points that were introduced in HWLOC 2.5 230 // But tbbbind_2_5 compiles with HWLOC 2.5 header, therefore such situation requires binary forward compatibility 231 // which are not guaranteed by the HWLOC library. To enforce linkage tbbbind_2_5 only with HWLOC >= 2.5 version 232 // this function calls the interface that is available in the HWLOC 2.5 only. 233 #if HWLOC_API_VERSION >= 0x20500 234 auto some_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, nullptr); 235 hwloc_get_obj_with_same_locality(topology, some_core, HWLOC_OBJ_CORE, nullptr, nullptr, 0); 236 #endif 237 } 238 239 240 void initialize( std::size_t groups_num ) { 241 if ( initialization_state != uninitialized ) 242 return; 243 244 topology_initialization(groups_num); 245 numa_topology_parsing(); 246 core_types_topology_parsing(); 247 248 enforce_hwloc_2_5_runtime_linkage(); 249 250 if (initialization_state == topology_loaded) 251 initialization_state = topology_parsed; 252 } 253 254 static system_topology* instance_ptr; 255 public: 256 typedef hwloc_cpuset_t affinity_mask; 257 typedef hwloc_const_cpuset_t const_affinity_mask; 258 259 bool is_topology_parsed() { return initialization_state == topology_parsed; } 260 261 static void construct( std::size_t groups_num ) { 262 if (instance_ptr == nullptr) { 263 instance_ptr = new system_topology(); 264 instance_ptr->initialize(groups_num); 265 } 266 } 267 268 static system_topology& instance() { 269 __TBB_ASSERT(instance_ptr != nullptr, "Getting instance of non-constructed topology"); 270 return *instance_ptr; 271 } 272 273 static void destroy() { 274 __TBB_ASSERT(instance_ptr != nullptr, "Destroying non-constructed topology"); 275 delete instance_ptr; 276 } 277 278 ~system_topology() { 279 if ( is_topology_parsed() ) { 280 for (auto& numa_node_mask : numa_affinity_masks_list) { 281 hwloc_bitmap_free(numa_node_mask); 282 } 283 284 for (auto& core_type_mask : core_types_affinity_masks_list) { 285 hwloc_bitmap_free(core_type_mask); 286 } 287 288 hwloc_bitmap_free(process_node_affinity_mask); 289 hwloc_bitmap_free(process_cpu_affinity_mask); 290 } 291 292 if ( initialization_state >= topology_allocated ) { 293 hwloc_topology_destroy(topology); 294 } 295 296 initialization_state = uninitialized; 297 } 298 299 void fill_topology_information( 300 int& _numa_nodes_count, int*& _numa_indexes_list, 301 int& _core_types_count, int*& _core_types_indexes_list 302 ) { 303 __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized system_topology"); 304 _numa_nodes_count = numa_nodes_count; 305 _numa_indexes_list = numa_indexes_list.data(); 306 307 _core_types_count = (int)core_types_indexes_list.size(); 308 _core_types_indexes_list = core_types_indexes_list.data(); 309 } 310 311 void fill_constraints_affinity_mask(affinity_mask input_mask, int numa_node_index, int core_type_index, int max_threads_per_core) { 312 __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized system_topology"); 313 __TBB_ASSERT(numa_node_index < (int)numa_affinity_masks_list.size(), "Wrong NUMA node id"); 314 __TBB_ASSERT(core_type_index < (int)core_types_affinity_masks_list.size(), "Wrong core type id"); 315 __TBB_ASSERT(max_threads_per_core == -1 || max_threads_per_core > 0, "Wrong max_threads_per_core"); 316 317 hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc(); 318 hwloc_cpuset_t core_mask = hwloc_bitmap_alloc(); 319 320 hwloc_bitmap_copy(constraints_mask, process_cpu_affinity_mask); 321 if (numa_node_index >= 0) { 322 hwloc_bitmap_and(constraints_mask, constraints_mask, numa_affinity_masks_list[numa_node_index]); 323 } 324 if (core_type_index >= 0) { 325 hwloc_bitmap_and(constraints_mask, constraints_mask, core_types_affinity_masks_list[core_type_index]); 326 } 327 if (max_threads_per_core > 0) { 328 // clear input mask 329 hwloc_bitmap_zero(input_mask); 330 331 hwloc_obj_t current_core = nullptr; 332 while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) { 333 hwloc_bitmap_and(core_mask, constraints_mask, current_core->cpuset); 334 335 // fit the core mask to required bits number 336 int current_threads_per_core = 0; 337 for (int id = hwloc_bitmap_first(core_mask); id != -1; id = hwloc_bitmap_next(core_mask, id)) { 338 if (++current_threads_per_core > max_threads_per_core) { 339 hwloc_bitmap_clr(core_mask, id); 340 } 341 } 342 343 hwloc_bitmap_or(input_mask, input_mask, core_mask); 344 } 345 } else { 346 hwloc_bitmap_copy(input_mask, constraints_mask); 347 } 348 349 hwloc_bitmap_free(core_mask); 350 hwloc_bitmap_free(constraints_mask); 351 } 352 353 void fit_num_threads_per_core(affinity_mask result_mask, affinity_mask current_mask, affinity_mask constraints_mask) { 354 hwloc_bitmap_zero(result_mask); 355 hwloc_obj_t current_core = nullptr; 356 while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) { 357 if (hwloc_bitmap_intersects(current_mask, current_core->cpuset)) { 358 hwloc_bitmap_or(result_mask, result_mask, current_core->cpuset); 359 } 360 } 361 hwloc_bitmap_and(result_mask, result_mask, constraints_mask); 362 } 363 364 int get_default_concurrency(int numa_node_index, int core_type_index, int max_threads_per_core) { 365 __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized system_topology"); 366 367 hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc(); 368 fill_constraints_affinity_mask(constraints_mask, numa_node_index, core_type_index, max_threads_per_core); 369 370 int default_concurrency = hwloc_bitmap_weight(constraints_mask); 371 hwloc_bitmap_free(constraints_mask); 372 return default_concurrency; 373 } 374 375 affinity_mask allocate_process_affinity_mask() { 376 __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized system_topology"); 377 return hwloc_bitmap_dup(process_cpu_affinity_mask); 378 } 379 380 void free_affinity_mask( affinity_mask mask_to_free ) { 381 hwloc_bitmap_free(mask_to_free); // If bitmap is nullptr, no operation is performed. 382 } 383 384 void store_current_affinity_mask( affinity_mask current_mask ) { 385 assertion_hwloc_wrapper(hwloc_get_cpubind, topology, current_mask, HWLOC_CPUBIND_THREAD); 386 387 hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask); 388 __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask), 389 "Current affinity mask must intersects with process affinity mask"); 390 } 391 392 void set_affinity_mask( const_affinity_mask mask ) { 393 if (hwloc_bitmap_weight(mask) > 0) { 394 assertion_hwloc_wrapper(hwloc_set_cpubind, topology, mask, HWLOC_CPUBIND_THREAD); 395 } 396 } 397 }; 398 399 system_topology* system_topology::instance_ptr{nullptr}; 400 401 class binding_handler { 402 // Following vector saves thread affinity mask on scheduler entry to return it to this thread 403 // on scheduler exit. 404 typedef std::vector<system_topology::affinity_mask> affinity_masks_container; 405 affinity_masks_container affinity_backup; 406 system_topology::affinity_mask handler_affinity_mask; 407 408 #ifdef _WIN32 409 affinity_masks_container affinity_buffer; 410 int my_numa_node_id; 411 int my_core_type_id; 412 int my_max_threads_per_core; 413 #endif 414 415 public: 416 binding_handler( std::size_t size, int numa_node_id, int core_type_id, int max_threads_per_core ) 417 : affinity_backup(size) 418 #ifdef _WIN32 419 , affinity_buffer(size) 420 , my_numa_node_id(numa_node_id) 421 , my_core_type_id(core_type_id) 422 , my_max_threads_per_core(max_threads_per_core) 423 #endif 424 { 425 for (std::size_t i = 0; i < size; ++i) { 426 affinity_backup[i] = system_topology::instance().allocate_process_affinity_mask(); 427 #ifdef _WIN32 428 affinity_buffer[i] = system_topology::instance().allocate_process_affinity_mask(); 429 #endif 430 } 431 handler_affinity_mask = system_topology::instance().allocate_process_affinity_mask(); 432 system_topology::instance().fill_constraints_affinity_mask 433 (handler_affinity_mask, numa_node_id, core_type_id, max_threads_per_core); 434 } 435 436 ~binding_handler() { 437 for (std::size_t i = 0; i < affinity_backup.size(); ++i) { 438 system_topology::instance().free_affinity_mask(affinity_backup[i]); 439 #ifdef _WIN32 440 system_topology::instance().free_affinity_mask(affinity_buffer[i]); 441 #endif 442 } 443 system_topology::instance().free_affinity_mask(handler_affinity_mask); 444 } 445 446 void apply_affinity( unsigned slot_num ) { 447 auto& topology = system_topology::instance(); 448 __TBB_ASSERT(slot_num < affinity_backup.size(), 449 "The slot number is greater than the number of slots in the arena"); 450 __TBB_ASSERT(topology.is_topology_parsed(), 451 "Trying to get access to uninitialized system_topology"); 452 453 topology.store_current_affinity_mask(affinity_backup[slot_num]); 454 455 #ifdef _WIN32 456 // TBBBind supports only systems where NUMA nodes and core types do not cross the border 457 // between several processor groups. So if a certain NUMA node or core type constraint 458 // specified, then the constraints affinity mask will not cross the processor groups' border. 459 460 // But if we have constraint based only on the max_threads_per_core setting, then the 461 // constraints affinity mask does may cross the border between several processor groups 462 // on machines with more then 64 hardware threads. That is why we need to use the special 463 // function, which regulates the number of threads in the current threads mask. 464 if (topology.number_of_processors_groups > 1 && my_max_threads_per_core != -1 && 465 (my_numa_node_id == -1 || topology.numa_indexes_list.size() == 1) && 466 (my_core_type_id == -1 || topology.core_types_indexes_list.size() == 1) 467 ) { 468 topology.fit_num_threads_per_core(affinity_buffer[slot_num], affinity_backup[slot_num], handler_affinity_mask); 469 topology.set_affinity_mask(affinity_buffer[slot_num]); 470 return; 471 } 472 #endif 473 topology.set_affinity_mask(handler_affinity_mask); 474 } 475 476 void restore_previous_affinity_mask( unsigned slot_num ) { 477 auto& topology = system_topology::instance(); 478 __TBB_ASSERT(topology.is_topology_parsed(), 479 "Trying to get access to uninitialized system_topology"); 480 topology.set_affinity_mask(affinity_backup[slot_num]); 481 }; 482 483 }; 484 485 extern "C" { // exported to TBB interfaces 486 487 TBBBIND_EXPORT void __TBB_internal_initialize_system_topology( 488 std::size_t groups_num, 489 int& numa_nodes_count, int*& numa_indexes_list, 490 int& core_types_count, int*& core_types_indexes_list 491 ) { 492 system_topology::construct(groups_num); 493 system_topology::instance().fill_topology_information( 494 numa_nodes_count, numa_indexes_list, 495 core_types_count, core_types_indexes_list 496 ); 497 } 498 499 TBBBIND_EXPORT binding_handler* __TBB_internal_allocate_binding_handler(int number_of_slots, int numa_id, int core_type_id, int max_threads_per_core) { 500 __TBB_ASSERT(number_of_slots > 0, "Trying to create numa handler for 0 threads."); 501 return new binding_handler(number_of_slots, numa_id, core_type_id, max_threads_per_core); 502 } 503 504 TBBBIND_EXPORT void __TBB_internal_deallocate_binding_handler(binding_handler* handler_ptr) { 505 __TBB_ASSERT(handler_ptr != nullptr, "Trying to deallocate nullptr pointer."); 506 delete handler_ptr; 507 } 508 509 TBBBIND_EXPORT void __TBB_internal_apply_affinity(binding_handler* handler_ptr, int slot_num) { 510 __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata."); 511 handler_ptr->apply_affinity(slot_num); 512 } 513 514 TBBBIND_EXPORT void __TBB_internal_restore_affinity(binding_handler* handler_ptr, int slot_num) { 515 __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata."); 516 handler_ptr->restore_previous_affinity_mask(slot_num); 517 } 518 519 TBBBIND_EXPORT int __TBB_internal_get_default_concurrency(int numa_id, int core_type_id, int max_threads_per_core) { 520 return system_topology::instance().get_default_concurrency(numa_id, core_type_id, max_threads_per_core); 521 } 522 523 void __TBB_internal_destroy_system_topology() { 524 return system_topology::destroy(); 525 } 526 527 } // extern "C" 528 529 } // namespace r1 530 } // namespace detail 531 } // namespace tbb 532 533 #undef assertion_hwloc_wrapper 534