1 /* 2 Copyright (c) 2019-2021 Intel Corporation 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 #include <vector> 18 #include <mutex> 19 20 #include "../tbb/assert_impl.h" // Out-of-line TBB assertion handling routines are instantiated here. 21 #include "oneapi/tbb/detail/_assert.h" 22 #include "oneapi/tbb/detail/_config.h" 23 24 #if _MSC_VER && !__INTEL_COMPILER && !__clang__ 25 #pragma warning( push ) 26 #pragma warning( disable : 4100 ) 27 #elif _MSC_VER && __clang__ 28 #pragma GCC diagnostic push 29 #pragma GCC diagnostic ignored "-Wunused-parameter" 30 #endif 31 #include <hwloc.h> 32 #if _MSC_VER && !__INTEL_COMPILER && !__clang__ 33 #pragma warning( pop ) 34 #elif _MSC_VER && __clang__ 35 #pragma GCC diagnostic pop 36 #endif 37 38 #define __TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT (HWLOC_API_VERSION >= 0x20400) 39 #define __TBBBIND_HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING_PRESENT (HWLOC_API_VERSION >= 0x20500) 40 41 // Most of hwloc calls returns negative exit code on error. 42 // This macro tracks error codes that are returned from the hwloc interfaces. 43 #define assertion_hwloc_wrapper(command, ...) \ 44 __TBB_ASSERT_EX( (command(__VA_ARGS__)) >= 0, "Error occurred during call to hwloc API."); 45 46 namespace tbb { 47 namespace detail { 48 namespace r1 { 49 50 //------------------------------------------------------------------------ 51 // Information about the machine's hardware TBB is happen to work on 52 //------------------------------------------------------------------------ 53 class system_topology { 54 friend class binding_handler; 55 56 // Common topology members 57 hwloc_topology_t topology{nullptr}; 58 hwloc_cpuset_t process_cpu_affinity_mask{nullptr}; 59 hwloc_nodeset_t process_node_affinity_mask{nullptr}; 60 std::size_t number_of_processors_groups{1}; 61 62 // NUMA API related topology members 63 std::vector<hwloc_cpuset_t> numa_affinity_masks_list{}; 64 std::vector<int> numa_indexes_list{}; 65 int numa_nodes_count{0}; 66 67 // Hybrid CPUs API related topology members 68 std::vector<hwloc_cpuset_t> core_types_affinity_masks_list{}; 69 std::vector<int> core_types_indexes_list{}; 70 71 enum init_stages { uninitialized, 72 started, 73 topology_allocated, 74 topology_loaded, 75 topology_parsed } initialization_state; 76 77 // Binding threads that locate in another Windows Processor groups 78 // is allowed only if machine topology contains several Windows Processors groups 79 // and process affinity mask wasn`t limited manually (affinity mask cannot violates 80 // processors group boundaries). 81 bool intergroup_binding_allowed(std::size_t groups_num) { return groups_num > 1; } 82 83 private: 84 void topology_initialization(std::size_t groups_num) { 85 initialization_state = started; 86 87 // Parse topology 88 if ( hwloc_topology_init( &topology ) == 0 ) { 89 initialization_state = topology_allocated; 90 #if __TBBBIND_HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING_PRESENT 91 if ( groups_num == 1 && 92 hwloc_topology_set_flags(topology, 93 HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM | 94 HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING 95 ) != 0 96 ) { 97 return; 98 } 99 #endif 100 if ( hwloc_topology_load( topology ) == 0 ) { 101 initialization_state = topology_loaded; 102 } 103 } 104 if ( initialization_state != topology_loaded ) 105 return; 106 107 // Getting process affinity mask 108 if ( intergroup_binding_allowed(groups_num) ) { 109 process_cpu_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_cpuset (topology)); 110 process_node_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_nodeset(topology)); 111 } else { 112 process_cpu_affinity_mask = hwloc_bitmap_alloc(); 113 process_node_affinity_mask = hwloc_bitmap_alloc(); 114 115 assertion_hwloc_wrapper(hwloc_get_cpubind, topology, process_cpu_affinity_mask, 0); 116 hwloc_cpuset_to_nodeset(topology, process_cpu_affinity_mask, process_node_affinity_mask); 117 } 118 119 number_of_processors_groups = groups_num; 120 } 121 122 void numa_topology_parsing() { 123 // Fill parameters with stubs if topology parsing is broken. 124 if ( initialization_state != topology_loaded ) { 125 numa_nodes_count = 1; 126 numa_indexes_list.push_back(-1); 127 return; 128 } 129 130 // If system contains no NUMA nodes, HWLOC 1.11 returns an infinitely filled bitmap. 131 // hwloc_bitmap_weight() returns negative value for such bitmaps, so we use this check 132 // to change way of topology initialization. 133 numa_nodes_count = hwloc_bitmap_weight(process_node_affinity_mask); 134 if (numa_nodes_count <= 0) { 135 // numa_nodes_count may be empty if the process affinity mask is empty too (invalid case) 136 // or if some internal HWLOC error occurred. 137 // So we place -1 as index in this case. 138 numa_indexes_list.push_back(numa_nodes_count == 0 ? -1 : 0); 139 numa_nodes_count = 1; 140 141 numa_affinity_masks_list.push_back(hwloc_bitmap_dup(process_cpu_affinity_mask)); 142 } else { 143 // Get NUMA logical indexes list 144 unsigned counter = 0; 145 int i = 0; 146 int max_numa_index = -1; 147 numa_indexes_list.resize(numa_nodes_count); 148 hwloc_obj_t node_buffer; 149 hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) { 150 node_buffer = hwloc_get_numanode_obj_by_os_index(topology, i); 151 numa_indexes_list[counter] = static_cast<int>(node_buffer->logical_index); 152 153 if ( numa_indexes_list[counter] > max_numa_index ) { 154 max_numa_index = numa_indexes_list[counter]; 155 } 156 157 counter++; 158 } hwloc_bitmap_foreach_end(); 159 __TBB_ASSERT(max_numa_index >= 0, "Maximal NUMA index must not be negative"); 160 161 // Fill concurrency and affinity masks lists 162 numa_affinity_masks_list.resize(max_numa_index + 1); 163 int index = 0; 164 hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) { 165 node_buffer = hwloc_get_numanode_obj_by_os_index(topology, i); 166 index = static_cast<int>(node_buffer->logical_index); 167 168 hwloc_cpuset_t& current_mask = numa_affinity_masks_list[index]; 169 current_mask = hwloc_bitmap_dup(node_buffer->cpuset); 170 171 hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask); 172 __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask), "hwloc detected unavailable NUMA node"); 173 } hwloc_bitmap_foreach_end(); 174 } 175 } 176 177 void core_types_topology_parsing() { 178 // Fill parameters with stubs if topology parsing is broken. 179 if ( initialization_state != topology_loaded ) { 180 core_types_indexes_list.push_back(-1); 181 return; 182 } 183 #if __TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT 184 __TBB_ASSERT(hwloc_get_api_version() >= 0x20400, "Hybrid CPUs support interfaces required HWLOC >= 2.4"); 185 // Parsing the hybrid CPU topology 186 int core_types_number = hwloc_cpukinds_get_nr(topology, 0); 187 bool core_types_parsing_broken = core_types_number <= 0; 188 if (!core_types_parsing_broken) { 189 core_types_affinity_masks_list.resize(core_types_number); 190 int efficiency{-1}; 191 192 for (int core_type = 0; core_type < core_types_number; ++core_type) { 193 hwloc_cpuset_t& current_mask = core_types_affinity_masks_list[core_type]; 194 current_mask = hwloc_bitmap_alloc(); 195 196 if (!hwloc_cpukinds_get_info(topology, core_type, current_mask, &efficiency, nullptr, nullptr, 0) 197 && efficiency >= 0 198 ) { 199 hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask); 200 201 if (hwloc_bitmap_weight(current_mask) > 0) { 202 core_types_indexes_list.push_back(core_type); 203 } 204 __TBB_ASSERT(hwloc_bitmap_weight(current_mask) >= 0, "Infinivitely filled core type mask"); 205 } else { 206 core_types_parsing_broken = true; 207 break; 208 } 209 } 210 } 211 #else /*!__TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/ 212 bool core_types_parsing_broken{true}; 213 #endif /*__TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/ 214 215 if (core_types_parsing_broken) { 216 for (auto& core_type_mask : core_types_affinity_masks_list) { 217 hwloc_bitmap_free(core_type_mask); 218 } 219 core_types_affinity_masks_list.resize(1); 220 core_types_indexes_list.resize(1); 221 222 core_types_affinity_masks_list[0] = hwloc_bitmap_dup(process_cpu_affinity_mask); 223 core_types_indexes_list[0] = -1; 224 } 225 } 226 227 void initialize( std::size_t groups_num ) { 228 if ( initialization_state != uninitialized ) 229 return; 230 231 topology_initialization(groups_num); 232 numa_topology_parsing(); 233 core_types_topology_parsing(); 234 235 if (initialization_state == topology_loaded) 236 initialization_state = topology_parsed; 237 } 238 239 static system_topology* instance_ptr; 240 public: 241 typedef hwloc_cpuset_t affinity_mask; 242 typedef hwloc_const_cpuset_t const_affinity_mask; 243 244 bool is_topology_parsed() { return initialization_state == topology_parsed; } 245 246 static void construct( std::size_t groups_num ) { 247 if (instance_ptr == nullptr) { 248 instance_ptr = new system_topology(); 249 instance_ptr->initialize(groups_num); 250 } 251 } 252 253 static system_topology& instance() { 254 __TBB_ASSERT(instance_ptr != nullptr, "Getting instance of non-constructed topology"); 255 return *instance_ptr; 256 } 257 258 static void destroy() { 259 __TBB_ASSERT(instance_ptr != nullptr, "Destroying non-constructed topology"); 260 delete instance_ptr; 261 } 262 263 ~system_topology() { 264 if ( is_topology_parsed() ) { 265 for (auto& numa_node_mask : numa_affinity_masks_list) { 266 hwloc_bitmap_free(numa_node_mask); 267 } 268 269 for (auto& core_type_mask : core_types_affinity_masks_list) { 270 hwloc_bitmap_free(core_type_mask); 271 } 272 273 hwloc_bitmap_free(process_node_affinity_mask); 274 hwloc_bitmap_free(process_cpu_affinity_mask); 275 } 276 277 if ( initialization_state >= topology_allocated ) { 278 hwloc_topology_destroy(topology); 279 } 280 281 initialization_state = uninitialized; 282 } 283 284 void fill_topology_information( 285 int& _numa_nodes_count, int*& _numa_indexes_list, 286 int& _core_types_count, int*& _core_types_indexes_list 287 ) { 288 __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized system_topology"); 289 _numa_nodes_count = numa_nodes_count; 290 _numa_indexes_list = numa_indexes_list.data(); 291 292 _core_types_count = (int)core_types_indexes_list.size(); 293 _core_types_indexes_list = core_types_indexes_list.data(); 294 } 295 296 void fill_constraints_affinity_mask(affinity_mask input_mask, int numa_node_index, int core_type_index, int max_threads_per_core) { 297 __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized system_topology"); 298 __TBB_ASSERT(numa_node_index < (int)numa_affinity_masks_list.size(), "Wrong NUMA node id"); 299 __TBB_ASSERT(core_type_index < (int)core_types_affinity_masks_list.size(), "Wrong core type id"); 300 __TBB_ASSERT(max_threads_per_core == -1 || max_threads_per_core > 0, "Wrong max_threads_per_core"); 301 302 hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc(); 303 hwloc_cpuset_t core_mask = hwloc_bitmap_alloc(); 304 305 hwloc_bitmap_copy(constraints_mask, process_cpu_affinity_mask); 306 if (numa_node_index >= 0) { 307 hwloc_bitmap_and(constraints_mask, constraints_mask, numa_affinity_masks_list[numa_node_index]); 308 } 309 if (core_type_index >= 0) { 310 hwloc_bitmap_and(constraints_mask, constraints_mask, core_types_affinity_masks_list[core_type_index]); 311 } 312 if (max_threads_per_core > 0) { 313 // clear input mask 314 hwloc_bitmap_zero(input_mask); 315 316 hwloc_obj_t current_core = nullptr; 317 while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) { 318 hwloc_bitmap_and(core_mask, constraints_mask, current_core->cpuset); 319 320 // fit the core mask to required bits number 321 int current_threads_per_core = 0; 322 for (int id = hwloc_bitmap_first(core_mask); id != -1; id = hwloc_bitmap_next(core_mask, id)) { 323 if (++current_threads_per_core > max_threads_per_core) { 324 hwloc_bitmap_clr(core_mask, id); 325 } 326 } 327 328 hwloc_bitmap_or(input_mask, input_mask, core_mask); 329 } 330 } else { 331 hwloc_bitmap_copy(input_mask, constraints_mask); 332 } 333 334 hwloc_bitmap_free(core_mask); 335 hwloc_bitmap_free(constraints_mask); 336 } 337 338 void fit_num_threads_per_core(affinity_mask result_mask, affinity_mask current_mask, affinity_mask constraints_mask) { 339 hwloc_bitmap_zero(result_mask); 340 hwloc_obj_t current_core = nullptr; 341 while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) { 342 if (hwloc_bitmap_intersects(current_mask, current_core->cpuset)) { 343 hwloc_bitmap_or(result_mask, result_mask, current_core->cpuset); 344 } 345 } 346 hwloc_bitmap_and(result_mask, result_mask, constraints_mask); 347 } 348 349 int get_default_concurrency(int numa_node_index, int core_type_index, int max_threads_per_core) { 350 __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized system_topology"); 351 352 hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc(); 353 fill_constraints_affinity_mask(constraints_mask, numa_node_index, core_type_index, max_threads_per_core); 354 355 int default_concurrency = hwloc_bitmap_weight(constraints_mask); 356 hwloc_bitmap_free(constraints_mask); 357 return default_concurrency; 358 } 359 360 affinity_mask allocate_process_affinity_mask() { 361 __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized system_topology"); 362 return hwloc_bitmap_dup(process_cpu_affinity_mask); 363 } 364 365 void free_affinity_mask( affinity_mask mask_to_free ) { 366 hwloc_bitmap_free(mask_to_free); // If bitmap is nullptr, no operation is performed. 367 } 368 369 void store_current_affinity_mask( affinity_mask current_mask ) { 370 assertion_hwloc_wrapper(hwloc_get_cpubind, topology, current_mask, HWLOC_CPUBIND_THREAD); 371 372 hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask); 373 __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask), 374 "Current affinity mask must intersects with process affinity mask"); 375 } 376 377 void set_affinity_mask( const_affinity_mask mask ) { 378 if (hwloc_bitmap_weight(mask) > 0) { 379 assertion_hwloc_wrapper(hwloc_set_cpubind, topology, mask, HWLOC_CPUBIND_THREAD); 380 } 381 } 382 }; 383 384 system_topology* system_topology::instance_ptr{nullptr}; 385 386 class binding_handler { 387 // Following vector saves thread affinity mask on scheduler entry to return it to this thread 388 // on scheduler exit. 389 typedef std::vector<system_topology::affinity_mask> affinity_masks_container; 390 affinity_masks_container affinity_backup; 391 system_topology::affinity_mask handler_affinity_mask; 392 393 #if WIN32 394 affinity_masks_container affinity_buffer; 395 int my_numa_node_id; 396 int my_core_type_id; 397 int my_max_threads_per_core; 398 #endif 399 400 public: 401 binding_handler( std::size_t size, int numa_node_id, int core_type_id, int max_threads_per_core ) 402 : affinity_backup(size) 403 #if WIN32 404 , affinity_buffer(size) 405 , my_numa_node_id(numa_node_id) 406 , my_core_type_id(core_type_id) 407 , my_max_threads_per_core(max_threads_per_core) 408 #endif 409 { 410 for (std::size_t i = 0; i < size; ++i) { 411 affinity_backup[i] = system_topology::instance().allocate_process_affinity_mask(); 412 #if WIN32 413 affinity_buffer[i] = system_topology::instance().allocate_process_affinity_mask(); 414 #endif 415 } 416 handler_affinity_mask = system_topology::instance().allocate_process_affinity_mask(); 417 system_topology::instance().fill_constraints_affinity_mask 418 (handler_affinity_mask, numa_node_id, core_type_id, max_threads_per_core); 419 } 420 421 ~binding_handler() { 422 for (std::size_t i = 0; i < affinity_backup.size(); ++i) { 423 system_topology::instance().free_affinity_mask(affinity_backup[i]); 424 #if WIN32 425 system_topology::instance().free_affinity_mask(affinity_buffer[i]); 426 #endif 427 } 428 system_topology::instance().free_affinity_mask(handler_affinity_mask); 429 } 430 431 void apply_affinity( unsigned slot_num ) { 432 auto& topology = system_topology::instance(); 433 __TBB_ASSERT(slot_num < affinity_backup.size(), 434 "The slot number is greater than the number of slots in the arena"); 435 __TBB_ASSERT(topology.is_topology_parsed(), 436 "Trying to get access to uninitialized system_topology"); 437 438 topology.store_current_affinity_mask(affinity_backup[slot_num]); 439 440 #if WIN32 441 // TBBBind supports only systems where NUMA nodes and core types do not cross the border 442 // between several processor groups. So if a certain NUMA node or core type constraint 443 // specified, then the constraints affinity mask will not cross the processor groups' border. 444 445 // But if we have constraint based only on the max_threads_per_core setting, then the 446 // constraints affinity mask does may cross the border between several processor groups 447 // on machines with more then 64 hardware threads. That is why we need to use the special 448 // function, which regulates the number of threads in the current threads mask. 449 if (topology.number_of_processors_groups > 1 && my_max_threads_per_core != -1 && 450 (my_numa_node_id == -1 || topology.numa_indexes_list.size() == 1) && 451 (my_core_type_id == -1 || topology.core_types_indexes_list.size() == 1) 452 ) { 453 topology.fit_num_threads_per_core(affinity_buffer[slot_num], affinity_backup[slot_num], handler_affinity_mask); 454 topology.set_affinity_mask(affinity_buffer[slot_num]); 455 return; 456 } 457 #endif 458 topology.set_affinity_mask(handler_affinity_mask); 459 } 460 461 void restore_previous_affinity_mask( unsigned slot_num ) { 462 auto& topology = system_topology::instance(); 463 __TBB_ASSERT(topology.is_topology_parsed(), 464 "Trying to get access to uninitialized system_topology"); 465 topology.set_affinity_mask(affinity_backup[slot_num]); 466 }; 467 468 }; 469 470 extern "C" { // exported to TBB interfaces 471 472 TBBBIND_EXPORT void __TBB_internal_initialize_system_topology( 473 std::size_t groups_num, 474 int& numa_nodes_count, int*& numa_indexes_list, 475 int& core_types_count, int*& core_types_indexes_list 476 ) { 477 system_topology::construct(groups_num); 478 system_topology::instance().fill_topology_information( 479 numa_nodes_count, numa_indexes_list, 480 core_types_count, core_types_indexes_list 481 ); 482 } 483 484 TBBBIND_EXPORT binding_handler* __TBB_internal_allocate_binding_handler(int number_of_slots, int numa_id, int core_type_id, int max_threads_per_core) { 485 __TBB_ASSERT(number_of_slots > 0, "Trying to create numa handler for 0 threads."); 486 return new binding_handler(number_of_slots, numa_id, core_type_id, max_threads_per_core); 487 } 488 489 TBBBIND_EXPORT void __TBB_internal_deallocate_binding_handler(binding_handler* handler_ptr) { 490 __TBB_ASSERT(handler_ptr != nullptr, "Trying to deallocate nullptr pointer."); 491 delete handler_ptr; 492 } 493 494 TBBBIND_EXPORT void __TBB_internal_apply_affinity(binding_handler* handler_ptr, int slot_num) { 495 __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata."); 496 handler_ptr->apply_affinity(slot_num); 497 } 498 499 TBBBIND_EXPORT void __TBB_internal_restore_affinity(binding_handler* handler_ptr, int slot_num) { 500 __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata."); 501 handler_ptr->restore_previous_affinity_mask(slot_num); 502 } 503 504 TBBBIND_EXPORT int __TBB_internal_get_default_concurrency(int numa_id, int core_type_id, int max_threads_per_core) { 505 return system_topology::instance().get_default_concurrency(numa_id, core_type_id, max_threads_per_core); 506 } 507 508 void __TBB_internal_destroy_system_topology() { 509 return system_topology::destroy(); 510 } 511 512 } // extern "C" 513 514 } // namespace r1 515 } // namespace detail 516 } // namespace tbb 517 518 #undef assertion_hwloc_wrapper 519