1 /* 2 Copyright (c) 2019-2021 Intel Corporation 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 #include <vector> 18 #include <mutex> 19 20 #include "../tbb/assert_impl.h" // Out-of-line TBB assertion handling routines are instantiated here. 21 #include "oneapi/tbb/detail/_assert.h" 22 #include "oneapi/tbb/detail/_config.h" 23 24 #if _MSC_VER && !__INTEL_COMPILER && !__clang__ 25 #pragma warning( push ) 26 #pragma warning( disable : 4100 ) 27 #elif _MSC_VER && __clang__ 28 #pragma GCC diagnostic push 29 #pragma GCC diagnostic ignored "-Wunused-parameter" 30 #endif 31 #include <hwloc.h> 32 #if _MSC_VER && !__INTEL_COMPILER && !__clang__ 33 #pragma warning( pop ) 34 #elif _MSC_VER && __clang__ 35 #pragma GCC diagnostic pop 36 #endif 37 38 #define __TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT (HWLOC_API_VERSION >= 0x20400) 39 #define __TBBBIND_HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING_PRESENT (HWLOC_API_VERSION >= 0x20500) 40 41 // Most of hwloc calls returns negative exit code on error. 42 // This macro tracks error codes that are returned from the hwloc interfaces. 43 #define assertion_hwloc_wrapper(command, ...) \ 44 __TBB_ASSERT_EX( (command(__VA_ARGS__)) >= 0, "Error occurred during call to hwloc API."); 45 46 namespace tbb { 47 namespace detail { 48 namespace r1 { 49 50 //------------------------------------------------------------------------ 51 // Information about the machine's hardware TBB is happen to work on 52 //------------------------------------------------------------------------ 53 class platform_topology { 54 friend class binding_handler; 55 56 // Common topology members 57 hwloc_topology_t topology{nullptr}; 58 hwloc_cpuset_t process_cpu_affinity_mask{nullptr}; 59 hwloc_nodeset_t process_node_affinity_mask{nullptr}; 60 std::size_t number_of_processors_groups{1}; 61 62 // NUMA API related topology members 63 std::vector<hwloc_cpuset_t> numa_affinity_masks_list{}; 64 std::vector<int> numa_indexes_list{}; 65 int numa_nodes_count{0}; 66 67 // Hybrid CPUs API related topology members 68 std::vector<hwloc_cpuset_t> core_types_affinity_masks_list{}; 69 std::vector<int> core_types_indexes_list{}; 70 71 enum init_stages { uninitialized, 72 started, 73 topology_allocated, 74 topology_loaded, 75 topology_parsed } initialization_state; 76 77 // Binding threads that locate in another Windows Processor groups 78 // is allowed only if machine topology contains several Windows Processors groups 79 // and process affinity mask wasn`t limited manually (affinity mask cannot violates 80 // processors group boundaries). 81 bool intergroup_binding_allowed(std::size_t groups_num) { return groups_num > 1; } 82 83 private: 84 void topology_initialization(std::size_t groups_num) { 85 initialization_state = started; 86 87 // Parse topology 88 if ( hwloc_topology_init( &topology ) == 0 ) { 89 initialization_state = topology_allocated; 90 #if __TBBBIND_HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING_PRESENT 91 if ( groups_num == 1 && 92 hwloc_topology_set_flags(topology, 93 HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM | 94 HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING 95 ) != 0 96 ) { 97 return; 98 } 99 #endif 100 if ( hwloc_topology_load( topology ) == 0 ) { 101 initialization_state = topology_loaded; 102 } 103 } 104 if ( initialization_state != topology_loaded ) 105 return; 106 107 // Getting process affinity mask 108 if ( intergroup_binding_allowed(groups_num) ) { 109 process_cpu_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_cpuset (topology)); 110 process_node_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_nodeset(topology)); 111 } else { 112 process_cpu_affinity_mask = hwloc_bitmap_alloc(); 113 process_node_affinity_mask = hwloc_bitmap_alloc(); 114 115 assertion_hwloc_wrapper(hwloc_get_cpubind, topology, process_cpu_affinity_mask, 0); 116 hwloc_cpuset_to_nodeset(topology, process_cpu_affinity_mask, process_node_affinity_mask); 117 } 118 119 number_of_processors_groups = groups_num; 120 } 121 122 void numa_topology_parsing() { 123 // Fill parameters with stubs if topology parsing is broken. 124 if ( initialization_state != topology_loaded ) { 125 numa_nodes_count = 1; 126 numa_indexes_list.push_back(-1); 127 return; 128 } 129 130 // If system contains no NUMA nodes, HWLOC 1.11 returns an infinitely filled bitmap. 131 // hwloc_bitmap_weight() returns negative value for such bitmaps, so we use this check 132 // to change way of topology initialization. 133 numa_nodes_count = hwloc_bitmap_weight(process_node_affinity_mask); 134 if (numa_nodes_count <= 0) { 135 // numa_nodes_count may be empty if the process affinity mask is empty too (invalid case) 136 // or if some internal HWLOC error occurred. 137 // So we place -1 as index in this case. 138 numa_indexes_list.push_back(numa_nodes_count == 0 ? -1 : 0); 139 numa_nodes_count = 1; 140 141 numa_affinity_masks_list.push_back(hwloc_bitmap_dup(process_cpu_affinity_mask)); 142 } else { 143 // Get NUMA logical indexes list 144 unsigned counter = 0; 145 int i = 0; 146 int max_numa_index = -1; 147 numa_indexes_list.resize(numa_nodes_count); 148 hwloc_obj_t node_buffer; 149 hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) { 150 node_buffer = hwloc_get_numanode_obj_by_os_index(topology, i); 151 numa_indexes_list[counter] = static_cast<int>(node_buffer->logical_index); 152 153 if ( numa_indexes_list[counter] > max_numa_index ) { 154 max_numa_index = numa_indexes_list[counter]; 155 } 156 157 counter++; 158 } hwloc_bitmap_foreach_end(); 159 __TBB_ASSERT(max_numa_index >= 0, "Maximal NUMA index must not be negative"); 160 161 // Fill concurrency and affinity masks lists 162 numa_affinity_masks_list.resize(max_numa_index + 1); 163 int index = 0; 164 hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) { 165 node_buffer = hwloc_get_numanode_obj_by_os_index(topology, i); 166 index = static_cast<int>(node_buffer->logical_index); 167 168 hwloc_cpuset_t& current_mask = numa_affinity_masks_list[index]; 169 current_mask = hwloc_bitmap_dup(node_buffer->cpuset); 170 171 hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask); 172 __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask), "hwloc detected unavailable NUMA node"); 173 } hwloc_bitmap_foreach_end(); 174 } 175 } 176 177 void core_types_topology_parsing() { 178 // Fill parameters with stubs if topology parsing is broken. 179 if ( initialization_state != topology_loaded ) { 180 core_types_indexes_list.push_back(-1); 181 return; 182 } 183 #if __TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT 184 __TBB_ASSERT(hwloc_get_api_version() >= 0x20400, "Hybrid CPUs support interfaces required HWLOC >= 2.4"); 185 // Parsing the hybrid CPU topology 186 int core_types_number = hwloc_cpukinds_get_nr(topology, 0); 187 bool core_types_parsing_broken = core_types_number <= 0; 188 if (!core_types_parsing_broken) { 189 core_types_affinity_masks_list.resize(core_types_number); 190 int efficiency{-1}; 191 192 for (int core_type = 0; core_type < core_types_number; ++core_type) { 193 hwloc_cpuset_t& current_mask = core_types_affinity_masks_list[core_type]; 194 current_mask = hwloc_bitmap_alloc(); 195 196 if (!hwloc_cpukinds_get_info(topology, core_type, current_mask, &efficiency, nullptr, nullptr, 0) 197 && efficiency >= 0 198 ) { 199 hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask); 200 201 if (hwloc_bitmap_weight(current_mask) > 0) { 202 core_types_indexes_list.push_back(core_type); 203 } 204 __TBB_ASSERT(hwloc_bitmap_weight(current_mask) >= 0, "Infinivitely filled core type mask"); 205 } else { 206 core_types_parsing_broken = true; 207 break; 208 } 209 } 210 } 211 #else /*!__TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/ 212 bool core_types_parsing_broken{true}; 213 #endif /*__TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/ 214 215 if (core_types_parsing_broken) { 216 for (auto& core_type_mask : core_types_affinity_masks_list) { 217 hwloc_bitmap_free(core_type_mask); 218 } 219 core_types_affinity_masks_list.resize(1); 220 core_types_indexes_list.resize(1); 221 222 core_types_affinity_masks_list[0] = hwloc_bitmap_dup(process_cpu_affinity_mask); 223 core_types_indexes_list[0] = -1; 224 } 225 } 226 227 public: 228 typedef hwloc_cpuset_t affinity_mask; 229 typedef hwloc_const_cpuset_t const_affinity_mask; 230 231 static platform_topology& instance() { 232 static platform_topology topology; 233 return topology; 234 } 235 236 bool is_topology_parsed() { return initialization_state == topology_parsed; } 237 238 void initialize( std::size_t groups_num ) { 239 if ( initialization_state != uninitialized ) 240 return; 241 242 topology_initialization(groups_num); 243 numa_topology_parsing(); 244 core_types_topology_parsing(); 245 246 if (initialization_state == topology_loaded) 247 initialization_state = topology_parsed; 248 } 249 250 ~platform_topology() { 251 if ( is_topology_parsed() ) { 252 for (auto& numa_node_mask : numa_affinity_masks_list) { 253 hwloc_bitmap_free(numa_node_mask); 254 } 255 256 for (auto& core_type_mask : core_types_affinity_masks_list) { 257 hwloc_bitmap_free(core_type_mask); 258 } 259 260 hwloc_bitmap_free(process_node_affinity_mask); 261 hwloc_bitmap_free(process_cpu_affinity_mask); 262 } 263 264 if ( initialization_state >= topology_allocated ) { 265 hwloc_topology_destroy(topology); 266 } 267 268 initialization_state = uninitialized; 269 } 270 271 void fill_topology_information( 272 int& _numa_nodes_count, int*& _numa_indexes_list, 273 int& _core_types_count, int*& _core_types_indexes_list 274 ) { 275 __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology"); 276 _numa_nodes_count = numa_nodes_count; 277 _numa_indexes_list = numa_indexes_list.data(); 278 279 _core_types_count = (int)core_types_indexes_list.size(); 280 _core_types_indexes_list = core_types_indexes_list.data(); 281 } 282 283 void fill_constraints_affinity_mask(affinity_mask input_mask, int numa_node_index, int core_type_index, int max_threads_per_core) { 284 __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology"); 285 __TBB_ASSERT(numa_node_index < (int)numa_affinity_masks_list.size(), "Wrong NUMA node id"); 286 __TBB_ASSERT(core_type_index < (int)core_types_affinity_masks_list.size(), "Wrong core type id"); 287 __TBB_ASSERT(max_threads_per_core == -1 || max_threads_per_core > 0, "Wrong max_threads_per_core"); 288 289 hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc(); 290 hwloc_cpuset_t core_mask = hwloc_bitmap_alloc(); 291 292 hwloc_bitmap_copy(constraints_mask, process_cpu_affinity_mask); 293 if (numa_node_index >= 0) { 294 hwloc_bitmap_and(constraints_mask, constraints_mask, numa_affinity_masks_list[numa_node_index]); 295 } 296 if (core_type_index >= 0) { 297 hwloc_bitmap_and(constraints_mask, constraints_mask, core_types_affinity_masks_list[core_type_index]); 298 } 299 if (max_threads_per_core > 0) { 300 // clear input mask 301 hwloc_bitmap_zero(input_mask); 302 303 hwloc_obj_t current_core = nullptr; 304 while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) { 305 hwloc_bitmap_and(core_mask, constraints_mask, current_core->cpuset); 306 307 // fit the core mask to required bits number 308 int current_threads_per_core = 0; 309 for (int id = hwloc_bitmap_first(core_mask); id != -1; id = hwloc_bitmap_next(core_mask, id)) { 310 if (++current_threads_per_core > max_threads_per_core) { 311 hwloc_bitmap_clr(core_mask, id); 312 } 313 } 314 315 hwloc_bitmap_or(input_mask, input_mask, core_mask); 316 } 317 } else { 318 hwloc_bitmap_copy(input_mask, constraints_mask); 319 } 320 321 hwloc_bitmap_free(core_mask); 322 hwloc_bitmap_free(constraints_mask); 323 } 324 325 void fit_num_threads_per_core(affinity_mask result_mask, affinity_mask current_mask, affinity_mask constraints_mask) { 326 hwloc_bitmap_zero(result_mask); 327 hwloc_obj_t current_core = nullptr; 328 while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) { 329 if (hwloc_bitmap_intersects(current_mask, current_core->cpuset)) { 330 hwloc_bitmap_or(result_mask, result_mask, current_core->cpuset); 331 } 332 } 333 hwloc_bitmap_and(result_mask, result_mask, constraints_mask); 334 } 335 336 int get_default_concurrency(int numa_node_index, int core_type_index, int max_threads_per_core) { 337 __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology"); 338 339 hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc(); 340 fill_constraints_affinity_mask(constraints_mask, numa_node_index, core_type_index, max_threads_per_core); 341 342 int default_concurrency = hwloc_bitmap_weight(constraints_mask); 343 hwloc_bitmap_free(constraints_mask); 344 return default_concurrency; 345 } 346 347 affinity_mask allocate_process_affinity_mask() { 348 __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology"); 349 return hwloc_bitmap_dup(process_cpu_affinity_mask); 350 } 351 352 void free_affinity_mask( affinity_mask mask_to_free ) { 353 hwloc_bitmap_free(mask_to_free); // If bitmap is nullptr, no operation is performed. 354 } 355 356 void store_current_affinity_mask( affinity_mask current_mask ) { 357 assertion_hwloc_wrapper(hwloc_get_cpubind, topology, current_mask, HWLOC_CPUBIND_THREAD); 358 359 hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask); 360 __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask), 361 "Current affinity mask must intersects with process affinity mask"); 362 } 363 364 void set_affinity_mask( const_affinity_mask mask ) { 365 if (hwloc_bitmap_weight(mask) > 0) { 366 assertion_hwloc_wrapper(hwloc_set_cpubind, topology, mask, HWLOC_CPUBIND_THREAD); 367 } 368 } 369 }; 370 371 class binding_handler { 372 // Following vector saves thread affinity mask on scheduler entry to return it to this thread 373 // on scheduler exit. 374 typedef std::vector<platform_topology::affinity_mask> affinity_masks_container; 375 affinity_masks_container affinity_backup; 376 platform_topology::affinity_mask handler_affinity_mask; 377 378 #if WIN32 379 affinity_masks_container affinity_buffer; 380 int my_numa_node_id; 381 int my_core_type_id; 382 int my_max_threads_per_core; 383 #endif 384 385 public: 386 binding_handler( std::size_t size, int numa_node_id, int core_type_id, int max_threads_per_core ) 387 : affinity_backup(size) 388 #if WIN32 389 , affinity_buffer(size) 390 , my_numa_node_id(numa_node_id) 391 , my_core_type_id(core_type_id) 392 , my_max_threads_per_core(max_threads_per_core) 393 #endif 394 { 395 for (std::size_t i = 0; i < size; ++i) { 396 affinity_backup[i] = platform_topology::instance().allocate_process_affinity_mask(); 397 #if WIN32 398 affinity_buffer[i] = platform_topology::instance().allocate_process_affinity_mask(); 399 #endif 400 } 401 handler_affinity_mask = platform_topology::instance().allocate_process_affinity_mask(); 402 platform_topology::instance().fill_constraints_affinity_mask 403 (handler_affinity_mask, numa_node_id, core_type_id, max_threads_per_core); 404 } 405 406 ~binding_handler() { 407 for (std::size_t i = 0; i < affinity_backup.size(); ++i) { 408 platform_topology::instance().free_affinity_mask(affinity_backup[i]); 409 #if WIN32 410 platform_topology::instance().free_affinity_mask(affinity_buffer[i]); 411 #endif 412 } 413 platform_topology::instance().free_affinity_mask(handler_affinity_mask); 414 } 415 416 void apply_affinity( unsigned slot_num ) { 417 auto& topology = platform_topology::instance(); 418 __TBB_ASSERT(slot_num < affinity_backup.size(), 419 "The slot number is greater than the number of slots in the arena"); 420 __TBB_ASSERT(topology.is_topology_parsed(), 421 "Trying to get access to uninitialized platform_topology"); 422 423 topology.store_current_affinity_mask(affinity_backup[slot_num]); 424 425 #if WIN32 426 // TBBBind supports only systems where NUMA nodes and core types do not cross the border 427 // between several processor groups. So if a certain NUMA node or core type constraint 428 // specified, then the constraints affinity mask will not cross the processor groups' border. 429 430 // But if we have constraint based only on the max_threads_per_core setting, then the 431 // constraints affinity mask does may cross the border between several processor groups 432 // on machines with more then 64 hardware threads. That is why we need to use the special 433 // function, which regulates the number of threads in the current threads mask. 434 if (topology.number_of_processors_groups > 1 && my_max_threads_per_core != -1 && 435 (my_numa_node_id == -1 || topology.numa_indexes_list.size() == 1) && 436 (my_core_type_id == -1 || topology.core_types_indexes_list.size() == 1) 437 ) { 438 topology.fit_num_threads_per_core(affinity_buffer[slot_num], affinity_backup[slot_num], handler_affinity_mask); 439 topology.set_affinity_mask(affinity_buffer[slot_num]); 440 return; 441 } 442 #endif 443 topology.set_affinity_mask(handler_affinity_mask); 444 } 445 446 void restore_previous_affinity_mask( unsigned slot_num ) { 447 auto& topology = platform_topology::instance(); 448 __TBB_ASSERT(topology.is_topology_parsed(), 449 "Trying to get access to uninitialized platform_topology"); 450 topology.set_affinity_mask(affinity_backup[slot_num]); 451 }; 452 453 }; 454 455 extern "C" { // exported to TBB interfaces 456 457 TBBBIND_EXPORT void __TBB_internal_initialize_system_topology( 458 std::size_t groups_num, 459 int& numa_nodes_count, int*& numa_indexes_list, 460 int& core_types_count, int*& core_types_indexes_list 461 ) { 462 platform_topology::instance().initialize(groups_num); 463 platform_topology::instance().fill_topology_information( 464 numa_nodes_count, numa_indexes_list, 465 core_types_count, core_types_indexes_list 466 ); 467 } 468 469 TBBBIND_EXPORT binding_handler* __TBB_internal_allocate_binding_handler(int number_of_slots, int numa_id, int core_type_id, int max_threads_per_core) { 470 __TBB_ASSERT(number_of_slots > 0, "Trying to create numa handler for 0 threads."); 471 return new binding_handler(number_of_slots, numa_id, core_type_id, max_threads_per_core); 472 } 473 474 TBBBIND_EXPORT void __TBB_internal_deallocate_binding_handler(binding_handler* handler_ptr) { 475 __TBB_ASSERT(handler_ptr != nullptr, "Trying to deallocate nullptr pointer."); 476 delete handler_ptr; 477 } 478 479 TBBBIND_EXPORT void __TBB_internal_apply_affinity(binding_handler* handler_ptr, int slot_num) { 480 __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata."); 481 handler_ptr->apply_affinity(slot_num); 482 } 483 484 TBBBIND_EXPORT void __TBB_internal_restore_affinity(binding_handler* handler_ptr, int slot_num) { 485 __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata."); 486 handler_ptr->restore_previous_affinity_mask(slot_num); 487 } 488 489 TBBBIND_EXPORT int __TBB_internal_get_default_concurrency(int numa_id, int core_type_id, int max_threads_per_core) { 490 return platform_topology::instance().get_default_concurrency(numa_id, core_type_id, max_threads_per_core); 491 } 492 493 } // extern "C" 494 495 } // namespace r1 496 } // namespace detail 497 } // namespace tbb 498 499 #undef assertion_hwloc_wrapper 500