1 /* 2 Copyright (c) 2019-2021 Intel Corporation 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 #include <vector> 18 #include <mutex> 19 20 #include "../tbb/assert_impl.h" // Out-of-line TBB assertion handling routines are instantiated here. 21 #include "oneapi/tbb/detail/_assert.h" 22 23 #if _MSC_VER && !__INTEL_COMPILER && !__clang__ 24 #pragma warning( push ) 25 #pragma warning( disable : 4100 ) 26 #elif _MSC_VER && __clang__ 27 #pragma GCC diagnostic push 28 #pragma GCC diagnostic ignored "-Wunused-parameter" 29 #endif 30 #include <hwloc.h> 31 #if _MSC_VER && !__INTEL_COMPILER && !__clang__ 32 #pragma warning( pop ) 33 #elif _MSC_VER && __clang__ 34 #pragma GCC diagnostic pop 35 #endif 36 37 #define __TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT (HWLOC_API_VERSION >= 0x20400) 38 #define __TBBBIND_HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING_PRESENT (HWLOC_API_VERSION >= 0x20500) 39 40 // Most of hwloc calls returns negative exit code on error. 41 // This macro tracks error codes that are returned from the hwloc interfaces. 42 #define assertion_hwloc_wrapper(command, ...) \ 43 __TBB_ASSERT_EX( (command(__VA_ARGS__)) >= 0, "Error occurred during call to hwloc API."); 44 45 namespace tbb { 46 namespace detail { 47 namespace r1 { 48 49 //------------------------------------------------------------------------ 50 // Information about the machine's hardware TBB is happen to work on 51 //------------------------------------------------------------------------ 52 class platform_topology { 53 friend class binding_handler; 54 55 // Common topology members 56 hwloc_topology_t topology{nullptr}; 57 hwloc_cpuset_t process_cpu_affinity_mask{nullptr}; 58 hwloc_nodeset_t process_node_affinity_mask{nullptr}; 59 std::size_t number_of_processors_groups{1}; 60 61 // NUMA API related topology members 62 std::vector<hwloc_cpuset_t> numa_affinity_masks_list{}; 63 std::vector<int> numa_indexes_list{}; 64 int numa_nodes_count{0}; 65 66 // Hybrid CPUs API related topology members 67 std::vector<hwloc_cpuset_t> core_types_affinity_masks_list{}; 68 std::vector<int> core_types_indexes_list{}; 69 70 enum init_stages { uninitialized, 71 started, 72 topology_allocated, 73 topology_loaded, 74 topology_parsed } initialization_state; 75 76 // Binding threads that locate in another Windows Processor groups 77 // is allowed only if machine topology contains several Windows Processors groups 78 // and process affinity mask wasn`t limited manually (affinity mask cannot violates 79 // processors group boundaries). 80 bool intergroup_binding_allowed(std::size_t groups_num) { return groups_num > 1; } 81 82 private: 83 void topology_initialization(std::size_t groups_num) { 84 initialization_state = started; 85 86 // Parse topology 87 if ( hwloc_topology_init( &topology ) == 0 ) { 88 initialization_state = topology_allocated; 89 #if __TBBBIND_HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING_PRESENT 90 if ( groups_num == 1 && 91 hwloc_topology_set_flags(topology, 92 HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM | 93 HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING 94 ) != 0 95 ) { 96 return; 97 } 98 #endif 99 if ( hwloc_topology_load( topology ) == 0 ) { 100 initialization_state = topology_loaded; 101 } 102 } 103 if ( initialization_state != topology_loaded ) 104 return; 105 106 // Getting process affinity mask 107 if ( intergroup_binding_allowed(groups_num) ) { 108 process_cpu_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_cpuset (topology)); 109 process_node_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_nodeset(topology)); 110 } else { 111 process_cpu_affinity_mask = hwloc_bitmap_alloc(); 112 process_node_affinity_mask = hwloc_bitmap_alloc(); 113 114 assertion_hwloc_wrapper(hwloc_get_cpubind, topology, process_cpu_affinity_mask, 0); 115 hwloc_cpuset_to_nodeset(topology, process_cpu_affinity_mask, process_node_affinity_mask); 116 } 117 118 number_of_processors_groups = groups_num; 119 } 120 121 void numa_topology_parsing() { 122 // Fill parameters with stubs if topology parsing is broken. 123 if ( initialization_state != topology_loaded ) { 124 numa_nodes_count = 1; 125 numa_indexes_list.push_back(-1); 126 return; 127 } 128 129 // If system contains no NUMA nodes, HWLOC 1.11 returns an infinitely filled bitmap. 130 // hwloc_bitmap_weight() returns negative value for such bitmaps, so we use this check 131 // to change way of topology initialization. 132 numa_nodes_count = hwloc_bitmap_weight(process_node_affinity_mask); 133 if (numa_nodes_count <= 0) { 134 // numa_nodes_count may be empty if the process affinity mask is empty too (invalid case) 135 // or if some internal HWLOC error occurred. 136 // So we place -1 as index in this case. 137 numa_indexes_list.push_back(numa_nodes_count == 0 ? -1 : 0); 138 numa_nodes_count = 1; 139 140 numa_affinity_masks_list.push_back(hwloc_bitmap_dup(process_cpu_affinity_mask)); 141 } else { 142 // Get NUMA logical indexes list 143 unsigned counter = 0; 144 int i = 0; 145 int max_numa_index = -1; 146 numa_indexes_list.resize(numa_nodes_count); 147 hwloc_obj_t node_buffer; 148 hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) { 149 node_buffer = hwloc_get_numanode_obj_by_os_index(topology, i); 150 numa_indexes_list[counter] = static_cast<int>(node_buffer->logical_index); 151 152 if ( numa_indexes_list[counter] > max_numa_index ) { 153 max_numa_index = numa_indexes_list[counter]; 154 } 155 156 counter++; 157 } hwloc_bitmap_foreach_end(); 158 __TBB_ASSERT(max_numa_index >= 0, "Maximal NUMA index must not be negative"); 159 160 // Fill concurrency and affinity masks lists 161 numa_affinity_masks_list.resize(max_numa_index + 1); 162 int index = 0; 163 hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) { 164 node_buffer = hwloc_get_numanode_obj_by_os_index(topology, i); 165 index = static_cast<int>(node_buffer->logical_index); 166 167 hwloc_cpuset_t& current_mask = numa_affinity_masks_list[index]; 168 current_mask = hwloc_bitmap_dup(node_buffer->cpuset); 169 170 hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask); 171 __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask), "hwloc detected unavailable NUMA node"); 172 } hwloc_bitmap_foreach_end(); 173 } 174 } 175 176 void core_types_topology_parsing() { 177 // Fill parameters with stubs if topology parsing is broken. 178 if ( initialization_state != topology_loaded ) { 179 core_types_indexes_list.push_back(-1); 180 return; 181 } 182 #if __TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT 183 __TBB_ASSERT(hwloc_get_api_version() >= 0x20400, "Hybrid CPUs support interfaces required HWLOC >= 2.4"); 184 // Parsing the hybrid CPU topology 185 int core_types_number = hwloc_cpukinds_get_nr(topology, 0); 186 bool core_types_parsing_broken = core_types_number <= 0; 187 if (!core_types_parsing_broken) { 188 core_types_affinity_masks_list.resize(core_types_number); 189 int efficiency{-1}; 190 191 for (int core_type = 0; core_type < core_types_number; ++core_type) { 192 hwloc_cpuset_t& current_mask = core_types_affinity_masks_list[core_type]; 193 current_mask = hwloc_bitmap_alloc(); 194 195 if (!hwloc_cpukinds_get_info(topology, core_type, current_mask, &efficiency, nullptr, nullptr, 0) 196 && efficiency >= 0 197 ) { 198 hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask); 199 200 if (hwloc_bitmap_weight(current_mask) > 0) { 201 core_types_indexes_list.push_back(core_type); 202 } 203 __TBB_ASSERT(hwloc_bitmap_weight(current_mask) >= 0, "Infinivitely filled core type mask"); 204 } else { 205 core_types_parsing_broken = true; 206 break; 207 } 208 } 209 } 210 #else /*!__TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/ 211 bool core_types_parsing_broken{true}; 212 #endif /*__TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/ 213 214 if (core_types_parsing_broken) { 215 for (auto& core_type_mask : core_types_affinity_masks_list) { 216 hwloc_bitmap_free(core_type_mask); 217 } 218 core_types_affinity_masks_list.resize(1); 219 core_types_indexes_list.resize(1); 220 221 core_types_affinity_masks_list[0] = hwloc_bitmap_dup(process_cpu_affinity_mask); 222 core_types_indexes_list[0] = -1; 223 } 224 } 225 226 public: 227 typedef hwloc_cpuset_t affinity_mask; 228 typedef hwloc_const_cpuset_t const_affinity_mask; 229 230 static platform_topology& instance() { 231 static platform_topology topology; 232 return topology; 233 } 234 235 bool is_topology_parsed() { return initialization_state == topology_parsed; } 236 237 void initialize( std::size_t groups_num ) { 238 if ( initialization_state != uninitialized ) 239 return; 240 241 topology_initialization(groups_num); 242 numa_topology_parsing(); 243 core_types_topology_parsing(); 244 245 if (initialization_state == topology_loaded) 246 initialization_state = topology_parsed; 247 } 248 249 ~platform_topology() { 250 if ( is_topology_parsed() ) { 251 for (auto& numa_node_mask : numa_affinity_masks_list) { 252 hwloc_bitmap_free(numa_node_mask); 253 } 254 255 for (auto& core_type_mask : core_types_affinity_masks_list) { 256 hwloc_bitmap_free(core_type_mask); 257 } 258 259 hwloc_bitmap_free(process_node_affinity_mask); 260 hwloc_bitmap_free(process_cpu_affinity_mask); 261 } 262 263 if ( initialization_state >= topology_allocated ) { 264 hwloc_topology_destroy(topology); 265 } 266 267 initialization_state = uninitialized; 268 } 269 270 void fill_topology_information( 271 int& _numa_nodes_count, int*& _numa_indexes_list, 272 int& _core_types_count, int*& _core_types_indexes_list 273 ) { 274 __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology"); 275 _numa_nodes_count = numa_nodes_count; 276 _numa_indexes_list = numa_indexes_list.data(); 277 278 _core_types_count = (int)core_types_indexes_list.size(); 279 _core_types_indexes_list = core_types_indexes_list.data(); 280 } 281 282 void fill_constraints_affinity_mask(affinity_mask input_mask, int numa_node_index, int core_type_index, int max_threads_per_core) { 283 __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology"); 284 __TBB_ASSERT(numa_node_index < (int)numa_affinity_masks_list.size(), "Wrong NUMA node id"); 285 __TBB_ASSERT(core_type_index < (int)core_types_affinity_masks_list.size(), "Wrong core type id"); 286 __TBB_ASSERT(max_threads_per_core == -1 || max_threads_per_core > 0, "Wrong max_threads_per_core"); 287 288 hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc(); 289 hwloc_cpuset_t core_mask = hwloc_bitmap_alloc(); 290 291 hwloc_bitmap_copy(constraints_mask, process_cpu_affinity_mask); 292 if (numa_node_index >= 0) { 293 hwloc_bitmap_and(constraints_mask, constraints_mask, numa_affinity_masks_list[numa_node_index]); 294 } 295 if (core_type_index >= 0) { 296 hwloc_bitmap_and(constraints_mask, constraints_mask, core_types_affinity_masks_list[core_type_index]); 297 } 298 if (max_threads_per_core > 0) { 299 // clear input mask 300 hwloc_bitmap_zero(input_mask); 301 302 hwloc_obj_t current_core = nullptr; 303 while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) { 304 hwloc_bitmap_and(core_mask, constraints_mask, current_core->cpuset); 305 306 // fit the core mask to required bits number 307 int current_threads_per_core = 0; 308 for (int id = hwloc_bitmap_first(core_mask); id != -1; id = hwloc_bitmap_next(core_mask, id)) { 309 if (++current_threads_per_core > max_threads_per_core) { 310 hwloc_bitmap_clr(core_mask, id); 311 } 312 } 313 314 hwloc_bitmap_or(input_mask, input_mask, core_mask); 315 } 316 } else { 317 hwloc_bitmap_copy(input_mask, constraints_mask); 318 } 319 320 hwloc_bitmap_free(core_mask); 321 hwloc_bitmap_free(constraints_mask); 322 } 323 324 void fit_num_threads_per_core(affinity_mask result_mask, affinity_mask current_mask, affinity_mask constraints_mask) { 325 hwloc_bitmap_zero(result_mask); 326 hwloc_obj_t current_core = nullptr; 327 while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) { 328 if (hwloc_bitmap_intersects(current_mask, current_core->cpuset)) { 329 hwloc_bitmap_or(result_mask, result_mask, current_core->cpuset); 330 } 331 } 332 hwloc_bitmap_and(result_mask, result_mask, constraints_mask); 333 } 334 335 int get_default_concurrency(int numa_node_index, int core_type_index, int max_threads_per_core) { 336 __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology"); 337 338 hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc(); 339 fill_constraints_affinity_mask(constraints_mask, numa_node_index, core_type_index, max_threads_per_core); 340 341 int default_concurrency = hwloc_bitmap_weight(constraints_mask); 342 hwloc_bitmap_free(constraints_mask); 343 return default_concurrency; 344 } 345 346 affinity_mask allocate_process_affinity_mask() { 347 __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology"); 348 return hwloc_bitmap_dup(process_cpu_affinity_mask); 349 } 350 351 void free_affinity_mask( affinity_mask mask_to_free ) { 352 hwloc_bitmap_free(mask_to_free); // If bitmap is nullptr, no operation is performed. 353 } 354 355 void store_current_affinity_mask( affinity_mask current_mask ) { 356 assertion_hwloc_wrapper(hwloc_get_cpubind, topology, current_mask, HWLOC_CPUBIND_THREAD); 357 358 hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask); 359 __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask), 360 "Current affinity mask must intersects with process affinity mask"); 361 } 362 363 void set_affinity_mask( const_affinity_mask mask ) { 364 if (hwloc_bitmap_weight(mask) > 0) { 365 assertion_hwloc_wrapper(hwloc_set_cpubind, topology, mask, HWLOC_CPUBIND_THREAD); 366 } 367 } 368 }; 369 370 class binding_handler { 371 // Following vector saves thread affinity mask on scheduler entry to return it to this thread 372 // on scheduler exit. 373 typedef std::vector<platform_topology::affinity_mask> affinity_masks_container; 374 affinity_masks_container affinity_backup; 375 platform_topology::affinity_mask handler_affinity_mask; 376 377 #if WIN32 378 affinity_masks_container affinity_buffer; 379 int my_numa_node_id; 380 int my_core_type_id; 381 int my_max_threads_per_core; 382 #endif 383 384 public: 385 binding_handler( std::size_t size, int numa_node_id, int core_type_id, int max_threads_per_core ) 386 : affinity_backup(size) 387 #if WIN32 388 , affinity_buffer(size) 389 , my_numa_node_id(numa_node_id) 390 , my_core_type_id(core_type_id) 391 , my_max_threads_per_core(max_threads_per_core) 392 #endif 393 { 394 for (std::size_t i = 0; i < size; ++i) { 395 affinity_backup[i] = platform_topology::instance().allocate_process_affinity_mask(); 396 #if WIN32 397 affinity_buffer[i] = platform_topology::instance().allocate_process_affinity_mask(); 398 #endif 399 } 400 handler_affinity_mask = platform_topology::instance().allocate_process_affinity_mask(); 401 platform_topology::instance().fill_constraints_affinity_mask 402 (handler_affinity_mask, numa_node_id, core_type_id, max_threads_per_core); 403 } 404 405 ~binding_handler() { 406 for (std::size_t i = 0; i < affinity_backup.size(); ++i) { 407 platform_topology::instance().free_affinity_mask(affinity_backup[i]); 408 #if WIN32 409 platform_topology::instance().free_affinity_mask(affinity_buffer[i]); 410 #endif 411 } 412 platform_topology::instance().free_affinity_mask(handler_affinity_mask); 413 } 414 415 void apply_affinity( unsigned slot_num ) { 416 auto& topology = platform_topology::instance(); 417 __TBB_ASSERT(slot_num < affinity_backup.size(), 418 "The slot number is greater than the number of slots in the arena"); 419 __TBB_ASSERT(topology.is_topology_parsed(), 420 "Trying to get access to uninitialized platform_topology"); 421 422 topology.store_current_affinity_mask(affinity_backup[slot_num]); 423 424 #if WIN32 425 // TBBBind supports only systems where NUMA nodes and core types do not cross the border 426 // between several processor groups. So if a certain NUMA node or core type constraint 427 // specified, then the constraints affinity mask will not cross the processor groups' border. 428 429 // But if we have constraint based only on the max_threads_per_core setting, then the 430 // constraints affinity mask does may cross the border between several processor groups 431 // on machines with more then 64 hardware threads. That is why we need to use the special 432 // function, which regulates the number of threads in the current threads mask. 433 if (topology.number_of_processors_groups > 1 && my_max_threads_per_core != -1 && 434 (my_numa_node_id == -1 || topology.numa_indexes_list.size() == 1) && 435 (my_core_type_id == -1 || topology.core_types_indexes_list.size() == 1) 436 ) { 437 topology.fit_num_threads_per_core(affinity_buffer[slot_num], affinity_backup[slot_num], handler_affinity_mask); 438 topology.set_affinity_mask(affinity_buffer[slot_num]); 439 return; 440 } 441 #endif 442 topology.set_affinity_mask(handler_affinity_mask); 443 } 444 445 void restore_previous_affinity_mask( unsigned slot_num ) { 446 auto& topology = platform_topology::instance(); 447 __TBB_ASSERT(topology.is_topology_parsed(), 448 "Trying to get access to uninitialized platform_topology"); 449 topology.set_affinity_mask(affinity_backup[slot_num]); 450 }; 451 452 }; 453 454 extern "C" { // exported to TBB interfaces 455 456 void __TBB_internal_initialize_system_topology( 457 std::size_t groups_num, 458 int& numa_nodes_count, int*& numa_indexes_list, 459 int& core_types_count, int*& core_types_indexes_list 460 ) { 461 platform_topology::instance().initialize(groups_num); 462 platform_topology::instance().fill_topology_information( 463 numa_nodes_count, numa_indexes_list, 464 core_types_count, core_types_indexes_list 465 ); 466 } 467 468 binding_handler* __TBB_internal_allocate_binding_handler(int number_of_slots, int numa_id, int core_type_id, int max_threads_per_core) { 469 __TBB_ASSERT(number_of_slots > 0, "Trying to create numa handler for 0 threads."); 470 return new binding_handler(number_of_slots, numa_id, core_type_id, max_threads_per_core); 471 } 472 473 void __TBB_internal_deallocate_binding_handler(binding_handler* handler_ptr) { 474 __TBB_ASSERT(handler_ptr != nullptr, "Trying to deallocate nullptr pointer."); 475 delete handler_ptr; 476 } 477 478 void __TBB_internal_apply_affinity(binding_handler* handler_ptr, int slot_num) { 479 __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata."); 480 handler_ptr->apply_affinity(slot_num); 481 } 482 483 void __TBB_internal_restore_affinity(binding_handler* handler_ptr, int slot_num) { 484 __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata."); 485 handler_ptr->restore_previous_affinity_mask(slot_num); 486 } 487 488 int __TBB_internal_get_default_concurrency(int numa_id, int core_type_id, int max_threads_per_core) { 489 return platform_topology::instance().get_default_concurrency(numa_id, core_type_id, max_threads_per_core); 490 } 491 492 } // extern "C" 493 494 } // namespace r1 495 } // namespace detail 496 } // namespace tbb 497 498 #undef assertion_hwloc_wrapper 499