1 /* 2 Copyright (c) 2019-2021 Intel Corporation 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 #include <vector> 18 #include <mutex> 19 20 #include "../tbb/assert_impl.h" // Out-of-line TBB assertion handling routines are instantiated here. 21 #include "oneapi/tbb/detail/_assert.h" 22 23 #if _MSC_VER && !__INTEL_COMPILER && !__clang__ 24 #pragma warning( push ) 25 #pragma warning( disable : 4100 ) 26 #elif _MSC_VER && __clang__ 27 #pragma GCC diagnostic push 28 #pragma GCC diagnostic ignored "-Wunused-parameter" 29 #endif 30 #include <hwloc.h> 31 #if _MSC_VER && !__INTEL_COMPILER && !__clang__ 32 #pragma warning( pop ) 33 #elif _MSC_VER && __clang__ 34 #pragma GCC diagnostic pop 35 #endif 36 37 #define __HWLOC_HYBRID_CPUS_INTERFACES_PRESENT (HWLOC_API_VERSION >= 0x20400) 38 39 // Most of hwloc calls returns negative exit code on error. 40 // This macro tracks error codes that are returned from the hwloc interfaces. 41 #define assertion_hwloc_wrapper(command, ...) \ 42 __TBB_ASSERT_EX( (command(__VA_ARGS__)) >= 0, "Error occurred during call to hwloc API."); 43 44 namespace tbb { 45 namespace detail { 46 namespace r1 { 47 48 //------------------------------------------------------------------------ 49 // Information about the machine's hardware TBB is happen to work on 50 //------------------------------------------------------------------------ 51 class platform_topology { 52 friend class binding_handler; 53 54 // Common topology members 55 hwloc_topology_t topology{nullptr}; 56 hwloc_cpuset_t process_cpu_affinity_mask{nullptr}; 57 hwloc_nodeset_t process_node_affinity_mask{nullptr}; 58 std::size_t number_of_processors_groups{1}; 59 60 // NUMA API related topology members 61 std::vector<hwloc_cpuset_t> numa_affinity_masks_list{}; 62 std::vector<int> numa_indexes_list{}; 63 int numa_nodes_count{0}; 64 65 // Hybrid CPUs API related topology members 66 std::vector<hwloc_cpuset_t> core_types_affinity_masks_list{}; 67 std::vector<int> core_types_indexes_list{}; 68 69 enum init_stages { uninitialized, 70 started, 71 topology_allocated, 72 topology_loaded, 73 topology_parsed } initialization_state; 74 75 // Binding threads that locate in another Windows Processor groups 76 // is allowed only if machine topology contains several Windows Processors groups 77 // and process affinity mask wasn`t limited manually (affinity mask cannot violates 78 // processors group boundaries). 79 bool intergroup_binding_allowed(std::size_t groups_num) { return groups_num > 1; } 80 81 private: 82 void topology_initialization(std::size_t groups_num) { 83 initialization_state = started; 84 85 // Parse topology 86 if ( hwloc_topology_init( &topology ) == 0 ) { 87 initialization_state = topology_allocated; 88 if ( hwloc_topology_load( topology ) == 0 ) { 89 initialization_state = topology_loaded; 90 } 91 } 92 if ( initialization_state != topology_loaded ) 93 return; 94 95 // Getting process affinity mask 96 if ( intergroup_binding_allowed(groups_num) ) { 97 process_cpu_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_cpuset (topology)); 98 process_node_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_nodeset(topology)); 99 } else { 100 process_cpu_affinity_mask = hwloc_bitmap_alloc(); 101 process_node_affinity_mask = hwloc_bitmap_alloc(); 102 103 assertion_hwloc_wrapper(hwloc_get_cpubind, topology, process_cpu_affinity_mask, 0); 104 hwloc_cpuset_to_nodeset(topology, process_cpu_affinity_mask, process_node_affinity_mask); 105 } 106 107 number_of_processors_groups = groups_num; 108 } 109 110 void numa_topology_parsing() { 111 // Fill parameters with stubs if topology parsing is broken. 112 if ( initialization_state != topology_loaded ) { 113 numa_nodes_count = 1; 114 numa_indexes_list.push_back(-1); 115 return; 116 } 117 118 // If system contains no NUMA nodes, HWLOC 1.11 returns an infinitely filled bitmap. 119 // hwloc_bitmap_weight() returns negative value for such bitmaps, so we use this check 120 // to change way of topology initialization. 121 numa_nodes_count = hwloc_bitmap_weight(process_node_affinity_mask); 122 if (numa_nodes_count <= 0) { 123 // numa_nodes_count may be empty if the process affinity mask is empty too (invalid case) 124 // or if some internal HWLOC error occurred. 125 // So we place -1 as index in this case. 126 numa_indexes_list.push_back(numa_nodes_count == 0 ? -1 : 0); 127 numa_nodes_count = 1; 128 129 numa_affinity_masks_list.push_back(hwloc_bitmap_dup(process_cpu_affinity_mask)); 130 } else { 131 // Get NUMA logical indexes list 132 unsigned counter = 0; 133 int i = 0; 134 int max_numa_index = -1; 135 numa_indexes_list.resize(numa_nodes_count); 136 hwloc_obj_t node_buffer; 137 hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) { 138 node_buffer = hwloc_get_obj_by_type(topology, HWLOC_OBJ_NUMANODE, i); 139 numa_indexes_list[counter] = static_cast<int>(node_buffer->logical_index); 140 141 if ( numa_indexes_list[counter] > max_numa_index ) { 142 max_numa_index = numa_indexes_list[counter]; 143 } 144 145 counter++; 146 } hwloc_bitmap_foreach_end(); 147 __TBB_ASSERT(max_numa_index >= 0, "Maximal NUMA index must not be negative"); 148 149 // Fill concurrency and affinity masks lists 150 numa_affinity_masks_list.resize(max_numa_index + 1); 151 int index = 0; 152 hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) { 153 node_buffer = hwloc_get_obj_by_type(topology, HWLOC_OBJ_NUMANODE, i); 154 index = static_cast<int>(node_buffer->logical_index); 155 156 hwloc_cpuset_t& current_mask = numa_affinity_masks_list[index]; 157 current_mask = hwloc_bitmap_dup(node_buffer->cpuset); 158 159 hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask); 160 __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask), "hwloc detected unavailable NUMA node"); 161 } hwloc_bitmap_foreach_end(); 162 } 163 } 164 165 void core_types_topology_parsing() { 166 // Fill parameters with stubs if topology parsing is broken. 167 if ( initialization_state != topology_loaded ) { 168 core_types_indexes_list.push_back(-1); 169 return; 170 } 171 #if __HWLOC_HYBRID_CPUS_INTERFACES_PRESENT 172 __TBB_ASSERT(hwloc_get_api_version() >= 0x20400, "Hybrid CPUs support interfaces required HWLOC >= 2.4"); 173 // Parsing the hybrid CPU topology 174 int core_types_number = hwloc_cpukinds_get_nr(topology, 0); 175 bool core_types_parsing_broken = core_types_number <= 0; 176 if (!core_types_parsing_broken) { 177 core_types_affinity_masks_list.resize(core_types_number); 178 int efficiency{-1}; 179 180 for (int core_type = 0; core_type < core_types_number; ++core_type) { 181 hwloc_cpuset_t& current_mask = core_types_affinity_masks_list[core_type]; 182 current_mask = hwloc_bitmap_alloc(); 183 184 if (!hwloc_cpukinds_get_info(topology, core_type, current_mask, &efficiency, nullptr, nullptr, 0) 185 && efficiency >= 0 186 ) { 187 hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask); 188 189 if (hwloc_bitmap_weight(current_mask) > 0) { 190 core_types_indexes_list.push_back(core_type); 191 } 192 __TBB_ASSERT(hwloc_bitmap_weight(current_mask) >= 0, "Infinivitely filled core type mask"); 193 } else { 194 core_types_parsing_broken = true; 195 break; 196 } 197 } 198 } 199 #else /*!__HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/ 200 bool core_types_parsing_broken{true}; 201 #endif /*__HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/ 202 203 if (core_types_parsing_broken) { 204 for (auto& core_type_mask : core_types_affinity_masks_list) { 205 hwloc_bitmap_free(core_type_mask); 206 } 207 core_types_affinity_masks_list.resize(1); 208 core_types_indexes_list.resize(1); 209 210 core_types_affinity_masks_list[0] = hwloc_bitmap_dup(process_cpu_affinity_mask); 211 core_types_indexes_list[0] = -1; 212 } 213 } 214 215 public: 216 typedef hwloc_cpuset_t affinity_mask; 217 typedef hwloc_const_cpuset_t const_affinity_mask; 218 219 static platform_topology& instance() { 220 static platform_topology topology; 221 return topology; 222 } 223 224 bool is_topology_parsed() { return initialization_state == topology_parsed; } 225 226 void initialize( std::size_t groups_num ) { 227 if ( initialization_state != uninitialized ) 228 return; 229 230 topology_initialization(groups_num); 231 numa_topology_parsing(); 232 core_types_topology_parsing(); 233 234 if (initialization_state == topology_loaded) 235 initialization_state = topology_parsed; 236 } 237 238 ~platform_topology() { 239 if ( is_topology_parsed() ) { 240 for (auto& numa_node_mask : numa_affinity_masks_list) { 241 hwloc_bitmap_free(numa_node_mask); 242 } 243 244 for (auto& core_type_mask : core_types_affinity_masks_list) { 245 hwloc_bitmap_free(core_type_mask); 246 } 247 248 hwloc_bitmap_free(process_node_affinity_mask); 249 hwloc_bitmap_free(process_cpu_affinity_mask); 250 } 251 252 if ( initialization_state >= topology_allocated ) { 253 hwloc_topology_destroy(topology); 254 } 255 256 initialization_state = uninitialized; 257 } 258 259 void fill_topology_information( 260 int& _numa_nodes_count, int*& _numa_indexes_list, 261 int& _core_types_count, int*& _core_types_indexes_list 262 ) { 263 __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology"); 264 _numa_nodes_count = numa_nodes_count; 265 _numa_indexes_list = numa_indexes_list.data(); 266 267 _core_types_count = (int)core_types_indexes_list.size(); 268 _core_types_indexes_list = core_types_indexes_list.data(); 269 } 270 271 void fill_constraints_affinity_mask(affinity_mask input_mask, int numa_node_index, int core_type_index, int max_threads_per_core) { 272 __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology"); 273 __TBB_ASSERT(numa_node_index < (int)numa_affinity_masks_list.size(), "Wrong NUMA node id"); 274 __TBB_ASSERT(core_type_index < (int)core_types_affinity_masks_list.size(), "Wrong core type id"); 275 __TBB_ASSERT(max_threads_per_core == -1 || max_threads_per_core > 0, "Wrong max_threads_per_core"); 276 277 hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc(); 278 hwloc_cpuset_t core_mask = hwloc_bitmap_alloc(); 279 280 hwloc_bitmap_copy(constraints_mask, process_cpu_affinity_mask); 281 if (numa_node_index >= 0) { 282 hwloc_bitmap_and(constraints_mask, constraints_mask, numa_affinity_masks_list[numa_node_index]); 283 } 284 if (core_type_index >= 0) { 285 hwloc_bitmap_and(constraints_mask, constraints_mask, core_types_affinity_masks_list[core_type_index]); 286 } 287 if (max_threads_per_core > 0) { 288 // clear input mask 289 hwloc_bitmap_zero(input_mask); 290 291 hwloc_obj_t current_core = nullptr; 292 while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) { 293 hwloc_bitmap_and(core_mask, constraints_mask, current_core->cpuset); 294 295 // fit the core mask to required bits number 296 int current_threads_per_core = 0; 297 for (int id = hwloc_bitmap_first(core_mask); id != -1; id = hwloc_bitmap_next(core_mask, id)) { 298 if (++current_threads_per_core > max_threads_per_core) { 299 hwloc_bitmap_clr(core_mask, id); 300 } 301 } 302 303 hwloc_bitmap_or(input_mask, input_mask, core_mask); 304 } 305 } else { 306 hwloc_bitmap_copy(input_mask, constraints_mask); 307 } 308 309 hwloc_bitmap_free(core_mask); 310 hwloc_bitmap_free(constraints_mask); 311 } 312 313 void fit_num_threads_per_core(affinity_mask result_mask, affinity_mask current_mask, affinity_mask constraints_mask) { 314 hwloc_bitmap_zero(result_mask); 315 hwloc_obj_t current_core = nullptr; 316 while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) { 317 if (hwloc_bitmap_intersects(current_mask, current_core->cpuset)) { 318 hwloc_bitmap_or(result_mask, result_mask, current_core->cpuset); 319 } 320 } 321 hwloc_bitmap_and(result_mask, result_mask, constraints_mask); 322 } 323 324 int get_default_concurrency(int numa_node_index, int core_type_index, int max_threads_per_core) { 325 __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology"); 326 327 hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc(); 328 fill_constraints_affinity_mask(constraints_mask, numa_node_index, core_type_index, max_threads_per_core); 329 330 int default_concurrency = hwloc_bitmap_weight(constraints_mask); 331 hwloc_bitmap_free(constraints_mask); 332 return default_concurrency; 333 } 334 335 affinity_mask allocate_process_affinity_mask() { 336 __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology"); 337 return hwloc_bitmap_dup(process_cpu_affinity_mask); 338 } 339 340 void free_affinity_mask( affinity_mask mask_to_free ) { 341 hwloc_bitmap_free(mask_to_free); // If bitmap is nullptr, no operation is performed. 342 } 343 344 void store_current_affinity_mask( affinity_mask current_mask ) { 345 assertion_hwloc_wrapper(hwloc_get_cpubind, topology, current_mask, HWLOC_CPUBIND_THREAD); 346 347 hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask); 348 __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask), 349 "Current affinity mask must intersects with process affinity mask"); 350 } 351 352 void set_affinity_mask( const_affinity_mask mask ) { 353 if (hwloc_bitmap_weight(mask) > 0) { 354 assertion_hwloc_wrapper(hwloc_set_cpubind, topology, mask, HWLOC_CPUBIND_THREAD); 355 } 356 } 357 }; 358 359 class binding_handler { 360 // Following vector saves thread affinity mask on scheduler entry to return it to this thread 361 // on scheduler exit. 362 typedef std::vector<platform_topology::affinity_mask> affinity_masks_container; 363 affinity_masks_container affinity_backup; 364 platform_topology::affinity_mask handler_affinity_mask; 365 366 #if WIN32 367 affinity_masks_container affinity_buffer; 368 int my_numa_node_id; 369 int my_core_type_id; 370 int my_max_threads_per_core; 371 #endif 372 373 public: 374 binding_handler( std::size_t size, int numa_node_id, int core_type_id, int max_threads_per_core ) 375 : affinity_backup(size) 376 #if WIN32 377 , affinity_buffer(size) 378 , my_numa_node_id(numa_node_id) 379 , my_core_type_id(core_type_id) 380 , my_max_threads_per_core(max_threads_per_core) 381 #endif 382 { 383 for (std::size_t i = 0; i < size; ++i) { 384 affinity_backup[i] = platform_topology::instance().allocate_process_affinity_mask(); 385 #if WIN32 386 affinity_buffer[i] = platform_topology::instance().allocate_process_affinity_mask(); 387 #endif 388 } 389 handler_affinity_mask = platform_topology::instance().allocate_process_affinity_mask(); 390 platform_topology::instance().fill_constraints_affinity_mask 391 (handler_affinity_mask, numa_node_id, core_type_id, max_threads_per_core); 392 } 393 394 ~binding_handler() { 395 for (std::size_t i = 0; i < affinity_backup.size(); ++i) { 396 platform_topology::instance().free_affinity_mask(affinity_backup[i]); 397 #if WIN32 398 platform_topology::instance().free_affinity_mask(affinity_buffer[i]); 399 #endif 400 } 401 platform_topology::instance().free_affinity_mask(handler_affinity_mask); 402 } 403 404 void apply_affinity( unsigned slot_num ) { 405 auto& topology = platform_topology::instance(); 406 __TBB_ASSERT(slot_num < affinity_backup.size(), 407 "The slot number is greater than the number of slots in the arena"); 408 __TBB_ASSERT(topology.is_topology_parsed(), 409 "Trying to get access to uninitialized platform_topology"); 410 411 topology.store_current_affinity_mask(affinity_backup[slot_num]); 412 413 #if WIN32 414 // TBBBind supports only systems where NUMA nodes and core types do not cross the border 415 // between several processor groups. So if a certain NUMA node or core type constraint 416 // specified, then the constraints affinity mask will not cross the processor groups' border. 417 418 // But if we have constraint based only on the max_threads_per_core setting, then the 419 // constraints affinity mask does may cross the border between several processor groups 420 // on machines with more then 64 hardware threads. That is why we need to use the special 421 // function, which regulates the number of threads in the current threads mask. 422 if (topology.number_of_processors_groups > 1 && my_max_threads_per_core != -1 && 423 (my_numa_node_id == -1 || topology.numa_indexes_list.size() == 1) && 424 (my_core_type_id == -1 || topology.core_types_indexes_list.size() == 1) 425 ) { 426 topology.fit_num_threads_per_core(affinity_buffer[slot_num], affinity_backup[slot_num], handler_affinity_mask); 427 topology.set_affinity_mask(affinity_buffer[slot_num]); 428 return; 429 } 430 #endif 431 topology.set_affinity_mask(handler_affinity_mask); 432 } 433 434 void restore_previous_affinity_mask( unsigned slot_num ) { 435 auto& topology = platform_topology::instance(); 436 __TBB_ASSERT(topology.is_topology_parsed(), 437 "Trying to get access to uninitialized platform_topology"); 438 topology.set_affinity_mask(affinity_backup[slot_num]); 439 }; 440 441 }; 442 443 extern "C" { // exported to TBB interfaces 444 445 void __TBB_internal_initialize_system_topology( 446 std::size_t groups_num, 447 int& numa_nodes_count, int*& numa_indexes_list, 448 int& core_types_count, int*& core_types_indexes_list 449 ) { 450 platform_topology::instance().initialize(groups_num); 451 platform_topology::instance().fill_topology_information( 452 numa_nodes_count, numa_indexes_list, 453 core_types_count, core_types_indexes_list 454 ); 455 } 456 457 binding_handler* __TBB_internal_allocate_binding_handler(int number_of_slots, int numa_id, int core_type_id, int max_threads_per_core) { 458 __TBB_ASSERT(number_of_slots > 0, "Trying to create numa handler for 0 threads."); 459 return new binding_handler(number_of_slots, numa_id, core_type_id, max_threads_per_core); 460 } 461 462 void __TBB_internal_deallocate_binding_handler(binding_handler* handler_ptr) { 463 __TBB_ASSERT(handler_ptr != nullptr, "Trying to deallocate nullptr pointer."); 464 delete handler_ptr; 465 } 466 467 void __TBB_internal_apply_affinity(binding_handler* handler_ptr, int slot_num) { 468 __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata."); 469 handler_ptr->apply_affinity(slot_num); 470 } 471 472 void __TBB_internal_restore_affinity(binding_handler* handler_ptr, int slot_num) { 473 __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata."); 474 handler_ptr->restore_previous_affinity_mask(slot_num); 475 } 476 477 int __TBB_internal_get_default_concurrency(int numa_id, int core_type_id, int max_threads_per_core) { 478 return platform_topology::instance().get_default_concurrency(numa_id, core_type_id, max_threads_per_core); 479 } 480 481 } // extern "C" 482 483 } // namespace r1 484 } // namespace detail 485 } // namespace tbb 486 487 #undef assertion_hwloc_wrapper 488