1 /* 2 Copyright (c) 2019-2021 Intel Corporation 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 #include <vector> 18 #include <mutex> 19 20 #include "../tbb/assert_impl.h" // Out-of-line TBB assertion handling routines are instantiated here. 21 #include "oneapi/tbb/detail/_assert.h" 22 23 #if _MSC_VER && !__INTEL_COMPILER 24 #pragma warning( push ) 25 #pragma warning( disable : 4100 ) 26 #endif 27 #include <hwloc.h> 28 #if _MSC_VER && !__INTEL_COMPILER 29 #pragma warning( pop ) 30 #endif 31 32 #define __HWLOC_HYBRID_CPUS_INTERFACES_PRESENT (HWLOC_API_VERSION >= 0x20400) 33 34 // Most of hwloc calls returns negative exit code on error. 35 // This macro tracks error codes that are returned from the hwloc interfaces. 36 #define assertion_hwloc_wrapper(command, ...) \ 37 __TBB_ASSERT_EX( (command(__VA_ARGS__)) >= 0, "Error occurred during call to hwloc API."); 38 39 namespace tbb { 40 namespace detail { 41 namespace r1 { 42 43 //------------------------------------------------------------------------ 44 // Information about the machine's hardware TBB is happen to work on 45 //------------------------------------------------------------------------ 46 class platform_topology { 47 friend class binding_handler; 48 49 // Common topology members 50 hwloc_topology_t topology{nullptr}; 51 hwloc_cpuset_t process_cpu_affinity_mask{nullptr}; 52 hwloc_nodeset_t process_node_affinity_mask{nullptr}; 53 std::size_t number_of_processors_groups{1}; 54 55 // NUMA API related topology members 56 std::vector<hwloc_cpuset_t> numa_affinity_masks_list{}; 57 std::vector<int> numa_indexes_list{}; 58 int numa_nodes_count{0}; 59 60 // Hybrid CPUs API related topology members 61 std::vector<hwloc_cpuset_t> core_types_affinity_masks_list{}; 62 std::vector<int> core_types_indexes_list{}; 63 64 enum init_stages { uninitialized, 65 started, 66 topology_allocated, 67 topology_loaded, 68 topology_parsed } initialization_state; 69 70 // Binding threads that locate in another Windows Processor groups 71 // is allowed only if machine topology contains several Windows Processors groups 72 // and process affinity mask wasn`t limited manually (affinity mask cannot violates 73 // processors group boundaries). 74 bool intergroup_binding_allowed(std::size_t groups_num) { return groups_num > 1; } 75 76 private: 77 void topology_initialization(std::size_t groups_num) { 78 initialization_state = started; 79 80 // Parse topology 81 if ( hwloc_topology_init( &topology ) == 0 ) { 82 initialization_state = topology_allocated; 83 if ( hwloc_topology_load( topology ) == 0 ) { 84 initialization_state = topology_loaded; 85 } 86 } 87 if ( initialization_state != topology_loaded ) 88 return; 89 90 // Getting process affinity mask 91 if ( intergroup_binding_allowed(groups_num) ) { 92 process_cpu_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_cpuset (topology)); 93 process_node_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_nodeset(topology)); 94 } else { 95 process_cpu_affinity_mask = hwloc_bitmap_alloc(); 96 process_node_affinity_mask = hwloc_bitmap_alloc(); 97 98 assertion_hwloc_wrapper(hwloc_get_cpubind, topology, process_cpu_affinity_mask, 0); 99 hwloc_cpuset_to_nodeset(topology, process_cpu_affinity_mask, process_node_affinity_mask); 100 } 101 102 number_of_processors_groups = groups_num; 103 } 104 105 void numa_topology_parsing() { 106 // Fill parameters with stubs if topology parsing is broken. 107 if ( initialization_state != topology_loaded ) { 108 numa_nodes_count = 1; 109 numa_indexes_list.push_back(-1); 110 return; 111 } 112 113 // If system contains no NUMA nodes, HWLOC 1.11 returns an infinitely filled bitmap. 114 // hwloc_bitmap_weight() returns negative value for such bitmaps, so we use this check 115 // to change way of topology initialization. 116 numa_nodes_count = hwloc_bitmap_weight(process_node_affinity_mask); 117 if (numa_nodes_count <= 0) { 118 // numa_nodes_count may be empty if the process affinity mask is empty too (invalid case) 119 // or if some internal HWLOC error occurred. 120 // So we place -1 as index in this case. 121 numa_indexes_list.push_back(numa_nodes_count == 0 ? -1 : 0); 122 numa_nodes_count = 1; 123 124 numa_affinity_masks_list.push_back(hwloc_bitmap_dup(process_cpu_affinity_mask)); 125 } else { 126 // Get NUMA logical indexes list 127 unsigned counter = 0; 128 int i = 0; 129 int max_numa_index = -1; 130 numa_indexes_list.resize(numa_nodes_count); 131 hwloc_obj_t node_buffer; 132 hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) { 133 node_buffer = hwloc_get_obj_by_type(topology, HWLOC_OBJ_NUMANODE, i); 134 numa_indexes_list[counter] = static_cast<int>(node_buffer->logical_index); 135 136 if ( numa_indexes_list[counter] > max_numa_index ) { 137 max_numa_index = numa_indexes_list[counter]; 138 } 139 140 counter++; 141 } hwloc_bitmap_foreach_end(); 142 __TBB_ASSERT(max_numa_index >= 0, "Maximal NUMA index must not be negative"); 143 144 // Fill concurrency and affinity masks lists 145 numa_affinity_masks_list.resize(max_numa_index + 1); 146 int index = 0; 147 hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) { 148 node_buffer = hwloc_get_obj_by_type(topology, HWLOC_OBJ_NUMANODE, i); 149 index = static_cast<int>(node_buffer->logical_index); 150 151 hwloc_cpuset_t& current_mask = numa_affinity_masks_list[index]; 152 current_mask = hwloc_bitmap_dup(node_buffer->cpuset); 153 154 hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask); 155 __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask), "hwloc detected unavailable NUMA node"); 156 } hwloc_bitmap_foreach_end(); 157 } 158 } 159 160 void core_types_topology_parsing() { 161 // Fill parameters with stubs if topology parsing is broken. 162 if ( initialization_state != topology_loaded ) { 163 core_types_indexes_list.push_back(-1); 164 return; 165 } 166 #if __HWLOC_HYBRID_CPUS_INTERFACES_PRESENT 167 __TBB_ASSERT(hwloc_get_api_version() >= 0x20400, "Hybrid CPUs support interfaces required HWLOC >= 2.4"); 168 // Parsing the hybrid CPU topology 169 int core_types_number = hwloc_cpukinds_get_nr(topology, 0); 170 bool core_types_parsing_broken = core_types_number <= 0; 171 if (!core_types_parsing_broken) { 172 core_types_affinity_masks_list.resize(core_types_number); 173 int efficiency{-1}; 174 175 for (int core_type = 0; core_type < core_types_number; ++core_type) { 176 hwloc_cpuset_t& current_mask = core_types_affinity_masks_list[core_type]; 177 current_mask = hwloc_bitmap_alloc(); 178 179 if (!hwloc_cpukinds_get_info(topology, core_type, current_mask, &efficiency, nullptr, nullptr, 0) 180 && efficiency >= 0 181 ) { 182 hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask); 183 184 if (hwloc_bitmap_weight(current_mask) > 0) { 185 core_types_indexes_list.push_back(core_type); 186 } 187 __TBB_ASSERT(hwloc_bitmap_weight(current_mask) >= 0, "Infinivitely filled core type mask"); 188 } else { 189 core_types_parsing_broken = true; 190 break; 191 } 192 } 193 } 194 #else /*!__HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/ 195 bool core_types_parsing_broken{true}; 196 #endif /*__HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/ 197 198 if (core_types_parsing_broken) { 199 for (auto& core_type_mask : core_types_affinity_masks_list) { 200 hwloc_bitmap_free(core_type_mask); 201 } 202 core_types_affinity_masks_list.resize(1); 203 core_types_indexes_list.resize(1); 204 205 core_types_affinity_masks_list[0] = hwloc_bitmap_dup(process_cpu_affinity_mask); 206 core_types_indexes_list[0] = -1; 207 } 208 } 209 210 public: 211 typedef hwloc_cpuset_t affinity_mask; 212 typedef hwloc_const_cpuset_t const_affinity_mask; 213 214 static platform_topology& instance() { 215 static platform_topology topology; 216 return topology; 217 } 218 219 bool is_topology_parsed() { return initialization_state == topology_parsed; } 220 221 void initialize( std::size_t groups_num ) { 222 if ( initialization_state != uninitialized ) 223 return; 224 225 topology_initialization(groups_num); 226 numa_topology_parsing(); 227 core_types_topology_parsing(); 228 229 if (initialization_state == topology_loaded) 230 initialization_state = topology_parsed; 231 } 232 233 ~platform_topology() { 234 if ( is_topology_parsed() ) { 235 for (auto& numa_node_mask : numa_affinity_masks_list) { 236 hwloc_bitmap_free(numa_node_mask); 237 } 238 239 for (auto& core_type_mask : core_types_affinity_masks_list) { 240 hwloc_bitmap_free(core_type_mask); 241 } 242 243 hwloc_bitmap_free(process_node_affinity_mask); 244 hwloc_bitmap_free(process_cpu_affinity_mask); 245 } 246 247 if ( initialization_state >= topology_allocated ) { 248 hwloc_topology_destroy(topology); 249 } 250 251 initialization_state = uninitialized; 252 } 253 254 void fill_topology_information( 255 int& _numa_nodes_count, int*& _numa_indexes_list, 256 int& _core_types_count, int*& _core_types_indexes_list 257 ) { 258 __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology"); 259 _numa_nodes_count = numa_nodes_count; 260 _numa_indexes_list = numa_indexes_list.data(); 261 262 _core_types_count = (int)core_types_indexes_list.size(); 263 _core_types_indexes_list = core_types_indexes_list.data(); 264 } 265 266 void fill_constraints_affinity_mask(affinity_mask input_mask, int numa_node_index, int core_type_index, int max_threads_per_core) { 267 __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology"); 268 __TBB_ASSERT(numa_node_index < (int)numa_affinity_masks_list.size(), "Wrong NUMA node id"); 269 __TBB_ASSERT(core_type_index < (int)core_types_affinity_masks_list.size(), "Wrong core type id"); 270 __TBB_ASSERT(max_threads_per_core == -1 || max_threads_per_core > 0, "Wrong max_threads_per_core"); 271 272 hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc(); 273 hwloc_cpuset_t core_mask = hwloc_bitmap_alloc(); 274 275 hwloc_bitmap_copy(constraints_mask, process_cpu_affinity_mask); 276 if (numa_node_index >= 0) { 277 hwloc_bitmap_and(constraints_mask, constraints_mask, numa_affinity_masks_list[numa_node_index]); 278 } 279 if (core_type_index >= 0) { 280 hwloc_bitmap_and(constraints_mask, constraints_mask, core_types_affinity_masks_list[core_type_index]); 281 } 282 if (max_threads_per_core > 0) { 283 // clear input mask 284 hwloc_bitmap_zero(input_mask); 285 286 hwloc_obj_t current_core = nullptr; 287 while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) { 288 hwloc_bitmap_and(core_mask, constraints_mask, current_core->cpuset); 289 290 // fit the core mask to required bits number 291 int current_threads_per_core = 0; 292 for (int id = hwloc_bitmap_first(core_mask); id != -1; id = hwloc_bitmap_next(core_mask, id)) { 293 if (++current_threads_per_core > max_threads_per_core) { 294 hwloc_bitmap_clr(core_mask, id); 295 } 296 } 297 298 hwloc_bitmap_or(input_mask, input_mask, core_mask); 299 } 300 } else { 301 hwloc_bitmap_copy(input_mask, constraints_mask); 302 } 303 304 hwloc_bitmap_free(core_mask); 305 hwloc_bitmap_free(constraints_mask); 306 } 307 308 void fit_num_threads_per_core(affinity_mask result_mask, affinity_mask current_mask, affinity_mask constraints_mask) { 309 hwloc_bitmap_zero(result_mask); 310 hwloc_obj_t current_core = nullptr; 311 while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) { 312 if (hwloc_bitmap_intersects(current_mask, current_core->cpuset)) { 313 hwloc_bitmap_or(result_mask, result_mask, current_core->cpuset); 314 } 315 } 316 hwloc_bitmap_and(result_mask, result_mask, constraints_mask); 317 } 318 319 int get_default_concurrency(int numa_node_index, int core_type_index, int max_threads_per_core) { 320 __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology"); 321 322 hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc(); 323 fill_constraints_affinity_mask(constraints_mask, numa_node_index, core_type_index, max_threads_per_core); 324 325 int default_concurrency = hwloc_bitmap_weight(constraints_mask); 326 hwloc_bitmap_free(constraints_mask); 327 return default_concurrency; 328 } 329 330 affinity_mask allocate_process_affinity_mask() { 331 __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology"); 332 return hwloc_bitmap_dup(process_cpu_affinity_mask); 333 } 334 335 void free_affinity_mask( affinity_mask mask_to_free ) { 336 hwloc_bitmap_free(mask_to_free); // If bitmap is nullptr, no operation is performed. 337 } 338 339 void store_current_affinity_mask( affinity_mask current_mask ) { 340 assertion_hwloc_wrapper(hwloc_get_cpubind, topology, current_mask, HWLOC_CPUBIND_THREAD); 341 342 hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask); 343 __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask), 344 "Current affinity mask must intersects with process affinity mask"); 345 } 346 347 void set_affinity_mask( const_affinity_mask mask ) { 348 if (hwloc_bitmap_weight(mask) > 0) { 349 assertion_hwloc_wrapper(hwloc_set_cpubind, topology, mask, HWLOC_CPUBIND_THREAD); 350 } 351 } 352 }; 353 354 class binding_handler { 355 // Following vector saves thread affinity mask on scheduler entry to return it to this thread 356 // on scheduler exit. 357 typedef std::vector<platform_topology::affinity_mask> affinity_masks_container; 358 affinity_masks_container affinity_backup; 359 platform_topology::affinity_mask handler_affinity_mask; 360 361 #if WIN32 362 affinity_masks_container affinity_buffer; 363 int my_numa_node_id; 364 int my_core_type_id; 365 int my_max_threads_per_core; 366 #endif 367 368 public: 369 binding_handler( std::size_t size, int numa_node_id, int core_type_id, int max_threads_per_core ) 370 : affinity_backup(size) 371 #if WIN32 372 , affinity_buffer(size) 373 , my_numa_node_id(numa_node_id) 374 , my_core_type_id(core_type_id) 375 , my_max_threads_per_core(max_threads_per_core) 376 #endif 377 { 378 for (std::size_t i = 0; i < size; ++i) { 379 affinity_backup[i] = platform_topology::instance().allocate_process_affinity_mask(); 380 #if WIN32 381 affinity_buffer[i] = platform_topology::instance().allocate_process_affinity_mask(); 382 #endif 383 } 384 handler_affinity_mask = platform_topology::instance().allocate_process_affinity_mask(); 385 platform_topology::instance().fill_constraints_affinity_mask 386 (handler_affinity_mask, numa_node_id, core_type_id, max_threads_per_core); 387 } 388 389 ~binding_handler() { 390 for (std::size_t i = 0; i < affinity_backup.size(); ++i) { 391 platform_topology::instance().free_affinity_mask(affinity_backup[i]); 392 #if WIN32 393 platform_topology::instance().free_affinity_mask(affinity_buffer[i]); 394 #endif 395 } 396 platform_topology::instance().free_affinity_mask(handler_affinity_mask); 397 } 398 399 void apply_affinity( unsigned slot_num ) { 400 auto& topology = platform_topology::instance(); 401 __TBB_ASSERT(slot_num < affinity_backup.size(), 402 "The slot number is greater than the number of slots in the arena"); 403 __TBB_ASSERT(topology.is_topology_parsed(), 404 "Trying to get access to uninitialized platform_topology"); 405 406 topology.store_current_affinity_mask(affinity_backup[slot_num]); 407 408 #if WIN32 409 // TBBBind supports only systems where NUMA nodes and core types do not cross the border 410 // between several processor groups. So if a certain NUMA node or core type constraint 411 // specified, then the constraints affinity mask will not cross the processor groups' border. 412 413 // But if we have constraint based only on the max_threads_per_core setting, then the 414 // constraints affinity mask does may cross the border between several processor groups 415 // on machines with more then 64 hardware threads. That is why we need to use the special 416 // function, which regulates the number of threads in the current threads mask. 417 if (topology.number_of_processors_groups > 1 && my_max_threads_per_core != -1 && 418 (my_numa_node_id == -1 || topology.numa_indexes_list.size() == 1) && 419 (my_core_type_id == -1 || topology.core_types_indexes_list.size() == 1) 420 ) { 421 topology.fit_num_threads_per_core(affinity_buffer[slot_num], affinity_backup[slot_num], handler_affinity_mask); 422 topology.set_affinity_mask(affinity_buffer[slot_num]); 423 return; 424 } 425 #endif 426 topology.set_affinity_mask(handler_affinity_mask); 427 } 428 429 void restore_previous_affinity_mask( unsigned slot_num ) { 430 auto& topology = platform_topology::instance(); 431 __TBB_ASSERT(topology.is_topology_parsed(), 432 "Trying to get access to uninitialized platform_topology"); 433 topology.set_affinity_mask(affinity_backup[slot_num]); 434 }; 435 436 }; 437 438 extern "C" { // exported to TBB interfaces 439 440 void __TBB_internal_initialize_system_topology( 441 std::size_t groups_num, 442 int& numa_nodes_count, int*& numa_indexes_list, 443 int& core_types_count, int*& core_types_indexes_list 444 ) { 445 platform_topology::instance().initialize(groups_num); 446 platform_topology::instance().fill_topology_information( 447 numa_nodes_count, numa_indexes_list, 448 core_types_count, core_types_indexes_list 449 ); 450 } 451 452 binding_handler* __TBB_internal_allocate_binding_handler(int number_of_slots, int numa_id, int core_type_id, int max_threads_per_core) { 453 __TBB_ASSERT(number_of_slots > 0, "Trying to create numa handler for 0 threads."); 454 return new binding_handler(number_of_slots, numa_id, core_type_id, max_threads_per_core); 455 } 456 457 void __TBB_internal_deallocate_binding_handler(binding_handler* handler_ptr) { 458 __TBB_ASSERT(handler_ptr != nullptr, "Trying to deallocate nullptr pointer."); 459 delete handler_ptr; 460 } 461 462 void __TBB_internal_apply_affinity(binding_handler* handler_ptr, int slot_num) { 463 __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata."); 464 handler_ptr->apply_affinity(slot_num); 465 } 466 467 void __TBB_internal_restore_affinity(binding_handler* handler_ptr, int slot_num) { 468 __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata."); 469 handler_ptr->restore_previous_affinity_mask(slot_num); 470 } 471 472 int __TBB_internal_get_default_concurrency(int numa_id, int core_type_id, int max_threads_per_core) { 473 return platform_topology::instance().get_default_concurrency(numa_id, core_type_id, max_threads_per_core); 474 } 475 476 } // extern "C" 477 478 } // namespace r1 479 } // namespace detail 480 } // namespace tbb 481 482 #undef assertion_hwloc_wrapper 483