151c0b2f7Stbbdev /* 2*b15aabb3Stbbdev Copyright (c) 2019-2021 Intel Corporation 351c0b2f7Stbbdev 451c0b2f7Stbbdev Licensed under the Apache License, Version 2.0 (the "License"); 551c0b2f7Stbbdev you may not use this file except in compliance with the License. 651c0b2f7Stbbdev You may obtain a copy of the License at 751c0b2f7Stbbdev 851c0b2f7Stbbdev http://www.apache.org/licenses/LICENSE-2.0 951c0b2f7Stbbdev 1051c0b2f7Stbbdev Unless required by applicable law or agreed to in writing, software 1151c0b2f7Stbbdev distributed under the License is distributed on an "AS IS" BASIS, 1251c0b2f7Stbbdev WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1351c0b2f7Stbbdev See the License for the specific language governing permissions and 1451c0b2f7Stbbdev limitations under the License. 1551c0b2f7Stbbdev */ 1651c0b2f7Stbbdev 17*b15aabb3Stbbdev #include <vector> 18*b15aabb3Stbbdev #include <mutex> 19*b15aabb3Stbbdev 2051c0b2f7Stbbdev #include "../tbb/assert_impl.h" // Out-of-line TBB assertion handling routines are instantiated here. 2149e08aacStbbdev #include "oneapi/tbb/detail/_assert.h" 2251c0b2f7Stbbdev 2351c0b2f7Stbbdev #if _MSC_VER && !__INTEL_COMPILER 2451c0b2f7Stbbdev #pragma warning( push ) 2551c0b2f7Stbbdev #pragma warning( disable : 4100 ) 2651c0b2f7Stbbdev #endif 2751c0b2f7Stbbdev #include <hwloc.h> 2851c0b2f7Stbbdev #if _MSC_VER && !__INTEL_COMPILER 2951c0b2f7Stbbdev #pragma warning( pop ) 3051c0b2f7Stbbdev #endif 3151c0b2f7Stbbdev 32*b15aabb3Stbbdev #define __HWLOC_HYBRID_CPUS_INTERFACES_PRESENT (HWLOC_API_VERSION >= 0x20400) 3351c0b2f7Stbbdev 3451c0b2f7Stbbdev // Most of hwloc calls returns negative exit code on error. 3551c0b2f7Stbbdev // This macro tracks error codes that are returned from the hwloc interfaces. 3651c0b2f7Stbbdev #define assertion_hwloc_wrapper(command, ...) \ 3751c0b2f7Stbbdev __TBB_ASSERT_EX( (command(__VA_ARGS__)) >= 0, "Error occurred during call to hwloc API."); 3851c0b2f7Stbbdev 3951c0b2f7Stbbdev namespace tbb { 4051c0b2f7Stbbdev namespace detail { 4151c0b2f7Stbbdev namespace r1 { 4251c0b2f7Stbbdev 4351c0b2f7Stbbdev //------------------------------------------------------------------------ 4451c0b2f7Stbbdev // Information about the machine's hardware TBB is happen to work on 4551c0b2f7Stbbdev //------------------------------------------------------------------------ 4651c0b2f7Stbbdev class platform_topology { 47*b15aabb3Stbbdev friend class binding_handler; 4851c0b2f7Stbbdev 49*b15aabb3Stbbdev // Common topology members 50*b15aabb3Stbbdev hwloc_topology_t topology{nullptr}; 51*b15aabb3Stbbdev hwloc_cpuset_t process_cpu_affinity_mask{nullptr}; 52*b15aabb3Stbbdev hwloc_nodeset_t process_node_affinity_mask{nullptr}; 53*b15aabb3Stbbdev std::size_t number_of_processors_groups{1}; 5451c0b2f7Stbbdev 55*b15aabb3Stbbdev // NUMA API related topology members 56*b15aabb3Stbbdev std::vector<hwloc_cpuset_t> numa_affinity_masks_list{}; 57*b15aabb3Stbbdev std::vector<int> numa_indexes_list{}; 58*b15aabb3Stbbdev int numa_nodes_count{0}; 59*b15aabb3Stbbdev 60*b15aabb3Stbbdev // Hybrid CPUs API related topology members 61*b15aabb3Stbbdev std::vector<hwloc_cpuset_t> core_types_affinity_masks_list{}; 62*b15aabb3Stbbdev std::vector<int> core_types_indexes_list{}; 6351c0b2f7Stbbdev 6451c0b2f7Stbbdev enum init_stages { uninitialized, 6551c0b2f7Stbbdev started, 6651c0b2f7Stbbdev topology_allocated, 6751c0b2f7Stbbdev topology_loaded, 6851c0b2f7Stbbdev topology_parsed } initialization_state; 6951c0b2f7Stbbdev 70*b15aabb3Stbbdev // Binding threads that locate in another Windows Processor groups 7151c0b2f7Stbbdev // is allowed only if machine topology contains several Windows Processors groups 7251c0b2f7Stbbdev // and process affinity mask wasn`t limited manually (affinity mask cannot violates 7351c0b2f7Stbbdev // processors group boundaries). 74*b15aabb3Stbbdev bool intergroup_binding_allowed(std::size_t groups_num) { return groups_num > 1; } 7551c0b2f7Stbbdev 76*b15aabb3Stbbdev private: 77*b15aabb3Stbbdev void topology_initialization(std::size_t groups_num) { 7851c0b2f7Stbbdev initialization_state = started; 7951c0b2f7Stbbdev 8051c0b2f7Stbbdev // Parse topology 8151c0b2f7Stbbdev if ( hwloc_topology_init( &topology ) == 0 ) { 8251c0b2f7Stbbdev initialization_state = topology_allocated; 8351c0b2f7Stbbdev if ( hwloc_topology_load( topology ) == 0 ) { 8451c0b2f7Stbbdev initialization_state = topology_loaded; 8551c0b2f7Stbbdev } 8651c0b2f7Stbbdev } 87*b15aabb3Stbbdev if ( initialization_state != topology_loaded ) 8851c0b2f7Stbbdev return; 8951c0b2f7Stbbdev 9051c0b2f7Stbbdev // Getting process affinity mask 9151c0b2f7Stbbdev if ( intergroup_binding_allowed(groups_num) ) { 9251c0b2f7Stbbdev process_cpu_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_cpuset (topology)); 9351c0b2f7Stbbdev process_node_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_nodeset(topology)); 9451c0b2f7Stbbdev } else { 9551c0b2f7Stbbdev process_cpu_affinity_mask = hwloc_bitmap_alloc(); 9651c0b2f7Stbbdev process_node_affinity_mask = hwloc_bitmap_alloc(); 9751c0b2f7Stbbdev 9851c0b2f7Stbbdev assertion_hwloc_wrapper(hwloc_get_cpubind, topology, process_cpu_affinity_mask, 0); 9951c0b2f7Stbbdev hwloc_cpuset_to_nodeset(topology, process_cpu_affinity_mask, process_node_affinity_mask); 10051c0b2f7Stbbdev } 10151c0b2f7Stbbdev 102*b15aabb3Stbbdev number_of_processors_groups = groups_num; 103*b15aabb3Stbbdev } 104*b15aabb3Stbbdev 105*b15aabb3Stbbdev void numa_topology_parsing() { 106*b15aabb3Stbbdev // Fill parameters with stubs if topology parsing is broken. 107*b15aabb3Stbbdev if ( initialization_state != topology_loaded ) { 108*b15aabb3Stbbdev numa_nodes_count = 1; 109*b15aabb3Stbbdev numa_indexes_list.push_back(-1); 110*b15aabb3Stbbdev return; 111*b15aabb3Stbbdev } 112*b15aabb3Stbbdev 11351c0b2f7Stbbdev // If system contains no NUMA nodes, HWLOC 1.11 returns an infinitely filled bitmap. 11451c0b2f7Stbbdev // hwloc_bitmap_weight() returns negative value for such bitmaps, so we use this check 11551c0b2f7Stbbdev // to change way of topology initialization. 11651c0b2f7Stbbdev numa_nodes_count = hwloc_bitmap_weight(process_node_affinity_mask); 11751c0b2f7Stbbdev if (numa_nodes_count <= 0) { 11851c0b2f7Stbbdev // numa_nodes_count may be empty if the process affinity mask is empty too (invalid case) 11951c0b2f7Stbbdev // or if some internal HWLOC error occurred. 12051c0b2f7Stbbdev // So we place -1 as index in this case. 12151c0b2f7Stbbdev numa_indexes_list.push_back(numa_nodes_count == 0 ? -1 : 0); 12251c0b2f7Stbbdev numa_nodes_count = 1; 12351c0b2f7Stbbdev 124*b15aabb3Stbbdev numa_affinity_masks_list.push_back(hwloc_bitmap_dup(process_cpu_affinity_mask)); 125*b15aabb3Stbbdev } else { 12651c0b2f7Stbbdev // Get NUMA logical indexes list 12751c0b2f7Stbbdev unsigned counter = 0; 12851c0b2f7Stbbdev int i = 0; 12951c0b2f7Stbbdev int max_numa_index = -1; 13051c0b2f7Stbbdev numa_indexes_list.resize(numa_nodes_count); 13151c0b2f7Stbbdev hwloc_obj_t node_buffer; 13251c0b2f7Stbbdev hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) { 13351c0b2f7Stbbdev node_buffer = hwloc_get_obj_by_type(topology, HWLOC_OBJ_NUMANODE, i); 13451c0b2f7Stbbdev numa_indexes_list[counter] = static_cast<int>(node_buffer->logical_index); 13551c0b2f7Stbbdev 13651c0b2f7Stbbdev if ( numa_indexes_list[counter] > max_numa_index ) { 13751c0b2f7Stbbdev max_numa_index = numa_indexes_list[counter]; 13851c0b2f7Stbbdev } 13951c0b2f7Stbbdev 14051c0b2f7Stbbdev counter++; 14151c0b2f7Stbbdev } hwloc_bitmap_foreach_end(); 14251c0b2f7Stbbdev __TBB_ASSERT(max_numa_index >= 0, "Maximal NUMA index must not be negative"); 14351c0b2f7Stbbdev 14451c0b2f7Stbbdev // Fill concurrency and affinity masks lists 145*b15aabb3Stbbdev numa_affinity_masks_list.resize(max_numa_index + 1); 14651c0b2f7Stbbdev int index = 0; 14751c0b2f7Stbbdev hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) { 14851c0b2f7Stbbdev node_buffer = hwloc_get_obj_by_type(topology, HWLOC_OBJ_NUMANODE, i); 14951c0b2f7Stbbdev index = static_cast<int>(node_buffer->logical_index); 15051c0b2f7Stbbdev 151*b15aabb3Stbbdev hwloc_cpuset_t& current_mask = numa_affinity_masks_list[index]; 15251c0b2f7Stbbdev current_mask = hwloc_bitmap_dup(node_buffer->cpuset); 15351c0b2f7Stbbdev 15451c0b2f7Stbbdev hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask); 15551c0b2f7Stbbdev __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask), "hwloc detected unavailable NUMA node"); 15651c0b2f7Stbbdev } hwloc_bitmap_foreach_end(); 157*b15aabb3Stbbdev } 158*b15aabb3Stbbdev } 159*b15aabb3Stbbdev 160*b15aabb3Stbbdev void core_types_topology_parsing() { 161*b15aabb3Stbbdev // Fill parameters with stubs if topology parsing is broken. 162*b15aabb3Stbbdev if ( initialization_state != topology_loaded ) { 163*b15aabb3Stbbdev core_types_indexes_list.push_back(-1); 164*b15aabb3Stbbdev return; 165*b15aabb3Stbbdev } 166*b15aabb3Stbbdev #if __HWLOC_HYBRID_CPUS_INTERFACES_PRESENT 167*b15aabb3Stbbdev __TBB_ASSERT(hwloc_get_api_version() >= 0x20400, "Hybrid CPUs support interfaces required HWLOC >= 2.4"); 168*b15aabb3Stbbdev // Parsing the hybrid CPU topology 169*b15aabb3Stbbdev int core_types_number = hwloc_cpukinds_get_nr(topology, 0); 170*b15aabb3Stbbdev bool core_types_parsing_broken = core_types_number <= 0; 171*b15aabb3Stbbdev if (!core_types_parsing_broken) { 172*b15aabb3Stbbdev core_types_affinity_masks_list.resize(core_types_number); 173*b15aabb3Stbbdev int efficiency{-1}; 174*b15aabb3Stbbdev 175*b15aabb3Stbbdev for (int core_type = 0; core_type < core_types_number; ++core_type) { 176*b15aabb3Stbbdev hwloc_cpuset_t& current_mask = core_types_affinity_masks_list[core_type]; 177*b15aabb3Stbbdev current_mask = hwloc_bitmap_alloc(); 178*b15aabb3Stbbdev 179*b15aabb3Stbbdev if (!hwloc_cpukinds_get_info(topology, core_type, current_mask, &efficiency, nullptr, nullptr, 0) 180*b15aabb3Stbbdev && efficiency >= 0 181*b15aabb3Stbbdev ) { 182*b15aabb3Stbbdev hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask); 183*b15aabb3Stbbdev 184*b15aabb3Stbbdev if (hwloc_bitmap_weight(current_mask) > 0) { 185*b15aabb3Stbbdev core_types_indexes_list.push_back(core_type); 186*b15aabb3Stbbdev } 187*b15aabb3Stbbdev __TBB_ASSERT(hwloc_bitmap_weight(current_mask) >= 0, "Infinivitely filled core type mask"); 188*b15aabb3Stbbdev } else { 189*b15aabb3Stbbdev core_types_parsing_broken = true; 190*b15aabb3Stbbdev break; 191*b15aabb3Stbbdev } 192*b15aabb3Stbbdev } 193*b15aabb3Stbbdev } 194*b15aabb3Stbbdev #else /*!__HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/ 195*b15aabb3Stbbdev bool core_types_parsing_broken{true}; 196*b15aabb3Stbbdev #endif /*__HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/ 197*b15aabb3Stbbdev 198*b15aabb3Stbbdev if (core_types_parsing_broken) { 199*b15aabb3Stbbdev for (auto& core_type_mask : core_types_affinity_masks_list) { 200*b15aabb3Stbbdev hwloc_bitmap_free(core_type_mask); 201*b15aabb3Stbbdev } 202*b15aabb3Stbbdev core_types_affinity_masks_list.resize(1); 203*b15aabb3Stbbdev core_types_indexes_list.resize(1); 204*b15aabb3Stbbdev 205*b15aabb3Stbbdev core_types_affinity_masks_list[0] = hwloc_bitmap_dup(process_cpu_affinity_mask); 206*b15aabb3Stbbdev core_types_indexes_list[0] = -1; 207*b15aabb3Stbbdev } 208*b15aabb3Stbbdev } 209*b15aabb3Stbbdev 210*b15aabb3Stbbdev public: 211*b15aabb3Stbbdev typedef hwloc_cpuset_t affinity_mask; 212*b15aabb3Stbbdev typedef hwloc_const_cpuset_t const_affinity_mask; 213*b15aabb3Stbbdev 214*b15aabb3Stbbdev static platform_topology& instance() { 215*b15aabb3Stbbdev static platform_topology topology; 216*b15aabb3Stbbdev return topology; 217*b15aabb3Stbbdev } 218*b15aabb3Stbbdev 219*b15aabb3Stbbdev bool is_topology_parsed() { return initialization_state == topology_parsed; } 220*b15aabb3Stbbdev 221*b15aabb3Stbbdev void initialize( std::size_t groups_num ) { 222*b15aabb3Stbbdev if ( initialization_state != uninitialized ) 223*b15aabb3Stbbdev return; 224*b15aabb3Stbbdev 225*b15aabb3Stbbdev topology_initialization(groups_num); 226*b15aabb3Stbbdev numa_topology_parsing(); 227*b15aabb3Stbbdev core_types_topology_parsing(); 228*b15aabb3Stbbdev 229*b15aabb3Stbbdev if (initialization_state == topology_loaded) 23051c0b2f7Stbbdev initialization_state = topology_parsed; 23151c0b2f7Stbbdev } 23251c0b2f7Stbbdev 23351c0b2f7Stbbdev ~platform_topology() { 23451c0b2f7Stbbdev if ( is_topology_parsed() ) { 235*b15aabb3Stbbdev for (auto& numa_node_mask : numa_affinity_masks_list) { 236*b15aabb3Stbbdev hwloc_bitmap_free(numa_node_mask); 23751c0b2f7Stbbdev } 238*b15aabb3Stbbdev 239*b15aabb3Stbbdev for (auto& core_type_mask : core_types_affinity_masks_list) { 240*b15aabb3Stbbdev hwloc_bitmap_free(core_type_mask); 241*b15aabb3Stbbdev } 242*b15aabb3Stbbdev 24351c0b2f7Stbbdev hwloc_bitmap_free(process_node_affinity_mask); 24451c0b2f7Stbbdev hwloc_bitmap_free(process_cpu_affinity_mask); 24551c0b2f7Stbbdev } 24651c0b2f7Stbbdev 24751c0b2f7Stbbdev if ( initialization_state >= topology_allocated ) { 24851c0b2f7Stbbdev hwloc_topology_destroy(topology); 24951c0b2f7Stbbdev } 25051c0b2f7Stbbdev 25151c0b2f7Stbbdev initialization_state = uninitialized; 25251c0b2f7Stbbdev } 25351c0b2f7Stbbdev 254*b15aabb3Stbbdev void fill_topology_information( 255*b15aabb3Stbbdev int& _numa_nodes_count, int*& _numa_indexes_list, 256*b15aabb3Stbbdev int& _core_types_count, int*& _core_types_indexes_list 257*b15aabb3Stbbdev ) { 25851c0b2f7Stbbdev __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology"); 259*b15aabb3Stbbdev _numa_nodes_count = numa_nodes_count; 260*b15aabb3Stbbdev _numa_indexes_list = numa_indexes_list.data(); 261*b15aabb3Stbbdev 262*b15aabb3Stbbdev _core_types_count = (int)core_types_indexes_list.size(); 263*b15aabb3Stbbdev _core_types_indexes_list = core_types_indexes_list.data(); 264*b15aabb3Stbbdev } 265*b15aabb3Stbbdev 266*b15aabb3Stbbdev void fill_constraints_affinity_mask(affinity_mask input_mask, int numa_node_index, int core_type_index, int max_threads_per_core) { 267*b15aabb3Stbbdev __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology"); 268*b15aabb3Stbbdev __TBB_ASSERT(numa_node_index < (int)numa_affinity_masks_list.size(), "Wrong NUMA node id"); 269*b15aabb3Stbbdev __TBB_ASSERT(core_type_index < (int)core_types_affinity_masks_list.size(), "Wrong core type id"); 270*b15aabb3Stbbdev __TBB_ASSERT(max_threads_per_core == -1 || max_threads_per_core > 0, "Wrong max_threads_per_core"); 271*b15aabb3Stbbdev 272*b15aabb3Stbbdev hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc(); 273*b15aabb3Stbbdev hwloc_cpuset_t core_mask = hwloc_bitmap_alloc(); 274*b15aabb3Stbbdev 275*b15aabb3Stbbdev hwloc_bitmap_copy(constraints_mask, process_cpu_affinity_mask); 276*b15aabb3Stbbdev if (numa_node_index >= 0) { 277*b15aabb3Stbbdev hwloc_bitmap_and(constraints_mask, constraints_mask, numa_affinity_masks_list[numa_node_index]); 278*b15aabb3Stbbdev } 279*b15aabb3Stbbdev if (core_type_index >= 0) { 280*b15aabb3Stbbdev hwloc_bitmap_and(constraints_mask, constraints_mask, core_types_affinity_masks_list[core_type_index]); 281*b15aabb3Stbbdev } 282*b15aabb3Stbbdev if (max_threads_per_core > 0) { 283*b15aabb3Stbbdev // clear input mask 284*b15aabb3Stbbdev hwloc_bitmap_zero(input_mask); 285*b15aabb3Stbbdev 286*b15aabb3Stbbdev hwloc_obj_t current_core = nullptr; 287*b15aabb3Stbbdev while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) { 288*b15aabb3Stbbdev hwloc_bitmap_and(core_mask, constraints_mask, current_core->cpuset); 289*b15aabb3Stbbdev 290*b15aabb3Stbbdev // fit the core mask to required bits number 291*b15aabb3Stbbdev int current_threads_per_core = 0; 292*b15aabb3Stbbdev for (int id = hwloc_bitmap_first(core_mask); id != -1; id = hwloc_bitmap_next(core_mask, id)) { 293*b15aabb3Stbbdev if (++current_threads_per_core > max_threads_per_core) { 294*b15aabb3Stbbdev hwloc_bitmap_clr(core_mask, id); 295*b15aabb3Stbbdev } 296*b15aabb3Stbbdev } 297*b15aabb3Stbbdev 298*b15aabb3Stbbdev hwloc_bitmap_or(input_mask, input_mask, core_mask); 299*b15aabb3Stbbdev } 300*b15aabb3Stbbdev } else { 301*b15aabb3Stbbdev hwloc_bitmap_copy(input_mask, constraints_mask); 302*b15aabb3Stbbdev } 303*b15aabb3Stbbdev 304*b15aabb3Stbbdev hwloc_bitmap_free(core_mask); 305*b15aabb3Stbbdev hwloc_bitmap_free(constraints_mask); 306*b15aabb3Stbbdev } 307*b15aabb3Stbbdev 308*b15aabb3Stbbdev void fit_num_threads_per_core(affinity_mask result_mask, affinity_mask current_mask, affinity_mask constraints_mask) { 309*b15aabb3Stbbdev hwloc_bitmap_zero(result_mask); 310*b15aabb3Stbbdev hwloc_obj_t current_core = nullptr; 311*b15aabb3Stbbdev while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) { 312*b15aabb3Stbbdev if (hwloc_bitmap_intersects(current_mask, current_core->cpuset)) { 313*b15aabb3Stbbdev hwloc_bitmap_or(result_mask, result_mask, current_core->cpuset); 314*b15aabb3Stbbdev } 315*b15aabb3Stbbdev } 316*b15aabb3Stbbdev hwloc_bitmap_and(result_mask, result_mask, constraints_mask); 317*b15aabb3Stbbdev } 318*b15aabb3Stbbdev 319*b15aabb3Stbbdev int get_default_concurrency(int numa_node_index, int core_type_index, int max_threads_per_core) { 320*b15aabb3Stbbdev __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology"); 321*b15aabb3Stbbdev 322*b15aabb3Stbbdev hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc(); 323*b15aabb3Stbbdev fill_constraints_affinity_mask(constraints_mask, numa_node_index, core_type_index, max_threads_per_core); 324*b15aabb3Stbbdev 325*b15aabb3Stbbdev int default_concurrency = hwloc_bitmap_weight(constraints_mask); 326*b15aabb3Stbbdev hwloc_bitmap_free(constraints_mask); 327*b15aabb3Stbbdev return default_concurrency; 32851c0b2f7Stbbdev } 32951c0b2f7Stbbdev 33051c0b2f7Stbbdev affinity_mask allocate_process_affinity_mask() { 33151c0b2f7Stbbdev __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology"); 33251c0b2f7Stbbdev return hwloc_bitmap_dup(process_cpu_affinity_mask); 33351c0b2f7Stbbdev } 33451c0b2f7Stbbdev 33551c0b2f7Stbbdev void free_affinity_mask( affinity_mask mask_to_free ) { 33651c0b2f7Stbbdev hwloc_bitmap_free(mask_to_free); // If bitmap is nullptr, no operation is performed. 33751c0b2f7Stbbdev } 33851c0b2f7Stbbdev 33951c0b2f7Stbbdev void store_current_affinity_mask( affinity_mask current_mask ) { 34051c0b2f7Stbbdev assertion_hwloc_wrapper(hwloc_get_cpubind, topology, current_mask, HWLOC_CPUBIND_THREAD); 34151c0b2f7Stbbdev 34251c0b2f7Stbbdev hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask); 34351c0b2f7Stbbdev __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask), 34451c0b2f7Stbbdev "Current affinity mask must intersects with process affinity mask"); 34551c0b2f7Stbbdev } 34651c0b2f7Stbbdev 347*b15aabb3Stbbdev void set_affinity_mask( const_affinity_mask mask ) { 348*b15aabb3Stbbdev if (hwloc_bitmap_weight(mask) > 0) { 349*b15aabb3Stbbdev assertion_hwloc_wrapper(hwloc_set_cpubind, topology, mask, HWLOC_CPUBIND_THREAD); 35051c0b2f7Stbbdev } 35151c0b2f7Stbbdev } 35251c0b2f7Stbbdev }; 35351c0b2f7Stbbdev 35451c0b2f7Stbbdev class binding_handler { 35551c0b2f7Stbbdev // Following vector saves thread affinity mask on scheduler entry to return it to this thread 35651c0b2f7Stbbdev // on scheduler exit. 35751c0b2f7Stbbdev typedef std::vector<platform_topology::affinity_mask> affinity_masks_container; 35851c0b2f7Stbbdev affinity_masks_container affinity_backup; 359*b15aabb3Stbbdev platform_topology::affinity_mask handler_affinity_mask; 360*b15aabb3Stbbdev 361*b15aabb3Stbbdev #if WIN32 362*b15aabb3Stbbdev affinity_masks_container affinity_buffer; 363*b15aabb3Stbbdev int my_numa_node_id; 364*b15aabb3Stbbdev int my_core_type_id; 365*b15aabb3Stbbdev int my_max_threads_per_core; 366*b15aabb3Stbbdev #endif 36751c0b2f7Stbbdev 36851c0b2f7Stbbdev public: 369*b15aabb3Stbbdev binding_handler( std::size_t size, int numa_node_id, int core_type_id, int max_threads_per_core ) 370*b15aabb3Stbbdev : affinity_backup(size) 371*b15aabb3Stbbdev #if WIN32 372*b15aabb3Stbbdev , affinity_buffer(size) 373*b15aabb3Stbbdev , my_numa_node_id(numa_node_id) 374*b15aabb3Stbbdev , my_core_type_id(core_type_id) 375*b15aabb3Stbbdev , my_max_threads_per_core(max_threads_per_core) 376*b15aabb3Stbbdev #endif 377*b15aabb3Stbbdev { 378*b15aabb3Stbbdev for (std::size_t i = 0; i < size; ++i) { 379*b15aabb3Stbbdev affinity_backup[i] = platform_topology::instance().allocate_process_affinity_mask(); 380*b15aabb3Stbbdev #if WIN32 381*b15aabb3Stbbdev affinity_buffer[i] = platform_topology::instance().allocate_process_affinity_mask(); 382*b15aabb3Stbbdev #endif 38351c0b2f7Stbbdev } 384*b15aabb3Stbbdev handler_affinity_mask = platform_topology::instance().allocate_process_affinity_mask(); 385*b15aabb3Stbbdev platform_topology::instance().fill_constraints_affinity_mask 386*b15aabb3Stbbdev (handler_affinity_mask, numa_node_id, core_type_id, max_threads_per_core); 38751c0b2f7Stbbdev } 38851c0b2f7Stbbdev 38951c0b2f7Stbbdev ~binding_handler() { 390*b15aabb3Stbbdev for (std::size_t i = 0; i < affinity_backup.size(); ++i) { 391*b15aabb3Stbbdev platform_topology::instance().free_affinity_mask(affinity_backup[i]); 392*b15aabb3Stbbdev #if WIN32 393*b15aabb3Stbbdev platform_topology::instance().free_affinity_mask(affinity_buffer[i]); 394*b15aabb3Stbbdev #endif 39551c0b2f7Stbbdev } 396*b15aabb3Stbbdev platform_topology::instance().free_affinity_mask(handler_affinity_mask); 39751c0b2f7Stbbdev } 39851c0b2f7Stbbdev 399*b15aabb3Stbbdev void apply_affinity( unsigned slot_num ) { 400*b15aabb3Stbbdev auto& topology = platform_topology::instance(); 40151c0b2f7Stbbdev __TBB_ASSERT(slot_num < affinity_backup.size(), 40251c0b2f7Stbbdev "The slot number is greater than the number of slots in the arena"); 403*b15aabb3Stbbdev __TBB_ASSERT(topology.is_topology_parsed(), 40451c0b2f7Stbbdev "Trying to get access to uninitialized platform_topology"); 40551c0b2f7Stbbdev 406*b15aabb3Stbbdev topology.store_current_affinity_mask(affinity_backup[slot_num]); 407*b15aabb3Stbbdev 408*b15aabb3Stbbdev #if WIN32 409*b15aabb3Stbbdev // TBBBind supports only systems where NUMA nodes and core types do not cross the border 410*b15aabb3Stbbdev // between several processor groups. So if a certain NUMA node or core type constraint 411*b15aabb3Stbbdev // specified, then the constraints affinity mask will not cross the processor groups' border. 412*b15aabb3Stbbdev 413*b15aabb3Stbbdev // But if we have constraint based only on the max_threads_per_core setting, then the 414*b15aabb3Stbbdev // constraints affinity mask does may cross the border between several processor groups 415*b15aabb3Stbbdev // on machines with more then 64 hardware threads. That is why we need to use the special 416*b15aabb3Stbbdev // function, which regulates the number of threads in the current threads mask. 417*b15aabb3Stbbdev if (topology.number_of_processors_groups > 1 && my_max_threads_per_core != -1 && 418*b15aabb3Stbbdev (my_numa_node_id == -1 || topology.numa_indexes_list.size() == 1) && 419*b15aabb3Stbbdev (my_core_type_id == -1 || topology.core_types_indexes_list.size() == 1) 420*b15aabb3Stbbdev ) { 421*b15aabb3Stbbdev topology.fit_num_threads_per_core(affinity_buffer[slot_num], affinity_backup[slot_num], handler_affinity_mask); 422*b15aabb3Stbbdev topology.set_affinity_mask(affinity_buffer[slot_num]); 423*b15aabb3Stbbdev return; 424*b15aabb3Stbbdev } 425*b15aabb3Stbbdev #endif 426*b15aabb3Stbbdev topology.set_affinity_mask(handler_affinity_mask); 42751c0b2f7Stbbdev } 42851c0b2f7Stbbdev 42951c0b2f7Stbbdev void restore_previous_affinity_mask( unsigned slot_num ) { 430*b15aabb3Stbbdev auto& topology = platform_topology::instance(); 431*b15aabb3Stbbdev __TBB_ASSERT(topology.is_topology_parsed(), 43251c0b2f7Stbbdev "Trying to get access to uninitialized platform_topology"); 433*b15aabb3Stbbdev topology.set_affinity_mask(affinity_backup[slot_num]); 43451c0b2f7Stbbdev }; 43551c0b2f7Stbbdev 43651c0b2f7Stbbdev }; 43751c0b2f7Stbbdev 43851c0b2f7Stbbdev extern "C" { // exported to TBB interfaces 43951c0b2f7Stbbdev 440*b15aabb3Stbbdev void __TBB_internal_initialize_system_topology( 441*b15aabb3Stbbdev std::size_t groups_num, 442*b15aabb3Stbbdev int& numa_nodes_count, int*& numa_indexes_list, 443*b15aabb3Stbbdev int& core_types_count, int*& core_types_indexes_list 444*b15aabb3Stbbdev ) { 44551c0b2f7Stbbdev platform_topology::instance().initialize(groups_num); 446*b15aabb3Stbbdev platform_topology::instance().fill_topology_information( 447*b15aabb3Stbbdev numa_nodes_count, numa_indexes_list, 448*b15aabb3Stbbdev core_types_count, core_types_indexes_list 449*b15aabb3Stbbdev ); 45051c0b2f7Stbbdev } 45151c0b2f7Stbbdev 452*b15aabb3Stbbdev binding_handler* __TBB_internal_allocate_binding_handler(int number_of_slots, int numa_id, int core_type_id, int max_threads_per_core) { 453*b15aabb3Stbbdev __TBB_ASSERT(number_of_slots > 0, "Trying to create numa handler for 0 threads."); 454*b15aabb3Stbbdev return new binding_handler(number_of_slots, numa_id, core_type_id, max_threads_per_core); 45551c0b2f7Stbbdev } 45651c0b2f7Stbbdev 45751c0b2f7Stbbdev void __TBB_internal_deallocate_binding_handler(binding_handler* handler_ptr) { 45851c0b2f7Stbbdev __TBB_ASSERT(handler_ptr != nullptr, "Trying to deallocate nullptr pointer."); 45951c0b2f7Stbbdev delete handler_ptr; 46051c0b2f7Stbbdev } 46151c0b2f7Stbbdev 462*b15aabb3Stbbdev void __TBB_internal_apply_affinity(binding_handler* handler_ptr, int slot_num) { 46351c0b2f7Stbbdev __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata."); 464*b15aabb3Stbbdev handler_ptr->apply_affinity(slot_num); 46551c0b2f7Stbbdev } 46651c0b2f7Stbbdev 46751c0b2f7Stbbdev void __TBB_internal_restore_affinity(binding_handler* handler_ptr, int slot_num) { 46851c0b2f7Stbbdev __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata."); 46951c0b2f7Stbbdev handler_ptr->restore_previous_affinity_mask(slot_num); 47051c0b2f7Stbbdev } 47151c0b2f7Stbbdev 472*b15aabb3Stbbdev int __TBB_internal_get_default_concurrency(int numa_id, int core_type_id, int max_threads_per_core) { 473*b15aabb3Stbbdev return platform_topology::instance().get_default_concurrency(numa_id, core_type_id, max_threads_per_core); 474*b15aabb3Stbbdev } 475*b15aabb3Stbbdev 47651c0b2f7Stbbdev } // extern "C" 47751c0b2f7Stbbdev 47851c0b2f7Stbbdev } // namespace r1 47951c0b2f7Stbbdev } // namespace detail 48051c0b2f7Stbbdev } // namespace tbb 48151c0b2f7Stbbdev 48251c0b2f7Stbbdev #undef assertion_hwloc_wrapper 483