xref: /oneTBB/src/tbbbind/tbb_bind.cpp (revision b15aabb3)
151c0b2f7Stbbdev /*
2*b15aabb3Stbbdev     Copyright (c) 2019-2021 Intel Corporation
351c0b2f7Stbbdev 
451c0b2f7Stbbdev     Licensed under the Apache License, Version 2.0 (the "License");
551c0b2f7Stbbdev     you may not use this file except in compliance with the License.
651c0b2f7Stbbdev     You may obtain a copy of the License at
751c0b2f7Stbbdev 
851c0b2f7Stbbdev         http://www.apache.org/licenses/LICENSE-2.0
951c0b2f7Stbbdev 
1051c0b2f7Stbbdev     Unless required by applicable law or agreed to in writing, software
1151c0b2f7Stbbdev     distributed under the License is distributed on an "AS IS" BASIS,
1251c0b2f7Stbbdev     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1351c0b2f7Stbbdev     See the License for the specific language governing permissions and
1451c0b2f7Stbbdev     limitations under the License.
1551c0b2f7Stbbdev */
1651c0b2f7Stbbdev 
17*b15aabb3Stbbdev #include <vector>
18*b15aabb3Stbbdev #include <mutex>
19*b15aabb3Stbbdev 
2051c0b2f7Stbbdev #include "../tbb/assert_impl.h" // Out-of-line TBB assertion handling routines are instantiated here.
2149e08aacStbbdev #include "oneapi/tbb/detail/_assert.h"
2251c0b2f7Stbbdev 
2351c0b2f7Stbbdev #if _MSC_VER && !__INTEL_COMPILER
2451c0b2f7Stbbdev #pragma warning( push )
2551c0b2f7Stbbdev #pragma warning( disable : 4100 )
2651c0b2f7Stbbdev #endif
2751c0b2f7Stbbdev #include <hwloc.h>
2851c0b2f7Stbbdev #if _MSC_VER && !__INTEL_COMPILER
2951c0b2f7Stbbdev #pragma warning( pop )
3051c0b2f7Stbbdev #endif
3151c0b2f7Stbbdev 
32*b15aabb3Stbbdev #define __HWLOC_HYBRID_CPUS_INTERFACES_PRESENT (HWLOC_API_VERSION >= 0x20400)
3351c0b2f7Stbbdev 
3451c0b2f7Stbbdev // Most of hwloc calls returns negative exit code on error.
3551c0b2f7Stbbdev // This macro tracks error codes that are returned from the hwloc interfaces.
3651c0b2f7Stbbdev #define assertion_hwloc_wrapper(command, ...) \
3751c0b2f7Stbbdev         __TBB_ASSERT_EX( (command(__VA_ARGS__)) >= 0, "Error occurred during call to hwloc API.");
3851c0b2f7Stbbdev 
3951c0b2f7Stbbdev namespace tbb {
4051c0b2f7Stbbdev namespace detail {
4151c0b2f7Stbbdev namespace r1 {
4251c0b2f7Stbbdev 
4351c0b2f7Stbbdev //------------------------------------------------------------------------
4451c0b2f7Stbbdev // Information about the machine's hardware TBB is happen to work on
4551c0b2f7Stbbdev //------------------------------------------------------------------------
4651c0b2f7Stbbdev class platform_topology {
47*b15aabb3Stbbdev     friend class binding_handler;
4851c0b2f7Stbbdev 
49*b15aabb3Stbbdev     // Common topology members
50*b15aabb3Stbbdev     hwloc_topology_t topology{nullptr};
51*b15aabb3Stbbdev     hwloc_cpuset_t   process_cpu_affinity_mask{nullptr};
52*b15aabb3Stbbdev     hwloc_nodeset_t  process_node_affinity_mask{nullptr};
53*b15aabb3Stbbdev     std::size_t number_of_processors_groups{1};
5451c0b2f7Stbbdev 
55*b15aabb3Stbbdev     // NUMA API related topology members
56*b15aabb3Stbbdev     std::vector<hwloc_cpuset_t> numa_affinity_masks_list{};
57*b15aabb3Stbbdev     std::vector<int> numa_indexes_list{};
58*b15aabb3Stbbdev     int numa_nodes_count{0};
59*b15aabb3Stbbdev 
60*b15aabb3Stbbdev     // Hybrid CPUs API related topology members
61*b15aabb3Stbbdev     std::vector<hwloc_cpuset_t> core_types_affinity_masks_list{};
62*b15aabb3Stbbdev     std::vector<int> core_types_indexes_list{};
6351c0b2f7Stbbdev 
6451c0b2f7Stbbdev     enum init_stages { uninitialized,
6551c0b2f7Stbbdev                        started,
6651c0b2f7Stbbdev                        topology_allocated,
6751c0b2f7Stbbdev                        topology_loaded,
6851c0b2f7Stbbdev                        topology_parsed } initialization_state;
6951c0b2f7Stbbdev 
70*b15aabb3Stbbdev     // Binding threads that locate in another Windows Processor groups
7151c0b2f7Stbbdev     // is allowed only if machine topology contains several Windows Processors groups
7251c0b2f7Stbbdev     // and process affinity mask wasn`t limited manually (affinity mask cannot violates
7351c0b2f7Stbbdev     // processors group boundaries).
74*b15aabb3Stbbdev     bool intergroup_binding_allowed(std::size_t groups_num) { return groups_num > 1; }
7551c0b2f7Stbbdev 
76*b15aabb3Stbbdev private:
77*b15aabb3Stbbdev     void topology_initialization(std::size_t groups_num) {
7851c0b2f7Stbbdev         initialization_state = started;
7951c0b2f7Stbbdev 
8051c0b2f7Stbbdev         // Parse topology
8151c0b2f7Stbbdev         if ( hwloc_topology_init( &topology ) == 0 ) {
8251c0b2f7Stbbdev             initialization_state = topology_allocated;
8351c0b2f7Stbbdev             if ( hwloc_topology_load( topology ) == 0 ) {
8451c0b2f7Stbbdev                 initialization_state = topology_loaded;
8551c0b2f7Stbbdev             }
8651c0b2f7Stbbdev         }
87*b15aabb3Stbbdev         if ( initialization_state != topology_loaded )
8851c0b2f7Stbbdev             return;
8951c0b2f7Stbbdev 
9051c0b2f7Stbbdev         // Getting process affinity mask
9151c0b2f7Stbbdev         if ( intergroup_binding_allowed(groups_num) ) {
9251c0b2f7Stbbdev             process_cpu_affinity_mask  = hwloc_bitmap_dup(hwloc_topology_get_complete_cpuset (topology));
9351c0b2f7Stbbdev             process_node_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_nodeset(topology));
9451c0b2f7Stbbdev         } else {
9551c0b2f7Stbbdev             process_cpu_affinity_mask  = hwloc_bitmap_alloc();
9651c0b2f7Stbbdev             process_node_affinity_mask = hwloc_bitmap_alloc();
9751c0b2f7Stbbdev 
9851c0b2f7Stbbdev             assertion_hwloc_wrapper(hwloc_get_cpubind, topology, process_cpu_affinity_mask, 0);
9951c0b2f7Stbbdev             hwloc_cpuset_to_nodeset(topology, process_cpu_affinity_mask, process_node_affinity_mask);
10051c0b2f7Stbbdev         }
10151c0b2f7Stbbdev 
102*b15aabb3Stbbdev         number_of_processors_groups = groups_num;
103*b15aabb3Stbbdev     }
104*b15aabb3Stbbdev 
105*b15aabb3Stbbdev     void numa_topology_parsing() {
106*b15aabb3Stbbdev         // Fill parameters with stubs if topology parsing is broken.
107*b15aabb3Stbbdev         if ( initialization_state != topology_loaded ) {
108*b15aabb3Stbbdev             numa_nodes_count = 1;
109*b15aabb3Stbbdev             numa_indexes_list.push_back(-1);
110*b15aabb3Stbbdev             return;
111*b15aabb3Stbbdev         }
112*b15aabb3Stbbdev 
11351c0b2f7Stbbdev         // If system contains no NUMA nodes, HWLOC 1.11 returns an infinitely filled bitmap.
11451c0b2f7Stbbdev         // hwloc_bitmap_weight() returns negative value for such bitmaps, so we use this check
11551c0b2f7Stbbdev         // to change way of topology initialization.
11651c0b2f7Stbbdev         numa_nodes_count = hwloc_bitmap_weight(process_node_affinity_mask);
11751c0b2f7Stbbdev         if (numa_nodes_count <= 0) {
11851c0b2f7Stbbdev             // numa_nodes_count may be empty if the process affinity mask is empty too (invalid case)
11951c0b2f7Stbbdev             // or if some internal HWLOC error occurred.
12051c0b2f7Stbbdev             // So we place -1 as index in this case.
12151c0b2f7Stbbdev             numa_indexes_list.push_back(numa_nodes_count == 0 ? -1 : 0);
12251c0b2f7Stbbdev             numa_nodes_count = 1;
12351c0b2f7Stbbdev 
124*b15aabb3Stbbdev             numa_affinity_masks_list.push_back(hwloc_bitmap_dup(process_cpu_affinity_mask));
125*b15aabb3Stbbdev         } else {
12651c0b2f7Stbbdev             // Get NUMA logical indexes list
12751c0b2f7Stbbdev             unsigned counter = 0;
12851c0b2f7Stbbdev             int i = 0;
12951c0b2f7Stbbdev             int max_numa_index = -1;
13051c0b2f7Stbbdev             numa_indexes_list.resize(numa_nodes_count);
13151c0b2f7Stbbdev             hwloc_obj_t node_buffer;
13251c0b2f7Stbbdev             hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) {
13351c0b2f7Stbbdev                 node_buffer = hwloc_get_obj_by_type(topology, HWLOC_OBJ_NUMANODE, i);
13451c0b2f7Stbbdev                 numa_indexes_list[counter] = static_cast<int>(node_buffer->logical_index);
13551c0b2f7Stbbdev 
13651c0b2f7Stbbdev                 if ( numa_indexes_list[counter] > max_numa_index ) {
13751c0b2f7Stbbdev                     max_numa_index = numa_indexes_list[counter];
13851c0b2f7Stbbdev                 }
13951c0b2f7Stbbdev 
14051c0b2f7Stbbdev                 counter++;
14151c0b2f7Stbbdev             } hwloc_bitmap_foreach_end();
14251c0b2f7Stbbdev             __TBB_ASSERT(max_numa_index >= 0, "Maximal NUMA index must not be negative");
14351c0b2f7Stbbdev 
14451c0b2f7Stbbdev             // Fill concurrency and affinity masks lists
145*b15aabb3Stbbdev             numa_affinity_masks_list.resize(max_numa_index + 1);
14651c0b2f7Stbbdev             int index = 0;
14751c0b2f7Stbbdev             hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) {
14851c0b2f7Stbbdev                 node_buffer = hwloc_get_obj_by_type(topology, HWLOC_OBJ_NUMANODE, i);
14951c0b2f7Stbbdev                 index = static_cast<int>(node_buffer->logical_index);
15051c0b2f7Stbbdev 
151*b15aabb3Stbbdev                 hwloc_cpuset_t& current_mask = numa_affinity_masks_list[index];
15251c0b2f7Stbbdev                 current_mask = hwloc_bitmap_dup(node_buffer->cpuset);
15351c0b2f7Stbbdev 
15451c0b2f7Stbbdev                 hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask);
15551c0b2f7Stbbdev                 __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask), "hwloc detected unavailable NUMA node");
15651c0b2f7Stbbdev             } hwloc_bitmap_foreach_end();
157*b15aabb3Stbbdev         }
158*b15aabb3Stbbdev     }
159*b15aabb3Stbbdev 
160*b15aabb3Stbbdev     void core_types_topology_parsing() {
161*b15aabb3Stbbdev         // Fill parameters with stubs if topology parsing is broken.
162*b15aabb3Stbbdev         if ( initialization_state != topology_loaded ) {
163*b15aabb3Stbbdev             core_types_indexes_list.push_back(-1);
164*b15aabb3Stbbdev             return;
165*b15aabb3Stbbdev         }
166*b15aabb3Stbbdev #if __HWLOC_HYBRID_CPUS_INTERFACES_PRESENT
167*b15aabb3Stbbdev         __TBB_ASSERT(hwloc_get_api_version() >= 0x20400, "Hybrid CPUs support interfaces required HWLOC >= 2.4");
168*b15aabb3Stbbdev         // Parsing the hybrid CPU topology
169*b15aabb3Stbbdev         int core_types_number = hwloc_cpukinds_get_nr(topology, 0);
170*b15aabb3Stbbdev         bool core_types_parsing_broken = core_types_number <= 0;
171*b15aabb3Stbbdev         if (!core_types_parsing_broken) {
172*b15aabb3Stbbdev             core_types_affinity_masks_list.resize(core_types_number);
173*b15aabb3Stbbdev             int efficiency{-1};
174*b15aabb3Stbbdev 
175*b15aabb3Stbbdev             for (int core_type = 0; core_type < core_types_number; ++core_type) {
176*b15aabb3Stbbdev                 hwloc_cpuset_t& current_mask = core_types_affinity_masks_list[core_type];
177*b15aabb3Stbbdev                 current_mask = hwloc_bitmap_alloc();
178*b15aabb3Stbbdev 
179*b15aabb3Stbbdev                 if (!hwloc_cpukinds_get_info(topology, core_type, current_mask, &efficiency, nullptr, nullptr, 0)
180*b15aabb3Stbbdev                     && efficiency >= 0
181*b15aabb3Stbbdev                 ) {
182*b15aabb3Stbbdev                     hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask);
183*b15aabb3Stbbdev 
184*b15aabb3Stbbdev                     if (hwloc_bitmap_weight(current_mask) > 0) {
185*b15aabb3Stbbdev                         core_types_indexes_list.push_back(core_type);
186*b15aabb3Stbbdev                     }
187*b15aabb3Stbbdev                     __TBB_ASSERT(hwloc_bitmap_weight(current_mask) >= 0, "Infinivitely filled core type mask");
188*b15aabb3Stbbdev                 } else {
189*b15aabb3Stbbdev                     core_types_parsing_broken = true;
190*b15aabb3Stbbdev                     break;
191*b15aabb3Stbbdev                 }
192*b15aabb3Stbbdev             }
193*b15aabb3Stbbdev         }
194*b15aabb3Stbbdev #else /*!__HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/
195*b15aabb3Stbbdev         bool core_types_parsing_broken{true};
196*b15aabb3Stbbdev #endif /*__HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/
197*b15aabb3Stbbdev 
198*b15aabb3Stbbdev         if (core_types_parsing_broken) {
199*b15aabb3Stbbdev             for (auto& core_type_mask : core_types_affinity_masks_list) {
200*b15aabb3Stbbdev                 hwloc_bitmap_free(core_type_mask);
201*b15aabb3Stbbdev             }
202*b15aabb3Stbbdev             core_types_affinity_masks_list.resize(1);
203*b15aabb3Stbbdev             core_types_indexes_list.resize(1);
204*b15aabb3Stbbdev 
205*b15aabb3Stbbdev             core_types_affinity_masks_list[0] = hwloc_bitmap_dup(process_cpu_affinity_mask);
206*b15aabb3Stbbdev             core_types_indexes_list[0] = -1;
207*b15aabb3Stbbdev         }
208*b15aabb3Stbbdev     }
209*b15aabb3Stbbdev 
210*b15aabb3Stbbdev public:
211*b15aabb3Stbbdev     typedef hwloc_cpuset_t             affinity_mask;
212*b15aabb3Stbbdev     typedef hwloc_const_cpuset_t const_affinity_mask;
213*b15aabb3Stbbdev 
214*b15aabb3Stbbdev     static platform_topology& instance() {
215*b15aabb3Stbbdev         static platform_topology topology;
216*b15aabb3Stbbdev         return topology;
217*b15aabb3Stbbdev     }
218*b15aabb3Stbbdev 
219*b15aabb3Stbbdev     bool is_topology_parsed() { return initialization_state == topology_parsed; }
220*b15aabb3Stbbdev 
221*b15aabb3Stbbdev     void initialize( std::size_t groups_num ) {
222*b15aabb3Stbbdev         if ( initialization_state != uninitialized )
223*b15aabb3Stbbdev             return;
224*b15aabb3Stbbdev 
225*b15aabb3Stbbdev         topology_initialization(groups_num);
226*b15aabb3Stbbdev         numa_topology_parsing();
227*b15aabb3Stbbdev         core_types_topology_parsing();
228*b15aabb3Stbbdev 
229*b15aabb3Stbbdev         if (initialization_state == topology_loaded)
23051c0b2f7Stbbdev             initialization_state = topology_parsed;
23151c0b2f7Stbbdev     }
23251c0b2f7Stbbdev 
23351c0b2f7Stbbdev     ~platform_topology() {
23451c0b2f7Stbbdev         if ( is_topology_parsed() ) {
235*b15aabb3Stbbdev             for (auto& numa_node_mask : numa_affinity_masks_list) {
236*b15aabb3Stbbdev                 hwloc_bitmap_free(numa_node_mask);
23751c0b2f7Stbbdev             }
238*b15aabb3Stbbdev 
239*b15aabb3Stbbdev             for (auto& core_type_mask : core_types_affinity_masks_list) {
240*b15aabb3Stbbdev                 hwloc_bitmap_free(core_type_mask);
241*b15aabb3Stbbdev             }
242*b15aabb3Stbbdev 
24351c0b2f7Stbbdev             hwloc_bitmap_free(process_node_affinity_mask);
24451c0b2f7Stbbdev             hwloc_bitmap_free(process_cpu_affinity_mask);
24551c0b2f7Stbbdev         }
24651c0b2f7Stbbdev 
24751c0b2f7Stbbdev         if ( initialization_state >= topology_allocated ) {
24851c0b2f7Stbbdev             hwloc_topology_destroy(topology);
24951c0b2f7Stbbdev         }
25051c0b2f7Stbbdev 
25151c0b2f7Stbbdev         initialization_state = uninitialized;
25251c0b2f7Stbbdev     }
25351c0b2f7Stbbdev 
254*b15aabb3Stbbdev     void fill_topology_information(
255*b15aabb3Stbbdev         int& _numa_nodes_count, int*& _numa_indexes_list,
256*b15aabb3Stbbdev         int& _core_types_count, int*& _core_types_indexes_list
257*b15aabb3Stbbdev     ) {
25851c0b2f7Stbbdev         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology");
259*b15aabb3Stbbdev         _numa_nodes_count = numa_nodes_count;
260*b15aabb3Stbbdev         _numa_indexes_list = numa_indexes_list.data();
261*b15aabb3Stbbdev 
262*b15aabb3Stbbdev         _core_types_count = (int)core_types_indexes_list.size();
263*b15aabb3Stbbdev         _core_types_indexes_list = core_types_indexes_list.data();
264*b15aabb3Stbbdev     }
265*b15aabb3Stbbdev 
266*b15aabb3Stbbdev     void fill_constraints_affinity_mask(affinity_mask input_mask, int numa_node_index, int core_type_index, int max_threads_per_core) {
267*b15aabb3Stbbdev         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology");
268*b15aabb3Stbbdev         __TBB_ASSERT(numa_node_index < (int)numa_affinity_masks_list.size(), "Wrong NUMA node id");
269*b15aabb3Stbbdev         __TBB_ASSERT(core_type_index < (int)core_types_affinity_masks_list.size(), "Wrong core type id");
270*b15aabb3Stbbdev         __TBB_ASSERT(max_threads_per_core == -1 || max_threads_per_core > 0, "Wrong max_threads_per_core");
271*b15aabb3Stbbdev 
272*b15aabb3Stbbdev         hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc();
273*b15aabb3Stbbdev         hwloc_cpuset_t core_mask = hwloc_bitmap_alloc();
274*b15aabb3Stbbdev 
275*b15aabb3Stbbdev         hwloc_bitmap_copy(constraints_mask, process_cpu_affinity_mask);
276*b15aabb3Stbbdev         if (numa_node_index >= 0) {
277*b15aabb3Stbbdev             hwloc_bitmap_and(constraints_mask, constraints_mask, numa_affinity_masks_list[numa_node_index]);
278*b15aabb3Stbbdev         }
279*b15aabb3Stbbdev         if (core_type_index >= 0) {
280*b15aabb3Stbbdev             hwloc_bitmap_and(constraints_mask, constraints_mask, core_types_affinity_masks_list[core_type_index]);
281*b15aabb3Stbbdev         }
282*b15aabb3Stbbdev         if (max_threads_per_core > 0) {
283*b15aabb3Stbbdev             // clear input mask
284*b15aabb3Stbbdev             hwloc_bitmap_zero(input_mask);
285*b15aabb3Stbbdev 
286*b15aabb3Stbbdev             hwloc_obj_t current_core = nullptr;
287*b15aabb3Stbbdev             while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) {
288*b15aabb3Stbbdev                 hwloc_bitmap_and(core_mask, constraints_mask, current_core->cpuset);
289*b15aabb3Stbbdev 
290*b15aabb3Stbbdev                 // fit the core mask to required bits number
291*b15aabb3Stbbdev                 int current_threads_per_core = 0;
292*b15aabb3Stbbdev                 for (int id = hwloc_bitmap_first(core_mask); id != -1; id = hwloc_bitmap_next(core_mask, id)) {
293*b15aabb3Stbbdev                     if (++current_threads_per_core > max_threads_per_core) {
294*b15aabb3Stbbdev                         hwloc_bitmap_clr(core_mask, id);
295*b15aabb3Stbbdev                     }
296*b15aabb3Stbbdev                 }
297*b15aabb3Stbbdev 
298*b15aabb3Stbbdev                 hwloc_bitmap_or(input_mask, input_mask, core_mask);
299*b15aabb3Stbbdev             }
300*b15aabb3Stbbdev         } else {
301*b15aabb3Stbbdev             hwloc_bitmap_copy(input_mask, constraints_mask);
302*b15aabb3Stbbdev         }
303*b15aabb3Stbbdev 
304*b15aabb3Stbbdev         hwloc_bitmap_free(core_mask);
305*b15aabb3Stbbdev         hwloc_bitmap_free(constraints_mask);
306*b15aabb3Stbbdev     }
307*b15aabb3Stbbdev 
308*b15aabb3Stbbdev     void fit_num_threads_per_core(affinity_mask result_mask, affinity_mask current_mask, affinity_mask constraints_mask) {
309*b15aabb3Stbbdev         hwloc_bitmap_zero(result_mask);
310*b15aabb3Stbbdev         hwloc_obj_t current_core = nullptr;
311*b15aabb3Stbbdev         while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) {
312*b15aabb3Stbbdev             if (hwloc_bitmap_intersects(current_mask, current_core->cpuset)) {
313*b15aabb3Stbbdev                 hwloc_bitmap_or(result_mask, result_mask, current_core->cpuset);
314*b15aabb3Stbbdev             }
315*b15aabb3Stbbdev         }
316*b15aabb3Stbbdev         hwloc_bitmap_and(result_mask, result_mask, constraints_mask);
317*b15aabb3Stbbdev     }
318*b15aabb3Stbbdev 
319*b15aabb3Stbbdev     int get_default_concurrency(int numa_node_index, int core_type_index, int max_threads_per_core) {
320*b15aabb3Stbbdev         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology");
321*b15aabb3Stbbdev 
322*b15aabb3Stbbdev         hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc();
323*b15aabb3Stbbdev         fill_constraints_affinity_mask(constraints_mask, numa_node_index, core_type_index, max_threads_per_core);
324*b15aabb3Stbbdev 
325*b15aabb3Stbbdev         int default_concurrency = hwloc_bitmap_weight(constraints_mask);
326*b15aabb3Stbbdev         hwloc_bitmap_free(constraints_mask);
327*b15aabb3Stbbdev         return default_concurrency;
32851c0b2f7Stbbdev     }
32951c0b2f7Stbbdev 
33051c0b2f7Stbbdev     affinity_mask allocate_process_affinity_mask() {
33151c0b2f7Stbbdev         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology");
33251c0b2f7Stbbdev         return hwloc_bitmap_dup(process_cpu_affinity_mask);
33351c0b2f7Stbbdev     }
33451c0b2f7Stbbdev 
33551c0b2f7Stbbdev     void free_affinity_mask( affinity_mask mask_to_free ) {
33651c0b2f7Stbbdev         hwloc_bitmap_free(mask_to_free); // If bitmap is nullptr, no operation is performed.
33751c0b2f7Stbbdev     }
33851c0b2f7Stbbdev 
33951c0b2f7Stbbdev     void store_current_affinity_mask( affinity_mask current_mask ) {
34051c0b2f7Stbbdev         assertion_hwloc_wrapper(hwloc_get_cpubind, topology, current_mask, HWLOC_CPUBIND_THREAD);
34151c0b2f7Stbbdev 
34251c0b2f7Stbbdev         hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask);
34351c0b2f7Stbbdev         __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask),
34451c0b2f7Stbbdev             "Current affinity mask must intersects with process affinity mask");
34551c0b2f7Stbbdev     }
34651c0b2f7Stbbdev 
347*b15aabb3Stbbdev     void set_affinity_mask( const_affinity_mask mask ) {
348*b15aabb3Stbbdev         if (hwloc_bitmap_weight(mask) > 0) {
349*b15aabb3Stbbdev             assertion_hwloc_wrapper(hwloc_set_cpubind, topology, mask, HWLOC_CPUBIND_THREAD);
35051c0b2f7Stbbdev         }
35151c0b2f7Stbbdev     }
35251c0b2f7Stbbdev };
35351c0b2f7Stbbdev 
35451c0b2f7Stbbdev class binding_handler {
35551c0b2f7Stbbdev     // Following vector saves thread affinity mask on scheduler entry to return it to this thread
35651c0b2f7Stbbdev     // on scheduler exit.
35751c0b2f7Stbbdev     typedef std::vector<platform_topology::affinity_mask> affinity_masks_container;
35851c0b2f7Stbbdev     affinity_masks_container affinity_backup;
359*b15aabb3Stbbdev     platform_topology::affinity_mask handler_affinity_mask;
360*b15aabb3Stbbdev 
361*b15aabb3Stbbdev #if WIN32
362*b15aabb3Stbbdev     affinity_masks_container affinity_buffer;
363*b15aabb3Stbbdev     int my_numa_node_id;
364*b15aabb3Stbbdev     int my_core_type_id;
365*b15aabb3Stbbdev     int my_max_threads_per_core;
366*b15aabb3Stbbdev #endif
36751c0b2f7Stbbdev 
36851c0b2f7Stbbdev public:
369*b15aabb3Stbbdev     binding_handler( std::size_t size, int numa_node_id, int core_type_id, int max_threads_per_core )
370*b15aabb3Stbbdev         : affinity_backup(size)
371*b15aabb3Stbbdev #if WIN32
372*b15aabb3Stbbdev         , affinity_buffer(size)
373*b15aabb3Stbbdev         , my_numa_node_id(numa_node_id)
374*b15aabb3Stbbdev         , my_core_type_id(core_type_id)
375*b15aabb3Stbbdev         , my_max_threads_per_core(max_threads_per_core)
376*b15aabb3Stbbdev #endif
377*b15aabb3Stbbdev     {
378*b15aabb3Stbbdev         for (std::size_t i = 0; i < size; ++i) {
379*b15aabb3Stbbdev             affinity_backup[i] = platform_topology::instance().allocate_process_affinity_mask();
380*b15aabb3Stbbdev #if WIN32
381*b15aabb3Stbbdev             affinity_buffer[i] = platform_topology::instance().allocate_process_affinity_mask();
382*b15aabb3Stbbdev #endif
38351c0b2f7Stbbdev         }
384*b15aabb3Stbbdev         handler_affinity_mask = platform_topology::instance().allocate_process_affinity_mask();
385*b15aabb3Stbbdev         platform_topology::instance().fill_constraints_affinity_mask
386*b15aabb3Stbbdev             (handler_affinity_mask, numa_node_id, core_type_id, max_threads_per_core);
38751c0b2f7Stbbdev     }
38851c0b2f7Stbbdev 
38951c0b2f7Stbbdev     ~binding_handler() {
390*b15aabb3Stbbdev         for (std::size_t i = 0; i < affinity_backup.size(); ++i) {
391*b15aabb3Stbbdev             platform_topology::instance().free_affinity_mask(affinity_backup[i]);
392*b15aabb3Stbbdev #if WIN32
393*b15aabb3Stbbdev             platform_topology::instance().free_affinity_mask(affinity_buffer[i]);
394*b15aabb3Stbbdev #endif
39551c0b2f7Stbbdev         }
396*b15aabb3Stbbdev         platform_topology::instance().free_affinity_mask(handler_affinity_mask);
39751c0b2f7Stbbdev     }
39851c0b2f7Stbbdev 
399*b15aabb3Stbbdev     void apply_affinity( unsigned slot_num ) {
400*b15aabb3Stbbdev         auto& topology = platform_topology::instance();
40151c0b2f7Stbbdev         __TBB_ASSERT(slot_num < affinity_backup.size(),
40251c0b2f7Stbbdev             "The slot number is greater than the number of slots in the arena");
403*b15aabb3Stbbdev         __TBB_ASSERT(topology.is_topology_parsed(),
40451c0b2f7Stbbdev             "Trying to get access to uninitialized platform_topology");
40551c0b2f7Stbbdev 
406*b15aabb3Stbbdev         topology.store_current_affinity_mask(affinity_backup[slot_num]);
407*b15aabb3Stbbdev 
408*b15aabb3Stbbdev #if WIN32
409*b15aabb3Stbbdev         // TBBBind supports only systems where NUMA nodes and core types do not cross the border
410*b15aabb3Stbbdev         // between several processor groups. So if a certain NUMA node or core type constraint
411*b15aabb3Stbbdev         // specified, then the constraints affinity mask will not cross the processor groups' border.
412*b15aabb3Stbbdev 
413*b15aabb3Stbbdev         // But if we have constraint based only on the max_threads_per_core setting, then the
414*b15aabb3Stbbdev         // constraints affinity mask does may cross the border between several processor groups
415*b15aabb3Stbbdev         // on machines with more then 64 hardware threads. That is why we need to use the special
416*b15aabb3Stbbdev         // function, which regulates the number of threads in the current threads mask.
417*b15aabb3Stbbdev         if (topology.number_of_processors_groups > 1 && my_max_threads_per_core != -1 &&
418*b15aabb3Stbbdev             (my_numa_node_id == -1 || topology.numa_indexes_list.size() == 1) &&
419*b15aabb3Stbbdev             (my_core_type_id == -1 || topology.core_types_indexes_list.size() == 1)
420*b15aabb3Stbbdev         ) {
421*b15aabb3Stbbdev             topology.fit_num_threads_per_core(affinity_buffer[slot_num], affinity_backup[slot_num], handler_affinity_mask);
422*b15aabb3Stbbdev             topology.set_affinity_mask(affinity_buffer[slot_num]);
423*b15aabb3Stbbdev             return;
424*b15aabb3Stbbdev         }
425*b15aabb3Stbbdev #endif
426*b15aabb3Stbbdev         topology.set_affinity_mask(handler_affinity_mask);
42751c0b2f7Stbbdev     }
42851c0b2f7Stbbdev 
42951c0b2f7Stbbdev     void restore_previous_affinity_mask( unsigned slot_num ) {
430*b15aabb3Stbbdev         auto& topology = platform_topology::instance();
431*b15aabb3Stbbdev         __TBB_ASSERT(topology.is_topology_parsed(),
43251c0b2f7Stbbdev             "Trying to get access to uninitialized platform_topology");
433*b15aabb3Stbbdev         topology.set_affinity_mask(affinity_backup[slot_num]);
43451c0b2f7Stbbdev     };
43551c0b2f7Stbbdev 
43651c0b2f7Stbbdev };
43751c0b2f7Stbbdev 
43851c0b2f7Stbbdev extern "C" { // exported to TBB interfaces
43951c0b2f7Stbbdev 
440*b15aabb3Stbbdev void __TBB_internal_initialize_system_topology(
441*b15aabb3Stbbdev     std::size_t groups_num,
442*b15aabb3Stbbdev     int& numa_nodes_count, int*& numa_indexes_list,
443*b15aabb3Stbbdev     int& core_types_count, int*& core_types_indexes_list
444*b15aabb3Stbbdev ) {
44551c0b2f7Stbbdev     platform_topology::instance().initialize(groups_num);
446*b15aabb3Stbbdev     platform_topology::instance().fill_topology_information(
447*b15aabb3Stbbdev         numa_nodes_count, numa_indexes_list,
448*b15aabb3Stbbdev         core_types_count, core_types_indexes_list
449*b15aabb3Stbbdev     );
45051c0b2f7Stbbdev }
45151c0b2f7Stbbdev 
452*b15aabb3Stbbdev binding_handler* __TBB_internal_allocate_binding_handler(int number_of_slots, int numa_id, int core_type_id, int max_threads_per_core) {
453*b15aabb3Stbbdev     __TBB_ASSERT(number_of_slots > 0, "Trying to create numa handler for 0 threads.");
454*b15aabb3Stbbdev     return new binding_handler(number_of_slots, numa_id, core_type_id, max_threads_per_core);
45551c0b2f7Stbbdev }
45651c0b2f7Stbbdev 
45751c0b2f7Stbbdev void __TBB_internal_deallocate_binding_handler(binding_handler* handler_ptr) {
45851c0b2f7Stbbdev     __TBB_ASSERT(handler_ptr != nullptr, "Trying to deallocate nullptr pointer.");
45951c0b2f7Stbbdev     delete handler_ptr;
46051c0b2f7Stbbdev }
46151c0b2f7Stbbdev 
462*b15aabb3Stbbdev void __TBB_internal_apply_affinity(binding_handler* handler_ptr, int slot_num) {
46351c0b2f7Stbbdev     __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata.");
464*b15aabb3Stbbdev     handler_ptr->apply_affinity(slot_num);
46551c0b2f7Stbbdev }
46651c0b2f7Stbbdev 
46751c0b2f7Stbbdev void __TBB_internal_restore_affinity(binding_handler* handler_ptr, int slot_num) {
46851c0b2f7Stbbdev     __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata.");
46951c0b2f7Stbbdev     handler_ptr->restore_previous_affinity_mask(slot_num);
47051c0b2f7Stbbdev }
47151c0b2f7Stbbdev 
472*b15aabb3Stbbdev int __TBB_internal_get_default_concurrency(int numa_id, int core_type_id, int max_threads_per_core) {
473*b15aabb3Stbbdev     return platform_topology::instance().get_default_concurrency(numa_id, core_type_id, max_threads_per_core);
474*b15aabb3Stbbdev }
475*b15aabb3Stbbdev 
47651c0b2f7Stbbdev } // extern "C"
47751c0b2f7Stbbdev 
47851c0b2f7Stbbdev } // namespace r1
47951c0b2f7Stbbdev } // namespace detail
48051c0b2f7Stbbdev } // namespace tbb
48151c0b2f7Stbbdev 
48251c0b2f7Stbbdev #undef assertion_hwloc_wrapper
483