xref: /oneTBB/src/tbbbind/tbb_bind.cpp (revision e96dbf4b)
151c0b2f7Stbbdev /*
2b15aabb3Stbbdev     Copyright (c) 2019-2021 Intel Corporation
351c0b2f7Stbbdev 
451c0b2f7Stbbdev     Licensed under the Apache License, Version 2.0 (the "License");
551c0b2f7Stbbdev     you may not use this file except in compliance with the License.
651c0b2f7Stbbdev     You may obtain a copy of the License at
751c0b2f7Stbbdev 
851c0b2f7Stbbdev         http://www.apache.org/licenses/LICENSE-2.0
951c0b2f7Stbbdev 
1051c0b2f7Stbbdev     Unless required by applicable law or agreed to in writing, software
1151c0b2f7Stbbdev     distributed under the License is distributed on an "AS IS" BASIS,
1251c0b2f7Stbbdev     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1351c0b2f7Stbbdev     See the License for the specific language governing permissions and
1451c0b2f7Stbbdev     limitations under the License.
1551c0b2f7Stbbdev */
1651c0b2f7Stbbdev 
17b15aabb3Stbbdev #include <vector>
18b15aabb3Stbbdev #include <mutex>
19b15aabb3Stbbdev 
2051c0b2f7Stbbdev #include "../tbb/assert_impl.h" // Out-of-line TBB assertion handling routines are instantiated here.
2149e08aacStbbdev #include "oneapi/tbb/detail/_assert.h"
2251c0b2f7Stbbdev 
23478de5b1Stbbdev #if _MSC_VER && !__INTEL_COMPILER && !__clang__
2451c0b2f7Stbbdev #pragma warning( push )
2551c0b2f7Stbbdev #pragma warning( disable : 4100 )
26478de5b1Stbbdev #elif _MSC_VER && __clang__
27478de5b1Stbbdev #pragma GCC diagnostic push
28478de5b1Stbbdev #pragma GCC diagnostic ignored "-Wunused-parameter"
2951c0b2f7Stbbdev #endif
3051c0b2f7Stbbdev #include <hwloc.h>
31478de5b1Stbbdev #if _MSC_VER && !__INTEL_COMPILER && !__clang__
3251c0b2f7Stbbdev #pragma warning( pop )
33478de5b1Stbbdev #elif _MSC_VER && __clang__
34478de5b1Stbbdev #pragma GCC diagnostic pop
3551c0b2f7Stbbdev #endif
3651c0b2f7Stbbdev 
37*e96dbf4bSIvan Kochin #define __TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT (HWLOC_API_VERSION >= 0x20400)
38*e96dbf4bSIvan Kochin #define __TBBBIND_HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING_PRESENT (HWLOC_API_VERSION >= 0x20500)
3951c0b2f7Stbbdev 
4051c0b2f7Stbbdev // Most of hwloc calls returns negative exit code on error.
4151c0b2f7Stbbdev // This macro tracks error codes that are returned from the hwloc interfaces.
4251c0b2f7Stbbdev #define assertion_hwloc_wrapper(command, ...) \
4351c0b2f7Stbbdev         __TBB_ASSERT_EX( (command(__VA_ARGS__)) >= 0, "Error occurred during call to hwloc API.");
4451c0b2f7Stbbdev 
4551c0b2f7Stbbdev namespace tbb {
4651c0b2f7Stbbdev namespace detail {
4751c0b2f7Stbbdev namespace r1 {
4851c0b2f7Stbbdev 
4951c0b2f7Stbbdev //------------------------------------------------------------------------
5051c0b2f7Stbbdev // Information about the machine's hardware TBB is happen to work on
5151c0b2f7Stbbdev //------------------------------------------------------------------------
5251c0b2f7Stbbdev class platform_topology {
53b15aabb3Stbbdev     friend class binding_handler;
5451c0b2f7Stbbdev 
55b15aabb3Stbbdev     // Common topology members
56b15aabb3Stbbdev     hwloc_topology_t topology{nullptr};
57b15aabb3Stbbdev     hwloc_cpuset_t   process_cpu_affinity_mask{nullptr};
58b15aabb3Stbbdev     hwloc_nodeset_t  process_node_affinity_mask{nullptr};
59b15aabb3Stbbdev     std::size_t number_of_processors_groups{1};
6051c0b2f7Stbbdev 
61b15aabb3Stbbdev     // NUMA API related topology members
62b15aabb3Stbbdev     std::vector<hwloc_cpuset_t> numa_affinity_masks_list{};
63b15aabb3Stbbdev     std::vector<int> numa_indexes_list{};
64b15aabb3Stbbdev     int numa_nodes_count{0};
65b15aabb3Stbbdev 
66b15aabb3Stbbdev     // Hybrid CPUs API related topology members
67b15aabb3Stbbdev     std::vector<hwloc_cpuset_t> core_types_affinity_masks_list{};
68b15aabb3Stbbdev     std::vector<int> core_types_indexes_list{};
6951c0b2f7Stbbdev 
7051c0b2f7Stbbdev     enum init_stages { uninitialized,
7151c0b2f7Stbbdev                        started,
7251c0b2f7Stbbdev                        topology_allocated,
7351c0b2f7Stbbdev                        topology_loaded,
7451c0b2f7Stbbdev                        topology_parsed } initialization_state;
7551c0b2f7Stbbdev 
76b15aabb3Stbbdev     // Binding threads that locate in another Windows Processor groups
7751c0b2f7Stbbdev     // is allowed only if machine topology contains several Windows Processors groups
7851c0b2f7Stbbdev     // and process affinity mask wasn`t limited manually (affinity mask cannot violates
7951c0b2f7Stbbdev     // processors group boundaries).
80b15aabb3Stbbdev     bool intergroup_binding_allowed(std::size_t groups_num) { return groups_num > 1; }
8151c0b2f7Stbbdev 
82b15aabb3Stbbdev private:
83b15aabb3Stbbdev     void topology_initialization(std::size_t groups_num) {
8451c0b2f7Stbbdev         initialization_state = started;
8551c0b2f7Stbbdev 
8651c0b2f7Stbbdev         // Parse topology
8751c0b2f7Stbbdev         if ( hwloc_topology_init( &topology ) == 0 ) {
8851c0b2f7Stbbdev             initialization_state = topology_allocated;
89*e96dbf4bSIvan Kochin #if __TBBBIND_HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING_PRESENT
90*e96dbf4bSIvan Kochin             if ( groups_num == 1 &&
91*e96dbf4bSIvan Kochin                  hwloc_topology_set_flags(topology,
92*e96dbf4bSIvan Kochin                      HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM |
93*e96dbf4bSIvan Kochin                      HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING
94*e96dbf4bSIvan Kochin                  ) != 0
95*e96dbf4bSIvan Kochin             ) {
96*e96dbf4bSIvan Kochin                 return;
97*e96dbf4bSIvan Kochin             }
98*e96dbf4bSIvan Kochin #endif
9951c0b2f7Stbbdev             if ( hwloc_topology_load( topology ) == 0 ) {
10051c0b2f7Stbbdev                 initialization_state = topology_loaded;
10151c0b2f7Stbbdev             }
10251c0b2f7Stbbdev         }
103b15aabb3Stbbdev         if ( initialization_state != topology_loaded )
10451c0b2f7Stbbdev             return;
10551c0b2f7Stbbdev 
10651c0b2f7Stbbdev         // Getting process affinity mask
10751c0b2f7Stbbdev         if ( intergroup_binding_allowed(groups_num) ) {
10851c0b2f7Stbbdev             process_cpu_affinity_mask  = hwloc_bitmap_dup(hwloc_topology_get_complete_cpuset (topology));
10951c0b2f7Stbbdev             process_node_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_nodeset(topology));
11051c0b2f7Stbbdev         } else {
11151c0b2f7Stbbdev             process_cpu_affinity_mask  = hwloc_bitmap_alloc();
11251c0b2f7Stbbdev             process_node_affinity_mask = hwloc_bitmap_alloc();
11351c0b2f7Stbbdev 
11451c0b2f7Stbbdev             assertion_hwloc_wrapper(hwloc_get_cpubind, topology, process_cpu_affinity_mask, 0);
11551c0b2f7Stbbdev             hwloc_cpuset_to_nodeset(topology, process_cpu_affinity_mask, process_node_affinity_mask);
11651c0b2f7Stbbdev         }
11751c0b2f7Stbbdev 
118b15aabb3Stbbdev         number_of_processors_groups = groups_num;
119b15aabb3Stbbdev     }
120b15aabb3Stbbdev 
121b15aabb3Stbbdev     void numa_topology_parsing() {
122b15aabb3Stbbdev         // Fill parameters with stubs if topology parsing is broken.
123b15aabb3Stbbdev         if ( initialization_state != topology_loaded ) {
124b15aabb3Stbbdev             numa_nodes_count = 1;
125b15aabb3Stbbdev             numa_indexes_list.push_back(-1);
126b15aabb3Stbbdev             return;
127b15aabb3Stbbdev         }
128b15aabb3Stbbdev 
12951c0b2f7Stbbdev         // If system contains no NUMA nodes, HWLOC 1.11 returns an infinitely filled bitmap.
13051c0b2f7Stbbdev         // hwloc_bitmap_weight() returns negative value for such bitmaps, so we use this check
13151c0b2f7Stbbdev         // to change way of topology initialization.
13251c0b2f7Stbbdev         numa_nodes_count = hwloc_bitmap_weight(process_node_affinity_mask);
13351c0b2f7Stbbdev         if (numa_nodes_count <= 0) {
13451c0b2f7Stbbdev             // numa_nodes_count may be empty if the process affinity mask is empty too (invalid case)
13551c0b2f7Stbbdev             // or if some internal HWLOC error occurred.
13651c0b2f7Stbbdev             // So we place -1 as index in this case.
13751c0b2f7Stbbdev             numa_indexes_list.push_back(numa_nodes_count == 0 ? -1 : 0);
13851c0b2f7Stbbdev             numa_nodes_count = 1;
13951c0b2f7Stbbdev 
140b15aabb3Stbbdev             numa_affinity_masks_list.push_back(hwloc_bitmap_dup(process_cpu_affinity_mask));
141b15aabb3Stbbdev         } else {
14251c0b2f7Stbbdev             // Get NUMA logical indexes list
14351c0b2f7Stbbdev             unsigned counter = 0;
14451c0b2f7Stbbdev             int i = 0;
14551c0b2f7Stbbdev             int max_numa_index = -1;
14651c0b2f7Stbbdev             numa_indexes_list.resize(numa_nodes_count);
14751c0b2f7Stbbdev             hwloc_obj_t node_buffer;
14851c0b2f7Stbbdev             hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) {
149*e96dbf4bSIvan Kochin                 node_buffer = hwloc_get_numanode_obj_by_os_index(topology, i);
15051c0b2f7Stbbdev                 numa_indexes_list[counter] = static_cast<int>(node_buffer->logical_index);
15151c0b2f7Stbbdev 
15251c0b2f7Stbbdev                 if ( numa_indexes_list[counter] > max_numa_index ) {
15351c0b2f7Stbbdev                     max_numa_index = numa_indexes_list[counter];
15451c0b2f7Stbbdev                 }
15551c0b2f7Stbbdev 
15651c0b2f7Stbbdev                 counter++;
15751c0b2f7Stbbdev             } hwloc_bitmap_foreach_end();
15851c0b2f7Stbbdev             __TBB_ASSERT(max_numa_index >= 0, "Maximal NUMA index must not be negative");
15951c0b2f7Stbbdev 
16051c0b2f7Stbbdev             // Fill concurrency and affinity masks lists
161b15aabb3Stbbdev             numa_affinity_masks_list.resize(max_numa_index + 1);
16251c0b2f7Stbbdev             int index = 0;
16351c0b2f7Stbbdev             hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) {
164*e96dbf4bSIvan Kochin                 node_buffer = hwloc_get_numanode_obj_by_os_index(topology, i);
16551c0b2f7Stbbdev                 index = static_cast<int>(node_buffer->logical_index);
16651c0b2f7Stbbdev 
167b15aabb3Stbbdev                 hwloc_cpuset_t& current_mask = numa_affinity_masks_list[index];
16851c0b2f7Stbbdev                 current_mask = hwloc_bitmap_dup(node_buffer->cpuset);
16951c0b2f7Stbbdev 
17051c0b2f7Stbbdev                 hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask);
17151c0b2f7Stbbdev                 __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask), "hwloc detected unavailable NUMA node");
17251c0b2f7Stbbdev             } hwloc_bitmap_foreach_end();
173b15aabb3Stbbdev         }
174b15aabb3Stbbdev     }
175b15aabb3Stbbdev 
176b15aabb3Stbbdev     void core_types_topology_parsing() {
177b15aabb3Stbbdev         // Fill parameters with stubs if topology parsing is broken.
178b15aabb3Stbbdev         if ( initialization_state != topology_loaded ) {
179b15aabb3Stbbdev             core_types_indexes_list.push_back(-1);
180b15aabb3Stbbdev             return;
181b15aabb3Stbbdev         }
182*e96dbf4bSIvan Kochin #if __TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT
183b15aabb3Stbbdev         __TBB_ASSERT(hwloc_get_api_version() >= 0x20400, "Hybrid CPUs support interfaces required HWLOC >= 2.4");
184b15aabb3Stbbdev         // Parsing the hybrid CPU topology
185b15aabb3Stbbdev         int core_types_number = hwloc_cpukinds_get_nr(topology, 0);
186b15aabb3Stbbdev         bool core_types_parsing_broken = core_types_number <= 0;
187b15aabb3Stbbdev         if (!core_types_parsing_broken) {
188b15aabb3Stbbdev             core_types_affinity_masks_list.resize(core_types_number);
189b15aabb3Stbbdev             int efficiency{-1};
190b15aabb3Stbbdev 
191b15aabb3Stbbdev             for (int core_type = 0; core_type < core_types_number; ++core_type) {
192b15aabb3Stbbdev                 hwloc_cpuset_t& current_mask = core_types_affinity_masks_list[core_type];
193b15aabb3Stbbdev                 current_mask = hwloc_bitmap_alloc();
194b15aabb3Stbbdev 
195b15aabb3Stbbdev                 if (!hwloc_cpukinds_get_info(topology, core_type, current_mask, &efficiency, nullptr, nullptr, 0)
196b15aabb3Stbbdev                     && efficiency >= 0
197b15aabb3Stbbdev                 ) {
198b15aabb3Stbbdev                     hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask);
199b15aabb3Stbbdev 
200b15aabb3Stbbdev                     if (hwloc_bitmap_weight(current_mask) > 0) {
201b15aabb3Stbbdev                         core_types_indexes_list.push_back(core_type);
202b15aabb3Stbbdev                     }
203b15aabb3Stbbdev                     __TBB_ASSERT(hwloc_bitmap_weight(current_mask) >= 0, "Infinivitely filled core type mask");
204b15aabb3Stbbdev                 } else {
205b15aabb3Stbbdev                     core_types_parsing_broken = true;
206b15aabb3Stbbdev                     break;
207b15aabb3Stbbdev                 }
208b15aabb3Stbbdev             }
209b15aabb3Stbbdev         }
210*e96dbf4bSIvan Kochin #else /*!__TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/
211b15aabb3Stbbdev         bool core_types_parsing_broken{true};
212*e96dbf4bSIvan Kochin #endif /*__TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/
213b15aabb3Stbbdev 
214b15aabb3Stbbdev         if (core_types_parsing_broken) {
215b15aabb3Stbbdev             for (auto& core_type_mask : core_types_affinity_masks_list) {
216b15aabb3Stbbdev                 hwloc_bitmap_free(core_type_mask);
217b15aabb3Stbbdev             }
218b15aabb3Stbbdev             core_types_affinity_masks_list.resize(1);
219b15aabb3Stbbdev             core_types_indexes_list.resize(1);
220b15aabb3Stbbdev 
221b15aabb3Stbbdev             core_types_affinity_masks_list[0] = hwloc_bitmap_dup(process_cpu_affinity_mask);
222b15aabb3Stbbdev             core_types_indexes_list[0] = -1;
223b15aabb3Stbbdev         }
224b15aabb3Stbbdev     }
225b15aabb3Stbbdev 
226b15aabb3Stbbdev public:
227b15aabb3Stbbdev     typedef hwloc_cpuset_t             affinity_mask;
228b15aabb3Stbbdev     typedef hwloc_const_cpuset_t const_affinity_mask;
229b15aabb3Stbbdev 
230b15aabb3Stbbdev     static platform_topology& instance() {
231b15aabb3Stbbdev         static platform_topology topology;
232b15aabb3Stbbdev         return topology;
233b15aabb3Stbbdev     }
234b15aabb3Stbbdev 
235b15aabb3Stbbdev     bool is_topology_parsed() { return initialization_state == topology_parsed; }
236b15aabb3Stbbdev 
237b15aabb3Stbbdev     void initialize( std::size_t groups_num ) {
238b15aabb3Stbbdev         if ( initialization_state != uninitialized )
239b15aabb3Stbbdev             return;
240b15aabb3Stbbdev 
241b15aabb3Stbbdev         topology_initialization(groups_num);
242b15aabb3Stbbdev         numa_topology_parsing();
243b15aabb3Stbbdev         core_types_topology_parsing();
244b15aabb3Stbbdev 
245b15aabb3Stbbdev         if (initialization_state == topology_loaded)
24651c0b2f7Stbbdev             initialization_state = topology_parsed;
24751c0b2f7Stbbdev     }
24851c0b2f7Stbbdev 
24951c0b2f7Stbbdev     ~platform_topology() {
25051c0b2f7Stbbdev         if ( is_topology_parsed() ) {
251b15aabb3Stbbdev             for (auto& numa_node_mask : numa_affinity_masks_list) {
252b15aabb3Stbbdev                 hwloc_bitmap_free(numa_node_mask);
25351c0b2f7Stbbdev             }
254b15aabb3Stbbdev 
255b15aabb3Stbbdev             for (auto& core_type_mask : core_types_affinity_masks_list) {
256b15aabb3Stbbdev                 hwloc_bitmap_free(core_type_mask);
257b15aabb3Stbbdev             }
258b15aabb3Stbbdev 
25951c0b2f7Stbbdev             hwloc_bitmap_free(process_node_affinity_mask);
26051c0b2f7Stbbdev             hwloc_bitmap_free(process_cpu_affinity_mask);
26151c0b2f7Stbbdev         }
26251c0b2f7Stbbdev 
26351c0b2f7Stbbdev         if ( initialization_state >= topology_allocated ) {
26451c0b2f7Stbbdev             hwloc_topology_destroy(topology);
26551c0b2f7Stbbdev         }
26651c0b2f7Stbbdev 
26751c0b2f7Stbbdev         initialization_state = uninitialized;
26851c0b2f7Stbbdev     }
26951c0b2f7Stbbdev 
270b15aabb3Stbbdev     void fill_topology_information(
271b15aabb3Stbbdev         int& _numa_nodes_count, int*& _numa_indexes_list,
272b15aabb3Stbbdev         int& _core_types_count, int*& _core_types_indexes_list
273b15aabb3Stbbdev     ) {
27451c0b2f7Stbbdev         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology");
275b15aabb3Stbbdev         _numa_nodes_count = numa_nodes_count;
276b15aabb3Stbbdev         _numa_indexes_list = numa_indexes_list.data();
277b15aabb3Stbbdev 
278b15aabb3Stbbdev         _core_types_count = (int)core_types_indexes_list.size();
279b15aabb3Stbbdev         _core_types_indexes_list = core_types_indexes_list.data();
280b15aabb3Stbbdev     }
281b15aabb3Stbbdev 
282b15aabb3Stbbdev     void fill_constraints_affinity_mask(affinity_mask input_mask, int numa_node_index, int core_type_index, int max_threads_per_core) {
283b15aabb3Stbbdev         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology");
284b15aabb3Stbbdev         __TBB_ASSERT(numa_node_index < (int)numa_affinity_masks_list.size(), "Wrong NUMA node id");
285b15aabb3Stbbdev         __TBB_ASSERT(core_type_index < (int)core_types_affinity_masks_list.size(), "Wrong core type id");
286b15aabb3Stbbdev         __TBB_ASSERT(max_threads_per_core == -1 || max_threads_per_core > 0, "Wrong max_threads_per_core");
287b15aabb3Stbbdev 
288b15aabb3Stbbdev         hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc();
289b15aabb3Stbbdev         hwloc_cpuset_t core_mask = hwloc_bitmap_alloc();
290b15aabb3Stbbdev 
291b15aabb3Stbbdev         hwloc_bitmap_copy(constraints_mask, process_cpu_affinity_mask);
292b15aabb3Stbbdev         if (numa_node_index >= 0) {
293b15aabb3Stbbdev             hwloc_bitmap_and(constraints_mask, constraints_mask, numa_affinity_masks_list[numa_node_index]);
294b15aabb3Stbbdev         }
295b15aabb3Stbbdev         if (core_type_index >= 0) {
296b15aabb3Stbbdev             hwloc_bitmap_and(constraints_mask, constraints_mask, core_types_affinity_masks_list[core_type_index]);
297b15aabb3Stbbdev         }
298b15aabb3Stbbdev         if (max_threads_per_core > 0) {
299b15aabb3Stbbdev             // clear input mask
300b15aabb3Stbbdev             hwloc_bitmap_zero(input_mask);
301b15aabb3Stbbdev 
302b15aabb3Stbbdev             hwloc_obj_t current_core = nullptr;
303b15aabb3Stbbdev             while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) {
304b15aabb3Stbbdev                 hwloc_bitmap_and(core_mask, constraints_mask, current_core->cpuset);
305b15aabb3Stbbdev 
306b15aabb3Stbbdev                 // fit the core mask to required bits number
307b15aabb3Stbbdev                 int current_threads_per_core = 0;
308b15aabb3Stbbdev                 for (int id = hwloc_bitmap_first(core_mask); id != -1; id = hwloc_bitmap_next(core_mask, id)) {
309b15aabb3Stbbdev                     if (++current_threads_per_core > max_threads_per_core) {
310b15aabb3Stbbdev                         hwloc_bitmap_clr(core_mask, id);
311b15aabb3Stbbdev                     }
312b15aabb3Stbbdev                 }
313b15aabb3Stbbdev 
314b15aabb3Stbbdev                 hwloc_bitmap_or(input_mask, input_mask, core_mask);
315b15aabb3Stbbdev             }
316b15aabb3Stbbdev         } else {
317b15aabb3Stbbdev             hwloc_bitmap_copy(input_mask, constraints_mask);
318b15aabb3Stbbdev         }
319b15aabb3Stbbdev 
320b15aabb3Stbbdev         hwloc_bitmap_free(core_mask);
321b15aabb3Stbbdev         hwloc_bitmap_free(constraints_mask);
322b15aabb3Stbbdev     }
323b15aabb3Stbbdev 
324b15aabb3Stbbdev     void fit_num_threads_per_core(affinity_mask result_mask, affinity_mask current_mask, affinity_mask constraints_mask) {
325b15aabb3Stbbdev         hwloc_bitmap_zero(result_mask);
326b15aabb3Stbbdev         hwloc_obj_t current_core = nullptr;
327b15aabb3Stbbdev         while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) {
328b15aabb3Stbbdev             if (hwloc_bitmap_intersects(current_mask, current_core->cpuset)) {
329b15aabb3Stbbdev                 hwloc_bitmap_or(result_mask, result_mask, current_core->cpuset);
330b15aabb3Stbbdev             }
331b15aabb3Stbbdev         }
332b15aabb3Stbbdev         hwloc_bitmap_and(result_mask, result_mask, constraints_mask);
333b15aabb3Stbbdev     }
334b15aabb3Stbbdev 
335b15aabb3Stbbdev     int get_default_concurrency(int numa_node_index, int core_type_index, int max_threads_per_core) {
336b15aabb3Stbbdev         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology");
337b15aabb3Stbbdev 
338b15aabb3Stbbdev         hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc();
339b15aabb3Stbbdev         fill_constraints_affinity_mask(constraints_mask, numa_node_index, core_type_index, max_threads_per_core);
340b15aabb3Stbbdev 
341b15aabb3Stbbdev         int default_concurrency = hwloc_bitmap_weight(constraints_mask);
342b15aabb3Stbbdev         hwloc_bitmap_free(constraints_mask);
343b15aabb3Stbbdev         return default_concurrency;
34451c0b2f7Stbbdev     }
34551c0b2f7Stbbdev 
34651c0b2f7Stbbdev     affinity_mask allocate_process_affinity_mask() {
34751c0b2f7Stbbdev         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology");
34851c0b2f7Stbbdev         return hwloc_bitmap_dup(process_cpu_affinity_mask);
34951c0b2f7Stbbdev     }
35051c0b2f7Stbbdev 
35151c0b2f7Stbbdev     void free_affinity_mask( affinity_mask mask_to_free ) {
35251c0b2f7Stbbdev         hwloc_bitmap_free(mask_to_free); // If bitmap is nullptr, no operation is performed.
35351c0b2f7Stbbdev     }
35451c0b2f7Stbbdev 
35551c0b2f7Stbbdev     void store_current_affinity_mask( affinity_mask current_mask ) {
35651c0b2f7Stbbdev         assertion_hwloc_wrapper(hwloc_get_cpubind, topology, current_mask, HWLOC_CPUBIND_THREAD);
35751c0b2f7Stbbdev 
35851c0b2f7Stbbdev         hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask);
35951c0b2f7Stbbdev         __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask),
36051c0b2f7Stbbdev             "Current affinity mask must intersects with process affinity mask");
36151c0b2f7Stbbdev     }
36251c0b2f7Stbbdev 
363b15aabb3Stbbdev     void set_affinity_mask( const_affinity_mask mask ) {
364b15aabb3Stbbdev         if (hwloc_bitmap_weight(mask) > 0) {
365b15aabb3Stbbdev             assertion_hwloc_wrapper(hwloc_set_cpubind, topology, mask, HWLOC_CPUBIND_THREAD);
36651c0b2f7Stbbdev         }
36751c0b2f7Stbbdev     }
36851c0b2f7Stbbdev };
36951c0b2f7Stbbdev 
37051c0b2f7Stbbdev class binding_handler {
37151c0b2f7Stbbdev     // Following vector saves thread affinity mask on scheduler entry to return it to this thread
37251c0b2f7Stbbdev     // on scheduler exit.
37351c0b2f7Stbbdev     typedef std::vector<platform_topology::affinity_mask> affinity_masks_container;
37451c0b2f7Stbbdev     affinity_masks_container affinity_backup;
375b15aabb3Stbbdev     platform_topology::affinity_mask handler_affinity_mask;
376b15aabb3Stbbdev 
377b15aabb3Stbbdev #if WIN32
378b15aabb3Stbbdev     affinity_masks_container affinity_buffer;
379b15aabb3Stbbdev     int my_numa_node_id;
380b15aabb3Stbbdev     int my_core_type_id;
381b15aabb3Stbbdev     int my_max_threads_per_core;
382b15aabb3Stbbdev #endif
38351c0b2f7Stbbdev 
38451c0b2f7Stbbdev public:
385b15aabb3Stbbdev     binding_handler( std::size_t size, int numa_node_id, int core_type_id, int max_threads_per_core )
386b15aabb3Stbbdev         : affinity_backup(size)
387b15aabb3Stbbdev #if WIN32
388b15aabb3Stbbdev         , affinity_buffer(size)
389b15aabb3Stbbdev         , my_numa_node_id(numa_node_id)
390b15aabb3Stbbdev         , my_core_type_id(core_type_id)
391b15aabb3Stbbdev         , my_max_threads_per_core(max_threads_per_core)
392b15aabb3Stbbdev #endif
393b15aabb3Stbbdev     {
394b15aabb3Stbbdev         for (std::size_t i = 0; i < size; ++i) {
395b15aabb3Stbbdev             affinity_backup[i] = platform_topology::instance().allocate_process_affinity_mask();
396b15aabb3Stbbdev #if WIN32
397b15aabb3Stbbdev             affinity_buffer[i] = platform_topology::instance().allocate_process_affinity_mask();
398b15aabb3Stbbdev #endif
39951c0b2f7Stbbdev         }
400b15aabb3Stbbdev         handler_affinity_mask = platform_topology::instance().allocate_process_affinity_mask();
401b15aabb3Stbbdev         platform_topology::instance().fill_constraints_affinity_mask
402b15aabb3Stbbdev             (handler_affinity_mask, numa_node_id, core_type_id, max_threads_per_core);
40351c0b2f7Stbbdev     }
40451c0b2f7Stbbdev 
40551c0b2f7Stbbdev     ~binding_handler() {
406b15aabb3Stbbdev         for (std::size_t i = 0; i < affinity_backup.size(); ++i) {
407b15aabb3Stbbdev             platform_topology::instance().free_affinity_mask(affinity_backup[i]);
408b15aabb3Stbbdev #if WIN32
409b15aabb3Stbbdev             platform_topology::instance().free_affinity_mask(affinity_buffer[i]);
410b15aabb3Stbbdev #endif
41151c0b2f7Stbbdev         }
412b15aabb3Stbbdev         platform_topology::instance().free_affinity_mask(handler_affinity_mask);
41351c0b2f7Stbbdev     }
41451c0b2f7Stbbdev 
415b15aabb3Stbbdev     void apply_affinity( unsigned slot_num ) {
416b15aabb3Stbbdev         auto& topology = platform_topology::instance();
41751c0b2f7Stbbdev         __TBB_ASSERT(slot_num < affinity_backup.size(),
41851c0b2f7Stbbdev             "The slot number is greater than the number of slots in the arena");
419b15aabb3Stbbdev         __TBB_ASSERT(topology.is_topology_parsed(),
42051c0b2f7Stbbdev             "Trying to get access to uninitialized platform_topology");
42151c0b2f7Stbbdev 
422b15aabb3Stbbdev         topology.store_current_affinity_mask(affinity_backup[slot_num]);
423b15aabb3Stbbdev 
424b15aabb3Stbbdev #if WIN32
425b15aabb3Stbbdev         // TBBBind supports only systems where NUMA nodes and core types do not cross the border
426b15aabb3Stbbdev         // between several processor groups. So if a certain NUMA node or core type constraint
427b15aabb3Stbbdev         // specified, then the constraints affinity mask will not cross the processor groups' border.
428b15aabb3Stbbdev 
429b15aabb3Stbbdev         // But if we have constraint based only on the max_threads_per_core setting, then the
430b15aabb3Stbbdev         // constraints affinity mask does may cross the border between several processor groups
431b15aabb3Stbbdev         // on machines with more then 64 hardware threads. That is why we need to use the special
432b15aabb3Stbbdev         // function, which regulates the number of threads in the current threads mask.
433b15aabb3Stbbdev         if (topology.number_of_processors_groups > 1 && my_max_threads_per_core != -1 &&
434b15aabb3Stbbdev             (my_numa_node_id == -1 || topology.numa_indexes_list.size() == 1) &&
435b15aabb3Stbbdev             (my_core_type_id == -1 || topology.core_types_indexes_list.size() == 1)
436b15aabb3Stbbdev         ) {
437b15aabb3Stbbdev             topology.fit_num_threads_per_core(affinity_buffer[slot_num], affinity_backup[slot_num], handler_affinity_mask);
438b15aabb3Stbbdev             topology.set_affinity_mask(affinity_buffer[slot_num]);
439b15aabb3Stbbdev             return;
440b15aabb3Stbbdev         }
441b15aabb3Stbbdev #endif
442b15aabb3Stbbdev         topology.set_affinity_mask(handler_affinity_mask);
44351c0b2f7Stbbdev     }
44451c0b2f7Stbbdev 
44551c0b2f7Stbbdev     void restore_previous_affinity_mask( unsigned slot_num ) {
446b15aabb3Stbbdev         auto& topology = platform_topology::instance();
447b15aabb3Stbbdev         __TBB_ASSERT(topology.is_topology_parsed(),
44851c0b2f7Stbbdev             "Trying to get access to uninitialized platform_topology");
449b15aabb3Stbbdev         topology.set_affinity_mask(affinity_backup[slot_num]);
45051c0b2f7Stbbdev     };
45151c0b2f7Stbbdev 
45251c0b2f7Stbbdev };
45351c0b2f7Stbbdev 
45451c0b2f7Stbbdev extern "C" { // exported to TBB interfaces
45551c0b2f7Stbbdev 
456b15aabb3Stbbdev void __TBB_internal_initialize_system_topology(
457b15aabb3Stbbdev     std::size_t groups_num,
458b15aabb3Stbbdev     int& numa_nodes_count, int*& numa_indexes_list,
459b15aabb3Stbbdev     int& core_types_count, int*& core_types_indexes_list
460b15aabb3Stbbdev ) {
46151c0b2f7Stbbdev     platform_topology::instance().initialize(groups_num);
462b15aabb3Stbbdev     platform_topology::instance().fill_topology_information(
463b15aabb3Stbbdev         numa_nodes_count, numa_indexes_list,
464b15aabb3Stbbdev         core_types_count, core_types_indexes_list
465b15aabb3Stbbdev     );
46651c0b2f7Stbbdev }
46751c0b2f7Stbbdev 
468b15aabb3Stbbdev binding_handler* __TBB_internal_allocate_binding_handler(int number_of_slots, int numa_id, int core_type_id, int max_threads_per_core) {
469b15aabb3Stbbdev     __TBB_ASSERT(number_of_slots > 0, "Trying to create numa handler for 0 threads.");
470b15aabb3Stbbdev     return new binding_handler(number_of_slots, numa_id, core_type_id, max_threads_per_core);
47151c0b2f7Stbbdev }
47251c0b2f7Stbbdev 
47351c0b2f7Stbbdev void __TBB_internal_deallocate_binding_handler(binding_handler* handler_ptr) {
47451c0b2f7Stbbdev     __TBB_ASSERT(handler_ptr != nullptr, "Trying to deallocate nullptr pointer.");
47551c0b2f7Stbbdev     delete handler_ptr;
47651c0b2f7Stbbdev }
47751c0b2f7Stbbdev 
478b15aabb3Stbbdev void __TBB_internal_apply_affinity(binding_handler* handler_ptr, int slot_num) {
47951c0b2f7Stbbdev     __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata.");
480b15aabb3Stbbdev     handler_ptr->apply_affinity(slot_num);
48151c0b2f7Stbbdev }
48251c0b2f7Stbbdev 
48351c0b2f7Stbbdev void __TBB_internal_restore_affinity(binding_handler* handler_ptr, int slot_num) {
48451c0b2f7Stbbdev     __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata.");
48551c0b2f7Stbbdev     handler_ptr->restore_previous_affinity_mask(slot_num);
48651c0b2f7Stbbdev }
48751c0b2f7Stbbdev 
488b15aabb3Stbbdev int __TBB_internal_get_default_concurrency(int numa_id, int core_type_id, int max_threads_per_core) {
489b15aabb3Stbbdev     return platform_topology::instance().get_default_concurrency(numa_id, core_type_id, max_threads_per_core);
490b15aabb3Stbbdev }
491b15aabb3Stbbdev 
49251c0b2f7Stbbdev } // extern "C"
49351c0b2f7Stbbdev 
49451c0b2f7Stbbdev } // namespace r1
49551c0b2f7Stbbdev } // namespace detail
49651c0b2f7Stbbdev } // namespace tbb
49751c0b2f7Stbbdev 
49851c0b2f7Stbbdev #undef assertion_hwloc_wrapper
499