xref: /oneTBB/src/tbbbind/tbb_bind.cpp (revision d86ed7fb)
1 /*
2     Copyright (c) 2019-2020 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 #include "../tbb/assert_impl.h" // Out-of-line TBB assertion handling routines are instantiated here.
18 #include "oneapi/tbb/detail/_assert.h"
19 
20 #if _MSC_VER && !__INTEL_COMPILER
21 #pragma warning( push )
22 #pragma warning( disable : 4100 )
23 #endif
24 #include <hwloc.h>
25 #if _MSC_VER && !__INTEL_COMPILER
26 #pragma warning( pop )
27 #endif
28 
29 #include <vector>
30 
31 // Most of hwloc calls returns negative exit code on error.
32 // This macro tracks error codes that are returned from the hwloc interfaces.
33 #define assertion_hwloc_wrapper(command, ...) \
34         __TBB_ASSERT_EX( (command(__VA_ARGS__)) >= 0, "Error occurred during call to hwloc API.");
35 
36 namespace tbb {
37 namespace detail {
38 namespace r1 {
39 
40 //------------------------------------------------------------------------
41 // Information about the machine's hardware TBB is happen to work on
42 //------------------------------------------------------------------------
43 class platform_topology {
44     friend class numa_affinity_handler;
45 
46     // TODO: add the `my_` prefix to the members
47     hwloc_topology_t topology;
48     hwloc_cpuset_t   process_cpu_affinity_mask;
49     hwloc_nodeset_t  process_node_affinity_mask;
50     std::vector<hwloc_cpuset_t>  affinity_masks_list;
51 
52     std::vector<int> default_concurrency_list;
53     std::vector<int> numa_indexes_list;
54     int  numa_nodes_count;
55 
56     enum init_stages { uninitialized,
57                        started,
58                        topology_allocated,
59                        topology_loaded,
60                        topology_parsed } initialization_state;
61 
62     // Binding threads to NUMA nodes that locates in another Windows Processor groups
63     // is allowed only if machine topology contains several Windows Processors groups
64     // and process affinity mask wasn`t limited manually (affinity mask cannot violates
65     // processors group boundaries).
66     bool intergroup_binding_allowed(size_t groups_num) { return groups_num > 1; }
67 
68     platform_topology() : topology(nullptr),
69                           process_cpu_affinity_mask(nullptr),
70                           process_node_affinity_mask(nullptr),
71                           numa_nodes_count(0),
72                           initialization_state(uninitialized) {}
73 
74 public:
75     typedef hwloc_cpuset_t             affinity_mask;
76     typedef hwloc_const_cpuset_t const_affinity_mask;
77 
78     static platform_topology& instance() {
79         static platform_topology topology;
80         return topology;
81     }
82 
83     bool is_topology_parsed() { return initialization_state == topology_parsed; }
84 
85     void initialize( size_t groups_num ) {
86         if ( initialization_state != uninitialized )
87             return;
88         initialization_state = started;
89 
90         // Parse topology
91         if ( hwloc_topology_init( &topology ) == 0 ) {
92             initialization_state = topology_allocated;
93             if ( hwloc_topology_load( topology ) == 0 ) {
94                 initialization_state = topology_loaded;
95             }
96         }
97 
98         // Fill parameters with stubs if topology parsing is broken.
99         if ( initialization_state != topology_loaded ) {
100             if ( initialization_state == topology_allocated ) {
101                 hwloc_topology_destroy(topology);
102             }
103             numa_nodes_count = 1;
104             numa_indexes_list.push_back(-1);
105             default_concurrency_list.push_back(-1);
106             return;
107         }
108 
109         // Getting process affinity mask
110         if ( intergroup_binding_allowed(groups_num) ) {
111             process_cpu_affinity_mask  = hwloc_bitmap_dup(hwloc_topology_get_complete_cpuset (topology));
112             process_node_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_nodeset(topology));
113         } else {
114             process_cpu_affinity_mask  = hwloc_bitmap_alloc();
115             process_node_affinity_mask = hwloc_bitmap_alloc();
116 
117             assertion_hwloc_wrapper(hwloc_get_cpubind, topology, process_cpu_affinity_mask, 0);
118             hwloc_cpuset_to_nodeset(topology, process_cpu_affinity_mask, process_node_affinity_mask);
119         }
120 
121         // Get the number of available NUMA nodes
122         // If system contains no NUMA nodes, HWLOC 1.11 returns an infinitely filled bitmap.
123         // hwloc_bitmap_weight() returns negative value for such bitmaps, so we use this check
124         // to change way of topology initialization.
125         numa_nodes_count = hwloc_bitmap_weight(process_node_affinity_mask);
126         if (numa_nodes_count <= 0) {
127             // numa_nodes_count may be empty if the process affinity mask is empty too (invalid case)
128             // or if some internal HWLOC error occurred.
129             // So we place -1 as index in this case.
130             numa_indexes_list.push_back(numa_nodes_count == 0 ? -1 : 0);
131             numa_nodes_count = 1;
132             default_concurrency_list.push_back(hwloc_bitmap_weight(process_cpu_affinity_mask));
133 
134             affinity_masks_list.push_back(hwloc_bitmap_dup(process_cpu_affinity_mask));
135             initialization_state = topology_parsed;
136             return;
137         }
138 
139         // Get NUMA logical indexes list
140         unsigned counter = 0;
141         int i = 0;
142         int max_numa_index = -1;
143         numa_indexes_list.resize(numa_nodes_count);
144         hwloc_obj_t node_buffer;
145         hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) {
146             node_buffer = hwloc_get_obj_by_type(topology, HWLOC_OBJ_NUMANODE, i);
147             numa_indexes_list[counter] = static_cast<int>(node_buffer->logical_index);
148 
149             if ( numa_indexes_list[counter] > max_numa_index ) {
150                 max_numa_index = numa_indexes_list[counter];
151             }
152 
153             counter++;
154         } hwloc_bitmap_foreach_end();
155         __TBB_ASSERT(max_numa_index >= 0, "Maximal NUMA index must not be negative");
156 
157         // Fill concurrency and affinity masks lists
158         default_concurrency_list.resize(max_numa_index + 1);
159         affinity_masks_list.resize(max_numa_index + 1);
160 
161         int index = 0;
162         hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) {
163             node_buffer = hwloc_get_obj_by_type(topology, HWLOC_OBJ_NUMANODE, i);
164             index = static_cast<int>(node_buffer->logical_index);
165 
166             hwloc_cpuset_t& current_mask = affinity_masks_list[index];
167             current_mask = hwloc_bitmap_dup(node_buffer->cpuset);
168 
169             hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask);
170             __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask), "hwloc detected unavailable NUMA node");
171             default_concurrency_list[index] = hwloc_bitmap_weight(current_mask);
172         } hwloc_bitmap_foreach_end();
173         initialization_state = topology_parsed;
174     }
175 
176     ~platform_topology() {
177         if ( is_topology_parsed() ) {
178             for (int i = 0; i < numa_nodes_count; i++) {
179                 hwloc_bitmap_free(affinity_masks_list[numa_indexes_list[i]]);
180             }
181             hwloc_bitmap_free(process_node_affinity_mask);
182             hwloc_bitmap_free(process_cpu_affinity_mask);
183         }
184 
185         if ( initialization_state >= topology_allocated ) {
186             hwloc_topology_destroy(topology);
187         }
188 
189         initialization_state = uninitialized;
190     }
191 
192     void fill(int& nodes_count, int*& indexes_list, int*& concurrency_list ) {
193         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology");
194         nodes_count = numa_nodes_count;
195         indexes_list = &numa_indexes_list.front();
196         concurrency_list = &default_concurrency_list.front();
197     }
198 
199     affinity_mask allocate_process_affinity_mask() {
200         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology");
201         return hwloc_bitmap_dup(process_cpu_affinity_mask);
202     }
203 
204     void free_affinity_mask( affinity_mask mask_to_free ) {
205         hwloc_bitmap_free(mask_to_free); // If bitmap is nullptr, no operation is performed.
206     }
207 
208     void store_current_affinity_mask( affinity_mask current_mask ) {
209         assertion_hwloc_wrapper(hwloc_get_cpubind, topology, current_mask, HWLOC_CPUBIND_THREAD);
210 
211         hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask);
212         __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask),
213             "Current affinity mask must intersects with process affinity mask");
214     }
215 
216     void set_new_affinity_mask( const_affinity_mask new_mask ) {
217         assertion_hwloc_wrapper(hwloc_set_cpubind, topology, new_mask, HWLOC_CPUBIND_THREAD);
218     }
219 
220     const_affinity_mask get_node_affinity_mask( int node_index ) {
221         __TBB_ASSERT((int)affinity_masks_list.size() > node_index,
222             "Trying to get affinity mask for uninitialized NUMA node");
223         return affinity_masks_list[node_index];
224     }
225 };
226 
227 class binding_handler {
228     // Following vector saves thread affinity mask on scheduler entry to return it to this thread
229     // on scheduler exit.
230     typedef std::vector<platform_topology::affinity_mask> affinity_masks_container;
231     affinity_masks_container affinity_backup;
232 
233 public:
234     binding_handler( size_t size ) : affinity_backup(size) {
235         for (affinity_masks_container::iterator it = affinity_backup.begin();
236              it != affinity_backup.end(); it++) {
237             *it = platform_topology::instance().allocate_process_affinity_mask();
238         }
239     }
240 
241     ~binding_handler() {
242         for (affinity_masks_container::iterator it = affinity_backup.begin();
243              it != affinity_backup.end(); it++) {
244             platform_topology::instance().free_affinity_mask(*it);
245         }
246     }
247 
248     void bind_thread_to_node( unsigned slot_num, unsigned numa_node_id ) {
249         __TBB_ASSERT(slot_num < affinity_backup.size(),
250             "The slot number is greater than the number of slots in the arena");
251         __TBB_ASSERT(platform_topology::instance().is_topology_parsed(),
252             "Trying to get access to uninitialized platform_topology");
253         platform_topology::instance().store_current_affinity_mask(affinity_backup[slot_num]);
254 
255         platform_topology::instance().set_new_affinity_mask(
256             platform_topology::instance().get_node_affinity_mask(numa_node_id));
257     }
258 
259     void restore_previous_affinity_mask( unsigned slot_num ) {
260         __TBB_ASSERT(platform_topology::instance().is_topology_parsed(),
261             "Trying to get access to uninitialized platform_topology");
262         platform_topology::instance().set_new_affinity_mask(affinity_backup[slot_num]);
263     };
264 
265 };
266 
267 extern "C" { // exported to TBB interfaces
268 
269 void __TBB_internal_initialize_numa_topology( size_t groups_num, int& nodes_count, int*& indexes_list, int*& concurrency_list ) {
270     platform_topology::instance().initialize(groups_num);
271     platform_topology::instance().fill(nodes_count, indexes_list, concurrency_list);
272 }
273 
274 binding_handler* __TBB_internal_allocate_binding_handler(int slot_num) {
275     __TBB_ASSERT(slot_num > 0, "Trying to create numa handler for 0 threads.");
276     return new binding_handler(slot_num);
277 }
278 
279 void __TBB_internal_deallocate_binding_handler(binding_handler* handler_ptr) {
280     __TBB_ASSERT(handler_ptr != nullptr, "Trying to deallocate nullptr pointer.");
281     delete handler_ptr;
282 }
283 
284 void __TBB_internal_bind_to_node(binding_handler* handler_ptr, int slot_num, int numa_id) {
285     __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata.");
286     __TBB_ASSERT(platform_topology::instance().is_topology_parsed(),
287         "Trying to get access to uninitialized platform_topology.");
288     handler_ptr->bind_thread_to_node(slot_num, numa_id);
289 }
290 
291 void __TBB_internal_restore_affinity(binding_handler* handler_ptr, int slot_num) {
292     __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata.");
293     __TBB_ASSERT(platform_topology::instance().is_topology_parsed(),
294         "Trying to get access to uninitialized platform_topology.");
295     handler_ptr->restore_previous_affinity_mask(slot_num);
296 }
297 
298 } // extern "C"
299 
300 } // namespace r1
301 } // namespace detail
302 } // namespace tbb
303 
304 #undef assertion_hwloc_wrapper
305