xref: /oneTBB/src/tbbbind/tbb_bind.cpp (revision b15aabb3)
1 /*
2     Copyright (c) 2019-2021 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 #include <vector>
18 #include <mutex>
19 
20 #include "../tbb/assert_impl.h" // Out-of-line TBB assertion handling routines are instantiated here.
21 #include "oneapi/tbb/detail/_assert.h"
22 
23 #if _MSC_VER && !__INTEL_COMPILER
24 #pragma warning( push )
25 #pragma warning( disable : 4100 )
26 #endif
27 #include <hwloc.h>
28 #if _MSC_VER && !__INTEL_COMPILER
29 #pragma warning( pop )
30 #endif
31 
32 #define __HWLOC_HYBRID_CPUS_INTERFACES_PRESENT (HWLOC_API_VERSION >= 0x20400)
33 
34 // Most of hwloc calls returns negative exit code on error.
35 // This macro tracks error codes that are returned from the hwloc interfaces.
36 #define assertion_hwloc_wrapper(command, ...) \
37         __TBB_ASSERT_EX( (command(__VA_ARGS__)) >= 0, "Error occurred during call to hwloc API.");
38 
39 namespace tbb {
40 namespace detail {
41 namespace r1 {
42 
43 //------------------------------------------------------------------------
44 // Information about the machine's hardware TBB is happen to work on
45 //------------------------------------------------------------------------
46 class platform_topology {
47     friend class binding_handler;
48 
49     // Common topology members
50     hwloc_topology_t topology{nullptr};
51     hwloc_cpuset_t   process_cpu_affinity_mask{nullptr};
52     hwloc_nodeset_t  process_node_affinity_mask{nullptr};
53     std::size_t number_of_processors_groups{1};
54 
55     // NUMA API related topology members
56     std::vector<hwloc_cpuset_t> numa_affinity_masks_list{};
57     std::vector<int> numa_indexes_list{};
58     int numa_nodes_count{0};
59 
60     // Hybrid CPUs API related topology members
61     std::vector<hwloc_cpuset_t> core_types_affinity_masks_list{};
62     std::vector<int> core_types_indexes_list{};
63 
64     enum init_stages { uninitialized,
65                        started,
66                        topology_allocated,
67                        topology_loaded,
68                        topology_parsed } initialization_state;
69 
70     // Binding threads that locate in another Windows Processor groups
71     // is allowed only if machine topology contains several Windows Processors groups
72     // and process affinity mask wasn`t limited manually (affinity mask cannot violates
73     // processors group boundaries).
74     bool intergroup_binding_allowed(std::size_t groups_num) { return groups_num > 1; }
75 
76 private:
77     void topology_initialization(std::size_t groups_num) {
78         initialization_state = started;
79 
80         // Parse topology
81         if ( hwloc_topology_init( &topology ) == 0 ) {
82             initialization_state = topology_allocated;
83             if ( hwloc_topology_load( topology ) == 0 ) {
84                 initialization_state = topology_loaded;
85             }
86         }
87         if ( initialization_state != topology_loaded )
88             return;
89 
90         // Getting process affinity mask
91         if ( intergroup_binding_allowed(groups_num) ) {
92             process_cpu_affinity_mask  = hwloc_bitmap_dup(hwloc_topology_get_complete_cpuset (topology));
93             process_node_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_nodeset(topology));
94         } else {
95             process_cpu_affinity_mask  = hwloc_bitmap_alloc();
96             process_node_affinity_mask = hwloc_bitmap_alloc();
97 
98             assertion_hwloc_wrapper(hwloc_get_cpubind, topology, process_cpu_affinity_mask, 0);
99             hwloc_cpuset_to_nodeset(topology, process_cpu_affinity_mask, process_node_affinity_mask);
100         }
101 
102         number_of_processors_groups = groups_num;
103     }
104 
105     void numa_topology_parsing() {
106         // Fill parameters with stubs if topology parsing is broken.
107         if ( initialization_state != topology_loaded ) {
108             numa_nodes_count = 1;
109             numa_indexes_list.push_back(-1);
110             return;
111         }
112 
113         // If system contains no NUMA nodes, HWLOC 1.11 returns an infinitely filled bitmap.
114         // hwloc_bitmap_weight() returns negative value for such bitmaps, so we use this check
115         // to change way of topology initialization.
116         numa_nodes_count = hwloc_bitmap_weight(process_node_affinity_mask);
117         if (numa_nodes_count <= 0) {
118             // numa_nodes_count may be empty if the process affinity mask is empty too (invalid case)
119             // or if some internal HWLOC error occurred.
120             // So we place -1 as index in this case.
121             numa_indexes_list.push_back(numa_nodes_count == 0 ? -1 : 0);
122             numa_nodes_count = 1;
123 
124             numa_affinity_masks_list.push_back(hwloc_bitmap_dup(process_cpu_affinity_mask));
125         } else {
126             // Get NUMA logical indexes list
127             unsigned counter = 0;
128             int i = 0;
129             int max_numa_index = -1;
130             numa_indexes_list.resize(numa_nodes_count);
131             hwloc_obj_t node_buffer;
132             hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) {
133                 node_buffer = hwloc_get_obj_by_type(topology, HWLOC_OBJ_NUMANODE, i);
134                 numa_indexes_list[counter] = static_cast<int>(node_buffer->logical_index);
135 
136                 if ( numa_indexes_list[counter] > max_numa_index ) {
137                     max_numa_index = numa_indexes_list[counter];
138                 }
139 
140                 counter++;
141             } hwloc_bitmap_foreach_end();
142             __TBB_ASSERT(max_numa_index >= 0, "Maximal NUMA index must not be negative");
143 
144             // Fill concurrency and affinity masks lists
145             numa_affinity_masks_list.resize(max_numa_index + 1);
146             int index = 0;
147             hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) {
148                 node_buffer = hwloc_get_obj_by_type(topology, HWLOC_OBJ_NUMANODE, i);
149                 index = static_cast<int>(node_buffer->logical_index);
150 
151                 hwloc_cpuset_t& current_mask = numa_affinity_masks_list[index];
152                 current_mask = hwloc_bitmap_dup(node_buffer->cpuset);
153 
154                 hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask);
155                 __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask), "hwloc detected unavailable NUMA node");
156             } hwloc_bitmap_foreach_end();
157         }
158     }
159 
160     void core_types_topology_parsing() {
161         // Fill parameters with stubs if topology parsing is broken.
162         if ( initialization_state != topology_loaded ) {
163             core_types_indexes_list.push_back(-1);
164             return;
165         }
166 #if __HWLOC_HYBRID_CPUS_INTERFACES_PRESENT
167         __TBB_ASSERT(hwloc_get_api_version() >= 0x20400, "Hybrid CPUs support interfaces required HWLOC >= 2.4");
168         // Parsing the hybrid CPU topology
169         int core_types_number = hwloc_cpukinds_get_nr(topology, 0);
170         bool core_types_parsing_broken = core_types_number <= 0;
171         if (!core_types_parsing_broken) {
172             core_types_affinity_masks_list.resize(core_types_number);
173             int efficiency{-1};
174 
175             for (int core_type = 0; core_type < core_types_number; ++core_type) {
176                 hwloc_cpuset_t& current_mask = core_types_affinity_masks_list[core_type];
177                 current_mask = hwloc_bitmap_alloc();
178 
179                 if (!hwloc_cpukinds_get_info(topology, core_type, current_mask, &efficiency, nullptr, nullptr, 0)
180                     && efficiency >= 0
181                 ) {
182                     hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask);
183 
184                     if (hwloc_bitmap_weight(current_mask) > 0) {
185                         core_types_indexes_list.push_back(core_type);
186                     }
187                     __TBB_ASSERT(hwloc_bitmap_weight(current_mask) >= 0, "Infinivitely filled core type mask");
188                 } else {
189                     core_types_parsing_broken = true;
190                     break;
191                 }
192             }
193         }
194 #else /*!__HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/
195         bool core_types_parsing_broken{true};
196 #endif /*__HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/
197 
198         if (core_types_parsing_broken) {
199             for (auto& core_type_mask : core_types_affinity_masks_list) {
200                 hwloc_bitmap_free(core_type_mask);
201             }
202             core_types_affinity_masks_list.resize(1);
203             core_types_indexes_list.resize(1);
204 
205             core_types_affinity_masks_list[0] = hwloc_bitmap_dup(process_cpu_affinity_mask);
206             core_types_indexes_list[0] = -1;
207         }
208     }
209 
210 public:
211     typedef hwloc_cpuset_t             affinity_mask;
212     typedef hwloc_const_cpuset_t const_affinity_mask;
213 
214     static platform_topology& instance() {
215         static platform_topology topology;
216         return topology;
217     }
218 
219     bool is_topology_parsed() { return initialization_state == topology_parsed; }
220 
221     void initialize( std::size_t groups_num ) {
222         if ( initialization_state != uninitialized )
223             return;
224 
225         topology_initialization(groups_num);
226         numa_topology_parsing();
227         core_types_topology_parsing();
228 
229         if (initialization_state == topology_loaded)
230             initialization_state = topology_parsed;
231     }
232 
233     ~platform_topology() {
234         if ( is_topology_parsed() ) {
235             for (auto& numa_node_mask : numa_affinity_masks_list) {
236                 hwloc_bitmap_free(numa_node_mask);
237             }
238 
239             for (auto& core_type_mask : core_types_affinity_masks_list) {
240                 hwloc_bitmap_free(core_type_mask);
241             }
242 
243             hwloc_bitmap_free(process_node_affinity_mask);
244             hwloc_bitmap_free(process_cpu_affinity_mask);
245         }
246 
247         if ( initialization_state >= topology_allocated ) {
248             hwloc_topology_destroy(topology);
249         }
250 
251         initialization_state = uninitialized;
252     }
253 
254     void fill_topology_information(
255         int& _numa_nodes_count, int*& _numa_indexes_list,
256         int& _core_types_count, int*& _core_types_indexes_list
257     ) {
258         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology");
259         _numa_nodes_count = numa_nodes_count;
260         _numa_indexes_list = numa_indexes_list.data();
261 
262         _core_types_count = (int)core_types_indexes_list.size();
263         _core_types_indexes_list = core_types_indexes_list.data();
264     }
265 
266     void fill_constraints_affinity_mask(affinity_mask input_mask, int numa_node_index, int core_type_index, int max_threads_per_core) {
267         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology");
268         __TBB_ASSERT(numa_node_index < (int)numa_affinity_masks_list.size(), "Wrong NUMA node id");
269         __TBB_ASSERT(core_type_index < (int)core_types_affinity_masks_list.size(), "Wrong core type id");
270         __TBB_ASSERT(max_threads_per_core == -1 || max_threads_per_core > 0, "Wrong max_threads_per_core");
271 
272         hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc();
273         hwloc_cpuset_t core_mask = hwloc_bitmap_alloc();
274 
275         hwloc_bitmap_copy(constraints_mask, process_cpu_affinity_mask);
276         if (numa_node_index >= 0) {
277             hwloc_bitmap_and(constraints_mask, constraints_mask, numa_affinity_masks_list[numa_node_index]);
278         }
279         if (core_type_index >= 0) {
280             hwloc_bitmap_and(constraints_mask, constraints_mask, core_types_affinity_masks_list[core_type_index]);
281         }
282         if (max_threads_per_core > 0) {
283             // clear input mask
284             hwloc_bitmap_zero(input_mask);
285 
286             hwloc_obj_t current_core = nullptr;
287             while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) {
288                 hwloc_bitmap_and(core_mask, constraints_mask, current_core->cpuset);
289 
290                 // fit the core mask to required bits number
291                 int current_threads_per_core = 0;
292                 for (int id = hwloc_bitmap_first(core_mask); id != -1; id = hwloc_bitmap_next(core_mask, id)) {
293                     if (++current_threads_per_core > max_threads_per_core) {
294                         hwloc_bitmap_clr(core_mask, id);
295                     }
296                 }
297 
298                 hwloc_bitmap_or(input_mask, input_mask, core_mask);
299             }
300         } else {
301             hwloc_bitmap_copy(input_mask, constraints_mask);
302         }
303 
304         hwloc_bitmap_free(core_mask);
305         hwloc_bitmap_free(constraints_mask);
306     }
307 
308     void fit_num_threads_per_core(affinity_mask result_mask, affinity_mask current_mask, affinity_mask constraints_mask) {
309         hwloc_bitmap_zero(result_mask);
310         hwloc_obj_t current_core = nullptr;
311         while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) {
312             if (hwloc_bitmap_intersects(current_mask, current_core->cpuset)) {
313                 hwloc_bitmap_or(result_mask, result_mask, current_core->cpuset);
314             }
315         }
316         hwloc_bitmap_and(result_mask, result_mask, constraints_mask);
317     }
318 
319     int get_default_concurrency(int numa_node_index, int core_type_index, int max_threads_per_core) {
320         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology");
321 
322         hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc();
323         fill_constraints_affinity_mask(constraints_mask, numa_node_index, core_type_index, max_threads_per_core);
324 
325         int default_concurrency = hwloc_bitmap_weight(constraints_mask);
326         hwloc_bitmap_free(constraints_mask);
327         return default_concurrency;
328     }
329 
330     affinity_mask allocate_process_affinity_mask() {
331         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology");
332         return hwloc_bitmap_dup(process_cpu_affinity_mask);
333     }
334 
335     void free_affinity_mask( affinity_mask mask_to_free ) {
336         hwloc_bitmap_free(mask_to_free); // If bitmap is nullptr, no operation is performed.
337     }
338 
339     void store_current_affinity_mask( affinity_mask current_mask ) {
340         assertion_hwloc_wrapper(hwloc_get_cpubind, topology, current_mask, HWLOC_CPUBIND_THREAD);
341 
342         hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask);
343         __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask),
344             "Current affinity mask must intersects with process affinity mask");
345     }
346 
347     void set_affinity_mask( const_affinity_mask mask ) {
348         if (hwloc_bitmap_weight(mask) > 0) {
349             assertion_hwloc_wrapper(hwloc_set_cpubind, topology, mask, HWLOC_CPUBIND_THREAD);
350         }
351     }
352 };
353 
354 class binding_handler {
355     // Following vector saves thread affinity mask on scheduler entry to return it to this thread
356     // on scheduler exit.
357     typedef std::vector<platform_topology::affinity_mask> affinity_masks_container;
358     affinity_masks_container affinity_backup;
359     platform_topology::affinity_mask handler_affinity_mask;
360 
361 #if WIN32
362     affinity_masks_container affinity_buffer;
363     int my_numa_node_id;
364     int my_core_type_id;
365     int my_max_threads_per_core;
366 #endif
367 
368 public:
369     binding_handler( std::size_t size, int numa_node_id, int core_type_id, int max_threads_per_core )
370         : affinity_backup(size)
371 #if WIN32
372         , affinity_buffer(size)
373         , my_numa_node_id(numa_node_id)
374         , my_core_type_id(core_type_id)
375         , my_max_threads_per_core(max_threads_per_core)
376 #endif
377     {
378         for (std::size_t i = 0; i < size; ++i) {
379             affinity_backup[i] = platform_topology::instance().allocate_process_affinity_mask();
380 #if WIN32
381             affinity_buffer[i] = platform_topology::instance().allocate_process_affinity_mask();
382 #endif
383         }
384         handler_affinity_mask = platform_topology::instance().allocate_process_affinity_mask();
385         platform_topology::instance().fill_constraints_affinity_mask
386             (handler_affinity_mask, numa_node_id, core_type_id, max_threads_per_core);
387     }
388 
389     ~binding_handler() {
390         for (std::size_t i = 0; i < affinity_backup.size(); ++i) {
391             platform_topology::instance().free_affinity_mask(affinity_backup[i]);
392 #if WIN32
393             platform_topology::instance().free_affinity_mask(affinity_buffer[i]);
394 #endif
395         }
396         platform_topology::instance().free_affinity_mask(handler_affinity_mask);
397     }
398 
399     void apply_affinity( unsigned slot_num ) {
400         auto& topology = platform_topology::instance();
401         __TBB_ASSERT(slot_num < affinity_backup.size(),
402             "The slot number is greater than the number of slots in the arena");
403         __TBB_ASSERT(topology.is_topology_parsed(),
404             "Trying to get access to uninitialized platform_topology");
405 
406         topology.store_current_affinity_mask(affinity_backup[slot_num]);
407 
408 #if WIN32
409         // TBBBind supports only systems where NUMA nodes and core types do not cross the border
410         // between several processor groups. So if a certain NUMA node or core type constraint
411         // specified, then the constraints affinity mask will not cross the processor groups' border.
412 
413         // But if we have constraint based only on the max_threads_per_core setting, then the
414         // constraints affinity mask does may cross the border between several processor groups
415         // on machines with more then 64 hardware threads. That is why we need to use the special
416         // function, which regulates the number of threads in the current threads mask.
417         if (topology.number_of_processors_groups > 1 && my_max_threads_per_core != -1 &&
418             (my_numa_node_id == -1 || topology.numa_indexes_list.size() == 1) &&
419             (my_core_type_id == -1 || topology.core_types_indexes_list.size() == 1)
420         ) {
421             topology.fit_num_threads_per_core(affinity_buffer[slot_num], affinity_backup[slot_num], handler_affinity_mask);
422             topology.set_affinity_mask(affinity_buffer[slot_num]);
423             return;
424         }
425 #endif
426         topology.set_affinity_mask(handler_affinity_mask);
427     }
428 
429     void restore_previous_affinity_mask( unsigned slot_num ) {
430         auto& topology = platform_topology::instance();
431         __TBB_ASSERT(topology.is_topology_parsed(),
432             "Trying to get access to uninitialized platform_topology");
433         topology.set_affinity_mask(affinity_backup[slot_num]);
434     };
435 
436 };
437 
438 extern "C" { // exported to TBB interfaces
439 
440 void __TBB_internal_initialize_system_topology(
441     std::size_t groups_num,
442     int& numa_nodes_count, int*& numa_indexes_list,
443     int& core_types_count, int*& core_types_indexes_list
444 ) {
445     platform_topology::instance().initialize(groups_num);
446     platform_topology::instance().fill_topology_information(
447         numa_nodes_count, numa_indexes_list,
448         core_types_count, core_types_indexes_list
449     );
450 }
451 
452 binding_handler* __TBB_internal_allocate_binding_handler(int number_of_slots, int numa_id, int core_type_id, int max_threads_per_core) {
453     __TBB_ASSERT(number_of_slots > 0, "Trying to create numa handler for 0 threads.");
454     return new binding_handler(number_of_slots, numa_id, core_type_id, max_threads_per_core);
455 }
456 
457 void __TBB_internal_deallocate_binding_handler(binding_handler* handler_ptr) {
458     __TBB_ASSERT(handler_ptr != nullptr, "Trying to deallocate nullptr pointer.");
459     delete handler_ptr;
460 }
461 
462 void __TBB_internal_apply_affinity(binding_handler* handler_ptr, int slot_num) {
463     __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata.");
464     handler_ptr->apply_affinity(slot_num);
465 }
466 
467 void __TBB_internal_restore_affinity(binding_handler* handler_ptr, int slot_num) {
468     __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata.");
469     handler_ptr->restore_previous_affinity_mask(slot_num);
470 }
471 
472 int __TBB_internal_get_default_concurrency(int numa_id, int core_type_id, int max_threads_per_core) {
473     return platform_topology::instance().get_default_concurrency(numa_id, core_type_id, max_threads_per_core);
474 }
475 
476 } // extern "C"
477 
478 } // namespace r1
479 } // namespace detail
480 } // namespace tbb
481 
482 #undef assertion_hwloc_wrapper
483