xref: /oneTBB/src/tbbbind/tbb_bind.cpp (revision fa3268c3)
1 /*
2     Copyright (c) 2019-2021 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 #include <vector>
18 #include <mutex>
19 
20 #include "../tbb/assert_impl.h" // Out-of-line TBB assertion handling routines are instantiated here.
21 #include "oneapi/tbb/detail/_assert.h"
22 
23 #if _MSC_VER && !__INTEL_COMPILER && !__clang__
24 #pragma warning( push )
25 #pragma warning( disable : 4100 )
26 #elif _MSC_VER && __clang__
27 #pragma GCC diagnostic push
28 #pragma GCC diagnostic ignored "-Wunused-parameter"
29 #endif
30 #include <hwloc.h>
31 #if _MSC_VER && !__INTEL_COMPILER && !__clang__
32 #pragma warning( pop )
33 #elif _MSC_VER && __clang__
34 #pragma GCC diagnostic pop
35 #endif
36 
37 #define __HWLOC_HYBRID_CPUS_INTERFACES_PRESENT (HWLOC_API_VERSION >= 0x20400)
38 
39 // Most of hwloc calls returns negative exit code on error.
40 // This macro tracks error codes that are returned from the hwloc interfaces.
41 #define assertion_hwloc_wrapper(command, ...) \
42         __TBB_ASSERT_EX( (command(__VA_ARGS__)) >= 0, "Error occurred during call to hwloc API.");
43 
44 namespace tbb {
45 namespace detail {
46 namespace r1 {
47 
48 //------------------------------------------------------------------------
49 // Information about the machine's hardware TBB is happen to work on
50 //------------------------------------------------------------------------
51 class platform_topology {
52     friend class binding_handler;
53 
54     // Common topology members
55     hwloc_topology_t topology{nullptr};
56     hwloc_cpuset_t   process_cpu_affinity_mask{nullptr};
57     hwloc_nodeset_t  process_node_affinity_mask{nullptr};
58     std::size_t number_of_processors_groups{1};
59 
60     // NUMA API related topology members
61     std::vector<hwloc_cpuset_t> numa_affinity_masks_list{};
62     std::vector<int> numa_indexes_list{};
63     int numa_nodes_count{0};
64 
65     // Hybrid CPUs API related topology members
66     std::vector<hwloc_cpuset_t> core_types_affinity_masks_list{};
67     std::vector<int> core_types_indexes_list{};
68 
69     enum init_stages { uninitialized,
70                        started,
71                        topology_allocated,
72                        topology_loaded,
73                        topology_parsed } initialization_state;
74 
75     // Binding threads that locate in another Windows Processor groups
76     // is allowed only if machine topology contains several Windows Processors groups
77     // and process affinity mask wasn`t limited manually (affinity mask cannot violates
78     // processors group boundaries).
79     bool intergroup_binding_allowed(std::size_t groups_num) { return groups_num > 1; }
80 
81 private:
82     void topology_initialization(std::size_t groups_num) {
83         initialization_state = started;
84 
85         // Parse topology
86         if ( hwloc_topology_init( &topology ) == 0 ) {
87             initialization_state = topology_allocated;
88             if ( hwloc_topology_load( topology ) == 0 ) {
89                 initialization_state = topology_loaded;
90             }
91         }
92         if ( initialization_state != topology_loaded )
93             return;
94 
95         // Getting process affinity mask
96         if ( intergroup_binding_allowed(groups_num) ) {
97             process_cpu_affinity_mask  = hwloc_bitmap_dup(hwloc_topology_get_complete_cpuset (topology));
98             process_node_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_nodeset(topology));
99         } else {
100             process_cpu_affinity_mask  = hwloc_bitmap_alloc();
101             process_node_affinity_mask = hwloc_bitmap_alloc();
102 
103             assertion_hwloc_wrapper(hwloc_get_cpubind, topology, process_cpu_affinity_mask, 0);
104             hwloc_cpuset_to_nodeset(topology, process_cpu_affinity_mask, process_node_affinity_mask);
105         }
106 
107         number_of_processors_groups = groups_num;
108     }
109 
110     void numa_topology_parsing() {
111         // Fill parameters with stubs if topology parsing is broken.
112         if ( initialization_state != topology_loaded ) {
113             numa_nodes_count = 1;
114             numa_indexes_list.push_back(-1);
115             return;
116         }
117 
118         // If system contains no NUMA nodes, HWLOC 1.11 returns an infinitely filled bitmap.
119         // hwloc_bitmap_weight() returns negative value for such bitmaps, so we use this check
120         // to change way of topology initialization.
121         numa_nodes_count = hwloc_bitmap_weight(process_node_affinity_mask);
122         if (numa_nodes_count <= 0) {
123             // numa_nodes_count may be empty if the process affinity mask is empty too (invalid case)
124             // or if some internal HWLOC error occurred.
125             // So we place -1 as index in this case.
126             numa_indexes_list.push_back(numa_nodes_count == 0 ? -1 : 0);
127             numa_nodes_count = 1;
128 
129             numa_affinity_masks_list.push_back(hwloc_bitmap_dup(process_cpu_affinity_mask));
130         } else {
131             // Get NUMA logical indexes list
132             unsigned counter = 0;
133             int i = 0;
134             int max_numa_index = -1;
135             numa_indexes_list.resize(numa_nodes_count);
136             hwloc_obj_t node_buffer;
137             hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) {
138                 node_buffer = hwloc_get_obj_by_type(topology, HWLOC_OBJ_NUMANODE, i);
139                 numa_indexes_list[counter] = static_cast<int>(node_buffer->logical_index);
140 
141                 if ( numa_indexes_list[counter] > max_numa_index ) {
142                     max_numa_index = numa_indexes_list[counter];
143                 }
144 
145                 counter++;
146             } hwloc_bitmap_foreach_end();
147             __TBB_ASSERT(max_numa_index >= 0, "Maximal NUMA index must not be negative");
148 
149             // Fill concurrency and affinity masks lists
150             numa_affinity_masks_list.resize(max_numa_index + 1);
151             int index = 0;
152             hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) {
153                 node_buffer = hwloc_get_obj_by_type(topology, HWLOC_OBJ_NUMANODE, i);
154                 index = static_cast<int>(node_buffer->logical_index);
155 
156                 hwloc_cpuset_t& current_mask = numa_affinity_masks_list[index];
157                 current_mask = hwloc_bitmap_dup(node_buffer->cpuset);
158 
159                 hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask);
160                 __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask), "hwloc detected unavailable NUMA node");
161             } hwloc_bitmap_foreach_end();
162         }
163     }
164 
165     void core_types_topology_parsing() {
166         // Fill parameters with stubs if topology parsing is broken.
167         if ( initialization_state != topology_loaded ) {
168             core_types_indexes_list.push_back(-1);
169             return;
170         }
171 #if __HWLOC_HYBRID_CPUS_INTERFACES_PRESENT
172         __TBB_ASSERT(hwloc_get_api_version() >= 0x20400, "Hybrid CPUs support interfaces required HWLOC >= 2.4");
173         // Parsing the hybrid CPU topology
174         int core_types_number = hwloc_cpukinds_get_nr(topology, 0);
175         bool core_types_parsing_broken = core_types_number <= 0;
176         if (!core_types_parsing_broken) {
177             core_types_affinity_masks_list.resize(core_types_number);
178             int efficiency{-1};
179 
180             for (int core_type = 0; core_type < core_types_number; ++core_type) {
181                 hwloc_cpuset_t& current_mask = core_types_affinity_masks_list[core_type];
182                 current_mask = hwloc_bitmap_alloc();
183 
184                 if (!hwloc_cpukinds_get_info(topology, core_type, current_mask, &efficiency, nullptr, nullptr, 0)
185                     && efficiency >= 0
186                 ) {
187                     hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask);
188 
189                     if (hwloc_bitmap_weight(current_mask) > 0) {
190                         core_types_indexes_list.push_back(core_type);
191                     }
192                     __TBB_ASSERT(hwloc_bitmap_weight(current_mask) >= 0, "Infinivitely filled core type mask");
193                 } else {
194                     core_types_parsing_broken = true;
195                     break;
196                 }
197             }
198         }
199 #else /*!__HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/
200         bool core_types_parsing_broken{true};
201 #endif /*__HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/
202 
203         if (core_types_parsing_broken) {
204             for (auto& core_type_mask : core_types_affinity_masks_list) {
205                 hwloc_bitmap_free(core_type_mask);
206             }
207             core_types_affinity_masks_list.resize(1);
208             core_types_indexes_list.resize(1);
209 
210             core_types_affinity_masks_list[0] = hwloc_bitmap_dup(process_cpu_affinity_mask);
211             core_types_indexes_list[0] = -1;
212         }
213     }
214 
215 public:
216     typedef hwloc_cpuset_t             affinity_mask;
217     typedef hwloc_const_cpuset_t const_affinity_mask;
218 
219     static platform_topology& instance() {
220         static platform_topology topology;
221         return topology;
222     }
223 
224     bool is_topology_parsed() { return initialization_state == topology_parsed; }
225 
226     void initialize( std::size_t groups_num ) {
227         if ( initialization_state != uninitialized )
228             return;
229 
230         topology_initialization(groups_num);
231         numa_topology_parsing();
232         core_types_topology_parsing();
233 
234         if (initialization_state == topology_loaded)
235             initialization_state = topology_parsed;
236     }
237 
238     ~platform_topology() {
239         if ( is_topology_parsed() ) {
240             for (auto& numa_node_mask : numa_affinity_masks_list) {
241                 hwloc_bitmap_free(numa_node_mask);
242             }
243 
244             for (auto& core_type_mask : core_types_affinity_masks_list) {
245                 hwloc_bitmap_free(core_type_mask);
246             }
247 
248             hwloc_bitmap_free(process_node_affinity_mask);
249             hwloc_bitmap_free(process_cpu_affinity_mask);
250         }
251 
252         if ( initialization_state >= topology_allocated ) {
253             hwloc_topology_destroy(topology);
254         }
255 
256         initialization_state = uninitialized;
257     }
258 
259     void fill_topology_information(
260         int& _numa_nodes_count, int*& _numa_indexes_list,
261         int& _core_types_count, int*& _core_types_indexes_list
262     ) {
263         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology");
264         _numa_nodes_count = numa_nodes_count;
265         _numa_indexes_list = numa_indexes_list.data();
266 
267         _core_types_count = (int)core_types_indexes_list.size();
268         _core_types_indexes_list = core_types_indexes_list.data();
269     }
270 
271     void fill_constraints_affinity_mask(affinity_mask input_mask, int numa_node_index, int core_type_index, int max_threads_per_core) {
272         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology");
273         __TBB_ASSERT(numa_node_index < (int)numa_affinity_masks_list.size(), "Wrong NUMA node id");
274         __TBB_ASSERT(core_type_index < (int)core_types_affinity_masks_list.size(), "Wrong core type id");
275         __TBB_ASSERT(max_threads_per_core == -1 || max_threads_per_core > 0, "Wrong max_threads_per_core");
276 
277         hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc();
278         hwloc_cpuset_t core_mask = hwloc_bitmap_alloc();
279 
280         hwloc_bitmap_copy(constraints_mask, process_cpu_affinity_mask);
281         if (numa_node_index >= 0) {
282             hwloc_bitmap_and(constraints_mask, constraints_mask, numa_affinity_masks_list[numa_node_index]);
283         }
284         if (core_type_index >= 0) {
285             hwloc_bitmap_and(constraints_mask, constraints_mask, core_types_affinity_masks_list[core_type_index]);
286         }
287         if (max_threads_per_core > 0) {
288             // clear input mask
289             hwloc_bitmap_zero(input_mask);
290 
291             hwloc_obj_t current_core = nullptr;
292             while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) {
293                 hwloc_bitmap_and(core_mask, constraints_mask, current_core->cpuset);
294 
295                 // fit the core mask to required bits number
296                 int current_threads_per_core = 0;
297                 for (int id = hwloc_bitmap_first(core_mask); id != -1; id = hwloc_bitmap_next(core_mask, id)) {
298                     if (++current_threads_per_core > max_threads_per_core) {
299                         hwloc_bitmap_clr(core_mask, id);
300                     }
301                 }
302 
303                 hwloc_bitmap_or(input_mask, input_mask, core_mask);
304             }
305         } else {
306             hwloc_bitmap_copy(input_mask, constraints_mask);
307         }
308 
309         hwloc_bitmap_free(core_mask);
310         hwloc_bitmap_free(constraints_mask);
311     }
312 
313     void fit_num_threads_per_core(affinity_mask result_mask, affinity_mask current_mask, affinity_mask constraints_mask) {
314         hwloc_bitmap_zero(result_mask);
315         hwloc_obj_t current_core = nullptr;
316         while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) {
317             if (hwloc_bitmap_intersects(current_mask, current_core->cpuset)) {
318                 hwloc_bitmap_or(result_mask, result_mask, current_core->cpuset);
319             }
320         }
321         hwloc_bitmap_and(result_mask, result_mask, constraints_mask);
322     }
323 
324     int get_default_concurrency(int numa_node_index, int core_type_index, int max_threads_per_core) {
325         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology");
326 
327         hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc();
328         fill_constraints_affinity_mask(constraints_mask, numa_node_index, core_type_index, max_threads_per_core);
329 
330         int default_concurrency = hwloc_bitmap_weight(constraints_mask);
331         hwloc_bitmap_free(constraints_mask);
332         return default_concurrency;
333     }
334 
335     affinity_mask allocate_process_affinity_mask() {
336         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology");
337         return hwloc_bitmap_dup(process_cpu_affinity_mask);
338     }
339 
340     void free_affinity_mask( affinity_mask mask_to_free ) {
341         hwloc_bitmap_free(mask_to_free); // If bitmap is nullptr, no operation is performed.
342     }
343 
344     void store_current_affinity_mask( affinity_mask current_mask ) {
345         assertion_hwloc_wrapper(hwloc_get_cpubind, topology, current_mask, HWLOC_CPUBIND_THREAD);
346 
347         hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask);
348         __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask),
349             "Current affinity mask must intersects with process affinity mask");
350     }
351 
352     void set_affinity_mask( const_affinity_mask mask ) {
353         if (hwloc_bitmap_weight(mask) > 0) {
354             assertion_hwloc_wrapper(hwloc_set_cpubind, topology, mask, HWLOC_CPUBIND_THREAD);
355         }
356     }
357 };
358 
359 class binding_handler {
360     // Following vector saves thread affinity mask on scheduler entry to return it to this thread
361     // on scheduler exit.
362     typedef std::vector<platform_topology::affinity_mask> affinity_masks_container;
363     affinity_masks_container affinity_backup;
364     platform_topology::affinity_mask handler_affinity_mask;
365 
366 #if WIN32
367     affinity_masks_container affinity_buffer;
368     int my_numa_node_id;
369     int my_core_type_id;
370     int my_max_threads_per_core;
371 #endif
372 
373 public:
374     binding_handler( std::size_t size, int numa_node_id, int core_type_id, int max_threads_per_core )
375         : affinity_backup(size)
376 #if WIN32
377         , affinity_buffer(size)
378         , my_numa_node_id(numa_node_id)
379         , my_core_type_id(core_type_id)
380         , my_max_threads_per_core(max_threads_per_core)
381 #endif
382     {
383         for (std::size_t i = 0; i < size; ++i) {
384             affinity_backup[i] = platform_topology::instance().allocate_process_affinity_mask();
385 #if WIN32
386             affinity_buffer[i] = platform_topology::instance().allocate_process_affinity_mask();
387 #endif
388         }
389         handler_affinity_mask = platform_topology::instance().allocate_process_affinity_mask();
390         platform_topology::instance().fill_constraints_affinity_mask
391             (handler_affinity_mask, numa_node_id, core_type_id, max_threads_per_core);
392     }
393 
394     ~binding_handler() {
395         for (std::size_t i = 0; i < affinity_backup.size(); ++i) {
396             platform_topology::instance().free_affinity_mask(affinity_backup[i]);
397 #if WIN32
398             platform_topology::instance().free_affinity_mask(affinity_buffer[i]);
399 #endif
400         }
401         platform_topology::instance().free_affinity_mask(handler_affinity_mask);
402     }
403 
404     void apply_affinity( unsigned slot_num ) {
405         auto& topology = platform_topology::instance();
406         __TBB_ASSERT(slot_num < affinity_backup.size(),
407             "The slot number is greater than the number of slots in the arena");
408         __TBB_ASSERT(topology.is_topology_parsed(),
409             "Trying to get access to uninitialized platform_topology");
410 
411         topology.store_current_affinity_mask(affinity_backup[slot_num]);
412 
413 #if WIN32
414         // TBBBind supports only systems where NUMA nodes and core types do not cross the border
415         // between several processor groups. So if a certain NUMA node or core type constraint
416         // specified, then the constraints affinity mask will not cross the processor groups' border.
417 
418         // But if we have constraint based only on the max_threads_per_core setting, then the
419         // constraints affinity mask does may cross the border between several processor groups
420         // on machines with more then 64 hardware threads. That is why we need to use the special
421         // function, which regulates the number of threads in the current threads mask.
422         if (topology.number_of_processors_groups > 1 && my_max_threads_per_core != -1 &&
423             (my_numa_node_id == -1 || topology.numa_indexes_list.size() == 1) &&
424             (my_core_type_id == -1 || topology.core_types_indexes_list.size() == 1)
425         ) {
426             topology.fit_num_threads_per_core(affinity_buffer[slot_num], affinity_backup[slot_num], handler_affinity_mask);
427             topology.set_affinity_mask(affinity_buffer[slot_num]);
428             return;
429         }
430 #endif
431         topology.set_affinity_mask(handler_affinity_mask);
432     }
433 
434     void restore_previous_affinity_mask( unsigned slot_num ) {
435         auto& topology = platform_topology::instance();
436         __TBB_ASSERT(topology.is_topology_parsed(),
437             "Trying to get access to uninitialized platform_topology");
438         topology.set_affinity_mask(affinity_backup[slot_num]);
439     };
440 
441 };
442 
443 extern "C" { // exported to TBB interfaces
444 
445 void __TBB_internal_initialize_system_topology(
446     std::size_t groups_num,
447     int& numa_nodes_count, int*& numa_indexes_list,
448     int& core_types_count, int*& core_types_indexes_list
449 ) {
450     platform_topology::instance().initialize(groups_num);
451     platform_topology::instance().fill_topology_information(
452         numa_nodes_count, numa_indexes_list,
453         core_types_count, core_types_indexes_list
454     );
455 }
456 
457 binding_handler* __TBB_internal_allocate_binding_handler(int number_of_slots, int numa_id, int core_type_id, int max_threads_per_core) {
458     __TBB_ASSERT(number_of_slots > 0, "Trying to create numa handler for 0 threads.");
459     return new binding_handler(number_of_slots, numa_id, core_type_id, max_threads_per_core);
460 }
461 
462 void __TBB_internal_deallocate_binding_handler(binding_handler* handler_ptr) {
463     __TBB_ASSERT(handler_ptr != nullptr, "Trying to deallocate nullptr pointer.");
464     delete handler_ptr;
465 }
466 
467 void __TBB_internal_apply_affinity(binding_handler* handler_ptr, int slot_num) {
468     __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata.");
469     handler_ptr->apply_affinity(slot_num);
470 }
471 
472 void __TBB_internal_restore_affinity(binding_handler* handler_ptr, int slot_num) {
473     __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata.");
474     handler_ptr->restore_previous_affinity_mask(slot_num);
475 }
476 
477 int __TBB_internal_get_default_concurrency(int numa_id, int core_type_id, int max_threads_per_core) {
478     return platform_topology::instance().get_default_concurrency(numa_id, core_type_id, max_threads_per_core);
479 }
480 
481 } // extern "C"
482 
483 } // namespace r1
484 } // namespace detail
485 } // namespace tbb
486 
487 #undef assertion_hwloc_wrapper
488