xref: /oneTBB/src/tbbbind/tbb_bind.cpp (revision 055cc6ea)
1 /*
2     Copyright (c) 2019-2021 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 #include <vector>
18 #include <mutex>
19 
20 #include "../tbb/assert_impl.h" // Out-of-line TBB assertion handling routines are instantiated here.
21 #include "oneapi/tbb/detail/_assert.h"
22 
23 #if _MSC_VER && !__INTEL_COMPILER && !__clang__
24 #pragma warning( push )
25 #pragma warning( disable : 4100 )
26 #elif _MSC_VER && __clang__
27 #pragma GCC diagnostic push
28 #pragma GCC diagnostic ignored "-Wunused-parameter"
29 #endif
30 #include <hwloc.h>
31 #if _MSC_VER && !__INTEL_COMPILER && !__clang__
32 #pragma warning( pop )
33 #elif _MSC_VER && __clang__
34 #pragma GCC diagnostic pop
35 #endif
36 
37 #define __TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT (HWLOC_API_VERSION >= 0x20400)
38 #define __TBBBIND_HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING_PRESENT (HWLOC_API_VERSION >= 0x20500)
39 
40 // Most of hwloc calls returns negative exit code on error.
41 // This macro tracks error codes that are returned from the hwloc interfaces.
42 #define assertion_hwloc_wrapper(command, ...) \
43         __TBB_ASSERT_EX( (command(__VA_ARGS__)) >= 0, "Error occurred during call to hwloc API.");
44 
45 namespace tbb {
46 namespace detail {
47 namespace r1 {
48 
49 //------------------------------------------------------------------------
50 // Information about the machine's hardware TBB is happen to work on
51 //------------------------------------------------------------------------
52 class platform_topology {
53     friend class binding_handler;
54 
55     // Common topology members
56     hwloc_topology_t topology{nullptr};
57     hwloc_cpuset_t   process_cpu_affinity_mask{nullptr};
58     hwloc_nodeset_t  process_node_affinity_mask{nullptr};
59     std::size_t number_of_processors_groups{1};
60 
61     // NUMA API related topology members
62     std::vector<hwloc_cpuset_t> numa_affinity_masks_list{};
63     std::vector<int> numa_indexes_list{};
64     int numa_nodes_count{0};
65 
66     // Hybrid CPUs API related topology members
67     std::vector<hwloc_cpuset_t> core_types_affinity_masks_list{};
68     std::vector<int> core_types_indexes_list{};
69 
70     enum init_stages { uninitialized,
71                        started,
72                        topology_allocated,
73                        topology_loaded,
74                        topology_parsed } initialization_state;
75 
76     // Binding threads that locate in another Windows Processor groups
77     // is allowed only if machine topology contains several Windows Processors groups
78     // and process affinity mask wasn`t limited manually (affinity mask cannot violates
79     // processors group boundaries).
80     bool intergroup_binding_allowed(std::size_t groups_num) { return groups_num > 1; }
81 
82 private:
83     void topology_initialization(std::size_t groups_num) {
84         initialization_state = started;
85 
86         // Parse topology
87         if ( hwloc_topology_init( &topology ) == 0 ) {
88             initialization_state = topology_allocated;
89 #if __TBBBIND_HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING_PRESENT
90             if ( groups_num == 1 &&
91                  hwloc_topology_set_flags(topology,
92                      HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM |
93                      HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING
94                  ) != 0
95             ) {
96                 return;
97             }
98 #endif
99             if ( hwloc_topology_load( topology ) == 0 ) {
100                 initialization_state = topology_loaded;
101             }
102         }
103         if ( initialization_state != topology_loaded )
104             return;
105 
106         // Getting process affinity mask
107         if ( intergroup_binding_allowed(groups_num) ) {
108             process_cpu_affinity_mask  = hwloc_bitmap_dup(hwloc_topology_get_complete_cpuset (topology));
109             process_node_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_nodeset(topology));
110         } else {
111             process_cpu_affinity_mask  = hwloc_bitmap_alloc();
112             process_node_affinity_mask = hwloc_bitmap_alloc();
113 
114             assertion_hwloc_wrapper(hwloc_get_cpubind, topology, process_cpu_affinity_mask, 0);
115             hwloc_cpuset_to_nodeset(topology, process_cpu_affinity_mask, process_node_affinity_mask);
116         }
117 
118         number_of_processors_groups = groups_num;
119     }
120 
121     void numa_topology_parsing() {
122         // Fill parameters with stubs if topology parsing is broken.
123         if ( initialization_state != topology_loaded ) {
124             numa_nodes_count = 1;
125             numa_indexes_list.push_back(-1);
126             return;
127         }
128 
129         // If system contains no NUMA nodes, HWLOC 1.11 returns an infinitely filled bitmap.
130         // hwloc_bitmap_weight() returns negative value for such bitmaps, so we use this check
131         // to change way of topology initialization.
132         numa_nodes_count = hwloc_bitmap_weight(process_node_affinity_mask);
133         if (numa_nodes_count <= 0) {
134             // numa_nodes_count may be empty if the process affinity mask is empty too (invalid case)
135             // or if some internal HWLOC error occurred.
136             // So we place -1 as index in this case.
137             numa_indexes_list.push_back(numa_nodes_count == 0 ? -1 : 0);
138             numa_nodes_count = 1;
139 
140             numa_affinity_masks_list.push_back(hwloc_bitmap_dup(process_cpu_affinity_mask));
141         } else {
142             // Get NUMA logical indexes list
143             unsigned counter = 0;
144             int i = 0;
145             int max_numa_index = -1;
146             numa_indexes_list.resize(numa_nodes_count);
147             hwloc_obj_t node_buffer;
148             hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) {
149                 node_buffer = hwloc_get_numanode_obj_by_os_index(topology, i);
150                 numa_indexes_list[counter] = static_cast<int>(node_buffer->logical_index);
151 
152                 if ( numa_indexes_list[counter] > max_numa_index ) {
153                     max_numa_index = numa_indexes_list[counter];
154                 }
155 
156                 counter++;
157             } hwloc_bitmap_foreach_end();
158             __TBB_ASSERT(max_numa_index >= 0, "Maximal NUMA index must not be negative");
159 
160             // Fill concurrency and affinity masks lists
161             numa_affinity_masks_list.resize(max_numa_index + 1);
162             int index = 0;
163             hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) {
164                 node_buffer = hwloc_get_numanode_obj_by_os_index(topology, i);
165                 index = static_cast<int>(node_buffer->logical_index);
166 
167                 hwloc_cpuset_t& current_mask = numa_affinity_masks_list[index];
168                 current_mask = hwloc_bitmap_dup(node_buffer->cpuset);
169 
170                 hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask);
171                 __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask), "hwloc detected unavailable NUMA node");
172             } hwloc_bitmap_foreach_end();
173         }
174     }
175 
176     void core_types_topology_parsing() {
177         // Fill parameters with stubs if topology parsing is broken.
178         if ( initialization_state != topology_loaded ) {
179             core_types_indexes_list.push_back(-1);
180             return;
181         }
182 #if __TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT
183         __TBB_ASSERT(hwloc_get_api_version() >= 0x20400, "Hybrid CPUs support interfaces required HWLOC >= 2.4");
184         // Parsing the hybrid CPU topology
185         int core_types_number = hwloc_cpukinds_get_nr(topology, 0);
186         bool core_types_parsing_broken = core_types_number <= 0;
187         if (!core_types_parsing_broken) {
188             core_types_affinity_masks_list.resize(core_types_number);
189             int efficiency{-1};
190 
191             for (int core_type = 0; core_type < core_types_number; ++core_type) {
192                 hwloc_cpuset_t& current_mask = core_types_affinity_masks_list[core_type];
193                 current_mask = hwloc_bitmap_alloc();
194 
195                 if (!hwloc_cpukinds_get_info(topology, core_type, current_mask, &efficiency, nullptr, nullptr, 0)
196                     && efficiency >= 0
197                 ) {
198                     hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask);
199 
200                     if (hwloc_bitmap_weight(current_mask) > 0) {
201                         core_types_indexes_list.push_back(core_type);
202                     }
203                     __TBB_ASSERT(hwloc_bitmap_weight(current_mask) >= 0, "Infinivitely filled core type mask");
204                 } else {
205                     core_types_parsing_broken = true;
206                     break;
207                 }
208             }
209         }
210 #else /*!__TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/
211         bool core_types_parsing_broken{true};
212 #endif /*__TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/
213 
214         if (core_types_parsing_broken) {
215             for (auto& core_type_mask : core_types_affinity_masks_list) {
216                 hwloc_bitmap_free(core_type_mask);
217             }
218             core_types_affinity_masks_list.resize(1);
219             core_types_indexes_list.resize(1);
220 
221             core_types_affinity_masks_list[0] = hwloc_bitmap_dup(process_cpu_affinity_mask);
222             core_types_indexes_list[0] = -1;
223         }
224     }
225 
226 public:
227     typedef hwloc_cpuset_t             affinity_mask;
228     typedef hwloc_const_cpuset_t const_affinity_mask;
229 
230     static platform_topology& instance() {
231         static platform_topology topology;
232         return topology;
233     }
234 
235     bool is_topology_parsed() { return initialization_state == topology_parsed; }
236 
237     void initialize( std::size_t groups_num ) {
238         if ( initialization_state != uninitialized )
239             return;
240 
241         topology_initialization(groups_num);
242         numa_topology_parsing();
243         core_types_topology_parsing();
244 
245         if (initialization_state == topology_loaded)
246             initialization_state = topology_parsed;
247     }
248 
249     ~platform_topology() {
250         if ( is_topology_parsed() ) {
251             for (auto& numa_node_mask : numa_affinity_masks_list) {
252                 hwloc_bitmap_free(numa_node_mask);
253             }
254 
255             for (auto& core_type_mask : core_types_affinity_masks_list) {
256                 hwloc_bitmap_free(core_type_mask);
257             }
258 
259             hwloc_bitmap_free(process_node_affinity_mask);
260             hwloc_bitmap_free(process_cpu_affinity_mask);
261         }
262 
263         if ( initialization_state >= topology_allocated ) {
264             hwloc_topology_destroy(topology);
265         }
266 
267         initialization_state = uninitialized;
268     }
269 
270     void fill_topology_information(
271         int& _numa_nodes_count, int*& _numa_indexes_list,
272         int& _core_types_count, int*& _core_types_indexes_list
273     ) {
274         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology");
275         _numa_nodes_count = numa_nodes_count;
276         _numa_indexes_list = numa_indexes_list.data();
277 
278         _core_types_count = (int)core_types_indexes_list.size();
279         _core_types_indexes_list = core_types_indexes_list.data();
280     }
281 
282     void fill_constraints_affinity_mask(affinity_mask input_mask, int numa_node_index, int core_type_index, int max_threads_per_core) {
283         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology");
284         __TBB_ASSERT(numa_node_index < (int)numa_affinity_masks_list.size(), "Wrong NUMA node id");
285         __TBB_ASSERT(core_type_index < (int)core_types_affinity_masks_list.size(), "Wrong core type id");
286         __TBB_ASSERT(max_threads_per_core == -1 || max_threads_per_core > 0, "Wrong max_threads_per_core");
287 
288         hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc();
289         hwloc_cpuset_t core_mask = hwloc_bitmap_alloc();
290 
291         hwloc_bitmap_copy(constraints_mask, process_cpu_affinity_mask);
292         if (numa_node_index >= 0) {
293             hwloc_bitmap_and(constraints_mask, constraints_mask, numa_affinity_masks_list[numa_node_index]);
294         }
295         if (core_type_index >= 0) {
296             hwloc_bitmap_and(constraints_mask, constraints_mask, core_types_affinity_masks_list[core_type_index]);
297         }
298         if (max_threads_per_core > 0) {
299             // clear input mask
300             hwloc_bitmap_zero(input_mask);
301 
302             hwloc_obj_t current_core = nullptr;
303             while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) {
304                 hwloc_bitmap_and(core_mask, constraints_mask, current_core->cpuset);
305 
306                 // fit the core mask to required bits number
307                 int current_threads_per_core = 0;
308                 for (int id = hwloc_bitmap_first(core_mask); id != -1; id = hwloc_bitmap_next(core_mask, id)) {
309                     if (++current_threads_per_core > max_threads_per_core) {
310                         hwloc_bitmap_clr(core_mask, id);
311                     }
312                 }
313 
314                 hwloc_bitmap_or(input_mask, input_mask, core_mask);
315             }
316         } else {
317             hwloc_bitmap_copy(input_mask, constraints_mask);
318         }
319 
320         hwloc_bitmap_free(core_mask);
321         hwloc_bitmap_free(constraints_mask);
322     }
323 
324     void fit_num_threads_per_core(affinity_mask result_mask, affinity_mask current_mask, affinity_mask constraints_mask) {
325         hwloc_bitmap_zero(result_mask);
326         hwloc_obj_t current_core = nullptr;
327         while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) {
328             if (hwloc_bitmap_intersects(current_mask, current_core->cpuset)) {
329                 hwloc_bitmap_or(result_mask, result_mask, current_core->cpuset);
330             }
331         }
332         hwloc_bitmap_and(result_mask, result_mask, constraints_mask);
333     }
334 
335     int get_default_concurrency(int numa_node_index, int core_type_index, int max_threads_per_core) {
336         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology");
337 
338         hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc();
339         fill_constraints_affinity_mask(constraints_mask, numa_node_index, core_type_index, max_threads_per_core);
340 
341         int default_concurrency = hwloc_bitmap_weight(constraints_mask);
342         hwloc_bitmap_free(constraints_mask);
343         return default_concurrency;
344     }
345 
346     affinity_mask allocate_process_affinity_mask() {
347         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized platform_topology");
348         return hwloc_bitmap_dup(process_cpu_affinity_mask);
349     }
350 
351     void free_affinity_mask( affinity_mask mask_to_free ) {
352         hwloc_bitmap_free(mask_to_free); // If bitmap is nullptr, no operation is performed.
353     }
354 
355     void store_current_affinity_mask( affinity_mask current_mask ) {
356         assertion_hwloc_wrapper(hwloc_get_cpubind, topology, current_mask, HWLOC_CPUBIND_THREAD);
357 
358         hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask);
359         __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask),
360             "Current affinity mask must intersects with process affinity mask");
361     }
362 
363     void set_affinity_mask( const_affinity_mask mask ) {
364         if (hwloc_bitmap_weight(mask) > 0) {
365             assertion_hwloc_wrapper(hwloc_set_cpubind, topology, mask, HWLOC_CPUBIND_THREAD);
366         }
367     }
368 };
369 
370 class binding_handler {
371     // Following vector saves thread affinity mask on scheduler entry to return it to this thread
372     // on scheduler exit.
373     typedef std::vector<platform_topology::affinity_mask> affinity_masks_container;
374     affinity_masks_container affinity_backup;
375     platform_topology::affinity_mask handler_affinity_mask;
376 
377 #if WIN32
378     affinity_masks_container affinity_buffer;
379     int my_numa_node_id;
380     int my_core_type_id;
381     int my_max_threads_per_core;
382 #endif
383 
384 public:
385     binding_handler( std::size_t size, int numa_node_id, int core_type_id, int max_threads_per_core )
386         : affinity_backup(size)
387 #if WIN32
388         , affinity_buffer(size)
389         , my_numa_node_id(numa_node_id)
390         , my_core_type_id(core_type_id)
391         , my_max_threads_per_core(max_threads_per_core)
392 #endif
393     {
394         for (std::size_t i = 0; i < size; ++i) {
395             affinity_backup[i] = platform_topology::instance().allocate_process_affinity_mask();
396 #if WIN32
397             affinity_buffer[i] = platform_topology::instance().allocate_process_affinity_mask();
398 #endif
399         }
400         handler_affinity_mask = platform_topology::instance().allocate_process_affinity_mask();
401         platform_topology::instance().fill_constraints_affinity_mask
402             (handler_affinity_mask, numa_node_id, core_type_id, max_threads_per_core);
403     }
404 
405     ~binding_handler() {
406         for (std::size_t i = 0; i < affinity_backup.size(); ++i) {
407             platform_topology::instance().free_affinity_mask(affinity_backup[i]);
408 #if WIN32
409             platform_topology::instance().free_affinity_mask(affinity_buffer[i]);
410 #endif
411         }
412         platform_topology::instance().free_affinity_mask(handler_affinity_mask);
413     }
414 
415     void apply_affinity( unsigned slot_num ) {
416         auto& topology = platform_topology::instance();
417         __TBB_ASSERT(slot_num < affinity_backup.size(),
418             "The slot number is greater than the number of slots in the arena");
419         __TBB_ASSERT(topology.is_topology_parsed(),
420             "Trying to get access to uninitialized platform_topology");
421 
422         topology.store_current_affinity_mask(affinity_backup[slot_num]);
423 
424 #if WIN32
425         // TBBBind supports only systems where NUMA nodes and core types do not cross the border
426         // between several processor groups. So if a certain NUMA node or core type constraint
427         // specified, then the constraints affinity mask will not cross the processor groups' border.
428 
429         // But if we have constraint based only on the max_threads_per_core setting, then the
430         // constraints affinity mask does may cross the border between several processor groups
431         // on machines with more then 64 hardware threads. That is why we need to use the special
432         // function, which regulates the number of threads in the current threads mask.
433         if (topology.number_of_processors_groups > 1 && my_max_threads_per_core != -1 &&
434             (my_numa_node_id == -1 || topology.numa_indexes_list.size() == 1) &&
435             (my_core_type_id == -1 || topology.core_types_indexes_list.size() == 1)
436         ) {
437             topology.fit_num_threads_per_core(affinity_buffer[slot_num], affinity_backup[slot_num], handler_affinity_mask);
438             topology.set_affinity_mask(affinity_buffer[slot_num]);
439             return;
440         }
441 #endif
442         topology.set_affinity_mask(handler_affinity_mask);
443     }
444 
445     void restore_previous_affinity_mask( unsigned slot_num ) {
446         auto& topology = platform_topology::instance();
447         __TBB_ASSERT(topology.is_topology_parsed(),
448             "Trying to get access to uninitialized platform_topology");
449         topology.set_affinity_mask(affinity_backup[slot_num]);
450     };
451 
452 };
453 
454 extern "C" { // exported to TBB interfaces
455 
456 void __TBB_internal_initialize_system_topology(
457     std::size_t groups_num,
458     int& numa_nodes_count, int*& numa_indexes_list,
459     int& core_types_count, int*& core_types_indexes_list
460 ) {
461     platform_topology::instance().initialize(groups_num);
462     platform_topology::instance().fill_topology_information(
463         numa_nodes_count, numa_indexes_list,
464         core_types_count, core_types_indexes_list
465     );
466 }
467 
468 binding_handler* __TBB_internal_allocate_binding_handler(int number_of_slots, int numa_id, int core_type_id, int max_threads_per_core) {
469     __TBB_ASSERT(number_of_slots > 0, "Trying to create numa handler for 0 threads.");
470     return new binding_handler(number_of_slots, numa_id, core_type_id, max_threads_per_core);
471 }
472 
473 void __TBB_internal_deallocate_binding_handler(binding_handler* handler_ptr) {
474     __TBB_ASSERT(handler_ptr != nullptr, "Trying to deallocate nullptr pointer.");
475     delete handler_ptr;
476 }
477 
478 void __TBB_internal_apply_affinity(binding_handler* handler_ptr, int slot_num) {
479     __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata.");
480     handler_ptr->apply_affinity(slot_num);
481 }
482 
483 void __TBB_internal_restore_affinity(binding_handler* handler_ptr, int slot_num) {
484     __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata.");
485     handler_ptr->restore_previous_affinity_mask(slot_num);
486 }
487 
488 int __TBB_internal_get_default_concurrency(int numa_id, int core_type_id, int max_threads_per_core) {
489     return platform_topology::instance().get_default_concurrency(numa_id, core_type_id, max_threads_per_core);
490 }
491 
492 } // extern "C"
493 
494 } // namespace r1
495 } // namespace detail
496 } // namespace tbb
497 
498 #undef assertion_hwloc_wrapper
499