xref: /oneTBB/src/tbbbind/tbb_bind.cpp (revision 71e1bb8e)
1 /*
2     Copyright (c) 2019-2023 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 #include <vector>
18 #include <mutex>
19 
20 #include "../tbb/assert_impl.h" // Out-of-line TBB assertion handling routines are instantiated here.
21 #include "oneapi/tbb/detail/_assert.h"
22 #include "oneapi/tbb/detail/_config.h"
23 
24 #if _MSC_VER && !__INTEL_COMPILER && !__clang__
25 #pragma warning( push )
26 #pragma warning( disable : 4100 )
27 #elif _MSC_VER && __clang__
28 #pragma GCC diagnostic push
29 #pragma GCC diagnostic ignored "-Wunused-parameter"
30 #endif
31 #include <hwloc.h>
32 #if _MSC_VER && !__INTEL_COMPILER && !__clang__
33 #pragma warning( pop )
34 #elif _MSC_VER && __clang__
35 #pragma GCC diagnostic pop
36 #endif
37 
38 #define __TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT (HWLOC_API_VERSION >= 0x20400)
39 #define __TBBBIND_HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING_PRESENT (HWLOC_API_VERSION >= 0x20500)
40 
41 // Most of hwloc calls returns negative exit code on error.
42 // This macro tracks error codes that are returned from the hwloc interfaces.
43 #define assertion_hwloc_wrapper(command, ...) \
44         __TBB_ASSERT_EX( (command(__VA_ARGS__)) >= 0, "Error occurred during call to hwloc API.");
45 
46 namespace tbb {
47 namespace detail {
48 namespace r1 {
49 
50 //------------------------------------------------------------------------
51 // Information about the machine's hardware TBB is happen to work on
52 //------------------------------------------------------------------------
53 class system_topology {
54     friend class binding_handler;
55 
56     // Common topology members
57     hwloc_topology_t topology{nullptr};
58     hwloc_cpuset_t   process_cpu_affinity_mask{nullptr};
59     hwloc_nodeset_t  process_node_affinity_mask{nullptr};
60     std::size_t number_of_processors_groups{1};
61 
62     // NUMA API related topology members
63     std::vector<hwloc_cpuset_t> numa_affinity_masks_list{};
64     std::vector<int> numa_indexes_list{};
65     int numa_nodes_count{0};
66 
67     // Hybrid CPUs API related topology members
68     std::vector<hwloc_cpuset_t> core_types_affinity_masks_list{};
69     std::vector<int> core_types_indexes_list{};
70 
71     enum init_stages { uninitialized,
72                        started,
73                        topology_allocated,
74                        topology_loaded,
75                        topology_parsed } initialization_state;
76 
77     // Binding threads that locate in another Windows Processor groups
78     // is allowed only if machine topology contains several Windows Processors groups
79     // and process affinity mask wasn`t limited manually (affinity mask cannot violates
80     // processors group boundaries).
81     bool intergroup_binding_allowed(std::size_t groups_num) { return groups_num > 1; }
82 
83 private:
84     void topology_initialization(std::size_t groups_num) {
85         initialization_state = started;
86 
87         // Parse topology
88         if ( hwloc_topology_init( &topology ) == 0 ) {
89             initialization_state = topology_allocated;
90 #if __TBBBIND_HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING_PRESENT
91             if ( groups_num == 1 &&
92                  hwloc_topology_set_flags(topology,
93                      HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM |
94                      HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING
95                  ) != 0
96             ) {
97                 return;
98             }
99 #endif
100             if ( hwloc_topology_load( topology ) == 0 ) {
101                 initialization_state = topology_loaded;
102             }
103         }
104         if ( initialization_state != topology_loaded )
105             return;
106 
107 #if __TBB_CPUBIND_PRESENT
108         // Getting process affinity mask
109         if ( intergroup_binding_allowed(groups_num) ) {
110             process_cpu_affinity_mask  = hwloc_bitmap_dup(hwloc_topology_get_complete_cpuset (topology));
111             process_node_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_nodeset(topology));
112         } else {
113             process_cpu_affinity_mask  = hwloc_bitmap_alloc();
114             process_node_affinity_mask = hwloc_bitmap_alloc();
115 
116             assertion_hwloc_wrapper(hwloc_get_cpubind, topology, process_cpu_affinity_mask, 0);
117             hwloc_cpuset_to_nodeset(topology, process_cpu_affinity_mask, process_node_affinity_mask);
118         }
119 #else
120         process_cpu_affinity_mask  = hwloc_bitmap_dup(hwloc_topology_get_complete_cpuset (topology));
121         process_node_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_nodeset(topology));
122 #endif
123 
124         number_of_processors_groups = groups_num;
125     }
126 
127     void numa_topology_parsing() {
128         // Fill parameters with stubs if topology parsing is broken.
129         if ( initialization_state != topology_loaded ) {
130             numa_nodes_count = 1;
131             numa_indexes_list.push_back(-1);
132             return;
133         }
134 
135         // If system contains no NUMA nodes, HWLOC 1.11 returns an infinitely filled bitmap.
136         // hwloc_bitmap_weight() returns negative value for such bitmaps, so we use this check
137         // to change way of topology initialization.
138         numa_nodes_count = hwloc_bitmap_weight(process_node_affinity_mask);
139         if (numa_nodes_count <= 0) {
140             // numa_nodes_count may be empty if the process affinity mask is empty too (invalid case)
141             // or if some internal HWLOC error occurred.
142             // So we place -1 as index in this case.
143             numa_indexes_list.push_back(numa_nodes_count == 0 ? -1 : 0);
144             numa_nodes_count = 1;
145 
146             numa_affinity_masks_list.push_back(hwloc_bitmap_dup(process_cpu_affinity_mask));
147         } else {
148             // Get NUMA logical indexes list
149             unsigned counter = 0;
150             int i = 0;
151             int max_numa_index = -1;
152             numa_indexes_list.resize(numa_nodes_count);
153             hwloc_obj_t node_buffer;
154             hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) {
155                 node_buffer = hwloc_get_numanode_obj_by_os_index(topology, i);
156                 numa_indexes_list[counter] = static_cast<int>(node_buffer->logical_index);
157 
158                 if ( numa_indexes_list[counter] > max_numa_index ) {
159                     max_numa_index = numa_indexes_list[counter];
160                 }
161 
162                 counter++;
163             } hwloc_bitmap_foreach_end();
164             __TBB_ASSERT(max_numa_index >= 0, "Maximal NUMA index must not be negative");
165 
166             // Fill concurrency and affinity masks lists
167             numa_affinity_masks_list.resize(max_numa_index + 1);
168             int index = 0;
169             hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) {
170                 node_buffer = hwloc_get_numanode_obj_by_os_index(topology, i);
171                 index = static_cast<int>(node_buffer->logical_index);
172 
173                 hwloc_cpuset_t& current_mask = numa_affinity_masks_list[index];
174                 current_mask = hwloc_bitmap_dup(node_buffer->cpuset);
175 
176                 hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask);
177                 __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask), "hwloc detected unavailable NUMA node");
178             } hwloc_bitmap_foreach_end();
179         }
180     }
181 
182     void core_types_topology_parsing() {
183         // Fill parameters with stubs if topology parsing is broken.
184         if ( initialization_state != topology_loaded ) {
185             core_types_indexes_list.push_back(-1);
186             return;
187         }
188 #if __TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT
189         __TBB_ASSERT(hwloc_get_api_version() >= 0x20400, "Hybrid CPUs support interfaces required HWLOC >= 2.4");
190         // Parsing the hybrid CPU topology
191         int core_types_number = hwloc_cpukinds_get_nr(topology, 0);
192         bool core_types_parsing_broken = core_types_number <= 0;
193         if (!core_types_parsing_broken) {
194             core_types_affinity_masks_list.resize(core_types_number);
195             int efficiency{-1};
196 
197             for (int core_type = 0; core_type < core_types_number; ++core_type) {
198                 hwloc_cpuset_t& current_mask = core_types_affinity_masks_list[core_type];
199                 current_mask = hwloc_bitmap_alloc();
200 
201                 if (!hwloc_cpukinds_get_info(topology, core_type, current_mask, &efficiency, nullptr, nullptr, 0)
202                     && efficiency >= 0
203                 ) {
204                     hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask);
205 
206                     if (hwloc_bitmap_weight(current_mask) > 0) {
207                         core_types_indexes_list.push_back(core_type);
208                     }
209                     __TBB_ASSERT(hwloc_bitmap_weight(current_mask) >= 0, "Infinivitely filled core type mask");
210                 } else {
211                     core_types_parsing_broken = true;
212                     break;
213                 }
214             }
215         }
216 #else /*!__TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/
217         bool core_types_parsing_broken{true};
218 #endif /*__TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/
219 
220         if (core_types_parsing_broken) {
221             for (auto& core_type_mask : core_types_affinity_masks_list) {
222                 hwloc_bitmap_free(core_type_mask);
223             }
224             core_types_affinity_masks_list.resize(1);
225             core_types_indexes_list.resize(1);
226 
227             core_types_affinity_masks_list[0] = hwloc_bitmap_dup(process_cpu_affinity_mask);
228             core_types_indexes_list[0] = -1;
229         }
230     }
231 
232     void enforce_hwloc_2_5_runtime_linkage() {
233         // Without the call of this function HWLOC 2.4 can be successfully loaded during the tbbbind_2_5 loading.
234         // It is possible since tbbbind_2_5 don't use any new entry points that were introduced in HWLOC 2.5
235         // But tbbbind_2_5 compiles with HWLOC 2.5 header, therefore such situation requires binary forward compatibility
236         // which are not guaranteed by the HWLOC library. To enforce linkage tbbbind_2_5 only with HWLOC >= 2.5 version
237         // this function calls the interface that is available in the HWLOC 2.5 only.
238 #if HWLOC_API_VERSION >= 0x20500
239         auto some_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, nullptr);
240         hwloc_get_obj_with_same_locality(topology, some_core, HWLOC_OBJ_CORE, nullptr, nullptr, 0);
241 #endif
242     }
243 
244 
245     void initialize( std::size_t groups_num ) {
246         if ( initialization_state != uninitialized )
247             return;
248 
249         topology_initialization(groups_num);
250         numa_topology_parsing();
251         core_types_topology_parsing();
252 
253         enforce_hwloc_2_5_runtime_linkage();
254 
255         if (initialization_state == topology_loaded)
256             initialization_state = topology_parsed;
257     }
258 
259     static system_topology* instance_ptr;
260 public:
261     typedef hwloc_cpuset_t             affinity_mask;
262     typedef hwloc_const_cpuset_t const_affinity_mask;
263 
264     bool is_topology_parsed() { return initialization_state == topology_parsed; }
265 
266     static void construct( std::size_t groups_num ) {
267         if (instance_ptr == nullptr) {
268             instance_ptr = new system_topology();
269             instance_ptr->initialize(groups_num);
270         }
271     }
272 
273     static system_topology& instance() {
274         __TBB_ASSERT(instance_ptr != nullptr, "Getting instance of non-constructed topology");
275         return *instance_ptr;
276     }
277 
278     static void destroy() {
279         __TBB_ASSERT(instance_ptr != nullptr, "Destroying non-constructed topology");
280         delete instance_ptr;
281     }
282 
283     ~system_topology() {
284         if ( is_topology_parsed() ) {
285             for (auto& numa_node_mask : numa_affinity_masks_list) {
286                 hwloc_bitmap_free(numa_node_mask);
287             }
288 
289             for (auto& core_type_mask : core_types_affinity_masks_list) {
290                 hwloc_bitmap_free(core_type_mask);
291             }
292 
293             hwloc_bitmap_free(process_node_affinity_mask);
294             hwloc_bitmap_free(process_cpu_affinity_mask);
295         }
296 
297         if ( initialization_state >= topology_allocated ) {
298             hwloc_topology_destroy(topology);
299         }
300 
301         initialization_state = uninitialized;
302     }
303 
304     void fill_topology_information(
305         int& _numa_nodes_count, int*& _numa_indexes_list,
306         int& _core_types_count, int*& _core_types_indexes_list
307     ) {
308         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized system_topology");
309         _numa_nodes_count = numa_nodes_count;
310         _numa_indexes_list = numa_indexes_list.data();
311 
312         _core_types_count = (int)core_types_indexes_list.size();
313         _core_types_indexes_list = core_types_indexes_list.data();
314     }
315 
316     void fill_constraints_affinity_mask(affinity_mask input_mask, int numa_node_index, int core_type_index, int max_threads_per_core) {
317         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized system_topology");
318         __TBB_ASSERT(numa_node_index < (int)numa_affinity_masks_list.size(), "Wrong NUMA node id");
319         __TBB_ASSERT(core_type_index < (int)core_types_affinity_masks_list.size(), "Wrong core type id");
320         __TBB_ASSERT(max_threads_per_core == -1 || max_threads_per_core > 0, "Wrong max_threads_per_core");
321 
322         hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc();
323         hwloc_cpuset_t core_mask = hwloc_bitmap_alloc();
324 
325         hwloc_bitmap_copy(constraints_mask, process_cpu_affinity_mask);
326         if (numa_node_index >= 0) {
327             hwloc_bitmap_and(constraints_mask, constraints_mask, numa_affinity_masks_list[numa_node_index]);
328         }
329         if (core_type_index >= 0) {
330             hwloc_bitmap_and(constraints_mask, constraints_mask, core_types_affinity_masks_list[core_type_index]);
331         }
332         if (max_threads_per_core > 0) {
333             // clear input mask
334             hwloc_bitmap_zero(input_mask);
335 
336             hwloc_obj_t current_core = nullptr;
337             while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) {
338                 hwloc_bitmap_and(core_mask, constraints_mask, current_core->cpuset);
339 
340                 // fit the core mask to required bits number
341                 int current_threads_per_core = 0;
342                 for (int id = hwloc_bitmap_first(core_mask); id != -1; id = hwloc_bitmap_next(core_mask, id)) {
343                     if (++current_threads_per_core > max_threads_per_core) {
344                         hwloc_bitmap_clr(core_mask, id);
345                     }
346                 }
347 
348                 hwloc_bitmap_or(input_mask, input_mask, core_mask);
349             }
350         } else {
351             hwloc_bitmap_copy(input_mask, constraints_mask);
352         }
353 
354         hwloc_bitmap_free(core_mask);
355         hwloc_bitmap_free(constraints_mask);
356     }
357 
358     void fit_num_threads_per_core(affinity_mask result_mask, affinity_mask current_mask, affinity_mask constraints_mask) {
359         hwloc_bitmap_zero(result_mask);
360         hwloc_obj_t current_core = nullptr;
361         while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) {
362             if (hwloc_bitmap_intersects(current_mask, current_core->cpuset)) {
363                 hwloc_bitmap_or(result_mask, result_mask, current_core->cpuset);
364             }
365         }
366         hwloc_bitmap_and(result_mask, result_mask, constraints_mask);
367     }
368 
369     int get_default_concurrency(int numa_node_index, int core_type_index, int max_threads_per_core) {
370         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized system_topology");
371 
372         hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc();
373         fill_constraints_affinity_mask(constraints_mask, numa_node_index, core_type_index, max_threads_per_core);
374 
375         int default_concurrency = hwloc_bitmap_weight(constraints_mask);
376         hwloc_bitmap_free(constraints_mask);
377         return default_concurrency;
378     }
379 
380     affinity_mask allocate_process_affinity_mask() {
381         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized system_topology");
382         return hwloc_bitmap_dup(process_cpu_affinity_mask);
383     }
384 
385     void free_affinity_mask( affinity_mask mask_to_free ) {
386         hwloc_bitmap_free(mask_to_free); // If bitmap is nullptr, no operation is performed.
387     }
388 
389     void store_current_affinity_mask( affinity_mask current_mask ) {
390         assertion_hwloc_wrapper(hwloc_get_cpubind, topology, current_mask, HWLOC_CPUBIND_THREAD);
391 
392         hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask);
393         __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask),
394             "Current affinity mask must intersects with process affinity mask");
395     }
396 
397     void set_affinity_mask( const_affinity_mask mask ) {
398         if (hwloc_bitmap_weight(mask) > 0) {
399             assertion_hwloc_wrapper(hwloc_set_cpubind, topology, mask, HWLOC_CPUBIND_THREAD);
400         }
401     }
402 };
403 
404 system_topology* system_topology::instance_ptr{nullptr};
405 
406 class binding_handler {
407     // Following vector saves thread affinity mask on scheduler entry to return it to this thread
408     // on scheduler exit.
409     typedef std::vector<system_topology::affinity_mask> affinity_masks_container;
410     affinity_masks_container affinity_backup;
411     system_topology::affinity_mask handler_affinity_mask;
412 
413 #ifdef _WIN32
414     affinity_masks_container affinity_buffer;
415     int my_numa_node_id;
416     int my_core_type_id;
417     int my_max_threads_per_core;
418 #endif
419 
420 public:
421     binding_handler( std::size_t size, int numa_node_id, int core_type_id, int max_threads_per_core )
422         : affinity_backup(size)
423 #ifdef _WIN32
424         , affinity_buffer(size)
425         , my_numa_node_id(numa_node_id)
426         , my_core_type_id(core_type_id)
427         , my_max_threads_per_core(max_threads_per_core)
428 #endif
429     {
430         for (std::size_t i = 0; i < size; ++i) {
431             affinity_backup[i] = system_topology::instance().allocate_process_affinity_mask();
432 #ifdef _WIN32
433             affinity_buffer[i] = system_topology::instance().allocate_process_affinity_mask();
434 #endif
435         }
436         handler_affinity_mask = system_topology::instance().allocate_process_affinity_mask();
437         system_topology::instance().fill_constraints_affinity_mask
438             (handler_affinity_mask, numa_node_id, core_type_id, max_threads_per_core);
439     }
440 
441     ~binding_handler() {
442         for (std::size_t i = 0; i < affinity_backup.size(); ++i) {
443             system_topology::instance().free_affinity_mask(affinity_backup[i]);
444 #ifdef _WIN32
445             system_topology::instance().free_affinity_mask(affinity_buffer[i]);
446 #endif
447         }
448         system_topology::instance().free_affinity_mask(handler_affinity_mask);
449     }
450 
451     void apply_affinity( unsigned slot_num ) {
452         auto& topology = system_topology::instance();
453         __TBB_ASSERT(slot_num < affinity_backup.size(),
454             "The slot number is greater than the number of slots in the arena");
455         __TBB_ASSERT(topology.is_topology_parsed(),
456             "Trying to get access to uninitialized system_topology");
457 
458         topology.store_current_affinity_mask(affinity_backup[slot_num]);
459 
460 #ifdef _WIN32
461         // TBBBind supports only systems where NUMA nodes and core types do not cross the border
462         // between several processor groups. So if a certain NUMA node or core type constraint
463         // specified, then the constraints affinity mask will not cross the processor groups' border.
464 
465         // But if we have constraint based only on the max_threads_per_core setting, then the
466         // constraints affinity mask does may cross the border between several processor groups
467         // on machines with more then 64 hardware threads. That is why we need to use the special
468         // function, which regulates the number of threads in the current threads mask.
469         if (topology.number_of_processors_groups > 1 && my_max_threads_per_core != -1 &&
470             (my_numa_node_id == -1 || topology.numa_indexes_list.size() == 1) &&
471             (my_core_type_id == -1 || topology.core_types_indexes_list.size() == 1)
472         ) {
473             topology.fit_num_threads_per_core(affinity_buffer[slot_num], affinity_backup[slot_num], handler_affinity_mask);
474             topology.set_affinity_mask(affinity_buffer[slot_num]);
475             return;
476         }
477 #endif
478         topology.set_affinity_mask(handler_affinity_mask);
479     }
480 
481     void restore_previous_affinity_mask( unsigned slot_num ) {
482         auto& topology = system_topology::instance();
483         __TBB_ASSERT(topology.is_topology_parsed(),
484             "Trying to get access to uninitialized system_topology");
485         topology.set_affinity_mask(affinity_backup[slot_num]);
486     };
487 
488 };
489 
490 extern "C" { // exported to TBB interfaces
491 
492 TBBBIND_EXPORT void __TBB_internal_initialize_system_topology(
493     std::size_t groups_num,
494     int& numa_nodes_count, int*& numa_indexes_list,
495     int& core_types_count, int*& core_types_indexes_list
496 ) {
497     system_topology::construct(groups_num);
498     system_topology::instance().fill_topology_information(
499         numa_nodes_count, numa_indexes_list,
500         core_types_count, core_types_indexes_list
501     );
502 }
503 
504 TBBBIND_EXPORT binding_handler* __TBB_internal_allocate_binding_handler(int number_of_slots, int numa_id, int core_type_id, int max_threads_per_core) {
505     __TBB_ASSERT(number_of_slots > 0, "Trying to create numa handler for 0 threads.");
506     return new binding_handler(number_of_slots, numa_id, core_type_id, max_threads_per_core);
507 }
508 
509 TBBBIND_EXPORT void __TBB_internal_deallocate_binding_handler(binding_handler* handler_ptr) {
510     __TBB_ASSERT(handler_ptr != nullptr, "Trying to deallocate nullptr pointer.");
511     delete handler_ptr;
512 }
513 
514 TBBBIND_EXPORT void __TBB_internal_apply_affinity(binding_handler* handler_ptr, int slot_num) {
515     __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata.");
516     handler_ptr->apply_affinity(slot_num);
517 }
518 
519 TBBBIND_EXPORT void __TBB_internal_restore_affinity(binding_handler* handler_ptr, int slot_num) {
520     __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata.");
521     handler_ptr->restore_previous_affinity_mask(slot_num);
522 }
523 
524 TBBBIND_EXPORT int __TBB_internal_get_default_concurrency(int numa_id, int core_type_id, int max_threads_per_core) {
525     return system_topology::instance().get_default_concurrency(numa_id, core_type_id, max_threads_per_core);
526 }
527 
528 void __TBB_internal_destroy_system_topology() {
529     return system_topology::destroy();
530 }
531 
532 } // extern "C"
533 
534 } // namespace r1
535 } // namespace detail
536 } // namespace tbb
537 
538 #undef assertion_hwloc_wrapper
539