xref: /oneTBB/src/tbbbind/tbb_bind.cpp (revision 7196bb4f)
1 /*
2     Copyright (c) 2019-2021 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 #include <vector>
18 #include <mutex>
19 
20 #include "../tbb/assert_impl.h" // Out-of-line TBB assertion handling routines are instantiated here.
21 #include "oneapi/tbb/detail/_assert.h"
22 #include "oneapi/tbb/detail/_config.h"
23 
24 #if _MSC_VER && !__INTEL_COMPILER && !__clang__
25 #pragma warning( push )
26 #pragma warning( disable : 4100 )
27 #elif _MSC_VER && __clang__
28 #pragma GCC diagnostic push
29 #pragma GCC diagnostic ignored "-Wunused-parameter"
30 #endif
31 #include <hwloc.h>
32 #if _MSC_VER && !__INTEL_COMPILER && !__clang__
33 #pragma warning( pop )
34 #elif _MSC_VER && __clang__
35 #pragma GCC diagnostic pop
36 #endif
37 
38 #define __TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT (HWLOC_API_VERSION >= 0x20400)
39 #define __TBBBIND_HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING_PRESENT (HWLOC_API_VERSION >= 0x20500)
40 
41 // Most of hwloc calls returns negative exit code on error.
42 // This macro tracks error codes that are returned from the hwloc interfaces.
43 #define assertion_hwloc_wrapper(command, ...) \
44         __TBB_ASSERT_EX( (command(__VA_ARGS__)) >= 0, "Error occurred during call to hwloc API.");
45 
46 namespace tbb {
47 namespace detail {
48 namespace r1 {
49 
50 //------------------------------------------------------------------------
51 // Information about the machine's hardware TBB is happen to work on
52 //------------------------------------------------------------------------
53 class system_topology {
54     friend class binding_handler;
55 
56     // Common topology members
57     hwloc_topology_t topology{nullptr};
58     hwloc_cpuset_t   process_cpu_affinity_mask{nullptr};
59     hwloc_nodeset_t  process_node_affinity_mask{nullptr};
60     std::size_t number_of_processors_groups{1};
61 
62     // NUMA API related topology members
63     std::vector<hwloc_cpuset_t> numa_affinity_masks_list{};
64     std::vector<int> numa_indexes_list{};
65     int numa_nodes_count{0};
66 
67     // Hybrid CPUs API related topology members
68     std::vector<hwloc_cpuset_t> core_types_affinity_masks_list{};
69     std::vector<int> core_types_indexes_list{};
70 
71     enum init_stages { uninitialized,
72                        started,
73                        topology_allocated,
74                        topology_loaded,
75                        topology_parsed } initialization_state;
76 
77     // Binding threads that locate in another Windows Processor groups
78     // is allowed only if machine topology contains several Windows Processors groups
79     // and process affinity mask wasn`t limited manually (affinity mask cannot violates
80     // processors group boundaries).
81     bool intergroup_binding_allowed(std::size_t groups_num) { return groups_num > 1; }
82 
83 private:
84     void topology_initialization(std::size_t groups_num) {
85         initialization_state = started;
86 
87         // Parse topology
88         if ( hwloc_topology_init( &topology ) == 0 ) {
89             initialization_state = topology_allocated;
90 #if __TBBBIND_HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING_PRESENT
91             if ( groups_num == 1 &&
92                  hwloc_topology_set_flags(topology,
93                      HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM |
94                      HWLOC_TOPOLOGY_FLAG_RESTRICT_TO_CPUBINDING
95                  ) != 0
96             ) {
97                 return;
98             }
99 #endif
100             if ( hwloc_topology_load( topology ) == 0 ) {
101                 initialization_state = topology_loaded;
102             }
103         }
104         if ( initialization_state != topology_loaded )
105             return;
106 
107         // Getting process affinity mask
108         if ( intergroup_binding_allowed(groups_num) ) {
109             process_cpu_affinity_mask  = hwloc_bitmap_dup(hwloc_topology_get_complete_cpuset (topology));
110             process_node_affinity_mask = hwloc_bitmap_dup(hwloc_topology_get_complete_nodeset(topology));
111         } else {
112             process_cpu_affinity_mask  = hwloc_bitmap_alloc();
113             process_node_affinity_mask = hwloc_bitmap_alloc();
114 
115             assertion_hwloc_wrapper(hwloc_get_cpubind, topology, process_cpu_affinity_mask, 0);
116             hwloc_cpuset_to_nodeset(topology, process_cpu_affinity_mask, process_node_affinity_mask);
117         }
118 
119         number_of_processors_groups = groups_num;
120     }
121 
122     void numa_topology_parsing() {
123         // Fill parameters with stubs if topology parsing is broken.
124         if ( initialization_state != topology_loaded ) {
125             numa_nodes_count = 1;
126             numa_indexes_list.push_back(-1);
127             return;
128         }
129 
130         // If system contains no NUMA nodes, HWLOC 1.11 returns an infinitely filled bitmap.
131         // hwloc_bitmap_weight() returns negative value for such bitmaps, so we use this check
132         // to change way of topology initialization.
133         numa_nodes_count = hwloc_bitmap_weight(process_node_affinity_mask);
134         if (numa_nodes_count <= 0) {
135             // numa_nodes_count may be empty if the process affinity mask is empty too (invalid case)
136             // or if some internal HWLOC error occurred.
137             // So we place -1 as index in this case.
138             numa_indexes_list.push_back(numa_nodes_count == 0 ? -1 : 0);
139             numa_nodes_count = 1;
140 
141             numa_affinity_masks_list.push_back(hwloc_bitmap_dup(process_cpu_affinity_mask));
142         } else {
143             // Get NUMA logical indexes list
144             unsigned counter = 0;
145             int i = 0;
146             int max_numa_index = -1;
147             numa_indexes_list.resize(numa_nodes_count);
148             hwloc_obj_t node_buffer;
149             hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) {
150                 node_buffer = hwloc_get_numanode_obj_by_os_index(topology, i);
151                 numa_indexes_list[counter] = static_cast<int>(node_buffer->logical_index);
152 
153                 if ( numa_indexes_list[counter] > max_numa_index ) {
154                     max_numa_index = numa_indexes_list[counter];
155                 }
156 
157                 counter++;
158             } hwloc_bitmap_foreach_end();
159             __TBB_ASSERT(max_numa_index >= 0, "Maximal NUMA index must not be negative");
160 
161             // Fill concurrency and affinity masks lists
162             numa_affinity_masks_list.resize(max_numa_index + 1);
163             int index = 0;
164             hwloc_bitmap_foreach_begin(i, process_node_affinity_mask) {
165                 node_buffer = hwloc_get_numanode_obj_by_os_index(topology, i);
166                 index = static_cast<int>(node_buffer->logical_index);
167 
168                 hwloc_cpuset_t& current_mask = numa_affinity_masks_list[index];
169                 current_mask = hwloc_bitmap_dup(node_buffer->cpuset);
170 
171                 hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask);
172                 __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask), "hwloc detected unavailable NUMA node");
173             } hwloc_bitmap_foreach_end();
174         }
175     }
176 
177     void core_types_topology_parsing() {
178         // Fill parameters with stubs if topology parsing is broken.
179         if ( initialization_state != topology_loaded ) {
180             core_types_indexes_list.push_back(-1);
181             return;
182         }
183 #if __TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT
184         __TBB_ASSERT(hwloc_get_api_version() >= 0x20400, "Hybrid CPUs support interfaces required HWLOC >= 2.4");
185         // Parsing the hybrid CPU topology
186         int core_types_number = hwloc_cpukinds_get_nr(topology, 0);
187         bool core_types_parsing_broken = core_types_number <= 0;
188         if (!core_types_parsing_broken) {
189             core_types_affinity_masks_list.resize(core_types_number);
190             int efficiency{-1};
191 
192             for (int core_type = 0; core_type < core_types_number; ++core_type) {
193                 hwloc_cpuset_t& current_mask = core_types_affinity_masks_list[core_type];
194                 current_mask = hwloc_bitmap_alloc();
195 
196                 if (!hwloc_cpukinds_get_info(topology, core_type, current_mask, &efficiency, nullptr, nullptr, 0)
197                     && efficiency >= 0
198                 ) {
199                     hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask);
200 
201                     if (hwloc_bitmap_weight(current_mask) > 0) {
202                         core_types_indexes_list.push_back(core_type);
203                     }
204                     __TBB_ASSERT(hwloc_bitmap_weight(current_mask) >= 0, "Infinivitely filled core type mask");
205                 } else {
206                     core_types_parsing_broken = true;
207                     break;
208                 }
209             }
210         }
211 #else /*!__TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/
212         bool core_types_parsing_broken{true};
213 #endif /*__TBBBIND_HWLOC_HYBRID_CPUS_INTERFACES_PRESENT*/
214 
215         if (core_types_parsing_broken) {
216             for (auto& core_type_mask : core_types_affinity_masks_list) {
217                 hwloc_bitmap_free(core_type_mask);
218             }
219             core_types_affinity_masks_list.resize(1);
220             core_types_indexes_list.resize(1);
221 
222             core_types_affinity_masks_list[0] = hwloc_bitmap_dup(process_cpu_affinity_mask);
223             core_types_indexes_list[0] = -1;
224         }
225     }
226 
227     void enforce_hwloc_2_5_runtime_linkage() {
228         // Without the call of this function HWLOC 2.4 can be successfully loaded during the tbbbind_2_5 loading.
229         // It is possible since tbbbind_2_5 don't use any new entry points that were introduced in HWLOC 2.5
230         // But tbbbind_2_5 compiles with HWLOC 2.5 header, therefore such situation requires binary forward compatibility
231         // which are not guaranteed by the HWLOC library. To enforce linkage tbbbind_2_5 only with HWLOC >= 2.5 version
232         // this function calls the interface that is available in the HWLOC 2.5 only.
233 #if HWLOC_API_VERSION >= 0x20500
234         auto some_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, nullptr);
235         hwloc_get_obj_with_same_locality(topology, some_core, HWLOC_OBJ_CORE, nullptr, nullptr, 0);
236 #endif
237     }
238 
239 
240     void initialize( std::size_t groups_num ) {
241         if ( initialization_state != uninitialized )
242             return;
243 
244         topology_initialization(groups_num);
245         numa_topology_parsing();
246         core_types_topology_parsing();
247 
248         enforce_hwloc_2_5_runtime_linkage();
249 
250         if (initialization_state == topology_loaded)
251             initialization_state = topology_parsed;
252     }
253 
254     static system_topology* instance_ptr;
255 public:
256     typedef hwloc_cpuset_t             affinity_mask;
257     typedef hwloc_const_cpuset_t const_affinity_mask;
258 
259     bool is_topology_parsed() { return initialization_state == topology_parsed; }
260 
261     static void construct( std::size_t groups_num ) {
262         if (instance_ptr == nullptr) {
263             instance_ptr = new system_topology();
264             instance_ptr->initialize(groups_num);
265         }
266     }
267 
268     static system_topology& instance() {
269         __TBB_ASSERT(instance_ptr != nullptr, "Getting instance of non-constructed topology");
270         return *instance_ptr;
271     }
272 
273     static void destroy() {
274         __TBB_ASSERT(instance_ptr != nullptr, "Destroying non-constructed topology");
275         delete instance_ptr;
276     }
277 
278     ~system_topology() {
279         if ( is_topology_parsed() ) {
280             for (auto& numa_node_mask : numa_affinity_masks_list) {
281                 hwloc_bitmap_free(numa_node_mask);
282             }
283 
284             for (auto& core_type_mask : core_types_affinity_masks_list) {
285                 hwloc_bitmap_free(core_type_mask);
286             }
287 
288             hwloc_bitmap_free(process_node_affinity_mask);
289             hwloc_bitmap_free(process_cpu_affinity_mask);
290         }
291 
292         if ( initialization_state >= topology_allocated ) {
293             hwloc_topology_destroy(topology);
294         }
295 
296         initialization_state = uninitialized;
297     }
298 
299     void fill_topology_information(
300         int& _numa_nodes_count, int*& _numa_indexes_list,
301         int& _core_types_count, int*& _core_types_indexes_list
302     ) {
303         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized system_topology");
304         _numa_nodes_count = numa_nodes_count;
305         _numa_indexes_list = numa_indexes_list.data();
306 
307         _core_types_count = (int)core_types_indexes_list.size();
308         _core_types_indexes_list = core_types_indexes_list.data();
309     }
310 
311     void fill_constraints_affinity_mask(affinity_mask input_mask, int numa_node_index, int core_type_index, int max_threads_per_core) {
312         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized system_topology");
313         __TBB_ASSERT(numa_node_index < (int)numa_affinity_masks_list.size(), "Wrong NUMA node id");
314         __TBB_ASSERT(core_type_index < (int)core_types_affinity_masks_list.size(), "Wrong core type id");
315         __TBB_ASSERT(max_threads_per_core == -1 || max_threads_per_core > 0, "Wrong max_threads_per_core");
316 
317         hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc();
318         hwloc_cpuset_t core_mask = hwloc_bitmap_alloc();
319 
320         hwloc_bitmap_copy(constraints_mask, process_cpu_affinity_mask);
321         if (numa_node_index >= 0) {
322             hwloc_bitmap_and(constraints_mask, constraints_mask, numa_affinity_masks_list[numa_node_index]);
323         }
324         if (core_type_index >= 0) {
325             hwloc_bitmap_and(constraints_mask, constraints_mask, core_types_affinity_masks_list[core_type_index]);
326         }
327         if (max_threads_per_core > 0) {
328             // clear input mask
329             hwloc_bitmap_zero(input_mask);
330 
331             hwloc_obj_t current_core = nullptr;
332             while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) {
333                 hwloc_bitmap_and(core_mask, constraints_mask, current_core->cpuset);
334 
335                 // fit the core mask to required bits number
336                 int current_threads_per_core = 0;
337                 for (int id = hwloc_bitmap_first(core_mask); id != -1; id = hwloc_bitmap_next(core_mask, id)) {
338                     if (++current_threads_per_core > max_threads_per_core) {
339                         hwloc_bitmap_clr(core_mask, id);
340                     }
341                 }
342 
343                 hwloc_bitmap_or(input_mask, input_mask, core_mask);
344             }
345         } else {
346             hwloc_bitmap_copy(input_mask, constraints_mask);
347         }
348 
349         hwloc_bitmap_free(core_mask);
350         hwloc_bitmap_free(constraints_mask);
351     }
352 
353     void fit_num_threads_per_core(affinity_mask result_mask, affinity_mask current_mask, affinity_mask constraints_mask) {
354         hwloc_bitmap_zero(result_mask);
355         hwloc_obj_t current_core = nullptr;
356         while ((current_core = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_CORE, current_core)) != nullptr) {
357             if (hwloc_bitmap_intersects(current_mask, current_core->cpuset)) {
358                 hwloc_bitmap_or(result_mask, result_mask, current_core->cpuset);
359             }
360         }
361         hwloc_bitmap_and(result_mask, result_mask, constraints_mask);
362     }
363 
364     int get_default_concurrency(int numa_node_index, int core_type_index, int max_threads_per_core) {
365         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized system_topology");
366 
367         hwloc_cpuset_t constraints_mask = hwloc_bitmap_alloc();
368         fill_constraints_affinity_mask(constraints_mask, numa_node_index, core_type_index, max_threads_per_core);
369 
370         int default_concurrency = hwloc_bitmap_weight(constraints_mask);
371         hwloc_bitmap_free(constraints_mask);
372         return default_concurrency;
373     }
374 
375     affinity_mask allocate_process_affinity_mask() {
376         __TBB_ASSERT(is_topology_parsed(), "Trying to get access to uninitialized system_topology");
377         return hwloc_bitmap_dup(process_cpu_affinity_mask);
378     }
379 
380     void free_affinity_mask( affinity_mask mask_to_free ) {
381         hwloc_bitmap_free(mask_to_free); // If bitmap is nullptr, no operation is performed.
382     }
383 
384     void store_current_affinity_mask( affinity_mask current_mask ) {
385         assertion_hwloc_wrapper(hwloc_get_cpubind, topology, current_mask, HWLOC_CPUBIND_THREAD);
386 
387         hwloc_bitmap_and(current_mask, current_mask, process_cpu_affinity_mask);
388         __TBB_ASSERT(!hwloc_bitmap_iszero(current_mask),
389             "Current affinity mask must intersects with process affinity mask");
390     }
391 
392     void set_affinity_mask( const_affinity_mask mask ) {
393         if (hwloc_bitmap_weight(mask) > 0) {
394             assertion_hwloc_wrapper(hwloc_set_cpubind, topology, mask, HWLOC_CPUBIND_THREAD);
395         }
396     }
397 };
398 
399 system_topology* system_topology::instance_ptr{nullptr};
400 
401 class binding_handler {
402     // Following vector saves thread affinity mask on scheduler entry to return it to this thread
403     // on scheduler exit.
404     typedef std::vector<system_topology::affinity_mask> affinity_masks_container;
405     affinity_masks_container affinity_backup;
406     system_topology::affinity_mask handler_affinity_mask;
407 
408 #ifdef _WIN32
409     affinity_masks_container affinity_buffer;
410     int my_numa_node_id;
411     int my_core_type_id;
412     int my_max_threads_per_core;
413 #endif
414 
415 public:
416     binding_handler( std::size_t size, int numa_node_id, int core_type_id, int max_threads_per_core )
417         : affinity_backup(size)
418 #ifdef _WIN32
419         , affinity_buffer(size)
420         , my_numa_node_id(numa_node_id)
421         , my_core_type_id(core_type_id)
422         , my_max_threads_per_core(max_threads_per_core)
423 #endif
424     {
425         for (std::size_t i = 0; i < size; ++i) {
426             affinity_backup[i] = system_topology::instance().allocate_process_affinity_mask();
427 #ifdef _WIN32
428             affinity_buffer[i] = system_topology::instance().allocate_process_affinity_mask();
429 #endif
430         }
431         handler_affinity_mask = system_topology::instance().allocate_process_affinity_mask();
432         system_topology::instance().fill_constraints_affinity_mask
433             (handler_affinity_mask, numa_node_id, core_type_id, max_threads_per_core);
434     }
435 
436     ~binding_handler() {
437         for (std::size_t i = 0; i < affinity_backup.size(); ++i) {
438             system_topology::instance().free_affinity_mask(affinity_backup[i]);
439 #ifdef _WIN32
440             system_topology::instance().free_affinity_mask(affinity_buffer[i]);
441 #endif
442         }
443         system_topology::instance().free_affinity_mask(handler_affinity_mask);
444     }
445 
446     void apply_affinity( unsigned slot_num ) {
447         auto& topology = system_topology::instance();
448         __TBB_ASSERT(slot_num < affinity_backup.size(),
449             "The slot number is greater than the number of slots in the arena");
450         __TBB_ASSERT(topology.is_topology_parsed(),
451             "Trying to get access to uninitialized system_topology");
452 
453         topology.store_current_affinity_mask(affinity_backup[slot_num]);
454 
455 #ifdef _WIN32
456         // TBBBind supports only systems where NUMA nodes and core types do not cross the border
457         // between several processor groups. So if a certain NUMA node or core type constraint
458         // specified, then the constraints affinity mask will not cross the processor groups' border.
459 
460         // But if we have constraint based only on the max_threads_per_core setting, then the
461         // constraints affinity mask does may cross the border between several processor groups
462         // on machines with more then 64 hardware threads. That is why we need to use the special
463         // function, which regulates the number of threads in the current threads mask.
464         if (topology.number_of_processors_groups > 1 && my_max_threads_per_core != -1 &&
465             (my_numa_node_id == -1 || topology.numa_indexes_list.size() == 1) &&
466             (my_core_type_id == -1 || topology.core_types_indexes_list.size() == 1)
467         ) {
468             topology.fit_num_threads_per_core(affinity_buffer[slot_num], affinity_backup[slot_num], handler_affinity_mask);
469             topology.set_affinity_mask(affinity_buffer[slot_num]);
470             return;
471         }
472 #endif
473         topology.set_affinity_mask(handler_affinity_mask);
474     }
475 
476     void restore_previous_affinity_mask( unsigned slot_num ) {
477         auto& topology = system_topology::instance();
478         __TBB_ASSERT(topology.is_topology_parsed(),
479             "Trying to get access to uninitialized system_topology");
480         topology.set_affinity_mask(affinity_backup[slot_num]);
481     };
482 
483 };
484 
485 extern "C" { // exported to TBB interfaces
486 
487 TBBBIND_EXPORT void __TBB_internal_initialize_system_topology(
488     std::size_t groups_num,
489     int& numa_nodes_count, int*& numa_indexes_list,
490     int& core_types_count, int*& core_types_indexes_list
491 ) {
492     system_topology::construct(groups_num);
493     system_topology::instance().fill_topology_information(
494         numa_nodes_count, numa_indexes_list,
495         core_types_count, core_types_indexes_list
496     );
497 }
498 
499 TBBBIND_EXPORT binding_handler* __TBB_internal_allocate_binding_handler(int number_of_slots, int numa_id, int core_type_id, int max_threads_per_core) {
500     __TBB_ASSERT(number_of_slots > 0, "Trying to create numa handler for 0 threads.");
501     return new binding_handler(number_of_slots, numa_id, core_type_id, max_threads_per_core);
502 }
503 
504 TBBBIND_EXPORT void __TBB_internal_deallocate_binding_handler(binding_handler* handler_ptr) {
505     __TBB_ASSERT(handler_ptr != nullptr, "Trying to deallocate nullptr pointer.");
506     delete handler_ptr;
507 }
508 
509 TBBBIND_EXPORT void __TBB_internal_apply_affinity(binding_handler* handler_ptr, int slot_num) {
510     __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata.");
511     handler_ptr->apply_affinity(slot_num);
512 }
513 
514 TBBBIND_EXPORT void __TBB_internal_restore_affinity(binding_handler* handler_ptr, int slot_num) {
515     __TBB_ASSERT(handler_ptr != nullptr, "Trying to get access to uninitialized metadata.");
516     handler_ptr->restore_previous_affinity_mask(slot_num);
517 }
518 
519 TBBBIND_EXPORT int __TBB_internal_get_default_concurrency(int numa_id, int core_type_id, int max_threads_per_core) {
520     return system_topology::instance().get_default_concurrency(numa_id, core_type_id, max_threads_per_core);
521 }
522 
523 void __TBB_internal_destroy_system_topology() {
524     return system_topology::destroy();
525 }
526 
527 } // extern "C"
528 
529 } // namespace r1
530 } // namespace detail
531 } // namespace tbb
532 
533 #undef assertion_hwloc_wrapper
534