xref: /oneTBB/src/tbb/misc_ex.cpp (revision 7cee2251)
1 /*
2     Copyright (c) 2005-2023 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 // Source file for miscellaneous entities that are infrequently referenced by
18 // an executing program, and implementation of which requires dynamic linking.
19 
20 #include "misc.h"
21 
22 #if !defined(__TBB_HardwareConcurrency)
23 
24 #include "dynamic_link.h"
25 #include <stdio.h>
26 #include <limits.h>
27 
28 #if _WIN32||_WIN64
29 #include <windows.h>
30 #if __TBB_WIN8UI_SUPPORT
31 #include <thread>
32 #endif
33 #else
34 #include <unistd.h>
35 #if __unix__
36 #if __linux__
37 #include <sys/sysinfo.h>
38 #endif
39 #include <cstring>
40 #include <sched.h>
41 #include <cerrno>
42 #elif __sun
43 #include <sys/sysinfo.h>
44 #elif __FreeBSD__
45 #include <cerrno>
46 #include <cstring>
47 #include <sys/param.h>  // Required by <sys/cpuset.h>
48 #include <sys/cpuset.h>
49 #endif
50 #endif
51 
52 namespace tbb {
53 namespace detail {
54 namespace r1 {
55 
56 #if __TBB_USE_OS_AFFINITY_SYSCALL
57 
58 #if __unix__
59 // Handlers for interoperation with libiomp
60 static int (*libiomp_try_restoring_original_mask)();
61 // Table for mapping to libiomp entry points
62 static const dynamic_link_descriptor iompLinkTable[] = {
63     DLD_NOWEAK( kmp_set_thread_affinity_mask_initial, libiomp_try_restoring_original_mask )
64 };
65 #endif
66 
67 static void set_thread_affinity_mask( std::size_t maskSize, const basic_mask_t* threadMask ) {
68 #if __FreeBSD__ || __NetBSD__ || __OpenBSD__
69     if( cpuset_setaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) )
70 #else /* __unix__ */
71     if( sched_setaffinity( 0, maskSize, threadMask ) )
72 #endif
73         // Here and below the error severity is lowered from critical level
74         // because it may happen during TBB library unload because of not
75         // waiting for workers to complete (current RML policy, to be fixed).
76         // handle_perror( errno, "setaffinity syscall" );
77         runtime_warning( "setaffinity syscall failed" );
78 }
79 
80 static void get_thread_affinity_mask( std::size_t maskSize, basic_mask_t* threadMask ) {
81 #if __FreeBSD__ || __NetBSD__ || __OpenBSD__
82     if( cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) )
83 #else /* __unix__ */
84     if( sched_getaffinity( 0, maskSize, threadMask ) )
85 #endif
86     runtime_warning( "getaffinity syscall failed" );
87 }
88 
89 static basic_mask_t* process_mask;
90 static int num_masks;
91 
92 void destroy_process_mask() {
93     delete [] process_mask;
94     process_mask = nullptr;
95 }
96 
97 #define curMaskSize sizeof(basic_mask_t) * num_masks
98 affinity_helper::~affinity_helper() {
99     if( threadMask ) {
100         if( is_changed ) {
101             set_thread_affinity_mask( curMaskSize, threadMask );
102         }
103         delete [] threadMask;
104     }
105 }
106 void affinity_helper::protect_affinity_mask( bool restore_process_mask ) {
107     if( threadMask == nullptr && num_masks ) { // TODO: assert num_masks validity?
108         threadMask = new basic_mask_t [num_masks];
109         std::memset( threadMask, 0, curMaskSize );
110         get_thread_affinity_mask( curMaskSize, threadMask );
111         if( restore_process_mask ) {
112             __TBB_ASSERT( process_mask, "A process mask is requested but not yet stored" );
113             is_changed = memcmp( process_mask, threadMask, curMaskSize );
114             if( is_changed )
115                 set_thread_affinity_mask( curMaskSize, process_mask );
116         } else {
117             // Assume that the mask will be changed by the caller.
118             is_changed = 1;
119         }
120     }
121 }
122 void affinity_helper::dismiss() {
123     delete [] threadMask;
124     threadMask = nullptr;
125     is_changed = 0;
126 }
127 #undef curMaskSize
128 
129 static std::atomic<do_once_state> hardware_concurrency_info;
130 
131 static int theNumProcs;
132 
133 static void initialize_hardware_concurrency_info () {
134     int err;
135     int availableProcs = 0;
136     int numMasks = 1;
137     int maxProcs = sysconf(_SC_NPROCESSORS_ONLN);
138     basic_mask_t* processMask;
139     const std::size_t BasicMaskSize =  sizeof(basic_mask_t);
140     for (;;) {
141         const int curMaskSize = BasicMaskSize * numMasks;
142         processMask = new basic_mask_t[numMasks];
143         std::memset( processMask, 0, curMaskSize );
144 #if __FreeBSD__ || __NetBSD__ || __OpenBSD__
145         // CPU_LEVEL_WHICH - anonymous (current) mask, CPU_LEVEL_CPUSET - assigned mask
146         err = cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, curMaskSize, processMask );
147         if ( !err || errno != ERANGE || curMaskSize * CHAR_BIT >= 16 * 1024 )
148             break;
149 #else /* __unix__ */
150         int pid = getpid();
151         err = sched_getaffinity( pid, curMaskSize, processMask );
152         if ( !err || errno != EINVAL || curMaskSize * CHAR_BIT >= 256 * 1024 )
153              break;
154 #endif
155         delete[] processMask;
156         numMasks <<= 1;
157     }
158     if ( !err ) {
159         // We have found the mask size and captured the process affinity mask into processMask.
160         num_masks = numMasks; // do here because it's needed for affinity_helper to work
161 #if __unix__
162         // For better coexistence with libiomp which might have changed the mask already,
163         // check for its presence and ask it to restore the mask.
164         dynamic_link_handle libhandle;
165         if ( dynamic_link( "libiomp5.so", iompLinkTable, 1, &libhandle, DYNAMIC_LINK_GLOBAL ) ) {
166             // We have found the symbol provided by libiomp5 for restoring original thread affinity.
167             affinity_helper affhelp;
168             affhelp.protect_affinity_mask( /*restore_process_mask=*/false );
169             if ( libiomp_try_restoring_original_mask()==0 ) {
170                 // Now we have the right mask to capture, restored by libiomp.
171                 const int curMaskSize = BasicMaskSize * numMasks;
172                 std::memset( processMask, 0, curMaskSize );
173                 get_thread_affinity_mask( curMaskSize, processMask );
174             } else
175                 affhelp.dismiss();  // thread mask has not changed
176             dynamic_unlink( libhandle );
177             // Destructor of affinity_helper restores the thread mask (unless dismissed).
178         }
179 #endif
180         for ( int m = 0; availableProcs < maxProcs && m < numMasks; ++m ) {
181             for ( std::size_t i = 0; (availableProcs < maxProcs) && (i < BasicMaskSize * CHAR_BIT); ++i ) {
182                 if ( CPU_ISSET( i, processMask + m ) )
183                     ++availableProcs;
184             }
185         }
186         process_mask = processMask;
187     }
188     else {
189         // Failed to get the process affinity mask; assume the whole machine can be used.
190         availableProcs = (maxProcs == INT_MAX) ? sysconf(_SC_NPROCESSORS_ONLN) : maxProcs;
191         delete[] processMask;
192     }
193     theNumProcs = availableProcs > 0 ? availableProcs : 1; // Fail safety strap
194     __TBB_ASSERT( theNumProcs <= sysconf(_SC_NPROCESSORS_ONLN), nullptr);
195 }
196 
197 int AvailableHwConcurrency() {
198     atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info );
199     return theNumProcs;
200 }
201 
202 /* End of __TBB_USE_OS_AFFINITY_SYSCALL implementation */
203 #elif __ANDROID__
204 
205 // Work-around for Android that reads the correct number of available CPUs since system calls are unreliable.
206 // Format of "present" file is: ([<int>-<int>|<int>],)+
207 int AvailableHwConcurrency() {
208     FILE *fp = fopen("/sys/devices/system/cpu/present", "r");
209     if (fp == nullptr) return 1;
210     int num_args, lower, upper, num_cpus=0;
211     while ((num_args = fscanf(fp, "%u-%u", &lower, &upper)) != EOF) {
212         switch(num_args) {
213             case 2: num_cpus += upper - lower + 1; break;
214             case 1: num_cpus += 1; break;
215         }
216         fscanf(fp, ",");
217     }
218     fclose(fp);
219     return (num_cpus > 0) ? num_cpus : 1;
220 }
221 
222 #elif defined(_SC_NPROCESSORS_ONLN)
223 
224 int AvailableHwConcurrency() {
225     int n = sysconf(_SC_NPROCESSORS_ONLN);
226     return (n > 0) ? n : 1;
227 }
228 
229 #elif _WIN32||_WIN64
230 
231 static std::atomic<do_once_state> hardware_concurrency_info;
232 
233 static const WORD TBB_ALL_PROCESSOR_GROUPS = 0xffff;
234 
235 // Statically allocate an array for processor group information.
236 // Windows 7 supports maximum 4 groups, but let's look ahead a little.
237 static const WORD MaxProcessorGroups = 64;
238 
239 struct ProcessorGroupInfo {
240     DWORD_PTR   mask;                   ///< Affinity mask covering the whole group
241     int         numProcs;               ///< Number of processors in the group
242     int         numProcsRunningTotal;   ///< Subtotal of processors in this and preceding groups
243 
244     //! Total number of processor groups in the system
245     static int NumGroups;
246 
247     //! Index of the group with a slot reserved for the first external thread
248     /** In the context of multiple processor groups support current implementation
249         defines "the first external thread" as the first thread to invoke
250         AvailableHwConcurrency().
251 
252         TODO:   Implement a dynamic scheme remapping workers depending on the pending
253                 external threads affinity. **/
254     static int HoleIndex;
255 };
256 
257 int ProcessorGroupInfo::NumGroups = 1;
258 int ProcessorGroupInfo::HoleIndex = 0;
259 
260 ProcessorGroupInfo theProcessorGroups[MaxProcessorGroups];
261 int calculate_numa[MaxProcessorGroups];  //Array needed for FindProcessorGroupIndex to calculate Processor Group when number of threads > number of cores to distribute threads evenly between processor groups
262 int numaSum;
263 struct TBB_GROUP_AFFINITY {
264     DWORD_PTR Mask;
265     WORD   Group;
266     WORD   Reserved[3];
267 };
268 
269 static DWORD (WINAPI *TBB_GetActiveProcessorCount)( WORD groupIndex ) = nullptr;
270 static WORD (WINAPI *TBB_GetActiveProcessorGroupCount)() = nullptr;
271 static BOOL (WINAPI *TBB_SetThreadGroupAffinity)( HANDLE hThread,
272                         const TBB_GROUP_AFFINITY* newAff, TBB_GROUP_AFFINITY *prevAff );
273 static BOOL (WINAPI *TBB_GetThreadGroupAffinity)( HANDLE hThread, TBB_GROUP_AFFINITY* );
274 
275 static const dynamic_link_descriptor ProcessorGroupsApiLinkTable[] = {
276       DLD(GetActiveProcessorCount, TBB_GetActiveProcessorCount)
277     , DLD(GetActiveProcessorGroupCount, TBB_GetActiveProcessorGroupCount)
278     , DLD(SetThreadGroupAffinity, TBB_SetThreadGroupAffinity)
279     , DLD(GetThreadGroupAffinity, TBB_GetThreadGroupAffinity)
280 };
281 
282 static void initialize_hardware_concurrency_info () {
283     suppress_unused_warning(TBB_ALL_PROCESSOR_GROUPS);
284 #if __TBB_WIN8UI_SUPPORT
285     // For these applications processor groups info is unavailable
286     // Setting up a number of processors for one processor group
287     theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = std::thread::hardware_concurrency();
288 #else /* __TBB_WIN8UI_SUPPORT */
289     dynamic_link( "Kernel32.dll", ProcessorGroupsApiLinkTable,
290                   sizeof(ProcessorGroupsApiLinkTable)/sizeof(dynamic_link_descriptor) );
291     SYSTEM_INFO si;
292     GetNativeSystemInfo(&si);
293     DWORD_PTR pam, sam, m = 1;
294     GetProcessAffinityMask( GetCurrentProcess(), &pam, &sam );
295     int nproc = 0;
296     for ( std::size_t i = 0; i < sizeof(DWORD_PTR) * CHAR_BIT; ++i, m <<= 1 ) {
297         if ( pam & m )
298             ++nproc;
299     }
300     __TBB_ASSERT( nproc <= (int)si.dwNumberOfProcessors, nullptr);
301     // By default setting up a number of processors for one processor group
302     theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = nproc;
303     // Setting up processor groups in case the process does not restrict affinity mask and more than one processor group is present
304     if ( nproc == (int)si.dwNumberOfProcessors && TBB_GetActiveProcessorCount ) {
305         // The process does not have restricting affinity mask and multiple processor groups are possible
306         ProcessorGroupInfo::NumGroups = (int)TBB_GetActiveProcessorGroupCount();
307         __TBB_ASSERT( ProcessorGroupInfo::NumGroups <= MaxProcessorGroups, nullptr);
308         // Fail safety bootstrap. Release versions will limit available concurrency
309         // level, while debug ones would assert.
310         if ( ProcessorGroupInfo::NumGroups > MaxProcessorGroups )
311             ProcessorGroupInfo::NumGroups = MaxProcessorGroups;
312         if ( ProcessorGroupInfo::NumGroups > 1 ) {
313             TBB_GROUP_AFFINITY ga;
314             if ( TBB_GetThreadGroupAffinity( GetCurrentThread(), &ga ) )
315                 ProcessorGroupInfo::HoleIndex = ga.Group;
316             int nprocs = 0;
317             int min_procs = INT_MAX;
318             for ( WORD i = 0; i < ProcessorGroupInfo::NumGroups; ++i ) {
319                 ProcessorGroupInfo  &pgi = theProcessorGroups[i];
320                 pgi.numProcs = (int)TBB_GetActiveProcessorCount(i);
321                 if (pgi.numProcs < min_procs) min_procs = pgi.numProcs;  //Finding the minimum number of processors in the Processor Groups
322                 calculate_numa[i] = pgi.numProcs;
323                 __TBB_ASSERT( pgi.numProcs <= (int)sizeof(DWORD_PTR) * CHAR_BIT, nullptr);
324                 pgi.mask = pgi.numProcs == sizeof(DWORD_PTR) * CHAR_BIT ? ~(DWORD_PTR)0 : (DWORD_PTR(1) << pgi.numProcs) - 1;
325                 pgi.numProcsRunningTotal = nprocs += pgi.numProcs;
326             }
327             __TBB_ASSERT( nprocs == (int)TBB_GetActiveProcessorCount( TBB_ALL_PROCESSOR_GROUPS ), nullptr);
328 
329             calculate_numa[0] = (calculate_numa[0] / min_procs)-1;
330             for (WORD i = 1; i < ProcessorGroupInfo::NumGroups; ++i) {
331                 calculate_numa[i] = calculate_numa[i-1] + (calculate_numa[i] / min_procs);
332             }
333 
334             numaSum = calculate_numa[ProcessorGroupInfo::NumGroups - 1];
335 
336         }
337 
338     }
339 #endif /* __TBB_WIN8UI_SUPPORT */
340 
341     PrintExtraVersionInfo("Processor groups", "%d", ProcessorGroupInfo::NumGroups);
342     if (ProcessorGroupInfo::NumGroups>1)
343         for (int i=0; i<ProcessorGroupInfo::NumGroups; ++i)
344             PrintExtraVersionInfo( "----- Group", "%d: size %d", i, theProcessorGroups[i].numProcs);
345 }
346 
347 int NumberOfProcessorGroups() {
348     __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "NumberOfProcessorGroups is used before AvailableHwConcurrency" );
349     return ProcessorGroupInfo::NumGroups;
350 }
351 
352 int FindProcessorGroupIndex ( int procIdx ) {
353     int current_grp_idx = ProcessorGroupInfo::HoleIndex;
354     if (procIdx >= theProcessorGroups[current_grp_idx].numProcs  && procIdx < theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal) {
355         procIdx = procIdx - theProcessorGroups[current_grp_idx].numProcs;
356         do {
357             current_grp_idx = (current_grp_idx + 1) % (ProcessorGroupInfo::NumGroups);
358             procIdx = procIdx - theProcessorGroups[current_grp_idx].numProcs;
359 
360         } while (procIdx >= 0);
361     }
362     else if (procIdx >= theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal) {
363         int temp_grp_index = 0;
364         procIdx = procIdx - theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal;
365         procIdx = procIdx % (numaSum+1);  //ProcIdx to stay between 0 and numaSum
366 
367         while (procIdx - calculate_numa[temp_grp_index] > 0) {
368             temp_grp_index = (temp_grp_index + 1) % ProcessorGroupInfo::NumGroups;
369         }
370         current_grp_idx = temp_grp_index;
371     }
372     __TBB_ASSERT(current_grp_idx < ProcessorGroupInfo::NumGroups, nullptr);
373 
374     return current_grp_idx;
375 }
376 
377 void MoveThreadIntoProcessorGroup( void* hThread, int groupIndex ) {
378     __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "MoveThreadIntoProcessorGroup is used before AvailableHwConcurrency" );
379     if ( !TBB_SetThreadGroupAffinity )
380         return;
381     TBB_GROUP_AFFINITY ga = { theProcessorGroups[groupIndex].mask, (WORD)groupIndex, {0,0,0} };
382     TBB_SetThreadGroupAffinity( hThread, &ga, nullptr);
383 }
384 
385 int AvailableHwConcurrency() {
386     atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info );
387     return theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal;
388 }
389 
390 /* End of _WIN32||_WIN64 implementation */
391 #else
392     #error AvailableHwConcurrency is not implemented for this OS
393 #endif
394 
395 } // namespace r1
396 } // namespace detail
397 } // namespace tbb
398 
399 #endif /* !__TBB_HardwareConcurrency */
400