xref: /oneTBB/src/tbb/misc_ex.cpp (revision 8c9445de)
1 /*
2     Copyright (c) 2005-2021 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 // Source file for miscellaneous entities that are infrequently referenced by
18 // an executing program, and implementation of which requires dynamic linking.
19 
20 #include "misc.h"
21 
22 #if !defined(__TBB_HardwareConcurrency)
23 
24 #include "dynamic_link.h"
25 #include <stdio.h>
26 #include <limits.h>
27 
28 #if _WIN32||_WIN64
29 #include <windows.h>
30 #if __TBB_WIN8UI_SUPPORT
31 #include <thread>
32 #endif
33 #else
34 #include <unistd.h>
35 #if __unix__
36 #if __linux__
37 #include <sys/sysinfo.h>
38 #endif
39 #include <cstring>
40 #include <sched.h>
41 #include <cerrno>
42 #elif __sun
43 #include <sys/sysinfo.h>
44 #elif __FreeBSD__
45 #include <cerrno>
46 #include <cstring>
47 #include <sys/param.h>  // Required by <sys/cpuset.h>
48 #include <sys/cpuset.h>
49 #endif
50 #endif
51 
52 namespace tbb {
53 namespace detail {
54 namespace r1 {
55 
56 #if __TBB_USE_OS_AFFINITY_SYSCALL
57 
58 #if __unix__
59 // Handlers for interoperation with libiomp
60 static int (*libiomp_try_restoring_original_mask)();
61 // Table for mapping to libiomp entry points
62 static const dynamic_link_descriptor iompLinkTable[] = {
63     DLD_NOWEAK( kmp_set_thread_affinity_mask_initial, libiomp_try_restoring_original_mask )
64 };
65 #endif
66 
67 static void set_thread_affinity_mask( std::size_t maskSize, const basic_mask_t* threadMask ) {
68 #if __FreeBSD__ || __NetBSD__ || __OpenBSD__
69     if( cpuset_setaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) )
70 #else /* __unix__ */
71     if( sched_setaffinity( 0, maskSize, threadMask ) )
72 #endif
73         // Here and below the error severity is lowered from critical level
74         // because it may happen during TBB library unload because of not
75         // waiting for workers to complete (current RML policy, to be fixed).
76         // handle_perror( errno, "setaffinity syscall" );
77         runtime_warning( "setaffinity syscall failed" );
78 }
79 
80 static void get_thread_affinity_mask( std::size_t maskSize, basic_mask_t* threadMask ) {
81 #if __FreeBSD__ || __NetBSD__ || __OpenBSD__
82     if( cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) )
83 #else /* __unix__ */
84     if( sched_getaffinity( 0, maskSize, threadMask ) )
85 #endif
86     runtime_warning( "getaffinity syscall failed" );
87 }
88 
89 static basic_mask_t* process_mask;
90 static int num_masks;
91 
92 void destroy_process_mask() {
93     delete [] process_mask;
94     process_mask = nullptr;
95 }
96 
97 #define curMaskSize sizeof(basic_mask_t) * num_masks
98 affinity_helper::~affinity_helper() {
99     if( threadMask ) {
100         if( is_changed ) {
101             set_thread_affinity_mask( curMaskSize, threadMask );
102         }
103         delete [] threadMask;
104     }
105 }
106 void affinity_helper::protect_affinity_mask( bool restore_process_mask ) {
107     if( threadMask == NULL && num_masks ) { // TODO: assert num_masks validity?
108         threadMask = new basic_mask_t [num_masks];
109         std::memset( threadMask, 0, curMaskSize );
110         get_thread_affinity_mask( curMaskSize, threadMask );
111         if( restore_process_mask ) {
112             __TBB_ASSERT( process_mask, "A process mask is requested but not yet stored" );
113             is_changed = memcmp( process_mask, threadMask, curMaskSize );
114             if( is_changed )
115                 set_thread_affinity_mask( curMaskSize, process_mask );
116         } else {
117             // Assume that the mask will be changed by the caller.
118             is_changed = 1;
119         }
120     }
121 }
122 void affinity_helper::dismiss() {
123     delete [] threadMask;
124     threadMask = NULL;
125     is_changed = 0;
126 }
127 #undef curMaskSize
128 
129 static std::atomic<do_once_state> hardware_concurrency_info;
130 
131 static int theNumProcs;
132 
133 static void initialize_hardware_concurrency_info () {
134     int err;
135     int availableProcs = 0;
136     int numMasks = 1;
137     int maxProcs = sysconf(_SC_NPROCESSORS_ONLN);
138     basic_mask_t* processMask;
139     const std::size_t BasicMaskSize =  sizeof(basic_mask_t);
140     for (;;) {
141         const int curMaskSize = BasicMaskSize * numMasks;
142         processMask = new basic_mask_t[numMasks];
143         std::memset( processMask, 0, curMaskSize );
144 #if __FreeBSD__ || __NetBSD__ || __OpenBSD__
145         // CPU_LEVEL_WHICH - anonymous (current) mask, CPU_LEVEL_CPUSET - assigned mask
146         err = cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, curMaskSize, processMask );
147         if ( !err || errno != ERANGE || curMaskSize * CHAR_BIT >= 16 * 1024 )
148             break;
149 #else /* __unix__ */
150         int pid = getpid();
151         err = sched_getaffinity( pid, curMaskSize, processMask );
152         if ( !err || errno != EINVAL || curMaskSize * CHAR_BIT >= 256 * 1024 )
153             break;
154 #endif
155         delete[] processMask;
156         numMasks <<= 1;
157     }
158     if ( !err ) {
159         // We have found the mask size and captured the process affinity mask into processMask.
160         num_masks = numMasks; // do here because it's needed for affinity_helper to work
161 #if __unix__
162         // For better coexistence with libiomp which might have changed the mask already,
163         // check for its presence and ask it to restore the mask.
164         dynamic_link_handle libhandle;
165         if ( dynamic_link( "libiomp5.so", iompLinkTable, 1, &libhandle, DYNAMIC_LINK_GLOBAL ) ) {
166             // We have found the symbol provided by libiomp5 for restoring original thread affinity.
167             affinity_helper affhelp;
168             affhelp.protect_affinity_mask( /*restore_process_mask=*/false );
169             if ( libiomp_try_restoring_original_mask()==0 ) {
170                 // Now we have the right mask to capture, restored by libiomp.
171                 const int curMaskSize = BasicMaskSize * numMasks;
172                 std::memset( processMask, 0, curMaskSize );
173                 get_thread_affinity_mask( curMaskSize, processMask );
174             } else
175                 affhelp.dismiss();  // thread mask has not changed
176             dynamic_unlink( libhandle );
177             // Destructor of affinity_helper restores the thread mask (unless dismissed).
178         }
179 #endif
180         for ( int m = 0; availableProcs < maxProcs && m < numMasks; ++m ) {
181             for ( std::size_t i = 0; (availableProcs < maxProcs) && (i < BasicMaskSize * CHAR_BIT); ++i ) {
182                 if ( CPU_ISSET( i, processMask + m ) )
183                     ++availableProcs;
184             }
185         }
186         process_mask = processMask;
187     }
188     else {
189         // Failed to get the process affinity mask; assume the whole machine can be used.
190         availableProcs = (maxProcs == INT_MAX) ? sysconf(_SC_NPROCESSORS_ONLN) : maxProcs;
191         delete[] processMask;
192     }
193     theNumProcs = availableProcs > 0 ? availableProcs : 1; // Fail safety strap
194     __TBB_ASSERT( theNumProcs <= sysconf(_SC_NPROCESSORS_ONLN), NULL );
195 }
196 
197 int AvailableHwConcurrency() {
198     atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info );
199     return theNumProcs;
200 }
201 
202 /* End of __TBB_USE_OS_AFFINITY_SYSCALL implementation */
203 #elif __ANDROID__
204 
205 // Work-around for Android that reads the correct number of available CPUs since system calls are unreliable.
206 // Format of "present" file is: ([<int>-<int>|<int>],)+
207 int AvailableHwConcurrency() {
208     FILE *fp = fopen("/sys/devices/system/cpu/present", "r");
209     if (fp == NULL) return 1;
210     int num_args, lower, upper, num_cpus=0;
211     while ((num_args = fscanf(fp, "%u-%u", &lower, &upper)) != EOF) {
212         switch(num_args) {
213             case 2: num_cpus += upper - lower + 1; break;
214             case 1: num_cpus += 1; break;
215         }
216         fscanf(fp, ",");
217     }
218     return (num_cpus > 0) ? num_cpus : 1;
219 }
220 
221 #elif defined(_SC_NPROCESSORS_ONLN)
222 
223 int AvailableHwConcurrency() {
224     int n = sysconf(_SC_NPROCESSORS_ONLN);
225     return (n > 0) ? n : 1;
226 }
227 
228 #elif _WIN32||_WIN64
229 
230 static std::atomic<do_once_state> hardware_concurrency_info;
231 
232 static const WORD TBB_ALL_PROCESSOR_GROUPS = 0xffff;
233 
234 // Statically allocate an array for processor group information.
235 // Windows 7 supports maximum 4 groups, but let's look ahead a little.
236 static const WORD MaxProcessorGroups = 64;
237 
238 struct ProcessorGroupInfo {
239     DWORD_PTR   mask;                   ///< Affinity mask covering the whole group
240     int         numProcs;               ///< Number of processors in the group
241     int         numProcsRunningTotal;   ///< Subtotal of processors in this and preceding groups
242 
243     //! Total number of processor groups in the system
244     static int NumGroups;
245 
246     //! Index of the group with a slot reserved for the first external thread
247     /** In the context of multiple processor groups support current implementation
248         defines "the first external thread" as the first thread to invoke
249         AvailableHwConcurrency().
250 
251         TODO:   Implement a dynamic scheme remapping workers depending on the pending
252                 external threads affinity. **/
253     static int HoleIndex;
254 };
255 
256 int ProcessorGroupInfo::NumGroups = 1;
257 int ProcessorGroupInfo::HoleIndex = 0;
258 
259 ProcessorGroupInfo theProcessorGroups[MaxProcessorGroups];
260 
261 struct TBB_GROUP_AFFINITY {
262     DWORD_PTR Mask;
263     WORD   Group;
264     WORD   Reserved[3];
265 };
266 
267 static DWORD (WINAPI *TBB_GetActiveProcessorCount)( WORD groupIndex ) = NULL;
268 static WORD (WINAPI *TBB_GetActiveProcessorGroupCount)() = NULL;
269 static BOOL (WINAPI *TBB_SetThreadGroupAffinity)( HANDLE hThread,
270                         const TBB_GROUP_AFFINITY* newAff, TBB_GROUP_AFFINITY *prevAff );
271 static BOOL (WINAPI *TBB_GetThreadGroupAffinity)( HANDLE hThread, TBB_GROUP_AFFINITY* );
272 
273 static const dynamic_link_descriptor ProcessorGroupsApiLinkTable[] = {
274       DLD(GetActiveProcessorCount, TBB_GetActiveProcessorCount)
275     , DLD(GetActiveProcessorGroupCount, TBB_GetActiveProcessorGroupCount)
276     , DLD(SetThreadGroupAffinity, TBB_SetThreadGroupAffinity)
277     , DLD(GetThreadGroupAffinity, TBB_GetThreadGroupAffinity)
278 };
279 
280 static void initialize_hardware_concurrency_info () {
281     suppress_unused_warning(TBB_ALL_PROCESSOR_GROUPS);
282 #if __TBB_WIN8UI_SUPPORT
283     // For these applications processor groups info is unavailable
284     // Setting up a number of processors for one processor group
285     theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = std::thread::hardware_concurrency();
286 #else /* __TBB_WIN8UI_SUPPORT */
287     dynamic_link( "Kernel32.dll", ProcessorGroupsApiLinkTable,
288                   sizeof(ProcessorGroupsApiLinkTable)/sizeof(dynamic_link_descriptor) );
289     SYSTEM_INFO si;
290     GetNativeSystemInfo(&si);
291     DWORD_PTR pam, sam, m = 1;
292     GetProcessAffinityMask( GetCurrentProcess(), &pam, &sam );
293     int nproc = 0;
294     for ( std::size_t i = 0; i < sizeof(DWORD_PTR) * CHAR_BIT; ++i, m <<= 1 ) {
295         if ( pam & m )
296             ++nproc;
297     }
298     __TBB_ASSERT( nproc <= (int)si.dwNumberOfProcessors, NULL );
299     // By default setting up a number of processors for one processor group
300     theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = nproc;
301     // Setting up processor groups in case the process does not restrict affinity mask and more than one processor group is present
302     if ( nproc == (int)si.dwNumberOfProcessors && TBB_GetActiveProcessorCount ) {
303         // The process does not have restricting affinity mask and multiple processor groups are possible
304         ProcessorGroupInfo::NumGroups = (int)TBB_GetActiveProcessorGroupCount();
305         __TBB_ASSERT( ProcessorGroupInfo::NumGroups <= MaxProcessorGroups, NULL );
306         // Fail safety bootstrap. Release versions will limit available concurrency
307         // level, while debug ones would assert.
308         if ( ProcessorGroupInfo::NumGroups > MaxProcessorGroups )
309             ProcessorGroupInfo::NumGroups = MaxProcessorGroups;
310         if ( ProcessorGroupInfo::NumGroups > 1 ) {
311             TBB_GROUP_AFFINITY ga;
312             if ( TBB_GetThreadGroupAffinity( GetCurrentThread(), &ga ) )
313                 ProcessorGroupInfo::HoleIndex = ga.Group;
314             int nprocs = 0;
315             for ( WORD i = 0; i < ProcessorGroupInfo::NumGroups; ++i ) {
316                 ProcessorGroupInfo  &pgi = theProcessorGroups[i];
317                 pgi.numProcs = (int)TBB_GetActiveProcessorCount(i);
318                 __TBB_ASSERT( pgi.numProcs <= (int)sizeof(DWORD_PTR) * CHAR_BIT, NULL );
319                 pgi.mask = pgi.numProcs == sizeof(DWORD_PTR) * CHAR_BIT ? ~(DWORD_PTR)0 : (DWORD_PTR(1) << pgi.numProcs) - 1;
320                 pgi.numProcsRunningTotal = nprocs += pgi.numProcs;
321             }
322             __TBB_ASSERT( nprocs == (int)TBB_GetActiveProcessorCount( TBB_ALL_PROCESSOR_GROUPS ), NULL );
323         }
324     }
325 #endif /* __TBB_WIN8UI_SUPPORT */
326 
327     PrintExtraVersionInfo("Processor groups", "%d", ProcessorGroupInfo::NumGroups);
328     if (ProcessorGroupInfo::NumGroups>1)
329         for (int i=0; i<ProcessorGroupInfo::NumGroups; ++i)
330             PrintExtraVersionInfo( "----- Group", "%d: size %d", i, theProcessorGroups[i].numProcs);
331 }
332 
333 int NumberOfProcessorGroups() {
334     __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "NumberOfProcessorGroups is used before AvailableHwConcurrency" );
335     return ProcessorGroupInfo::NumGroups;
336 }
337 
338 // Offset for the slot reserved for the first external thread
339 #define HoleAdjusted(procIdx, grpIdx) (procIdx + (holeIdx <= grpIdx))
340 
341 int FindProcessorGroupIndex ( int procIdx ) {
342     // In case of oversubscription spread extra workers in a round robin manner
343     int holeIdx;
344     const int numProcs = theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal;
345     if ( procIdx >= numProcs - 1 ) {
346         holeIdx = INT_MAX;
347         procIdx = (procIdx - numProcs + 1) % numProcs;
348     }
349     else
350         holeIdx = ProcessorGroupInfo::HoleIndex;
351     __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "FindProcessorGroupIndex is used before AvailableHwConcurrency" );
352     // Approximate the likely group index assuming all groups are of the same size
353     int i = procIdx / theProcessorGroups[0].numProcs;
354     // Make sure the approximation is a valid group index
355     if (i >= ProcessorGroupInfo::NumGroups) i = ProcessorGroupInfo::NumGroups-1;
356     // Now adjust the approximation up or down
357     if ( theProcessorGroups[i].numProcsRunningTotal > HoleAdjusted(procIdx, i) ) {
358         while ( theProcessorGroups[i].numProcsRunningTotal - theProcessorGroups[i].numProcs > HoleAdjusted(procIdx, i) ) {
359             __TBB_ASSERT( i > 0, NULL );
360             --i;
361         }
362     }
363     else {
364         do {
365             ++i;
366         } while ( theProcessorGroups[i].numProcsRunningTotal <= HoleAdjusted(procIdx, i) );
367     }
368     __TBB_ASSERT( i < ProcessorGroupInfo::NumGroups, NULL );
369     return i;
370 }
371 
372 void MoveThreadIntoProcessorGroup( void* hThread, int groupIndex ) {
373     __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "MoveThreadIntoProcessorGroup is used before AvailableHwConcurrency" );
374     if ( !TBB_SetThreadGroupAffinity )
375         return;
376     TBB_GROUP_AFFINITY ga = { theProcessorGroups[groupIndex].mask, (WORD)groupIndex, {0,0,0} };
377     TBB_SetThreadGroupAffinity( hThread, &ga, NULL );
378 }
379 
380 int AvailableHwConcurrency() {
381     atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info );
382     return theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal;
383 }
384 
385 /* End of _WIN32||_WIN64 implementation */
386 #else
387     #error AvailableHwConcurrency is not implemented for this OS
388 #endif
389 
390 } // namespace r1
391 } // namespace detail
392 } // namespace tbb
393 
394 #endif /* !__TBB_HardwareConcurrency */
395