xref: /oneTBB/src/tbb/misc_ex.cpp (revision 6caecf96)
1 /*
2     Copyright (c) 2005-2021 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 // Source file for miscellaneous entities that are infrequently referenced by
18 // an executing program, and implementation of which requires dynamic linking.
19 
20 #include "misc.h"
21 
22 #if !defined(__TBB_HardwareConcurrency)
23 
24 #include "dynamic_link.h"
25 #include <stdio.h>
26 #include <limits.h>
27 
28 #if _WIN32||_WIN64
29 #include <windows.h>
30 #if __TBB_WIN8UI_SUPPORT
31 #include <thread>
32 #endif
33 #else
34 #include <unistd.h>
35 #if __linux__
36 #include <sys/sysinfo.h>
37 #include <cstring>
38 #include <sched.h>
39 #include <cerrno>
40 #elif __sun
41 #include <sys/sysinfo.h>
42 #elif __FreeBSD__
43 #include <cerrno>
44 #include <cstring>
45 #include <sys/param.h>  // Required by <sys/cpuset.h>
46 #include <sys/cpuset.h>
47 #endif
48 #endif
49 
50 namespace tbb {
51 namespace detail {
52 namespace r1 {
53 
54 #if __TBB_USE_OS_AFFINITY_SYSCALL
55 
56 #if __linux__
57 // Handlers for interoperation with libiomp
58 static int (*libiomp_try_restoring_original_mask)();
59 // Table for mapping to libiomp entry points
60 static const dynamic_link_descriptor iompLinkTable[] = {
61     DLD_NOWEAK( kmp_set_thread_affinity_mask_initial, libiomp_try_restoring_original_mask )
62 };
63 #endif
64 
65 static void set_thread_affinity_mask( std::size_t maskSize, const basic_mask_t* threadMask ) {
66 #if __linux__
67     if( sched_setaffinity( 0, maskSize, threadMask ) )
68 #else /* FreeBSD */
69     if( cpuset_setaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) )
70 #endif
71         // Here and below the error severity is lowered from critical level
72         // because it may happen during TBB library unload because of not
73         // waiting for workers to complete (current RML policy, to be fixed).
74         // handle_perror( errno, "setaffinity syscall" );
75         runtime_warning( "setaffinity syscall failed" );
76 }
77 
78 static void get_thread_affinity_mask( std::size_t maskSize, basic_mask_t* threadMask ) {
79 #if __linux__
80     if( sched_getaffinity( 0, maskSize, threadMask ) )
81 #else /* FreeBSD */
82     if( cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) )
83 #endif
84     runtime_warning( "getaffinity syscall failed" );
85 }
86 
87 static basic_mask_t* process_mask;
88 static int num_masks;
89 
90 void destroy_process_mask() {
91     if( process_mask ) {
92         delete [] process_mask;
93     }
94 }
95 
96 #define curMaskSize sizeof(basic_mask_t) * num_masks
97 affinity_helper::~affinity_helper() {
98     if( threadMask ) {
99         if( is_changed ) {
100             set_thread_affinity_mask( curMaskSize, threadMask );
101         }
102         delete [] threadMask;
103     }
104 }
105 void affinity_helper::protect_affinity_mask( bool restore_process_mask ) {
106     if( threadMask == NULL && num_masks ) { // TODO: assert num_masks validity?
107         threadMask = new basic_mask_t [num_masks];
108         std::memset( threadMask, 0, curMaskSize );
109         get_thread_affinity_mask( curMaskSize, threadMask );
110         if( restore_process_mask ) {
111             __TBB_ASSERT( process_mask, "A process mask is requested but not yet stored" );
112             is_changed = memcmp( process_mask, threadMask, curMaskSize );
113             if( is_changed )
114                 set_thread_affinity_mask( curMaskSize, process_mask );
115         } else {
116             // Assume that the mask will be changed by the caller.
117             is_changed = 1;
118         }
119     }
120 }
121 void affinity_helper::dismiss() {
122     if( threadMask ) {
123         delete [] threadMask;
124         threadMask = NULL;
125     }
126     is_changed = 0;
127 }
128 #undef curMaskSize
129 
130 static std::atomic<do_once_state> hardware_concurrency_info;
131 
132 static int theNumProcs;
133 
134 static void initialize_hardware_concurrency_info () {
135     int err;
136     int availableProcs = 0;
137     int numMasks = 1;
138 #if __linux__
139     int maxProcs = sysconf(_SC_NPROCESSORS_ONLN);
140     int pid = getpid();
141 #else /* FreeBSD >= 7.1 */
142     int maxProcs = sysconf(_SC_NPROCESSORS_ONLN);
143 #endif
144     basic_mask_t* processMask;
145     const std::size_t BasicMaskSize =  sizeof(basic_mask_t);
146     for (;;) {
147         const int curMaskSize = BasicMaskSize * numMasks;
148         processMask = new basic_mask_t[numMasks];
149         std::memset( processMask, 0, curMaskSize );
150 #if __linux__
151         err = sched_getaffinity( pid, curMaskSize, processMask );
152         if ( !err || errno != EINVAL || curMaskSize * CHAR_BIT >= 256 * 1024 )
153             break;
154 #else /* FreeBSD >= 7.1 */
155         // CPU_LEVEL_WHICH - anonymous (current) mask, CPU_LEVEL_CPUSET - assigned mask
156         err = cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, curMaskSize, processMask );
157         if ( !err || errno != ERANGE || curMaskSize * CHAR_BIT >= 16 * 1024 )
158             break;
159 #endif /* FreeBSD >= 7.1 */
160         delete[] processMask;
161         numMasks <<= 1;
162     }
163     if ( !err ) {
164         // We have found the mask size and captured the process affinity mask into processMask.
165         num_masks = numMasks; // do here because it's needed for affinity_helper to work
166 #if __linux__
167         // For better coexistence with libiomp which might have changed the mask already,
168         // check for its presence and ask it to restore the mask.
169         dynamic_link_handle libhandle;
170         if ( dynamic_link( "libiomp5.so", iompLinkTable, 1, &libhandle, DYNAMIC_LINK_GLOBAL ) ) {
171             // We have found the symbol provided by libiomp5 for restoring original thread affinity.
172             affinity_helper affhelp;
173             affhelp.protect_affinity_mask( /*restore_process_mask=*/false );
174             if ( libiomp_try_restoring_original_mask()==0 ) {
175                 // Now we have the right mask to capture, restored by libiomp.
176                 const int curMaskSize = BasicMaskSize * numMasks;
177                 std::memset( processMask, 0, curMaskSize );
178                 get_thread_affinity_mask( curMaskSize, processMask );
179             } else
180                 affhelp.dismiss();  // thread mask has not changed
181             dynamic_unlink( libhandle );
182             // Destructor of affinity_helper restores the thread mask (unless dismissed).
183         }
184 #endif
185         for ( int m = 0; availableProcs < maxProcs && m < numMasks; ++m ) {
186             for ( std::size_t i = 0; (availableProcs < maxProcs) && (i < BasicMaskSize * CHAR_BIT); ++i ) {
187                 if ( CPU_ISSET( i, processMask + m ) )
188                     ++availableProcs;
189             }
190         }
191         process_mask = processMask;
192     }
193     else {
194         // Failed to get the process affinity mask; assume the whole machine can be used.
195         availableProcs = (maxProcs == INT_MAX) ? sysconf(_SC_NPROCESSORS_ONLN) : maxProcs;
196         delete[] processMask;
197     }
198     theNumProcs = availableProcs > 0 ? availableProcs : 1; // Fail safety strap
199     __TBB_ASSERT( theNumProcs <= sysconf(_SC_NPROCESSORS_ONLN), NULL );
200 }
201 
202 int AvailableHwConcurrency() {
203     atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info );
204     return theNumProcs;
205 }
206 
207 /* End of __TBB_USE_OS_AFFINITY_SYSCALL implementation */
208 #elif __ANDROID__
209 
210 // Work-around for Android that reads the correct number of available CPUs since system calls are unreliable.
211 // Format of "present" file is: ([<int>-<int>|<int>],)+
212 int AvailableHwConcurrency() {
213     FILE *fp = fopen("/sys/devices/system/cpu/present", "r");
214     if (fp == NULL) return 1;
215     int num_args, lower, upper, num_cpus=0;
216     while ((num_args = fscanf(fp, "%u-%u", &lower, &upper)) != EOF) {
217         switch(num_args) {
218             case 2: num_cpus += upper - lower + 1; break;
219             case 1: num_cpus += 1; break;
220         }
221         fscanf(fp, ",");
222     }
223     return (num_cpus > 0) ? num_cpus : 1;
224 }
225 
226 #elif defined(_SC_NPROCESSORS_ONLN)
227 
228 int AvailableHwConcurrency() {
229     int n = sysconf(_SC_NPROCESSORS_ONLN);
230     return (n > 0) ? n : 1;
231 }
232 
233 #elif _WIN32||_WIN64
234 
235 static std::atomic<do_once_state> hardware_concurrency_info;
236 
237 static const WORD TBB_ALL_PROCESSOR_GROUPS = 0xffff;
238 
239 // Statically allocate an array for processor group information.
240 // Windows 7 supports maximum 4 groups, but let's look ahead a little.
241 static const WORD MaxProcessorGroups = 64;
242 
243 struct ProcessorGroupInfo {
244     DWORD_PTR   mask;                   ///< Affinity mask covering the whole group
245     int         numProcs;               ///< Number of processors in the group
246     int         numProcsRunningTotal;   ///< Subtotal of processors in this and preceding groups
247 
248     //! Total number of processor groups in the system
249     static int NumGroups;
250 
251     //! Index of the group with a slot reserved for the first external thread
252     /** In the context of multiple processor groups support current implementation
253         defines "the first external thread" as the first thread to invoke
254         AvailableHwConcurrency().
255 
256         TODO:   Implement a dynamic scheme remapping workers depending on the pending
257                 external threads affinity. **/
258     static int HoleIndex;
259 };
260 
261 int ProcessorGroupInfo::NumGroups = 1;
262 int ProcessorGroupInfo::HoleIndex = 0;
263 
264 ProcessorGroupInfo theProcessorGroups[MaxProcessorGroups];
265 
266 struct TBB_GROUP_AFFINITY {
267     DWORD_PTR Mask;
268     WORD   Group;
269     WORD   Reserved[3];
270 };
271 
272 static DWORD (WINAPI *TBB_GetActiveProcessorCount)( WORD groupIndex ) = NULL;
273 static WORD (WINAPI *TBB_GetActiveProcessorGroupCount)() = NULL;
274 static BOOL (WINAPI *TBB_SetThreadGroupAffinity)( HANDLE hThread,
275                         const TBB_GROUP_AFFINITY* newAff, TBB_GROUP_AFFINITY *prevAff );
276 static BOOL (WINAPI *TBB_GetThreadGroupAffinity)( HANDLE hThread, TBB_GROUP_AFFINITY* );
277 
278 static const dynamic_link_descriptor ProcessorGroupsApiLinkTable[] = {
279       DLD(GetActiveProcessorCount, TBB_GetActiveProcessorCount)
280     , DLD(GetActiveProcessorGroupCount, TBB_GetActiveProcessorGroupCount)
281     , DLD(SetThreadGroupAffinity, TBB_SetThreadGroupAffinity)
282     , DLD(GetThreadGroupAffinity, TBB_GetThreadGroupAffinity)
283 };
284 
285 static void initialize_hardware_concurrency_info () {
286     suppress_unused_warning(TBB_ALL_PROCESSOR_GROUPS);
287 #if __TBB_WIN8UI_SUPPORT
288     // For these applications processor groups info is unavailable
289     // Setting up a number of processors for one processor group
290     theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = std::thread::hardware_concurrency();
291 #else /* __TBB_WIN8UI_SUPPORT */
292     dynamic_link( "Kernel32.dll", ProcessorGroupsApiLinkTable,
293                   sizeof(ProcessorGroupsApiLinkTable)/sizeof(dynamic_link_descriptor) );
294     SYSTEM_INFO si;
295     GetNativeSystemInfo(&si);
296     DWORD_PTR pam, sam, m = 1;
297     GetProcessAffinityMask( GetCurrentProcess(), &pam, &sam );
298     int nproc = 0;
299     for ( std::size_t i = 0; i < sizeof(DWORD_PTR) * CHAR_BIT; ++i, m <<= 1 ) {
300         if ( pam & m )
301             ++nproc;
302     }
303     __TBB_ASSERT( nproc <= (int)si.dwNumberOfProcessors, NULL );
304     // By default setting up a number of processors for one processor group
305     theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = nproc;
306     // Setting up processor groups in case the process does not restrict affinity mask and more than one processor group is present
307     if ( nproc == (int)si.dwNumberOfProcessors && TBB_GetActiveProcessorCount ) {
308         // The process does not have restricting affinity mask and multiple processor groups are possible
309         ProcessorGroupInfo::NumGroups = (int)TBB_GetActiveProcessorGroupCount();
310         __TBB_ASSERT( ProcessorGroupInfo::NumGroups <= MaxProcessorGroups, NULL );
311         // Fail safety bootstrap. Release versions will limit available concurrency
312         // level, while debug ones would assert.
313         if ( ProcessorGroupInfo::NumGroups > MaxProcessorGroups )
314             ProcessorGroupInfo::NumGroups = MaxProcessorGroups;
315         if ( ProcessorGroupInfo::NumGroups > 1 ) {
316             TBB_GROUP_AFFINITY ga;
317             if ( TBB_GetThreadGroupAffinity( GetCurrentThread(), &ga ) )
318                 ProcessorGroupInfo::HoleIndex = ga.Group;
319             int nprocs = 0;
320             for ( WORD i = 0; i < ProcessorGroupInfo::NumGroups; ++i ) {
321                 ProcessorGroupInfo  &pgi = theProcessorGroups[i];
322                 pgi.numProcs = (int)TBB_GetActiveProcessorCount(i);
323                 __TBB_ASSERT( pgi.numProcs <= (int)sizeof(DWORD_PTR) * CHAR_BIT, NULL );
324                 pgi.mask = pgi.numProcs == sizeof(DWORD_PTR) * CHAR_BIT ? ~(DWORD_PTR)0 : (DWORD_PTR(1) << pgi.numProcs) - 1;
325                 pgi.numProcsRunningTotal = nprocs += pgi.numProcs;
326             }
327             __TBB_ASSERT( nprocs == (int)TBB_GetActiveProcessorCount( TBB_ALL_PROCESSOR_GROUPS ), NULL );
328         }
329     }
330 #endif /* __TBB_WIN8UI_SUPPORT */
331 
332     PrintExtraVersionInfo("Processor groups", "%d", ProcessorGroupInfo::NumGroups);
333     if (ProcessorGroupInfo::NumGroups>1)
334         for (int i=0; i<ProcessorGroupInfo::NumGroups; ++i)
335             PrintExtraVersionInfo( "----- Group", "%d: size %d", i, theProcessorGroups[i].numProcs);
336 }
337 
338 int NumberOfProcessorGroups() {
339     __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "NumberOfProcessorGroups is used before AvailableHwConcurrency" );
340     return ProcessorGroupInfo::NumGroups;
341 }
342 
343 // Offset for the slot reserved for the first external thread
344 #define HoleAdjusted(procIdx, grpIdx) (procIdx + (holeIdx <= grpIdx))
345 
346 int FindProcessorGroupIndex ( int procIdx ) {
347     // In case of oversubscription spread extra workers in a round robin manner
348     int holeIdx;
349     const int numProcs = theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal;
350     if ( procIdx >= numProcs - 1 ) {
351         holeIdx = INT_MAX;
352         procIdx = (procIdx - numProcs + 1) % numProcs;
353     }
354     else
355         holeIdx = ProcessorGroupInfo::HoleIndex;
356     __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "FindProcessorGroupIndex is used before AvailableHwConcurrency" );
357     // Approximate the likely group index assuming all groups are of the same size
358     int i = procIdx / theProcessorGroups[0].numProcs;
359     // Make sure the approximation is a valid group index
360     if (i >= ProcessorGroupInfo::NumGroups) i = ProcessorGroupInfo::NumGroups-1;
361     // Now adjust the approximation up or down
362     if ( theProcessorGroups[i].numProcsRunningTotal > HoleAdjusted(procIdx, i) ) {
363         while ( theProcessorGroups[i].numProcsRunningTotal - theProcessorGroups[i].numProcs > HoleAdjusted(procIdx, i) ) {
364             __TBB_ASSERT( i > 0, NULL );
365             --i;
366         }
367     }
368     else {
369         do {
370             ++i;
371         } while ( theProcessorGroups[i].numProcsRunningTotal <= HoleAdjusted(procIdx, i) );
372     }
373     __TBB_ASSERT( i < ProcessorGroupInfo::NumGroups, NULL );
374     return i;
375 }
376 
377 void MoveThreadIntoProcessorGroup( void* hThread, int groupIndex ) {
378     __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "MoveThreadIntoProcessorGroup is used before AvailableHwConcurrency" );
379     if ( !TBB_SetThreadGroupAffinity )
380         return;
381     TBB_GROUP_AFFINITY ga = { theProcessorGroups[groupIndex].mask, (WORD)groupIndex, {0,0,0} };
382     TBB_SetThreadGroupAffinity( hThread, &ga, NULL );
383 }
384 
385 int AvailableHwConcurrency() {
386     atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info );
387     return theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal;
388 }
389 
390 /* End of _WIN32||_WIN64 implementation */
391 #else
392     #error AvailableHwConcurrency is not implemented for this OS
393 #endif
394 
395 } // namespace r1
396 } // namespace detail
397 } // namespace tbb
398 
399 #endif /* !__TBB_HardwareConcurrency */
400