xref: /oneTBB/src/tbb/misc_ex.cpp (revision 055cc6ea)
1 /*
2     Copyright (c) 2005-2021 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 // Source file for miscellaneous entities that are infrequently referenced by
18 // an executing program, and implementation of which requires dynamic linking.
19 
20 #include "misc.h"
21 
22 #if !defined(__TBB_HardwareConcurrency)
23 
24 #include "dynamic_link.h"
25 #include <stdio.h>
26 #include <limits.h>
27 
28 #if _WIN32||_WIN64
29 #include <windows.h>
30 #if __TBB_WIN8UI_SUPPORT
31 #include <thread>
32 #endif
33 #else
34 #include <unistd.h>
35 #if __unix__
36 #if __linux__
37 #include <sys/sysinfo.h>
38 #endif
39 #include <cstring>
40 #include <sched.h>
41 #include <cerrno>
42 #elif __sun
43 #include <sys/sysinfo.h>
44 #elif __FreeBSD__
45 #include <cerrno>
46 #include <cstring>
47 #include <sys/param.h>  // Required by <sys/cpuset.h>
48 #include <sys/cpuset.h>
49 #endif
50 #endif
51 
52 namespace tbb {
53 namespace detail {
54 namespace r1 {
55 
56 #if __TBB_USE_OS_AFFINITY_SYSCALL
57 
58 #if __unix__
59 // Handlers for interoperation with libiomp
60 static int (*libiomp_try_restoring_original_mask)();
61 // Table for mapping to libiomp entry points
62 static const dynamic_link_descriptor iompLinkTable[] = {
63     DLD_NOWEAK( kmp_set_thread_affinity_mask_initial, libiomp_try_restoring_original_mask )
64 };
65 #endif
66 
67 static void set_thread_affinity_mask( std::size_t maskSize, const basic_mask_t* threadMask ) {
68 #if __FreeBSD__ || __NetBSD__ || __OpenBSD__
69     if( cpuset_setaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) )
70 #else /* __unix__ */
71     if( sched_setaffinity( 0, maskSize, threadMask ) )
72 #endif
73         // Here and below the error severity is lowered from critical level
74         // because it may happen during TBB library unload because of not
75         // waiting for workers to complete (current RML policy, to be fixed).
76         // handle_perror( errno, "setaffinity syscall" );
77         runtime_warning( "setaffinity syscall failed" );
78 }
79 
80 static void get_thread_affinity_mask( std::size_t maskSize, basic_mask_t* threadMask ) {
81 #if __FreeBSD__ || __NetBSD__ || __OpenBSD__
82     if( cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) )
83 #else /* __unix__ */
84     if( sched_getaffinity( 0, maskSize, threadMask ) )
85 #endif
86     runtime_warning( "getaffinity syscall failed" );
87 }
88 
89 static basic_mask_t* process_mask;
90 static int num_masks;
91 
92 void destroy_process_mask() {
93     if( process_mask ) {
94         delete [] process_mask;
95     }
96 }
97 
98 #define curMaskSize sizeof(basic_mask_t) * num_masks
99 affinity_helper::~affinity_helper() {
100     if( threadMask ) {
101         if( is_changed ) {
102             set_thread_affinity_mask( curMaskSize, threadMask );
103         }
104         delete [] threadMask;
105     }
106 }
107 void affinity_helper::protect_affinity_mask( bool restore_process_mask ) {
108     if( threadMask == NULL && num_masks ) { // TODO: assert num_masks validity?
109         threadMask = new basic_mask_t [num_masks];
110         std::memset( threadMask, 0, curMaskSize );
111         get_thread_affinity_mask( curMaskSize, threadMask );
112         if( restore_process_mask ) {
113             __TBB_ASSERT( process_mask, "A process mask is requested but not yet stored" );
114             is_changed = memcmp( process_mask, threadMask, curMaskSize );
115             if( is_changed )
116                 set_thread_affinity_mask( curMaskSize, process_mask );
117         } else {
118             // Assume that the mask will be changed by the caller.
119             is_changed = 1;
120         }
121     }
122 }
123 void affinity_helper::dismiss() {
124     if( threadMask ) {
125         delete [] threadMask;
126         threadMask = NULL;
127     }
128     is_changed = 0;
129 }
130 #undef curMaskSize
131 
132 static std::atomic<do_once_state> hardware_concurrency_info;
133 
134 static int theNumProcs;
135 
136 static void initialize_hardware_concurrency_info () {
137     int err;
138     int availableProcs = 0;
139     int numMasks = 1;
140     int maxProcs = sysconf(_SC_NPROCESSORS_ONLN);
141     basic_mask_t* processMask;
142     const std::size_t BasicMaskSize =  sizeof(basic_mask_t);
143     for (;;) {
144         const int curMaskSize = BasicMaskSize * numMasks;
145         processMask = new basic_mask_t[numMasks];
146         std::memset( processMask, 0, curMaskSize );
147 #if __FreeBSD__ || __NetBSD__ || __OpenBSD__
148         // CPU_LEVEL_WHICH - anonymous (current) mask, CPU_LEVEL_CPUSET - assigned mask
149         err = cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, curMaskSize, processMask );
150         if ( !err || errno != ERANGE || curMaskSize * CHAR_BIT >= 16 * 1024 )
151             break;
152 #else /* __unix__ */
153         int pid = getpid();
154         err = sched_getaffinity( pid, curMaskSize, processMask );
155         if ( !err || errno != EINVAL || curMaskSize * CHAR_BIT >= 256 * 1024 )
156             break;
157 #endif
158         delete[] processMask;
159         numMasks <<= 1;
160     }
161     if ( !err ) {
162         // We have found the mask size and captured the process affinity mask into processMask.
163         num_masks = numMasks; // do here because it's needed for affinity_helper to work
164 #if __unix__
165         // For better coexistence with libiomp which might have changed the mask already,
166         // check for its presence and ask it to restore the mask.
167         dynamic_link_handle libhandle;
168         if ( dynamic_link( "libiomp5.so", iompLinkTable, 1, &libhandle, DYNAMIC_LINK_GLOBAL ) ) {
169             // We have found the symbol provided by libiomp5 for restoring original thread affinity.
170             affinity_helper affhelp;
171             affhelp.protect_affinity_mask( /*restore_process_mask=*/false );
172             if ( libiomp_try_restoring_original_mask()==0 ) {
173                 // Now we have the right mask to capture, restored by libiomp.
174                 const int curMaskSize = BasicMaskSize * numMasks;
175                 std::memset( processMask, 0, curMaskSize );
176                 get_thread_affinity_mask( curMaskSize, processMask );
177             } else
178                 affhelp.dismiss();  // thread mask has not changed
179             dynamic_unlink( libhandle );
180             // Destructor of affinity_helper restores the thread mask (unless dismissed).
181         }
182 #endif
183         for ( int m = 0; availableProcs < maxProcs && m < numMasks; ++m ) {
184             for ( std::size_t i = 0; (availableProcs < maxProcs) && (i < BasicMaskSize * CHAR_BIT); ++i ) {
185                 if ( CPU_ISSET( i, processMask + m ) )
186                     ++availableProcs;
187             }
188         }
189         process_mask = processMask;
190     }
191     else {
192         // Failed to get the process affinity mask; assume the whole machine can be used.
193         availableProcs = (maxProcs == INT_MAX) ? sysconf(_SC_NPROCESSORS_ONLN) : maxProcs;
194         delete[] processMask;
195     }
196     theNumProcs = availableProcs > 0 ? availableProcs : 1; // Fail safety strap
197     __TBB_ASSERT( theNumProcs <= sysconf(_SC_NPROCESSORS_ONLN), NULL );
198 }
199 
200 int AvailableHwConcurrency() {
201     atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info );
202     return theNumProcs;
203 }
204 
205 /* End of __TBB_USE_OS_AFFINITY_SYSCALL implementation */
206 #elif __ANDROID__
207 
208 // Work-around for Android that reads the correct number of available CPUs since system calls are unreliable.
209 // Format of "present" file is: ([<int>-<int>|<int>],)+
210 int AvailableHwConcurrency() {
211     FILE *fp = fopen("/sys/devices/system/cpu/present", "r");
212     if (fp == NULL) return 1;
213     int num_args, lower, upper, num_cpus=0;
214     while ((num_args = fscanf(fp, "%u-%u", &lower, &upper)) != EOF) {
215         switch(num_args) {
216             case 2: num_cpus += upper - lower + 1; break;
217             case 1: num_cpus += 1; break;
218         }
219         fscanf(fp, ",");
220     }
221     return (num_cpus > 0) ? num_cpus : 1;
222 }
223 
224 #elif defined(_SC_NPROCESSORS_ONLN)
225 
226 int AvailableHwConcurrency() {
227     int n = sysconf(_SC_NPROCESSORS_ONLN);
228     return (n > 0) ? n : 1;
229 }
230 
231 #elif _WIN32||_WIN64
232 
233 static std::atomic<do_once_state> hardware_concurrency_info;
234 
235 static const WORD TBB_ALL_PROCESSOR_GROUPS = 0xffff;
236 
237 // Statically allocate an array for processor group information.
238 // Windows 7 supports maximum 4 groups, but let's look ahead a little.
239 static const WORD MaxProcessorGroups = 64;
240 
241 struct ProcessorGroupInfo {
242     DWORD_PTR   mask;                   ///< Affinity mask covering the whole group
243     int         numProcs;               ///< Number of processors in the group
244     int         numProcsRunningTotal;   ///< Subtotal of processors in this and preceding groups
245 
246     //! Total number of processor groups in the system
247     static int NumGroups;
248 
249     //! Index of the group with a slot reserved for the first external thread
250     /** In the context of multiple processor groups support current implementation
251         defines "the first external thread" as the first thread to invoke
252         AvailableHwConcurrency().
253 
254         TODO:   Implement a dynamic scheme remapping workers depending on the pending
255                 external threads affinity. **/
256     static int HoleIndex;
257 };
258 
259 int ProcessorGroupInfo::NumGroups = 1;
260 int ProcessorGroupInfo::HoleIndex = 0;
261 
262 ProcessorGroupInfo theProcessorGroups[MaxProcessorGroups];
263 
264 struct TBB_GROUP_AFFINITY {
265     DWORD_PTR Mask;
266     WORD   Group;
267     WORD   Reserved[3];
268 };
269 
270 static DWORD (WINAPI *TBB_GetActiveProcessorCount)( WORD groupIndex ) = NULL;
271 static WORD (WINAPI *TBB_GetActiveProcessorGroupCount)() = NULL;
272 static BOOL (WINAPI *TBB_SetThreadGroupAffinity)( HANDLE hThread,
273                         const TBB_GROUP_AFFINITY* newAff, TBB_GROUP_AFFINITY *prevAff );
274 static BOOL (WINAPI *TBB_GetThreadGroupAffinity)( HANDLE hThread, TBB_GROUP_AFFINITY* );
275 
276 static const dynamic_link_descriptor ProcessorGroupsApiLinkTable[] = {
277       DLD(GetActiveProcessorCount, TBB_GetActiveProcessorCount)
278     , DLD(GetActiveProcessorGroupCount, TBB_GetActiveProcessorGroupCount)
279     , DLD(SetThreadGroupAffinity, TBB_SetThreadGroupAffinity)
280     , DLD(GetThreadGroupAffinity, TBB_GetThreadGroupAffinity)
281 };
282 
283 static void initialize_hardware_concurrency_info () {
284     suppress_unused_warning(TBB_ALL_PROCESSOR_GROUPS);
285 #if __TBB_WIN8UI_SUPPORT
286     // For these applications processor groups info is unavailable
287     // Setting up a number of processors for one processor group
288     theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = std::thread::hardware_concurrency();
289 #else /* __TBB_WIN8UI_SUPPORT */
290     dynamic_link( "Kernel32.dll", ProcessorGroupsApiLinkTable,
291                   sizeof(ProcessorGroupsApiLinkTable)/sizeof(dynamic_link_descriptor) );
292     SYSTEM_INFO si;
293     GetNativeSystemInfo(&si);
294     DWORD_PTR pam, sam, m = 1;
295     GetProcessAffinityMask( GetCurrentProcess(), &pam, &sam );
296     int nproc = 0;
297     for ( std::size_t i = 0; i < sizeof(DWORD_PTR) * CHAR_BIT; ++i, m <<= 1 ) {
298         if ( pam & m )
299             ++nproc;
300     }
301     __TBB_ASSERT( nproc <= (int)si.dwNumberOfProcessors, NULL );
302     // By default setting up a number of processors for one processor group
303     theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = nproc;
304     // Setting up processor groups in case the process does not restrict affinity mask and more than one processor group is present
305     if ( nproc == (int)si.dwNumberOfProcessors && TBB_GetActiveProcessorCount ) {
306         // The process does not have restricting affinity mask and multiple processor groups are possible
307         ProcessorGroupInfo::NumGroups = (int)TBB_GetActiveProcessorGroupCount();
308         __TBB_ASSERT( ProcessorGroupInfo::NumGroups <= MaxProcessorGroups, NULL );
309         // Fail safety bootstrap. Release versions will limit available concurrency
310         // level, while debug ones would assert.
311         if ( ProcessorGroupInfo::NumGroups > MaxProcessorGroups )
312             ProcessorGroupInfo::NumGroups = MaxProcessorGroups;
313         if ( ProcessorGroupInfo::NumGroups > 1 ) {
314             TBB_GROUP_AFFINITY ga;
315             if ( TBB_GetThreadGroupAffinity( GetCurrentThread(), &ga ) )
316                 ProcessorGroupInfo::HoleIndex = ga.Group;
317             int nprocs = 0;
318             for ( WORD i = 0; i < ProcessorGroupInfo::NumGroups; ++i ) {
319                 ProcessorGroupInfo  &pgi = theProcessorGroups[i];
320                 pgi.numProcs = (int)TBB_GetActiveProcessorCount(i);
321                 __TBB_ASSERT( pgi.numProcs <= (int)sizeof(DWORD_PTR) * CHAR_BIT, NULL );
322                 pgi.mask = pgi.numProcs == sizeof(DWORD_PTR) * CHAR_BIT ? ~(DWORD_PTR)0 : (DWORD_PTR(1) << pgi.numProcs) - 1;
323                 pgi.numProcsRunningTotal = nprocs += pgi.numProcs;
324             }
325             __TBB_ASSERT( nprocs == (int)TBB_GetActiveProcessorCount( TBB_ALL_PROCESSOR_GROUPS ), NULL );
326         }
327     }
328 #endif /* __TBB_WIN8UI_SUPPORT */
329 
330     PrintExtraVersionInfo("Processor groups", "%d", ProcessorGroupInfo::NumGroups);
331     if (ProcessorGroupInfo::NumGroups>1)
332         for (int i=0; i<ProcessorGroupInfo::NumGroups; ++i)
333             PrintExtraVersionInfo( "----- Group", "%d: size %d", i, theProcessorGroups[i].numProcs);
334 }
335 
336 int NumberOfProcessorGroups() {
337     __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "NumberOfProcessorGroups is used before AvailableHwConcurrency" );
338     return ProcessorGroupInfo::NumGroups;
339 }
340 
341 // Offset for the slot reserved for the first external thread
342 #define HoleAdjusted(procIdx, grpIdx) (procIdx + (holeIdx <= grpIdx))
343 
344 int FindProcessorGroupIndex ( int procIdx ) {
345     // In case of oversubscription spread extra workers in a round robin manner
346     int holeIdx;
347     const int numProcs = theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal;
348     if ( procIdx >= numProcs - 1 ) {
349         holeIdx = INT_MAX;
350         procIdx = (procIdx - numProcs + 1) % numProcs;
351     }
352     else
353         holeIdx = ProcessorGroupInfo::HoleIndex;
354     __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "FindProcessorGroupIndex is used before AvailableHwConcurrency" );
355     // Approximate the likely group index assuming all groups are of the same size
356     int i = procIdx / theProcessorGroups[0].numProcs;
357     // Make sure the approximation is a valid group index
358     if (i >= ProcessorGroupInfo::NumGroups) i = ProcessorGroupInfo::NumGroups-1;
359     // Now adjust the approximation up or down
360     if ( theProcessorGroups[i].numProcsRunningTotal > HoleAdjusted(procIdx, i) ) {
361         while ( theProcessorGroups[i].numProcsRunningTotal - theProcessorGroups[i].numProcs > HoleAdjusted(procIdx, i) ) {
362             __TBB_ASSERT( i > 0, NULL );
363             --i;
364         }
365     }
366     else {
367         do {
368             ++i;
369         } while ( theProcessorGroups[i].numProcsRunningTotal <= HoleAdjusted(procIdx, i) );
370     }
371     __TBB_ASSERT( i < ProcessorGroupInfo::NumGroups, NULL );
372     return i;
373 }
374 
375 void MoveThreadIntoProcessorGroup( void* hThread, int groupIndex ) {
376     __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "MoveThreadIntoProcessorGroup is used before AvailableHwConcurrency" );
377     if ( !TBB_SetThreadGroupAffinity )
378         return;
379     TBB_GROUP_AFFINITY ga = { theProcessorGroups[groupIndex].mask, (WORD)groupIndex, {0,0,0} };
380     TBB_SetThreadGroupAffinity( hThread, &ga, NULL );
381 }
382 
383 int AvailableHwConcurrency() {
384     atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info );
385     return theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal;
386 }
387 
388 /* End of _WIN32||_WIN64 implementation */
389 #else
390     #error AvailableHwConcurrency is not implemented for this OS
391 #endif
392 
393 } // namespace r1
394 } // namespace detail
395 } // namespace tbb
396 
397 #endif /* !__TBB_HardwareConcurrency */
398