xref: /oneTBB/src/tbb/misc_ex.cpp (revision b15aabb3)
1 /*
2     Copyright (c) 2005-2021 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 // Source file for miscellaneous entities that are infrequently referenced by
18 // an executing program, and implementation of which requires dynamic linking.
19 
20 #include "misc.h"
21 
22 #if !defined(__TBB_HardwareConcurrency)
23 
24 #include "dynamic_link.h"
25 #include <stdio.h>
26 #include <limits.h>
27 
28 #if _WIN32||_WIN64
29 #include <windows.h>
30 #if __TBB_WIN8UI_SUPPORT
31 #include <thread>
32 #endif
33 #else
34 #include <unistd.h>
35 #if __linux__
36 #include <sys/sysinfo.h>
37 #include <cstring>
38 #include <sched.h>
39 #include <cerrno>
40 #elif __sun
41 #include <sys/sysinfo.h>
42 #elif __FreeBSD__
43 #include <cerrno>
44 #include <cstring>
45 #include <sys/param.h>  // Required by <sys/cpuset.h>
46 #include <sys/cpuset.h>
47 #endif
48 #endif
49 
50 namespace tbb {
51 namespace detail {
52 namespace r1 {
53 
54 #if __TBB_USE_OS_AFFINITY_SYSCALL
55 
56 #if __linux__
57 // Handlers for interoperation with libiomp
58 static int (*libiomp_try_restoring_original_mask)();
59 // Table for mapping to libiomp entry points
60 static const dynamic_link_descriptor iompLinkTable[] = {
61     DLD_NOWEAK( kmp_set_thread_affinity_mask_initial, libiomp_try_restoring_original_mask )
62 };
63 #endif
64 
65 static void set_thread_affinity_mask( std::size_t maskSize, const basic_mask_t* threadMask ) {
66 #if __linux__
67     if( sched_setaffinity( 0, maskSize, threadMask ) )
68 #else /* FreeBSD */
69     if( cpuset_setaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) )
70 #endif
71         // Here and below the error severity is lowered from critical level
72         // because it may happen during TBB library unload because of not
73         // waiting for workers to complete (current RML policy, to be fixed).
74         // handle_perror( errno, "setaffinity syscall" );
75         runtime_warning( "setaffinity syscall failed" );
76 }
77 
78 static void get_thread_affinity_mask( std::size_t maskSize, basic_mask_t* threadMask ) {
79 #if __linux__
80     if( sched_getaffinity( 0, maskSize, threadMask ) )
81 #else /* FreeBSD */
82     if( cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) )
83 #endif
84     runtime_warning( "getaffinity syscall failed" );
85 }
86 
87 static basic_mask_t* process_mask;
88 static int num_masks;
89 
90 void destroy_process_mask() {
91     if( process_mask ) {
92         delete [] process_mask;
93     }
94 }
95 
96 #define curMaskSize sizeof(basic_mask_t) * num_masks
97 affinity_helper::~affinity_helper() {
98     if( threadMask ) {
99         if( is_changed ) {
100             set_thread_affinity_mask( curMaskSize, threadMask );
101         }
102         delete [] threadMask;
103     }
104 }
105 void affinity_helper::protect_affinity_mask( bool restore_process_mask ) {
106     if( threadMask == NULL && num_masks ) { // TODO: assert num_masks validity?
107         threadMask = new basic_mask_t [num_masks];
108         std::memset( threadMask, 0, curMaskSize );
109         get_thread_affinity_mask( curMaskSize, threadMask );
110         if( restore_process_mask ) {
111             __TBB_ASSERT( process_mask, "A process mask is requested but not yet stored" );
112             is_changed = memcmp( process_mask, threadMask, curMaskSize );
113             if( is_changed )
114                 set_thread_affinity_mask( curMaskSize, process_mask );
115         } else {
116             // Assume that the mask will be changed by the caller.
117             is_changed = 1;
118         }
119     }
120 }
121 void affinity_helper::dismiss() {
122     if( threadMask ) {
123         delete [] threadMask;
124         threadMask = NULL;
125     }
126     is_changed = 0;
127 }
128 #undef curMaskSize
129 
130 static std::atomic<do_once_state> hardware_concurrency_info;
131 
132 static int theNumProcs;
133 
134 static void initialize_hardware_concurrency_info () {
135     int err;
136     int availableProcs = 0;
137     int numMasks = 1;
138 #if __linux__
139     int maxProcs = sysconf(_SC_NPROCESSORS_ONLN);
140     int pid = getpid();
141 #else /* FreeBSD >= 7.1 */
142     int maxProcs = sysconf(_SC_NPROCESSORS_ONLN);
143 #endif
144     basic_mask_t* processMask;
145     const std::size_t BasicMaskSize =  sizeof(basic_mask_t);
146     for (;;) {
147         const int curMaskSize = BasicMaskSize * numMasks;
148         processMask = new basic_mask_t[numMasks];
149         std::memset( processMask, 0, curMaskSize );
150 #if __linux__
151         err = sched_getaffinity( pid, curMaskSize, processMask );
152         if ( !err || errno != EINVAL || curMaskSize * CHAR_BIT >= 256 * 1024 )
153             break;
154 #else /* FreeBSD >= 7.1 */
155         // CPU_LEVEL_WHICH - anonymous (current) mask, CPU_LEVEL_CPUSET - assigned mask
156         err = cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, curMaskSize, processMask );
157         if ( !err || errno != ERANGE || curMaskSize * CHAR_BIT >= 16 * 1024 )
158             break;
159 #endif /* FreeBSD >= 7.1 */
160         delete[] processMask;
161         numMasks <<= 1;
162     }
163     if ( !err ) {
164         // We have found the mask size and captured the process affinity mask into processMask.
165         num_masks = numMasks; // do here because it's needed for affinity_helper to work
166 #if __linux__
167         // For better coexistence with libiomp which might have changed the mask already,
168         // check for its presence and ask it to restore the mask.
169         dynamic_link_handle libhandle;
170         if ( dynamic_link( "libiomp5.so", iompLinkTable, 1, &libhandle, DYNAMIC_LINK_GLOBAL ) ) {
171             // We have found the symbol provided by libiomp5 for restoring original thread affinity.
172             affinity_helper affhelp;
173             affhelp.protect_affinity_mask( /*restore_process_mask=*/false );
174             if ( libiomp_try_restoring_original_mask()==0 ) {
175                 // Now we have the right mask to capture, restored by libiomp.
176                 const int curMaskSize = BasicMaskSize * numMasks;
177                 std::memset( processMask, 0, curMaskSize );
178                 get_thread_affinity_mask( curMaskSize, processMask );
179             } else
180                 affhelp.dismiss();  // thread mask has not changed
181             dynamic_unlink( libhandle );
182             // Destructor of affinity_helper restores the thread mask (unless dismissed).
183         }
184 #endif
185         for ( int m = 0; availableProcs < maxProcs && m < numMasks; ++m ) {
186             for ( std::size_t i = 0; (availableProcs < maxProcs) && (i < BasicMaskSize * CHAR_BIT); ++i ) {
187                 if ( CPU_ISSET( i, processMask + m ) )
188                     ++availableProcs;
189             }
190         }
191         process_mask = processMask;
192     }
193     else {
194         // Failed to get the process affinity mask; assume the whole machine can be used.
195         availableProcs = (maxProcs == INT_MAX) ? sysconf(_SC_NPROCESSORS_ONLN) : maxProcs;
196         delete[] processMask;
197     }
198     theNumProcs = availableProcs > 0 ? availableProcs : 1; // Fail safety strap
199     __TBB_ASSERT( theNumProcs <= sysconf(_SC_NPROCESSORS_ONLN), NULL );
200 }
201 
202 int AvailableHwConcurrency() {
203     atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info );
204     return theNumProcs;
205 }
206 
207 /* End of __TBB_USE_OS_AFFINITY_SYSCALL implementation */
208 #elif __ANDROID__
209 
210 // Work-around for Android that reads the correct number of available CPUs since system calls are unreliable.
211 // Format of "present" file is: ([<int>-<int>|<int>],)+
212 int AvailableHwConcurrency() {
213     FILE *fp = fopen("/sys/devices/system/cpu/present", "r");
214     if (fp == NULL) return 1;
215     int num_args, lower, upper, num_cpus=0;
216     while ((num_args = fscanf(fp, "%u-%u", &lower, &upper)) != EOF) {
217         switch(num_args) {
218             case 2: num_cpus += upper - lower + 1; break;
219             case 1: num_cpus += 1; break;
220         }
221         fscanf(fp, ",");
222     }
223     return (num_cpus > 0) ? num_cpus : 1;
224 }
225 
226 #elif defined(_SC_NPROCESSORS_ONLN)
227 
228 int AvailableHwConcurrency() {
229     int n = sysconf(_SC_NPROCESSORS_ONLN);
230     return (n > 0) ? n : 1;
231 }
232 
233 #elif _WIN32||_WIN64
234 
235 static std::atomic<do_once_state> hardware_concurrency_info;
236 
237 static const WORD TBB_ALL_PROCESSOR_GROUPS = 0xffff;
238 
239 // Statically allocate an array for processor group information.
240 // Windows 7 supports maximum 4 groups, but let's look ahead a little.
241 static const WORD MaxProcessorGroups = 64;
242 
243 struct ProcessorGroupInfo {
244     DWORD_PTR   mask;                   ///< Affinity mask covering the whole group
245     int         numProcs;               ///< Number of processors in the group
246     int         numProcsRunningTotal;   ///< Subtotal of processors in this and preceding groups
247 
248     //! Total number of processor groups in the system
249     static int NumGroups;
250 
251     //! Index of the group with a slot reserved for the first external thread
252     /** In the context of multiple processor groups support current implementation
253         defines "the first external thread" as the first thread to invoke
254         AvailableHwConcurrency().
255 
256         TODO:   Implement a dynamic scheme remapping workers depending on the pending
257                 external threads affinity. **/
258     static int HoleIndex;
259 };
260 
261 int ProcessorGroupInfo::NumGroups = 1;
262 int ProcessorGroupInfo::HoleIndex = 0;
263 
264 ProcessorGroupInfo theProcessorGroups[MaxProcessorGroups];
265 
266 struct TBB_GROUP_AFFINITY {
267     DWORD_PTR Mask;
268     WORD   Group;
269     WORD   Reserved[3];
270 };
271 
272 static DWORD (WINAPI *TBB_GetActiveProcessorCount)( WORD groupIndex ) = NULL;
273 static WORD (WINAPI *TBB_GetActiveProcessorGroupCount)() = NULL;
274 static BOOL (WINAPI *TBB_SetThreadGroupAffinity)( HANDLE hThread,
275                         const TBB_GROUP_AFFINITY* newAff, TBB_GROUP_AFFINITY *prevAff );
276 static BOOL (WINAPI *TBB_GetThreadGroupAffinity)( HANDLE hThread, TBB_GROUP_AFFINITY* );
277 
278 static const dynamic_link_descriptor ProcessorGroupsApiLinkTable[] = {
279       DLD(GetActiveProcessorCount, TBB_GetActiveProcessorCount)
280     , DLD(GetActiveProcessorGroupCount, TBB_GetActiveProcessorGroupCount)
281     , DLD(SetThreadGroupAffinity, TBB_SetThreadGroupAffinity)
282     , DLD(GetThreadGroupAffinity, TBB_GetThreadGroupAffinity)
283 };
284 
285 static void initialize_hardware_concurrency_info () {
286 #if __TBB_WIN8UI_SUPPORT
287     // For these applications processor groups info is unavailable
288     // Setting up a number of processors for one processor group
289     theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = std::thread::hardware_concurrency();
290 #else /* __TBB_WIN8UI_SUPPORT */
291     dynamic_link( "Kernel32.dll", ProcessorGroupsApiLinkTable,
292                   sizeof(ProcessorGroupsApiLinkTable)/sizeof(dynamic_link_descriptor) );
293     SYSTEM_INFO si;
294     GetNativeSystemInfo(&si);
295     DWORD_PTR pam, sam, m = 1;
296     GetProcessAffinityMask( GetCurrentProcess(), &pam, &sam );
297     int nproc = 0;
298     for ( std::size_t i = 0; i < sizeof(DWORD_PTR) * CHAR_BIT; ++i, m <<= 1 ) {
299         if ( pam & m )
300             ++nproc;
301     }
302     __TBB_ASSERT( nproc <= (int)si.dwNumberOfProcessors, NULL );
303     // By default setting up a number of processors for one processor group
304     theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = nproc;
305     // Setting up processor groups in case the process does not restrict affinity mask and more than one processor group is present
306     if ( nproc == (int)si.dwNumberOfProcessors && TBB_GetActiveProcessorCount ) {
307         // The process does not have restricting affinity mask and multiple processor groups are possible
308         ProcessorGroupInfo::NumGroups = (int)TBB_GetActiveProcessorGroupCount();
309         __TBB_ASSERT( ProcessorGroupInfo::NumGroups <= MaxProcessorGroups, NULL );
310         // Fail safety bootstrap. Release versions will limit available concurrency
311         // level, while debug ones would assert.
312         if ( ProcessorGroupInfo::NumGroups > MaxProcessorGroups )
313             ProcessorGroupInfo::NumGroups = MaxProcessorGroups;
314         if ( ProcessorGroupInfo::NumGroups > 1 ) {
315             TBB_GROUP_AFFINITY ga;
316             if ( TBB_GetThreadGroupAffinity( GetCurrentThread(), &ga ) )
317                 ProcessorGroupInfo::HoleIndex = ga.Group;
318             int nprocs = 0;
319             for ( WORD i = 0; i < ProcessorGroupInfo::NumGroups; ++i ) {
320                 ProcessorGroupInfo  &pgi = theProcessorGroups[i];
321                 pgi.numProcs = (int)TBB_GetActiveProcessorCount(i);
322                 __TBB_ASSERT( pgi.numProcs <= (int)sizeof(DWORD_PTR) * CHAR_BIT, NULL );
323                 pgi.mask = pgi.numProcs == sizeof(DWORD_PTR) * CHAR_BIT ? ~(DWORD_PTR)0 : (DWORD_PTR(1) << pgi.numProcs) - 1;
324                 pgi.numProcsRunningTotal = nprocs += pgi.numProcs;
325             }
326             __TBB_ASSERT( nprocs == (int)TBB_GetActiveProcessorCount( TBB_ALL_PROCESSOR_GROUPS ), NULL );
327         }
328     }
329 #endif /* __TBB_WIN8UI_SUPPORT */
330 
331     PrintExtraVersionInfo("Processor groups", "%d", ProcessorGroupInfo::NumGroups);
332     if (ProcessorGroupInfo::NumGroups>1)
333         for (int i=0; i<ProcessorGroupInfo::NumGroups; ++i)
334             PrintExtraVersionInfo( "----- Group", "%d: size %d", i, theProcessorGroups[i].numProcs);
335 }
336 
337 int NumberOfProcessorGroups() {
338     __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "NumberOfProcessorGroups is used before AvailableHwConcurrency" );
339     return ProcessorGroupInfo::NumGroups;
340 }
341 
342 // Offset for the slot reserved for the first external thread
343 #define HoleAdjusted(procIdx, grpIdx) (procIdx + (holeIdx <= grpIdx))
344 
345 int FindProcessorGroupIndex ( int procIdx ) {
346     // In case of oversubscription spread extra workers in a round robin manner
347     int holeIdx;
348     const int numProcs = theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal;
349     if ( procIdx >= numProcs - 1 ) {
350         holeIdx = INT_MAX;
351         procIdx = (procIdx - numProcs + 1) % numProcs;
352     }
353     else
354         holeIdx = ProcessorGroupInfo::HoleIndex;
355     __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "FindProcessorGroupIndex is used before AvailableHwConcurrency" );
356     // Approximate the likely group index assuming all groups are of the same size
357     int i = procIdx / theProcessorGroups[0].numProcs;
358     // Make sure the approximation is a valid group index
359     if (i >= ProcessorGroupInfo::NumGroups) i = ProcessorGroupInfo::NumGroups-1;
360     // Now adjust the approximation up or down
361     if ( theProcessorGroups[i].numProcsRunningTotal > HoleAdjusted(procIdx, i) ) {
362         while ( theProcessorGroups[i].numProcsRunningTotal - theProcessorGroups[i].numProcs > HoleAdjusted(procIdx, i) ) {
363             __TBB_ASSERT( i > 0, NULL );
364             --i;
365         }
366     }
367     else {
368         do {
369             ++i;
370         } while ( theProcessorGroups[i].numProcsRunningTotal <= HoleAdjusted(procIdx, i) );
371     }
372     __TBB_ASSERT( i < ProcessorGroupInfo::NumGroups, NULL );
373     return i;
374 }
375 
376 void MoveThreadIntoProcessorGroup( void* hThread, int groupIndex ) {
377     __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "MoveThreadIntoProcessorGroup is used before AvailableHwConcurrency" );
378     if ( !TBB_SetThreadGroupAffinity )
379         return;
380     TBB_GROUP_AFFINITY ga = { theProcessorGroups[groupIndex].mask, (WORD)groupIndex, {0,0,0} };
381     TBB_SetThreadGroupAffinity( hThread, &ga, NULL );
382 }
383 
384 int AvailableHwConcurrency() {
385     atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info );
386     return theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal;
387 }
388 
389 /* End of _WIN32||_WIN64 implementation */
390 #else
391     #error AvailableHwConcurrency is not implemented for this OS
392 #endif
393 
394 } // namespace r1
395 } // namespace detail
396 } // namespace tbb
397 
398 #endif /* !__TBB_HardwareConcurrency */
399