1 /*
2 Copyright (c) 2005-2023 Intel Corporation
3
4 Licensed under the Apache License, Version 2.0 (the "License");
5 you may not use this file except in compliance with the License.
6 You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
15 */
16
17 // Source file for miscellaneous entities that are infrequently referenced by
18 // an executing program, and implementation of which requires dynamic linking.
19
20 #include "misc.h"
21
22 #if !defined(__TBB_HardwareConcurrency)
23
24 #include "dynamic_link.h"
25 #include <stdio.h>
26 #include <limits.h>
27
28 #if _WIN32||_WIN64
29 #include <windows.h>
30 #if __TBB_WIN8UI_SUPPORT
31 #include <thread>
32 #endif
33 #else
34 #include <unistd.h>
35 #if __unix__
36 #if __linux__
37 #include <sys/sysinfo.h>
38 #endif
39 #include <cstring>
40 #include <sched.h>
41 #include <cerrno>
42 #elif __sun
43 #include <sys/sysinfo.h>
44 #elif __FreeBSD__
45 #include <cerrno>
46 #include <cstring>
47 #include <sys/param.h> // Required by <sys/cpuset.h>
48 #include <sys/cpuset.h>
49 #endif
50 #endif
51
52 namespace tbb {
53 namespace detail {
54 namespace r1 {
55
56 #if __TBB_USE_OS_AFFINITY_SYSCALL
57
58 #if __unix__
59 // Handlers for interoperation with libiomp
60 static int (*libiomp_try_restoring_original_mask)();
61 // Table for mapping to libiomp entry points
62 static const dynamic_link_descriptor iompLinkTable[] = {
63 DLD_NOWEAK( kmp_set_thread_affinity_mask_initial, libiomp_try_restoring_original_mask )
64 };
65 #endif
66
set_thread_affinity_mask(std::size_t maskSize,const basic_mask_t * threadMask)67 static void set_thread_affinity_mask( std::size_t maskSize, const basic_mask_t* threadMask ) {
68 #if __FreeBSD__ || __NetBSD__ || __OpenBSD__
69 if( cpuset_setaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) )
70 #else /* __unix__ */
71 if( sched_setaffinity( 0, maskSize, threadMask ) )
72 #endif
73 // Here and below the error severity is lowered from critical level
74 // because it may happen during TBB library unload because of not
75 // waiting for workers to complete (current RML policy, to be fixed).
76 // handle_perror( errno, "setaffinity syscall" );
77 runtime_warning( "setaffinity syscall failed" );
78 }
79
get_thread_affinity_mask(std::size_t maskSize,basic_mask_t * threadMask)80 static void get_thread_affinity_mask( std::size_t maskSize, basic_mask_t* threadMask ) {
81 #if __FreeBSD__ || __NetBSD__ || __OpenBSD__
82 if( cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) )
83 #else /* __unix__ */
84 if( sched_getaffinity( 0, maskSize, threadMask ) )
85 #endif
86 runtime_warning( "getaffinity syscall failed" );
87 }
88
89 static basic_mask_t* process_mask;
90 static int num_masks;
91
destroy_process_mask()92 void destroy_process_mask() {
93 delete [] process_mask;
94 process_mask = nullptr;
95 }
96
97 #define curMaskSize sizeof(basic_mask_t) * num_masks
~affinity_helper()98 affinity_helper::~affinity_helper() {
99 if( threadMask ) {
100 if( is_changed ) {
101 set_thread_affinity_mask( curMaskSize, threadMask );
102 }
103 delete [] threadMask;
104 }
105 }
protect_affinity_mask(bool restore_process_mask)106 void affinity_helper::protect_affinity_mask( bool restore_process_mask ) {
107 if( threadMask == nullptr && num_masks ) { // TODO: assert num_masks validity?
108 threadMask = new basic_mask_t [num_masks];
109 std::memset( threadMask, 0, curMaskSize );
110 get_thread_affinity_mask( curMaskSize, threadMask );
111 if( restore_process_mask ) {
112 __TBB_ASSERT( process_mask, "A process mask is requested but not yet stored" );
113 is_changed = memcmp( process_mask, threadMask, curMaskSize );
114 if( is_changed )
115 set_thread_affinity_mask( curMaskSize, process_mask );
116 } else {
117 // Assume that the mask will be changed by the caller.
118 is_changed = 1;
119 }
120 }
121 }
dismiss()122 void affinity_helper::dismiss() {
123 delete [] threadMask;
124 threadMask = nullptr;
125 is_changed = 0;
126 }
127 #undef curMaskSize
128
129 static std::atomic<do_once_state> hardware_concurrency_info;
130
131 static int theNumProcs;
132
initialize_hardware_concurrency_info()133 static void initialize_hardware_concurrency_info () {
134 int err;
135 int availableProcs = 0;
136 int numMasks = 1;
137 int maxProcs = sysconf(_SC_NPROCESSORS_ONLN);
138 basic_mask_t* processMask;
139 const std::size_t BasicMaskSize = sizeof(basic_mask_t);
140 for (;;) {
141 const int curMaskSize = BasicMaskSize * numMasks;
142 processMask = new basic_mask_t[numMasks];
143 std::memset( processMask, 0, curMaskSize );
144 #if __FreeBSD__ || __NetBSD__ || __OpenBSD__
145 // CPU_LEVEL_WHICH - anonymous (current) mask, CPU_LEVEL_CPUSET - assigned mask
146 err = cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, curMaskSize, processMask );
147 if ( !err || errno != ERANGE || curMaskSize * CHAR_BIT >= 16 * 1024 )
148 break;
149 #else /* __unix__ */
150 int pid = getpid();
151 err = sched_getaffinity( pid, curMaskSize, processMask );
152 if ( !err || errno != EINVAL || curMaskSize * CHAR_BIT >= 256 * 1024 )
153 break;
154 #endif
155 delete[] processMask;
156 numMasks <<= 1;
157 }
158 if ( !err ) {
159 // We have found the mask size and captured the process affinity mask into processMask.
160 num_masks = numMasks; // do here because it's needed for affinity_helper to work
161 #if __unix__
162 // For better coexistence with libiomp which might have changed the mask already,
163 // check for its presence and ask it to restore the mask.
164 dynamic_link_handle libhandle;
165 if ( dynamic_link( "libiomp5.so", iompLinkTable, 1, &libhandle, DYNAMIC_LINK_GLOBAL ) ) {
166 // We have found the symbol provided by libiomp5 for restoring original thread affinity.
167 affinity_helper affhelp;
168 affhelp.protect_affinity_mask( /*restore_process_mask=*/false );
169 if ( libiomp_try_restoring_original_mask()==0 ) {
170 // Now we have the right mask to capture, restored by libiomp.
171 const int curMaskSize = BasicMaskSize * numMasks;
172 std::memset( processMask, 0, curMaskSize );
173 get_thread_affinity_mask( curMaskSize, processMask );
174 } else
175 affhelp.dismiss(); // thread mask has not changed
176 dynamic_unlink( libhandle );
177 // Destructor of affinity_helper restores the thread mask (unless dismissed).
178 }
179 #endif
180 for ( int m = 0; availableProcs < maxProcs && m < numMasks; ++m ) {
181 for ( std::size_t i = 0; (availableProcs < maxProcs) && (i < BasicMaskSize * CHAR_BIT); ++i ) {
182 if ( CPU_ISSET( i, processMask + m ) )
183 ++availableProcs;
184 }
185 }
186 process_mask = processMask;
187 }
188 else {
189 // Failed to get the process affinity mask; assume the whole machine can be used.
190 availableProcs = (maxProcs == INT_MAX) ? sysconf(_SC_NPROCESSORS_ONLN) : maxProcs;
191 delete[] processMask;
192 }
193 theNumProcs = availableProcs > 0 ? availableProcs : 1; // Fail safety strap
194 __TBB_ASSERT( theNumProcs <= sysconf(_SC_NPROCESSORS_ONLN), nullptr);
195 }
196
AvailableHwConcurrency()197 int AvailableHwConcurrency() {
198 atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info );
199 return theNumProcs;
200 }
201
202 /* End of __TBB_USE_OS_AFFINITY_SYSCALL implementation */
203 #elif __ANDROID__
204
205 // Work-around for Android that reads the correct number of available CPUs since system calls are unreliable.
206 // Format of "present" file is: ([<int>-<int>|<int>],)+
207 int AvailableHwConcurrency() {
208 FILE *fp = fopen("/sys/devices/system/cpu/present", "r");
209 if (fp == nullptr) return 1;
210 int num_args, lower, upper, num_cpus=0;
211 while ((num_args = fscanf(fp, "%u-%u", &lower, &upper)) != EOF) {
212 switch(num_args) {
213 case 2: num_cpus += upper - lower + 1; break;
214 case 1: num_cpus += 1; break;
215 }
216 fscanf(fp, ",");
217 }
218 fclose(fp);
219 return (num_cpus > 0) ? num_cpus : 1;
220 }
221
222 #elif defined(_SC_NPROCESSORS_ONLN)
223
224 int AvailableHwConcurrency() {
225 int n = sysconf(_SC_NPROCESSORS_ONLN);
226 return (n > 0) ? n : 1;
227 }
228
229 #elif _WIN32||_WIN64
230
231 static std::atomic<do_once_state> hardware_concurrency_info;
232
233 static const WORD TBB_ALL_PROCESSOR_GROUPS = 0xffff;
234
235 // Statically allocate an array for processor group information.
236 // Windows 7 supports maximum 4 groups, but let's look ahead a little.
237 static const WORD MaxProcessorGroups = 64;
238
239 struct ProcessorGroupInfo {
240 DWORD_PTR mask; ///< Affinity mask covering the whole group
241 int numProcs; ///< Number of processors in the group
242 int numProcsRunningTotal; ///< Subtotal of processors in this and preceding groups
243
244 //! Total number of processor groups in the system
245 static int NumGroups;
246
247 //! Index of the group with a slot reserved for the first external thread
248 /** In the context of multiple processor groups support current implementation
249 defines "the first external thread" as the first thread to invoke
250 AvailableHwConcurrency().
251
252 TODO: Implement a dynamic scheme remapping workers depending on the pending
253 external threads affinity. **/
254 static int HoleIndex;
255 };
256
257 int ProcessorGroupInfo::NumGroups = 1;
258 int ProcessorGroupInfo::HoleIndex = 0;
259
260 ProcessorGroupInfo theProcessorGroups[MaxProcessorGroups];
261 int calculate_numa[MaxProcessorGroups]; //Array needed for FindProcessorGroupIndex to calculate Processor Group when number of threads > number of cores to distribute threads evenly between processor groups
262 int numaSum;
263 struct TBB_GROUP_AFFINITY {
264 DWORD_PTR Mask;
265 WORD Group;
266 WORD Reserved[3];
267 };
268
269 static DWORD (WINAPI *TBB_GetActiveProcessorCount)( WORD groupIndex ) = nullptr;
270 static WORD (WINAPI *TBB_GetActiveProcessorGroupCount)() = nullptr;
271 static BOOL (WINAPI *TBB_SetThreadGroupAffinity)( HANDLE hThread,
272 const TBB_GROUP_AFFINITY* newAff, TBB_GROUP_AFFINITY *prevAff );
273 static BOOL (WINAPI *TBB_GetThreadGroupAffinity)( HANDLE hThread, TBB_GROUP_AFFINITY* );
274
275 static const dynamic_link_descriptor ProcessorGroupsApiLinkTable[] = {
276 DLD(GetActiveProcessorCount, TBB_GetActiveProcessorCount)
277 , DLD(GetActiveProcessorGroupCount, TBB_GetActiveProcessorGroupCount)
278 , DLD(SetThreadGroupAffinity, TBB_SetThreadGroupAffinity)
279 , DLD(GetThreadGroupAffinity, TBB_GetThreadGroupAffinity)
280 };
281
282 static void initialize_hardware_concurrency_info () {
283 suppress_unused_warning(TBB_ALL_PROCESSOR_GROUPS);
284 #if __TBB_WIN8UI_SUPPORT
285 // For these applications processor groups info is unavailable
286 // Setting up a number of processors for one processor group
287 theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = std::thread::hardware_concurrency();
288 #else /* __TBB_WIN8UI_SUPPORT */
289 dynamic_link( "Kernel32.dll", ProcessorGroupsApiLinkTable,
290 sizeof(ProcessorGroupsApiLinkTable)/sizeof(dynamic_link_descriptor) );
291 SYSTEM_INFO si;
292 GetNativeSystemInfo(&si);
293 DWORD_PTR pam, sam, m = 1;
294 GetProcessAffinityMask( GetCurrentProcess(), &pam, &sam );
295 int nproc = 0;
296 for ( std::size_t i = 0; i < sizeof(DWORD_PTR) * CHAR_BIT; ++i, m <<= 1 ) {
297 if ( pam & m )
298 ++nproc;
299 }
300 __TBB_ASSERT( nproc <= (int)si.dwNumberOfProcessors, nullptr);
301 // By default setting up a number of processors for one processor group
302 theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = nproc;
303 // Setting up processor groups in case the process does not restrict affinity mask and more than one processor group is present
304 if ( nproc == (int)si.dwNumberOfProcessors && TBB_GetActiveProcessorCount ) {
305 // The process does not have restricting affinity mask and multiple processor groups are possible
306 ProcessorGroupInfo::NumGroups = (int)TBB_GetActiveProcessorGroupCount();
307 __TBB_ASSERT( ProcessorGroupInfo::NumGroups <= MaxProcessorGroups, nullptr);
308 // Fail safety bootstrap. Release versions will limit available concurrency
309 // level, while debug ones would assert.
310 if ( ProcessorGroupInfo::NumGroups > MaxProcessorGroups )
311 ProcessorGroupInfo::NumGroups = MaxProcessorGroups;
312 if ( ProcessorGroupInfo::NumGroups > 1 ) {
313 TBB_GROUP_AFFINITY ga;
314 if ( TBB_GetThreadGroupAffinity( GetCurrentThread(), &ga ) )
315 ProcessorGroupInfo::HoleIndex = ga.Group;
316 int nprocs = 0;
317 int min_procs = INT_MAX;
318 for ( WORD i = 0; i < ProcessorGroupInfo::NumGroups; ++i ) {
319 ProcessorGroupInfo &pgi = theProcessorGroups[i];
320 pgi.numProcs = (int)TBB_GetActiveProcessorCount(i);
321 if (pgi.numProcs < min_procs) min_procs = pgi.numProcs; //Finding the minimum number of processors in the Processor Groups
322 calculate_numa[i] = pgi.numProcs;
323 __TBB_ASSERT( pgi.numProcs <= (int)sizeof(DWORD_PTR) * CHAR_BIT, nullptr);
324 pgi.mask = pgi.numProcs == sizeof(DWORD_PTR) * CHAR_BIT ? ~(DWORD_PTR)0 : (DWORD_PTR(1) << pgi.numProcs) - 1;
325 pgi.numProcsRunningTotal = nprocs += pgi.numProcs;
326 }
327 __TBB_ASSERT( nprocs == (int)TBB_GetActiveProcessorCount( TBB_ALL_PROCESSOR_GROUPS ), nullptr);
328
329 calculate_numa[0] = (calculate_numa[0] / min_procs)-1;
330 for (WORD i = 1; i < ProcessorGroupInfo::NumGroups; ++i) {
331 calculate_numa[i] = calculate_numa[i-1] + (calculate_numa[i] / min_procs);
332 }
333
334 numaSum = calculate_numa[ProcessorGroupInfo::NumGroups - 1];
335
336 }
337
338 }
339 #endif /* __TBB_WIN8UI_SUPPORT */
340
341 PrintExtraVersionInfo("Processor groups", "%d", ProcessorGroupInfo::NumGroups);
342 if (ProcessorGroupInfo::NumGroups>1)
343 for (int i=0; i<ProcessorGroupInfo::NumGroups; ++i)
344 PrintExtraVersionInfo( "----- Group", "%d: size %d", i, theProcessorGroups[i].numProcs);
345 }
346
347 int NumberOfProcessorGroups() {
348 __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "NumberOfProcessorGroups is used before AvailableHwConcurrency" );
349 return ProcessorGroupInfo::NumGroups;
350 }
351
352 int FindProcessorGroupIndex ( int procIdx ) {
353 int current_grp_idx = ProcessorGroupInfo::HoleIndex;
354 if (procIdx >= theProcessorGroups[current_grp_idx].numProcs && procIdx < theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal) {
355 procIdx = procIdx - theProcessorGroups[current_grp_idx].numProcs;
356 do {
357 current_grp_idx = (current_grp_idx + 1) % (ProcessorGroupInfo::NumGroups);
358 procIdx = procIdx - theProcessorGroups[current_grp_idx].numProcs;
359
360 } while (procIdx >= 0);
361 }
362 else if (procIdx >= theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal) {
363 int temp_grp_index = 0;
364 procIdx = procIdx - theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal;
365 procIdx = procIdx % (numaSum+1); //ProcIdx to stay between 0 and numaSum
366
367 while (procIdx - calculate_numa[temp_grp_index] > 0) {
368 temp_grp_index = (temp_grp_index + 1) % ProcessorGroupInfo::NumGroups;
369 }
370 current_grp_idx = temp_grp_index;
371 }
372 __TBB_ASSERT(current_grp_idx < ProcessorGroupInfo::NumGroups, nullptr);
373
374 return current_grp_idx;
375 }
376
377 void MoveThreadIntoProcessorGroup( void* hThread, int groupIndex ) {
378 __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "MoveThreadIntoProcessorGroup is used before AvailableHwConcurrency" );
379 if ( !TBB_SetThreadGroupAffinity )
380 return;
381 TBB_GROUP_AFFINITY ga = { theProcessorGroups[groupIndex].mask, (WORD)groupIndex, {0,0,0} };
382 TBB_SetThreadGroupAffinity( hThread, &ga, nullptr);
383 }
384
385 int AvailableHwConcurrency() {
386 atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info );
387 return theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal;
388 }
389
390 /* End of _WIN32||_WIN64 implementation */
391 #else
392 #error AvailableHwConcurrency is not implemented for this OS
393 #endif
394
395 } // namespace r1
396 } // namespace detail
397 } // namespace tbb
398
399 #endif /* !__TBB_HardwareConcurrency */
400