1 /* 2 Copyright (c) 2005-2021 Intel Corporation 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 // Source file for miscellaneous entities that are infrequently referenced by 18 // an executing program, and implementation of which requires dynamic linking. 19 20 #include "misc.h" 21 22 #if !defined(__TBB_HardwareConcurrency) 23 24 #include "dynamic_link.h" 25 #include <stdio.h> 26 #include <limits.h> 27 28 #if _WIN32||_WIN64 29 #include <windows.h> 30 #if __TBB_WIN8UI_SUPPORT 31 #include <thread> 32 #endif 33 #else 34 #include <unistd.h> 35 #if __linux__ 36 #include <sys/sysinfo.h> 37 #include <cstring> 38 #include <sched.h> 39 #include <cerrno> 40 #elif __sun 41 #include <sys/sysinfo.h> 42 #elif __FreeBSD__ 43 #include <cerrno> 44 #include <cstring> 45 #include <sys/param.h> // Required by <sys/cpuset.h> 46 #include <sys/cpuset.h> 47 #endif 48 #endif 49 50 namespace tbb { 51 namespace detail { 52 namespace r1 { 53 54 #if __TBB_USE_OS_AFFINITY_SYSCALL 55 56 #if __linux__ 57 // Handlers for interoperation with libiomp 58 static int (*libiomp_try_restoring_original_mask)(); 59 // Table for mapping to libiomp entry points 60 static const dynamic_link_descriptor iompLinkTable[] = { 61 DLD_NOWEAK( kmp_set_thread_affinity_mask_initial, libiomp_try_restoring_original_mask ) 62 }; 63 #endif 64 65 static void set_thread_affinity_mask( std::size_t maskSize, const basic_mask_t* threadMask ) { 66 #if __linux__ 67 if( sched_setaffinity( 0, maskSize, threadMask ) ) 68 #else /* FreeBSD */ 69 if( cpuset_setaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) ) 70 #endif 71 // Here and below the error severity is lowered from critical level 72 // because it may happen during TBB library unload because of not 73 // waiting for workers to complete (current RML policy, to be fixed). 74 // handle_perror( errno, "setaffinity syscall" ); 75 runtime_warning( "setaffinity syscall failed" ); 76 } 77 78 static void get_thread_affinity_mask( std::size_t maskSize, basic_mask_t* threadMask ) { 79 #if __linux__ 80 if( sched_getaffinity( 0, maskSize, threadMask ) ) 81 #else /* FreeBSD */ 82 if( cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) ) 83 #endif 84 runtime_warning( "getaffinity syscall failed" ); 85 } 86 87 static basic_mask_t* process_mask; 88 static int num_masks; 89 90 void destroy_process_mask() { 91 if( process_mask ) { 92 delete [] process_mask; 93 } 94 } 95 96 #define curMaskSize sizeof(basic_mask_t) * num_masks 97 affinity_helper::~affinity_helper() { 98 if( threadMask ) { 99 if( is_changed ) { 100 set_thread_affinity_mask( curMaskSize, threadMask ); 101 } 102 delete [] threadMask; 103 } 104 } 105 void affinity_helper::protect_affinity_mask( bool restore_process_mask ) { 106 if( threadMask == NULL && num_masks ) { // TODO: assert num_masks validity? 107 threadMask = new basic_mask_t [num_masks]; 108 std::memset( threadMask, 0, curMaskSize ); 109 get_thread_affinity_mask( curMaskSize, threadMask ); 110 if( restore_process_mask ) { 111 __TBB_ASSERT( process_mask, "A process mask is requested but not yet stored" ); 112 is_changed = memcmp( process_mask, threadMask, curMaskSize ); 113 if( is_changed ) 114 set_thread_affinity_mask( curMaskSize, process_mask ); 115 } else { 116 // Assume that the mask will be changed by the caller. 117 is_changed = 1; 118 } 119 } 120 } 121 void affinity_helper::dismiss() { 122 if( threadMask ) { 123 delete [] threadMask; 124 threadMask = NULL; 125 } 126 is_changed = 0; 127 } 128 #undef curMaskSize 129 130 static std::atomic<do_once_state> hardware_concurrency_info; 131 132 static int theNumProcs; 133 134 static void initialize_hardware_concurrency_info () { 135 int err; 136 int availableProcs = 0; 137 int numMasks = 1; 138 #if __linux__ 139 int maxProcs = sysconf(_SC_NPROCESSORS_ONLN); 140 int pid = getpid(); 141 #else /* FreeBSD >= 7.1 */ 142 int maxProcs = sysconf(_SC_NPROCESSORS_ONLN); 143 #endif 144 basic_mask_t* processMask; 145 const std::size_t BasicMaskSize = sizeof(basic_mask_t); 146 for (;;) { 147 const int curMaskSize = BasicMaskSize * numMasks; 148 processMask = new basic_mask_t[numMasks]; 149 std::memset( processMask, 0, curMaskSize ); 150 #if __linux__ 151 err = sched_getaffinity( pid, curMaskSize, processMask ); 152 if ( !err || errno != EINVAL || curMaskSize * CHAR_BIT >= 256 * 1024 ) 153 break; 154 #else /* FreeBSD >= 7.1 */ 155 // CPU_LEVEL_WHICH - anonymous (current) mask, CPU_LEVEL_CPUSET - assigned mask 156 err = cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, curMaskSize, processMask ); 157 if ( !err || errno != ERANGE || curMaskSize * CHAR_BIT >= 16 * 1024 ) 158 break; 159 #endif /* FreeBSD >= 7.1 */ 160 delete[] processMask; 161 numMasks <<= 1; 162 } 163 if ( !err ) { 164 // We have found the mask size and captured the process affinity mask into processMask. 165 num_masks = numMasks; // do here because it's needed for affinity_helper to work 166 #if __linux__ 167 // For better coexistence with libiomp which might have changed the mask already, 168 // check for its presence and ask it to restore the mask. 169 dynamic_link_handle libhandle; 170 if ( dynamic_link( "libiomp5.so", iompLinkTable, 1, &libhandle, DYNAMIC_LINK_GLOBAL ) ) { 171 // We have found the symbol provided by libiomp5 for restoring original thread affinity. 172 affinity_helper affhelp; 173 affhelp.protect_affinity_mask( /*restore_process_mask=*/false ); 174 if ( libiomp_try_restoring_original_mask()==0 ) { 175 // Now we have the right mask to capture, restored by libiomp. 176 const int curMaskSize = BasicMaskSize * numMasks; 177 std::memset( processMask, 0, curMaskSize ); 178 get_thread_affinity_mask( curMaskSize, processMask ); 179 } else 180 affhelp.dismiss(); // thread mask has not changed 181 dynamic_unlink( libhandle ); 182 // Destructor of affinity_helper restores the thread mask (unless dismissed). 183 } 184 #endif 185 for ( int m = 0; availableProcs < maxProcs && m < numMasks; ++m ) { 186 for ( std::size_t i = 0; (availableProcs < maxProcs) && (i < BasicMaskSize * CHAR_BIT); ++i ) { 187 if ( CPU_ISSET( i, processMask + m ) ) 188 ++availableProcs; 189 } 190 } 191 process_mask = processMask; 192 } 193 else { 194 // Failed to get the process affinity mask; assume the whole machine can be used. 195 availableProcs = (maxProcs == INT_MAX) ? sysconf(_SC_NPROCESSORS_ONLN) : maxProcs; 196 delete[] processMask; 197 } 198 theNumProcs = availableProcs > 0 ? availableProcs : 1; // Fail safety strap 199 __TBB_ASSERT( theNumProcs <= sysconf(_SC_NPROCESSORS_ONLN), NULL ); 200 } 201 202 int AvailableHwConcurrency() { 203 atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info ); 204 return theNumProcs; 205 } 206 207 /* End of __TBB_USE_OS_AFFINITY_SYSCALL implementation */ 208 #elif __ANDROID__ 209 210 // Work-around for Android that reads the correct number of available CPUs since system calls are unreliable. 211 // Format of "present" file is: ([<int>-<int>|<int>],)+ 212 int AvailableHwConcurrency() { 213 FILE *fp = fopen("/sys/devices/system/cpu/present", "r"); 214 if (fp == NULL) return 1; 215 int num_args, lower, upper, num_cpus=0; 216 while ((num_args = fscanf(fp, "%u-%u", &lower, &upper)) != EOF) { 217 switch(num_args) { 218 case 2: num_cpus += upper - lower + 1; break; 219 case 1: num_cpus += 1; break; 220 } 221 fscanf(fp, ","); 222 } 223 return (num_cpus > 0) ? num_cpus : 1; 224 } 225 226 #elif defined(_SC_NPROCESSORS_ONLN) 227 228 int AvailableHwConcurrency() { 229 int n = sysconf(_SC_NPROCESSORS_ONLN); 230 return (n > 0) ? n : 1; 231 } 232 233 #elif _WIN32||_WIN64 234 235 static std::atomic<do_once_state> hardware_concurrency_info; 236 237 static const WORD TBB_ALL_PROCESSOR_GROUPS = 0xffff; 238 239 // Statically allocate an array for processor group information. 240 // Windows 7 supports maximum 4 groups, but let's look ahead a little. 241 static const WORD MaxProcessorGroups = 64; 242 243 struct ProcessorGroupInfo { 244 DWORD_PTR mask; ///< Affinity mask covering the whole group 245 int numProcs; ///< Number of processors in the group 246 int numProcsRunningTotal; ///< Subtotal of processors in this and preceding groups 247 248 //! Total number of processor groups in the system 249 static int NumGroups; 250 251 //! Index of the group with a slot reserved for the first external thread 252 /** In the context of multiple processor groups support current implementation 253 defines "the first external thread" as the first thread to invoke 254 AvailableHwConcurrency(). 255 256 TODO: Implement a dynamic scheme remapping workers depending on the pending 257 external threads affinity. **/ 258 static int HoleIndex; 259 }; 260 261 int ProcessorGroupInfo::NumGroups = 1; 262 int ProcessorGroupInfo::HoleIndex = 0; 263 264 ProcessorGroupInfo theProcessorGroups[MaxProcessorGroups]; 265 266 struct TBB_GROUP_AFFINITY { 267 DWORD_PTR Mask; 268 WORD Group; 269 WORD Reserved[3]; 270 }; 271 272 static DWORD (WINAPI *TBB_GetActiveProcessorCount)( WORD groupIndex ) = NULL; 273 static WORD (WINAPI *TBB_GetActiveProcessorGroupCount)() = NULL; 274 static BOOL (WINAPI *TBB_SetThreadGroupAffinity)( HANDLE hThread, 275 const TBB_GROUP_AFFINITY* newAff, TBB_GROUP_AFFINITY *prevAff ); 276 static BOOL (WINAPI *TBB_GetThreadGroupAffinity)( HANDLE hThread, TBB_GROUP_AFFINITY* ); 277 278 static const dynamic_link_descriptor ProcessorGroupsApiLinkTable[] = { 279 DLD(GetActiveProcessorCount, TBB_GetActiveProcessorCount) 280 , DLD(GetActiveProcessorGroupCount, TBB_GetActiveProcessorGroupCount) 281 , DLD(SetThreadGroupAffinity, TBB_SetThreadGroupAffinity) 282 , DLD(GetThreadGroupAffinity, TBB_GetThreadGroupAffinity) 283 }; 284 285 static void initialize_hardware_concurrency_info () { 286 suppress_unused_warning(TBB_ALL_PROCESSOR_GROUPS); 287 #if __TBB_WIN8UI_SUPPORT 288 // For these applications processor groups info is unavailable 289 // Setting up a number of processors for one processor group 290 theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = std::thread::hardware_concurrency(); 291 #else /* __TBB_WIN8UI_SUPPORT */ 292 dynamic_link( "Kernel32.dll", ProcessorGroupsApiLinkTable, 293 sizeof(ProcessorGroupsApiLinkTable)/sizeof(dynamic_link_descriptor) ); 294 SYSTEM_INFO si; 295 GetNativeSystemInfo(&si); 296 DWORD_PTR pam, sam, m = 1; 297 GetProcessAffinityMask( GetCurrentProcess(), &pam, &sam ); 298 int nproc = 0; 299 for ( std::size_t i = 0; i < sizeof(DWORD_PTR) * CHAR_BIT; ++i, m <<= 1 ) { 300 if ( pam & m ) 301 ++nproc; 302 } 303 __TBB_ASSERT( nproc <= (int)si.dwNumberOfProcessors, NULL ); 304 // By default setting up a number of processors for one processor group 305 theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = nproc; 306 // Setting up processor groups in case the process does not restrict affinity mask and more than one processor group is present 307 if ( nproc == (int)si.dwNumberOfProcessors && TBB_GetActiveProcessorCount ) { 308 // The process does not have restricting affinity mask and multiple processor groups are possible 309 ProcessorGroupInfo::NumGroups = (int)TBB_GetActiveProcessorGroupCount(); 310 __TBB_ASSERT( ProcessorGroupInfo::NumGroups <= MaxProcessorGroups, NULL ); 311 // Fail safety bootstrap. Release versions will limit available concurrency 312 // level, while debug ones would assert. 313 if ( ProcessorGroupInfo::NumGroups > MaxProcessorGroups ) 314 ProcessorGroupInfo::NumGroups = MaxProcessorGroups; 315 if ( ProcessorGroupInfo::NumGroups > 1 ) { 316 TBB_GROUP_AFFINITY ga; 317 if ( TBB_GetThreadGroupAffinity( GetCurrentThread(), &ga ) ) 318 ProcessorGroupInfo::HoleIndex = ga.Group; 319 int nprocs = 0; 320 for ( WORD i = 0; i < ProcessorGroupInfo::NumGroups; ++i ) { 321 ProcessorGroupInfo &pgi = theProcessorGroups[i]; 322 pgi.numProcs = (int)TBB_GetActiveProcessorCount(i); 323 __TBB_ASSERT( pgi.numProcs <= (int)sizeof(DWORD_PTR) * CHAR_BIT, NULL ); 324 pgi.mask = pgi.numProcs == sizeof(DWORD_PTR) * CHAR_BIT ? ~(DWORD_PTR)0 : (DWORD_PTR(1) << pgi.numProcs) - 1; 325 pgi.numProcsRunningTotal = nprocs += pgi.numProcs; 326 } 327 __TBB_ASSERT( nprocs == (int)TBB_GetActiveProcessorCount( TBB_ALL_PROCESSOR_GROUPS ), NULL ); 328 } 329 } 330 #endif /* __TBB_WIN8UI_SUPPORT */ 331 332 PrintExtraVersionInfo("Processor groups", "%d", ProcessorGroupInfo::NumGroups); 333 if (ProcessorGroupInfo::NumGroups>1) 334 for (int i=0; i<ProcessorGroupInfo::NumGroups; ++i) 335 PrintExtraVersionInfo( "----- Group", "%d: size %d", i, theProcessorGroups[i].numProcs); 336 } 337 338 int NumberOfProcessorGroups() { 339 __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "NumberOfProcessorGroups is used before AvailableHwConcurrency" ); 340 return ProcessorGroupInfo::NumGroups; 341 } 342 343 // Offset for the slot reserved for the first external thread 344 #define HoleAdjusted(procIdx, grpIdx) (procIdx + (holeIdx <= grpIdx)) 345 346 int FindProcessorGroupIndex ( int procIdx ) { 347 // In case of oversubscription spread extra workers in a round robin manner 348 int holeIdx; 349 const int numProcs = theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal; 350 if ( procIdx >= numProcs - 1 ) { 351 holeIdx = INT_MAX; 352 procIdx = (procIdx - numProcs + 1) % numProcs; 353 } 354 else 355 holeIdx = ProcessorGroupInfo::HoleIndex; 356 __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "FindProcessorGroupIndex is used before AvailableHwConcurrency" ); 357 // Approximate the likely group index assuming all groups are of the same size 358 int i = procIdx / theProcessorGroups[0].numProcs; 359 // Make sure the approximation is a valid group index 360 if (i >= ProcessorGroupInfo::NumGroups) i = ProcessorGroupInfo::NumGroups-1; 361 // Now adjust the approximation up or down 362 if ( theProcessorGroups[i].numProcsRunningTotal > HoleAdjusted(procIdx, i) ) { 363 while ( theProcessorGroups[i].numProcsRunningTotal - theProcessorGroups[i].numProcs > HoleAdjusted(procIdx, i) ) { 364 __TBB_ASSERT( i > 0, NULL ); 365 --i; 366 } 367 } 368 else { 369 do { 370 ++i; 371 } while ( theProcessorGroups[i].numProcsRunningTotal <= HoleAdjusted(procIdx, i) ); 372 } 373 __TBB_ASSERT( i < ProcessorGroupInfo::NumGroups, NULL ); 374 return i; 375 } 376 377 void MoveThreadIntoProcessorGroup( void* hThread, int groupIndex ) { 378 __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "MoveThreadIntoProcessorGroup is used before AvailableHwConcurrency" ); 379 if ( !TBB_SetThreadGroupAffinity ) 380 return; 381 TBB_GROUP_AFFINITY ga = { theProcessorGroups[groupIndex].mask, (WORD)groupIndex, {0,0,0} }; 382 TBB_SetThreadGroupAffinity( hThread, &ga, NULL ); 383 } 384 385 int AvailableHwConcurrency() { 386 atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info ); 387 return theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal; 388 } 389 390 /* End of _WIN32||_WIN64 implementation */ 391 #else 392 #error AvailableHwConcurrency is not implemented for this OS 393 #endif 394 395 } // namespace r1 396 } // namespace detail 397 } // namespace tbb 398 399 #endif /* !__TBB_HardwareConcurrency */ 400