1 /* 2 Copyright (c) 2005-2021 Intel Corporation 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 // Source file for miscellaneous entities that are infrequently referenced by 18 // an executing program, and implementation of which requires dynamic linking. 19 20 #include "misc.h" 21 22 #if !defined(__TBB_HardwareConcurrency) 23 24 #include "dynamic_link.h" 25 #include <stdio.h> 26 #include <limits.h> 27 28 #if _WIN32||_WIN64 29 #include <windows.h> 30 #if __TBB_WIN8UI_SUPPORT 31 #include <thread> 32 #endif 33 #else 34 #include <unistd.h> 35 #if __linux__ 36 #include <sys/sysinfo.h> 37 #include <cstring> 38 #include <sched.h> 39 #include <cerrno> 40 #elif __sun 41 #include <sys/sysinfo.h> 42 #elif __FreeBSD__ 43 #include <cerrno> 44 #include <cstring> 45 #include <sys/param.h> // Required by <sys/cpuset.h> 46 #include <sys/cpuset.h> 47 #endif 48 #endif 49 50 namespace tbb { 51 namespace detail { 52 namespace r1 { 53 54 #if __TBB_USE_OS_AFFINITY_SYSCALL 55 56 #if __linux__ 57 // Handlers for interoperation with libiomp 58 static int (*libiomp_try_restoring_original_mask)(); 59 // Table for mapping to libiomp entry points 60 static const dynamic_link_descriptor iompLinkTable[] = { 61 DLD_NOWEAK( kmp_set_thread_affinity_mask_initial, libiomp_try_restoring_original_mask ) 62 }; 63 #endif 64 65 static void set_thread_affinity_mask( std::size_t maskSize, const basic_mask_t* threadMask ) { 66 #if __linux__ 67 if( sched_setaffinity( 0, maskSize, threadMask ) ) 68 #else /* FreeBSD */ 69 if( cpuset_setaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) ) 70 #endif 71 // Here and below the error severity is lowered from critical level 72 // because it may happen during TBB library unload because of not 73 // waiting for workers to complete (current RML policy, to be fixed). 74 // handle_perror( errno, "setaffinity syscall" ); 75 runtime_warning( "setaffinity syscall failed" ); 76 } 77 78 static void get_thread_affinity_mask( std::size_t maskSize, basic_mask_t* threadMask ) { 79 #if __linux__ 80 if( sched_getaffinity( 0, maskSize, threadMask ) ) 81 #else /* FreeBSD */ 82 if( cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) ) 83 #endif 84 runtime_warning( "getaffinity syscall failed" ); 85 } 86 87 static basic_mask_t* process_mask; 88 static int num_masks; 89 90 void destroy_process_mask() { 91 if( process_mask ) { 92 delete [] process_mask; 93 } 94 } 95 96 #define curMaskSize sizeof(basic_mask_t) * num_masks 97 affinity_helper::~affinity_helper() { 98 if( threadMask ) { 99 if( is_changed ) { 100 set_thread_affinity_mask( curMaskSize, threadMask ); 101 } 102 delete [] threadMask; 103 } 104 } 105 void affinity_helper::protect_affinity_mask( bool restore_process_mask ) { 106 if( threadMask == NULL && num_masks ) { // TODO: assert num_masks validity? 107 threadMask = new basic_mask_t [num_masks]; 108 std::memset( threadMask, 0, curMaskSize ); 109 get_thread_affinity_mask( curMaskSize, threadMask ); 110 if( restore_process_mask ) { 111 __TBB_ASSERT( process_mask, "A process mask is requested but not yet stored" ); 112 is_changed = memcmp( process_mask, threadMask, curMaskSize ); 113 if( is_changed ) 114 set_thread_affinity_mask( curMaskSize, process_mask ); 115 } else { 116 // Assume that the mask will be changed by the caller. 117 is_changed = 1; 118 } 119 } 120 } 121 void affinity_helper::dismiss() { 122 if( threadMask ) { 123 delete [] threadMask; 124 threadMask = NULL; 125 } 126 is_changed = 0; 127 } 128 #undef curMaskSize 129 130 static std::atomic<do_once_state> hardware_concurrency_info; 131 132 static int theNumProcs; 133 134 static void initialize_hardware_concurrency_info () { 135 int err; 136 int availableProcs = 0; 137 int numMasks = 1; 138 #if __linux__ 139 int maxProcs = sysconf(_SC_NPROCESSORS_ONLN); 140 int pid = getpid(); 141 #else /* FreeBSD >= 7.1 */ 142 int maxProcs = sysconf(_SC_NPROCESSORS_ONLN); 143 #endif 144 basic_mask_t* processMask; 145 const std::size_t BasicMaskSize = sizeof(basic_mask_t); 146 for (;;) { 147 const int curMaskSize = BasicMaskSize * numMasks; 148 processMask = new basic_mask_t[numMasks]; 149 std::memset( processMask, 0, curMaskSize ); 150 #if __linux__ 151 err = sched_getaffinity( pid, curMaskSize, processMask ); 152 if ( !err || errno != EINVAL || curMaskSize * CHAR_BIT >= 256 * 1024 ) 153 break; 154 #else /* FreeBSD >= 7.1 */ 155 // CPU_LEVEL_WHICH - anonymous (current) mask, CPU_LEVEL_CPUSET - assigned mask 156 err = cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, curMaskSize, processMask ); 157 if ( !err || errno != ERANGE || curMaskSize * CHAR_BIT >= 16 * 1024 ) 158 break; 159 #endif /* FreeBSD >= 7.1 */ 160 delete[] processMask; 161 numMasks <<= 1; 162 } 163 if ( !err ) { 164 // We have found the mask size and captured the process affinity mask into processMask. 165 num_masks = numMasks; // do here because it's needed for affinity_helper to work 166 #if __linux__ 167 // For better coexistence with libiomp which might have changed the mask already, 168 // check for its presence and ask it to restore the mask. 169 dynamic_link_handle libhandle; 170 if ( dynamic_link( "libiomp5.so", iompLinkTable, 1, &libhandle, DYNAMIC_LINK_GLOBAL ) ) { 171 // We have found the symbol provided by libiomp5 for restoring original thread affinity. 172 affinity_helper affhelp; 173 affhelp.protect_affinity_mask( /*restore_process_mask=*/false ); 174 if ( libiomp_try_restoring_original_mask()==0 ) { 175 // Now we have the right mask to capture, restored by libiomp. 176 const int curMaskSize = BasicMaskSize * numMasks; 177 std::memset( processMask, 0, curMaskSize ); 178 get_thread_affinity_mask( curMaskSize, processMask ); 179 } else 180 affhelp.dismiss(); // thread mask has not changed 181 dynamic_unlink( libhandle ); 182 // Destructor of affinity_helper restores the thread mask (unless dismissed). 183 } 184 #endif 185 for ( int m = 0; availableProcs < maxProcs && m < numMasks; ++m ) { 186 for ( std::size_t i = 0; (availableProcs < maxProcs) && (i < BasicMaskSize * CHAR_BIT); ++i ) { 187 if ( CPU_ISSET( i, processMask + m ) ) 188 ++availableProcs; 189 } 190 } 191 process_mask = processMask; 192 } 193 else { 194 // Failed to get the process affinity mask; assume the whole machine can be used. 195 availableProcs = (maxProcs == INT_MAX) ? sysconf(_SC_NPROCESSORS_ONLN) : maxProcs; 196 delete[] processMask; 197 } 198 theNumProcs = availableProcs > 0 ? availableProcs : 1; // Fail safety strap 199 __TBB_ASSERT( theNumProcs <= sysconf(_SC_NPROCESSORS_ONLN), NULL ); 200 } 201 202 int AvailableHwConcurrency() { 203 atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info ); 204 return theNumProcs; 205 } 206 207 /* End of __TBB_USE_OS_AFFINITY_SYSCALL implementation */ 208 #elif __ANDROID__ 209 210 // Work-around for Android that reads the correct number of available CPUs since system calls are unreliable. 211 // Format of "present" file is: ([<int>-<int>|<int>],)+ 212 int AvailableHwConcurrency() { 213 FILE *fp = fopen("/sys/devices/system/cpu/present", "r"); 214 if (fp == NULL) return 1; 215 int num_args, lower, upper, num_cpus=0; 216 while ((num_args = fscanf(fp, "%u-%u", &lower, &upper)) != EOF) { 217 switch(num_args) { 218 case 2: num_cpus += upper - lower + 1; break; 219 case 1: num_cpus += 1; break; 220 } 221 fscanf(fp, ","); 222 } 223 return (num_cpus > 0) ? num_cpus : 1; 224 } 225 226 #elif defined(_SC_NPROCESSORS_ONLN) 227 228 int AvailableHwConcurrency() { 229 int n = sysconf(_SC_NPROCESSORS_ONLN); 230 return (n > 0) ? n : 1; 231 } 232 233 #elif _WIN32||_WIN64 234 235 static std::atomic<do_once_state> hardware_concurrency_info; 236 237 static const WORD TBB_ALL_PROCESSOR_GROUPS = 0xffff; 238 239 // Statically allocate an array for processor group information. 240 // Windows 7 supports maximum 4 groups, but let's look ahead a little. 241 static const WORD MaxProcessorGroups = 64; 242 243 struct ProcessorGroupInfo { 244 DWORD_PTR mask; ///< Affinity mask covering the whole group 245 int numProcs; ///< Number of processors in the group 246 int numProcsRunningTotal; ///< Subtotal of processors in this and preceding groups 247 248 //! Total number of processor groups in the system 249 static int NumGroups; 250 251 //! Index of the group with a slot reserved for the first external thread 252 /** In the context of multiple processor groups support current implementation 253 defines "the first external thread" as the first thread to invoke 254 AvailableHwConcurrency(). 255 256 TODO: Implement a dynamic scheme remapping workers depending on the pending 257 external threads affinity. **/ 258 static int HoleIndex; 259 }; 260 261 int ProcessorGroupInfo::NumGroups = 1; 262 int ProcessorGroupInfo::HoleIndex = 0; 263 264 ProcessorGroupInfo theProcessorGroups[MaxProcessorGroups]; 265 266 struct TBB_GROUP_AFFINITY { 267 DWORD_PTR Mask; 268 WORD Group; 269 WORD Reserved[3]; 270 }; 271 272 static DWORD (WINAPI *TBB_GetActiveProcessorCount)( WORD groupIndex ) = NULL; 273 static WORD (WINAPI *TBB_GetActiveProcessorGroupCount)() = NULL; 274 static BOOL (WINAPI *TBB_SetThreadGroupAffinity)( HANDLE hThread, 275 const TBB_GROUP_AFFINITY* newAff, TBB_GROUP_AFFINITY *prevAff ); 276 static BOOL (WINAPI *TBB_GetThreadGroupAffinity)( HANDLE hThread, TBB_GROUP_AFFINITY* ); 277 278 static const dynamic_link_descriptor ProcessorGroupsApiLinkTable[] = { 279 DLD(GetActiveProcessorCount, TBB_GetActiveProcessorCount) 280 , DLD(GetActiveProcessorGroupCount, TBB_GetActiveProcessorGroupCount) 281 , DLD(SetThreadGroupAffinity, TBB_SetThreadGroupAffinity) 282 , DLD(GetThreadGroupAffinity, TBB_GetThreadGroupAffinity) 283 }; 284 285 static void initialize_hardware_concurrency_info () { 286 #if __TBB_WIN8UI_SUPPORT 287 // For these applications processor groups info is unavailable 288 // Setting up a number of processors for one processor group 289 theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = std::thread::hardware_concurrency(); 290 #else /* __TBB_WIN8UI_SUPPORT */ 291 dynamic_link( "Kernel32.dll", ProcessorGroupsApiLinkTable, 292 sizeof(ProcessorGroupsApiLinkTable)/sizeof(dynamic_link_descriptor) ); 293 SYSTEM_INFO si; 294 GetNativeSystemInfo(&si); 295 DWORD_PTR pam, sam, m = 1; 296 GetProcessAffinityMask( GetCurrentProcess(), &pam, &sam ); 297 int nproc = 0; 298 for ( std::size_t i = 0; i < sizeof(DWORD_PTR) * CHAR_BIT; ++i, m <<= 1 ) { 299 if ( pam & m ) 300 ++nproc; 301 } 302 __TBB_ASSERT( nproc <= (int)si.dwNumberOfProcessors, NULL ); 303 // By default setting up a number of processors for one processor group 304 theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = nproc; 305 // Setting up processor groups in case the process does not restrict affinity mask and more than one processor group is present 306 if ( nproc == (int)si.dwNumberOfProcessors && TBB_GetActiveProcessorCount ) { 307 // The process does not have restricting affinity mask and multiple processor groups are possible 308 ProcessorGroupInfo::NumGroups = (int)TBB_GetActiveProcessorGroupCount(); 309 __TBB_ASSERT( ProcessorGroupInfo::NumGroups <= MaxProcessorGroups, NULL ); 310 // Fail safety bootstrap. Release versions will limit available concurrency 311 // level, while debug ones would assert. 312 if ( ProcessorGroupInfo::NumGroups > MaxProcessorGroups ) 313 ProcessorGroupInfo::NumGroups = MaxProcessorGroups; 314 if ( ProcessorGroupInfo::NumGroups > 1 ) { 315 TBB_GROUP_AFFINITY ga; 316 if ( TBB_GetThreadGroupAffinity( GetCurrentThread(), &ga ) ) 317 ProcessorGroupInfo::HoleIndex = ga.Group; 318 int nprocs = 0; 319 for ( WORD i = 0; i < ProcessorGroupInfo::NumGroups; ++i ) { 320 ProcessorGroupInfo &pgi = theProcessorGroups[i]; 321 pgi.numProcs = (int)TBB_GetActiveProcessorCount(i); 322 __TBB_ASSERT( pgi.numProcs <= (int)sizeof(DWORD_PTR) * CHAR_BIT, NULL ); 323 pgi.mask = pgi.numProcs == sizeof(DWORD_PTR) * CHAR_BIT ? ~(DWORD_PTR)0 : (DWORD_PTR(1) << pgi.numProcs) - 1; 324 pgi.numProcsRunningTotal = nprocs += pgi.numProcs; 325 } 326 __TBB_ASSERT( nprocs == (int)TBB_GetActiveProcessorCount( TBB_ALL_PROCESSOR_GROUPS ), NULL ); 327 } 328 } 329 #endif /* __TBB_WIN8UI_SUPPORT */ 330 331 PrintExtraVersionInfo("Processor groups", "%d", ProcessorGroupInfo::NumGroups); 332 if (ProcessorGroupInfo::NumGroups>1) 333 for (int i=0; i<ProcessorGroupInfo::NumGroups; ++i) 334 PrintExtraVersionInfo( "----- Group", "%d: size %d", i, theProcessorGroups[i].numProcs); 335 } 336 337 int NumberOfProcessorGroups() { 338 __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "NumberOfProcessorGroups is used before AvailableHwConcurrency" ); 339 return ProcessorGroupInfo::NumGroups; 340 } 341 342 // Offset for the slot reserved for the first external thread 343 #define HoleAdjusted(procIdx, grpIdx) (procIdx + (holeIdx <= grpIdx)) 344 345 int FindProcessorGroupIndex ( int procIdx ) { 346 // In case of oversubscription spread extra workers in a round robin manner 347 int holeIdx; 348 const int numProcs = theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal; 349 if ( procIdx >= numProcs - 1 ) { 350 holeIdx = INT_MAX; 351 procIdx = (procIdx - numProcs + 1) % numProcs; 352 } 353 else 354 holeIdx = ProcessorGroupInfo::HoleIndex; 355 __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "FindProcessorGroupIndex is used before AvailableHwConcurrency" ); 356 // Approximate the likely group index assuming all groups are of the same size 357 int i = procIdx / theProcessorGroups[0].numProcs; 358 // Make sure the approximation is a valid group index 359 if (i >= ProcessorGroupInfo::NumGroups) i = ProcessorGroupInfo::NumGroups-1; 360 // Now adjust the approximation up or down 361 if ( theProcessorGroups[i].numProcsRunningTotal > HoleAdjusted(procIdx, i) ) { 362 while ( theProcessorGroups[i].numProcsRunningTotal - theProcessorGroups[i].numProcs > HoleAdjusted(procIdx, i) ) { 363 __TBB_ASSERT( i > 0, NULL ); 364 --i; 365 } 366 } 367 else { 368 do { 369 ++i; 370 } while ( theProcessorGroups[i].numProcsRunningTotal <= HoleAdjusted(procIdx, i) ); 371 } 372 __TBB_ASSERT( i < ProcessorGroupInfo::NumGroups, NULL ); 373 return i; 374 } 375 376 void MoveThreadIntoProcessorGroup( void* hThread, int groupIndex ) { 377 __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "MoveThreadIntoProcessorGroup is used before AvailableHwConcurrency" ); 378 if ( !TBB_SetThreadGroupAffinity ) 379 return; 380 TBB_GROUP_AFFINITY ga = { theProcessorGroups[groupIndex].mask, (WORD)groupIndex, {0,0,0} }; 381 TBB_SetThreadGroupAffinity( hThread, &ga, NULL ); 382 } 383 384 int AvailableHwConcurrency() { 385 atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info ); 386 return theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal; 387 } 388 389 /* End of _WIN32||_WIN64 implementation */ 390 #else 391 #error AvailableHwConcurrency is not implemented for this OS 392 #endif 393 394 } // namespace r1 395 } // namespace detail 396 } // namespace tbb 397 398 #endif /* !__TBB_HardwareConcurrency */ 399