1 /* 2 Copyright (c) 2005-2021 Intel Corporation 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 // Source file for miscellaneous entities that are infrequently referenced by 18 // an executing program, and implementation of which requires dynamic linking. 19 20 #include "misc.h" 21 22 #if !defined(__TBB_HardwareConcurrency) 23 24 #include "dynamic_link.h" 25 #include <stdio.h> 26 #include <limits.h> 27 28 #if _WIN32||_WIN64 29 #include <windows.h> 30 #if __TBB_WIN8UI_SUPPORT 31 #include <thread> 32 #endif 33 #else 34 #include <unistd.h> 35 #if __unix__ 36 #if __linux__ 37 #include <sys/sysinfo.h> 38 #endif 39 #include <cstring> 40 #include <sched.h> 41 #include <cerrno> 42 #elif __sun 43 #include <sys/sysinfo.h> 44 #elif __FreeBSD__ 45 #include <cerrno> 46 #include <cstring> 47 #include <sys/param.h> // Required by <sys/cpuset.h> 48 #include <sys/cpuset.h> 49 #endif 50 #endif 51 52 namespace tbb { 53 namespace detail { 54 namespace r1 { 55 56 #if __TBB_USE_OS_AFFINITY_SYSCALL 57 58 #if __unix__ 59 // Handlers for interoperation with libiomp 60 static int (*libiomp_try_restoring_original_mask)(); 61 // Table for mapping to libiomp entry points 62 static const dynamic_link_descriptor iompLinkTable[] = { 63 DLD_NOWEAK( kmp_set_thread_affinity_mask_initial, libiomp_try_restoring_original_mask ) 64 }; 65 #endif 66 67 static void set_thread_affinity_mask( std::size_t maskSize, const basic_mask_t* threadMask ) { 68 #if __FreeBSD__ || __NetBSD__ || __OpenBSD__ 69 if( cpuset_setaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) ) 70 #else /* __unix__ */ 71 if( sched_setaffinity( 0, maskSize, threadMask ) ) 72 #endif 73 // Here and below the error severity is lowered from critical level 74 // because it may happen during TBB library unload because of not 75 // waiting for workers to complete (current RML policy, to be fixed). 76 // handle_perror( errno, "setaffinity syscall" ); 77 runtime_warning( "setaffinity syscall failed" ); 78 } 79 80 static void get_thread_affinity_mask( std::size_t maskSize, basic_mask_t* threadMask ) { 81 #if __FreeBSD__ || __NetBSD__ || __OpenBSD__ 82 if( cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) ) 83 #else /* __unix__ */ 84 if( sched_getaffinity( 0, maskSize, threadMask ) ) 85 #endif 86 runtime_warning( "getaffinity syscall failed" ); 87 } 88 89 static basic_mask_t* process_mask; 90 static int num_masks; 91 92 void destroy_process_mask() { 93 if( process_mask ) { 94 delete [] process_mask; 95 } 96 } 97 98 #define curMaskSize sizeof(basic_mask_t) * num_masks 99 affinity_helper::~affinity_helper() { 100 if( threadMask ) { 101 if( is_changed ) { 102 set_thread_affinity_mask( curMaskSize, threadMask ); 103 } 104 delete [] threadMask; 105 } 106 } 107 void affinity_helper::protect_affinity_mask( bool restore_process_mask ) { 108 if( threadMask == NULL && num_masks ) { // TODO: assert num_masks validity? 109 threadMask = new basic_mask_t [num_masks]; 110 std::memset( threadMask, 0, curMaskSize ); 111 get_thread_affinity_mask( curMaskSize, threadMask ); 112 if( restore_process_mask ) { 113 __TBB_ASSERT( process_mask, "A process mask is requested but not yet stored" ); 114 is_changed = memcmp( process_mask, threadMask, curMaskSize ); 115 if( is_changed ) 116 set_thread_affinity_mask( curMaskSize, process_mask ); 117 } else { 118 // Assume that the mask will be changed by the caller. 119 is_changed = 1; 120 } 121 } 122 } 123 void affinity_helper::dismiss() { 124 if( threadMask ) { 125 delete [] threadMask; 126 threadMask = NULL; 127 } 128 is_changed = 0; 129 } 130 #undef curMaskSize 131 132 static std::atomic<do_once_state> hardware_concurrency_info; 133 134 static int theNumProcs; 135 136 static void initialize_hardware_concurrency_info () { 137 int err; 138 int availableProcs = 0; 139 int numMasks = 1; 140 int maxProcs = sysconf(_SC_NPROCESSORS_ONLN); 141 basic_mask_t* processMask; 142 const std::size_t BasicMaskSize = sizeof(basic_mask_t); 143 for (;;) { 144 const int curMaskSize = BasicMaskSize * numMasks; 145 processMask = new basic_mask_t[numMasks]; 146 std::memset( processMask, 0, curMaskSize ); 147 #if __FreeBSD__ || __NetBSD__ || __OpenBSD__ 148 // CPU_LEVEL_WHICH - anonymous (current) mask, CPU_LEVEL_CPUSET - assigned mask 149 err = cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, curMaskSize, processMask ); 150 if ( !err || errno != ERANGE || curMaskSize * CHAR_BIT >= 16 * 1024 ) 151 break; 152 #else /* __unix__ */ 153 int pid = getpid(); 154 err = sched_getaffinity( pid, curMaskSize, processMask ); 155 if ( !err || errno != EINVAL || curMaskSize * CHAR_BIT >= 256 * 1024 ) 156 break; 157 #endif 158 delete[] processMask; 159 numMasks <<= 1; 160 } 161 if ( !err ) { 162 // We have found the mask size and captured the process affinity mask into processMask. 163 num_masks = numMasks; // do here because it's needed for affinity_helper to work 164 #if __unix__ 165 // For better coexistence with libiomp which might have changed the mask already, 166 // check for its presence and ask it to restore the mask. 167 dynamic_link_handle libhandle; 168 if ( dynamic_link( "libiomp5.so", iompLinkTable, 1, &libhandle, DYNAMIC_LINK_GLOBAL ) ) { 169 // We have found the symbol provided by libiomp5 for restoring original thread affinity. 170 affinity_helper affhelp; 171 affhelp.protect_affinity_mask( /*restore_process_mask=*/false ); 172 if ( libiomp_try_restoring_original_mask()==0 ) { 173 // Now we have the right mask to capture, restored by libiomp. 174 const int curMaskSize = BasicMaskSize * numMasks; 175 std::memset( processMask, 0, curMaskSize ); 176 get_thread_affinity_mask( curMaskSize, processMask ); 177 } else 178 affhelp.dismiss(); // thread mask has not changed 179 dynamic_unlink( libhandle ); 180 // Destructor of affinity_helper restores the thread mask (unless dismissed). 181 } 182 #endif 183 for ( int m = 0; availableProcs < maxProcs && m < numMasks; ++m ) { 184 for ( std::size_t i = 0; (availableProcs < maxProcs) && (i < BasicMaskSize * CHAR_BIT); ++i ) { 185 if ( CPU_ISSET( i, processMask + m ) ) 186 ++availableProcs; 187 } 188 } 189 process_mask = processMask; 190 } 191 else { 192 // Failed to get the process affinity mask; assume the whole machine can be used. 193 availableProcs = (maxProcs == INT_MAX) ? sysconf(_SC_NPROCESSORS_ONLN) : maxProcs; 194 delete[] processMask; 195 } 196 theNumProcs = availableProcs > 0 ? availableProcs : 1; // Fail safety strap 197 __TBB_ASSERT( theNumProcs <= sysconf(_SC_NPROCESSORS_ONLN), NULL ); 198 } 199 200 int AvailableHwConcurrency() { 201 atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info ); 202 return theNumProcs; 203 } 204 205 /* End of __TBB_USE_OS_AFFINITY_SYSCALL implementation */ 206 #elif __ANDROID__ 207 208 // Work-around for Android that reads the correct number of available CPUs since system calls are unreliable. 209 // Format of "present" file is: ([<int>-<int>|<int>],)+ 210 int AvailableHwConcurrency() { 211 FILE *fp = fopen("/sys/devices/system/cpu/present", "r"); 212 if (fp == NULL) return 1; 213 int num_args, lower, upper, num_cpus=0; 214 while ((num_args = fscanf(fp, "%u-%u", &lower, &upper)) != EOF) { 215 switch(num_args) { 216 case 2: num_cpus += upper - lower + 1; break; 217 case 1: num_cpus += 1; break; 218 } 219 fscanf(fp, ","); 220 } 221 return (num_cpus > 0) ? num_cpus : 1; 222 } 223 224 #elif defined(_SC_NPROCESSORS_ONLN) 225 226 int AvailableHwConcurrency() { 227 int n = sysconf(_SC_NPROCESSORS_ONLN); 228 return (n > 0) ? n : 1; 229 } 230 231 #elif _WIN32||_WIN64 232 233 static std::atomic<do_once_state> hardware_concurrency_info; 234 235 static const WORD TBB_ALL_PROCESSOR_GROUPS = 0xffff; 236 237 // Statically allocate an array for processor group information. 238 // Windows 7 supports maximum 4 groups, but let's look ahead a little. 239 static const WORD MaxProcessorGroups = 64; 240 241 struct ProcessorGroupInfo { 242 DWORD_PTR mask; ///< Affinity mask covering the whole group 243 int numProcs; ///< Number of processors in the group 244 int numProcsRunningTotal; ///< Subtotal of processors in this and preceding groups 245 246 //! Total number of processor groups in the system 247 static int NumGroups; 248 249 //! Index of the group with a slot reserved for the first external thread 250 /** In the context of multiple processor groups support current implementation 251 defines "the first external thread" as the first thread to invoke 252 AvailableHwConcurrency(). 253 254 TODO: Implement a dynamic scheme remapping workers depending on the pending 255 external threads affinity. **/ 256 static int HoleIndex; 257 }; 258 259 int ProcessorGroupInfo::NumGroups = 1; 260 int ProcessorGroupInfo::HoleIndex = 0; 261 262 ProcessorGroupInfo theProcessorGroups[MaxProcessorGroups]; 263 264 struct TBB_GROUP_AFFINITY { 265 DWORD_PTR Mask; 266 WORD Group; 267 WORD Reserved[3]; 268 }; 269 270 static DWORD (WINAPI *TBB_GetActiveProcessorCount)( WORD groupIndex ) = NULL; 271 static WORD (WINAPI *TBB_GetActiveProcessorGroupCount)() = NULL; 272 static BOOL (WINAPI *TBB_SetThreadGroupAffinity)( HANDLE hThread, 273 const TBB_GROUP_AFFINITY* newAff, TBB_GROUP_AFFINITY *prevAff ); 274 static BOOL (WINAPI *TBB_GetThreadGroupAffinity)( HANDLE hThread, TBB_GROUP_AFFINITY* ); 275 276 static const dynamic_link_descriptor ProcessorGroupsApiLinkTable[] = { 277 DLD(GetActiveProcessorCount, TBB_GetActiveProcessorCount) 278 , DLD(GetActiveProcessorGroupCount, TBB_GetActiveProcessorGroupCount) 279 , DLD(SetThreadGroupAffinity, TBB_SetThreadGroupAffinity) 280 , DLD(GetThreadGroupAffinity, TBB_GetThreadGroupAffinity) 281 }; 282 283 static void initialize_hardware_concurrency_info () { 284 suppress_unused_warning(TBB_ALL_PROCESSOR_GROUPS); 285 #if __TBB_WIN8UI_SUPPORT 286 // For these applications processor groups info is unavailable 287 // Setting up a number of processors for one processor group 288 theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = std::thread::hardware_concurrency(); 289 #else /* __TBB_WIN8UI_SUPPORT */ 290 dynamic_link( "Kernel32.dll", ProcessorGroupsApiLinkTable, 291 sizeof(ProcessorGroupsApiLinkTable)/sizeof(dynamic_link_descriptor) ); 292 SYSTEM_INFO si; 293 GetNativeSystemInfo(&si); 294 DWORD_PTR pam, sam, m = 1; 295 GetProcessAffinityMask( GetCurrentProcess(), &pam, &sam ); 296 int nproc = 0; 297 for ( std::size_t i = 0; i < sizeof(DWORD_PTR) * CHAR_BIT; ++i, m <<= 1 ) { 298 if ( pam & m ) 299 ++nproc; 300 } 301 __TBB_ASSERT( nproc <= (int)si.dwNumberOfProcessors, NULL ); 302 // By default setting up a number of processors for one processor group 303 theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = nproc; 304 // Setting up processor groups in case the process does not restrict affinity mask and more than one processor group is present 305 if ( nproc == (int)si.dwNumberOfProcessors && TBB_GetActiveProcessorCount ) { 306 // The process does not have restricting affinity mask and multiple processor groups are possible 307 ProcessorGroupInfo::NumGroups = (int)TBB_GetActiveProcessorGroupCount(); 308 __TBB_ASSERT( ProcessorGroupInfo::NumGroups <= MaxProcessorGroups, NULL ); 309 // Fail safety bootstrap. Release versions will limit available concurrency 310 // level, while debug ones would assert. 311 if ( ProcessorGroupInfo::NumGroups > MaxProcessorGroups ) 312 ProcessorGroupInfo::NumGroups = MaxProcessorGroups; 313 if ( ProcessorGroupInfo::NumGroups > 1 ) { 314 TBB_GROUP_AFFINITY ga; 315 if ( TBB_GetThreadGroupAffinity( GetCurrentThread(), &ga ) ) 316 ProcessorGroupInfo::HoleIndex = ga.Group; 317 int nprocs = 0; 318 for ( WORD i = 0; i < ProcessorGroupInfo::NumGroups; ++i ) { 319 ProcessorGroupInfo &pgi = theProcessorGroups[i]; 320 pgi.numProcs = (int)TBB_GetActiveProcessorCount(i); 321 __TBB_ASSERT( pgi.numProcs <= (int)sizeof(DWORD_PTR) * CHAR_BIT, NULL ); 322 pgi.mask = pgi.numProcs == sizeof(DWORD_PTR) * CHAR_BIT ? ~(DWORD_PTR)0 : (DWORD_PTR(1) << pgi.numProcs) - 1; 323 pgi.numProcsRunningTotal = nprocs += pgi.numProcs; 324 } 325 __TBB_ASSERT( nprocs == (int)TBB_GetActiveProcessorCount( TBB_ALL_PROCESSOR_GROUPS ), NULL ); 326 } 327 } 328 #endif /* __TBB_WIN8UI_SUPPORT */ 329 330 PrintExtraVersionInfo("Processor groups", "%d", ProcessorGroupInfo::NumGroups); 331 if (ProcessorGroupInfo::NumGroups>1) 332 for (int i=0; i<ProcessorGroupInfo::NumGroups; ++i) 333 PrintExtraVersionInfo( "----- Group", "%d: size %d", i, theProcessorGroups[i].numProcs); 334 } 335 336 int NumberOfProcessorGroups() { 337 __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "NumberOfProcessorGroups is used before AvailableHwConcurrency" ); 338 return ProcessorGroupInfo::NumGroups; 339 } 340 341 // Offset for the slot reserved for the first external thread 342 #define HoleAdjusted(procIdx, grpIdx) (procIdx + (holeIdx <= grpIdx)) 343 344 int FindProcessorGroupIndex ( int procIdx ) { 345 // In case of oversubscription spread extra workers in a round robin manner 346 int holeIdx; 347 const int numProcs = theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal; 348 if ( procIdx >= numProcs - 1 ) { 349 holeIdx = INT_MAX; 350 procIdx = (procIdx - numProcs + 1) % numProcs; 351 } 352 else 353 holeIdx = ProcessorGroupInfo::HoleIndex; 354 __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "FindProcessorGroupIndex is used before AvailableHwConcurrency" ); 355 // Approximate the likely group index assuming all groups are of the same size 356 int i = procIdx / theProcessorGroups[0].numProcs; 357 // Make sure the approximation is a valid group index 358 if (i >= ProcessorGroupInfo::NumGroups) i = ProcessorGroupInfo::NumGroups-1; 359 // Now adjust the approximation up or down 360 if ( theProcessorGroups[i].numProcsRunningTotal > HoleAdjusted(procIdx, i) ) { 361 while ( theProcessorGroups[i].numProcsRunningTotal - theProcessorGroups[i].numProcs > HoleAdjusted(procIdx, i) ) { 362 __TBB_ASSERT( i > 0, NULL ); 363 --i; 364 } 365 } 366 else { 367 do { 368 ++i; 369 } while ( theProcessorGroups[i].numProcsRunningTotal <= HoleAdjusted(procIdx, i) ); 370 } 371 __TBB_ASSERT( i < ProcessorGroupInfo::NumGroups, NULL ); 372 return i; 373 } 374 375 void MoveThreadIntoProcessorGroup( void* hThread, int groupIndex ) { 376 __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "MoveThreadIntoProcessorGroup is used before AvailableHwConcurrency" ); 377 if ( !TBB_SetThreadGroupAffinity ) 378 return; 379 TBB_GROUP_AFFINITY ga = { theProcessorGroups[groupIndex].mask, (WORD)groupIndex, {0,0,0} }; 380 TBB_SetThreadGroupAffinity( hThread, &ga, NULL ); 381 } 382 383 int AvailableHwConcurrency() { 384 atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info ); 385 return theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal; 386 } 387 388 /* End of _WIN32||_WIN64 implementation */ 389 #else 390 #error AvailableHwConcurrency is not implemented for this OS 391 #endif 392 393 } // namespace r1 394 } // namespace detail 395 } // namespace tbb 396 397 #endif /* !__TBB_HardwareConcurrency */ 398