1 /* 2 Copyright (c) 2005-2022 Intel Corporation 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 // Source file for miscellaneous entities that are infrequently referenced by 18 // an executing program, and implementation of which requires dynamic linking. 19 20 #include "misc.h" 21 22 #if !defined(__TBB_HardwareConcurrency) 23 24 #include "dynamic_link.h" 25 #include <stdio.h> 26 #include <limits.h> 27 28 #if _WIN32||_WIN64 29 #include <windows.h> 30 #if __TBB_WIN8UI_SUPPORT 31 #include <thread> 32 #endif 33 #else 34 #include <unistd.h> 35 #if __unix__ 36 #if __linux__ 37 #include <sys/sysinfo.h> 38 #endif 39 #include <cstring> 40 #include <sched.h> 41 #include <cerrno> 42 #elif __sun 43 #include <sys/sysinfo.h> 44 #elif __FreeBSD__ 45 #include <cerrno> 46 #include <cstring> 47 #include <sys/param.h> // Required by <sys/cpuset.h> 48 #include <sys/cpuset.h> 49 #endif 50 #endif 51 52 namespace tbb { 53 namespace detail { 54 namespace r1 { 55 56 #if __TBB_USE_OS_AFFINITY_SYSCALL 57 58 #if __unix__ 59 // Handlers for interoperation with libiomp 60 static int (*libiomp_try_restoring_original_mask)(); 61 // Table for mapping to libiomp entry points 62 static const dynamic_link_descriptor iompLinkTable[] = { 63 DLD_NOWEAK( kmp_set_thread_affinity_mask_initial, libiomp_try_restoring_original_mask ) 64 }; 65 #endif 66 67 static void set_thread_affinity_mask( std::size_t maskSize, const basic_mask_t* threadMask ) { 68 #if __FreeBSD__ || __NetBSD__ || __OpenBSD__ 69 if( cpuset_setaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) ) 70 #else /* __unix__ */ 71 if( sched_setaffinity( 0, maskSize, threadMask ) ) 72 #endif 73 // Here and below the error severity is lowered from critical level 74 // because it may happen during TBB library unload because of not 75 // waiting for workers to complete (current RML policy, to be fixed). 76 // handle_perror( errno, "setaffinity syscall" ); 77 runtime_warning( "setaffinity syscall failed" ); 78 } 79 80 static void get_thread_affinity_mask( std::size_t maskSize, basic_mask_t* threadMask ) { 81 #if __FreeBSD__ || __NetBSD__ || __OpenBSD__ 82 if( cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) ) 83 #else /* __unix__ */ 84 if( sched_getaffinity( 0, maskSize, threadMask ) ) 85 #endif 86 runtime_warning( "getaffinity syscall failed" ); 87 } 88 89 static basic_mask_t* process_mask; 90 static int num_masks; 91 92 void destroy_process_mask() { 93 delete [] process_mask; 94 process_mask = nullptr; 95 } 96 97 #define curMaskSize sizeof(basic_mask_t) * num_masks 98 affinity_helper::~affinity_helper() { 99 if( threadMask ) { 100 if( is_changed ) { 101 set_thread_affinity_mask( curMaskSize, threadMask ); 102 } 103 delete [] threadMask; 104 } 105 } 106 void affinity_helper::protect_affinity_mask( bool restore_process_mask ) { 107 if( threadMask == nullptr && num_masks ) { // TODO: assert num_masks validity? 108 threadMask = new basic_mask_t [num_masks]; 109 std::memset( threadMask, 0, curMaskSize ); 110 get_thread_affinity_mask( curMaskSize, threadMask ); 111 if( restore_process_mask ) { 112 __TBB_ASSERT( process_mask, "A process mask is requested but not yet stored" ); 113 is_changed = memcmp( process_mask, threadMask, curMaskSize ); 114 if( is_changed ) 115 set_thread_affinity_mask( curMaskSize, process_mask ); 116 } else { 117 // Assume that the mask will be changed by the caller. 118 is_changed = 1; 119 } 120 } 121 } 122 void affinity_helper::dismiss() { 123 delete [] threadMask; 124 threadMask = nullptr; 125 is_changed = 0; 126 } 127 #undef curMaskSize 128 129 static std::atomic<do_once_state> hardware_concurrency_info; 130 131 static int theNumProcs; 132 133 static void initialize_hardware_concurrency_info () { 134 int err; 135 int availableProcs = 0; 136 int numMasks = 1; 137 int maxProcs = sysconf(_SC_NPROCESSORS_ONLN); 138 basic_mask_t* processMask; 139 const std::size_t BasicMaskSize = sizeof(basic_mask_t); 140 for (;;) { 141 const int curMaskSize = BasicMaskSize * numMasks; 142 processMask = new basic_mask_t[numMasks]; 143 std::memset( processMask, 0, curMaskSize ); 144 #if __FreeBSD__ || __NetBSD__ || __OpenBSD__ 145 // CPU_LEVEL_WHICH - anonymous (current) mask, CPU_LEVEL_CPUSET - assigned mask 146 err = cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, curMaskSize, processMask ); 147 if ( !err || errno != ERANGE || curMaskSize * CHAR_BIT >= 16 * 1024 ) 148 break; 149 #else /* __unix__ */ 150 int pid = getpid(); 151 err = sched_getaffinity( pid, curMaskSize, processMask ); 152 if ( !err || errno != EINVAL || curMaskSize * CHAR_BIT >= 256 * 1024 ) 153 break; 154 #endif 155 delete[] processMask; 156 numMasks <<= 1; 157 } 158 if ( !err ) { 159 // We have found the mask size and captured the process affinity mask into processMask. 160 num_masks = numMasks; // do here because it's needed for affinity_helper to work 161 #if __unix__ 162 // For better coexistence with libiomp which might have changed the mask already, 163 // check for its presence and ask it to restore the mask. 164 dynamic_link_handle libhandle; 165 if ( dynamic_link( "libiomp5.so", iompLinkTable, 1, &libhandle, DYNAMIC_LINK_GLOBAL ) ) { 166 // We have found the symbol provided by libiomp5 for restoring original thread affinity. 167 affinity_helper affhelp; 168 affhelp.protect_affinity_mask( /*restore_process_mask=*/false ); 169 if ( libiomp_try_restoring_original_mask()==0 ) { 170 // Now we have the right mask to capture, restored by libiomp. 171 const int curMaskSize = BasicMaskSize * numMasks; 172 std::memset( processMask, 0, curMaskSize ); 173 get_thread_affinity_mask( curMaskSize, processMask ); 174 } else 175 affhelp.dismiss(); // thread mask has not changed 176 dynamic_unlink( libhandle ); 177 // Destructor of affinity_helper restores the thread mask (unless dismissed). 178 } 179 #endif 180 for ( int m = 0; availableProcs < maxProcs && m < numMasks; ++m ) { 181 for ( std::size_t i = 0; (availableProcs < maxProcs) && (i < BasicMaskSize * CHAR_BIT); ++i ) { 182 if ( CPU_ISSET( i, processMask + m ) ) 183 ++availableProcs; 184 } 185 } 186 process_mask = processMask; 187 } 188 else { 189 // Failed to get the process affinity mask; assume the whole machine can be used. 190 availableProcs = (maxProcs == INT_MAX) ? sysconf(_SC_NPROCESSORS_ONLN) : maxProcs; 191 delete[] processMask; 192 } 193 theNumProcs = availableProcs > 0 ? availableProcs : 1; // Fail safety strap 194 __TBB_ASSERT( theNumProcs <= sysconf(_SC_NPROCESSORS_ONLN), nullptr); 195 } 196 197 int AvailableHwConcurrency() { 198 atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info ); 199 return theNumProcs; 200 } 201 202 /* End of __TBB_USE_OS_AFFINITY_SYSCALL implementation */ 203 #elif __ANDROID__ 204 205 // Work-around for Android that reads the correct number of available CPUs since system calls are unreliable. 206 // Format of "present" file is: ([<int>-<int>|<int>],)+ 207 int AvailableHwConcurrency() { 208 FILE *fp = fopen("/sys/devices/system/cpu/present", "r"); 209 if (fp == nullptr) return 1; 210 int num_args, lower, upper, num_cpus=0; 211 while ((num_args = fscanf(fp, "%u-%u", &lower, &upper)) != EOF) { 212 switch(num_args) { 213 case 2: num_cpus += upper - lower + 1; break; 214 case 1: num_cpus += 1; break; 215 } 216 fscanf(fp, ","); 217 } 218 return (num_cpus > 0) ? num_cpus : 1; 219 } 220 221 #elif defined(_SC_NPROCESSORS_ONLN) 222 223 int AvailableHwConcurrency() { 224 int n = sysconf(_SC_NPROCESSORS_ONLN); 225 return (n > 0) ? n : 1; 226 } 227 228 #elif _WIN32||_WIN64 229 230 static std::atomic<do_once_state> hardware_concurrency_info; 231 232 static const WORD TBB_ALL_PROCESSOR_GROUPS = 0xffff; 233 234 // Statically allocate an array for processor group information. 235 // Windows 7 supports maximum 4 groups, but let's look ahead a little. 236 static const WORD MaxProcessorGroups = 64; 237 238 struct ProcessorGroupInfo { 239 DWORD_PTR mask; ///< Affinity mask covering the whole group 240 int numProcs; ///< Number of processors in the group 241 int numProcsRunningTotal; ///< Subtotal of processors in this and preceding groups 242 243 //! Total number of processor groups in the system 244 static int NumGroups; 245 246 //! Index of the group with a slot reserved for the first external thread 247 /** In the context of multiple processor groups support current implementation 248 defines "the first external thread" as the first thread to invoke 249 AvailableHwConcurrency(). 250 251 TODO: Implement a dynamic scheme remapping workers depending on the pending 252 external threads affinity. **/ 253 static int HoleIndex; 254 }; 255 256 int ProcessorGroupInfo::NumGroups = 1; 257 int ProcessorGroupInfo::HoleIndex = 0; 258 259 ProcessorGroupInfo theProcessorGroups[MaxProcessorGroups]; 260 261 struct TBB_GROUP_AFFINITY { 262 DWORD_PTR Mask; 263 WORD Group; 264 WORD Reserved[3]; 265 }; 266 267 static DWORD (WINAPI *TBB_GetActiveProcessorCount)( WORD groupIndex ) = nullptr; 268 static WORD (WINAPI *TBB_GetActiveProcessorGroupCount)() = nullptr; 269 static BOOL (WINAPI *TBB_SetThreadGroupAffinity)( HANDLE hThread, 270 const TBB_GROUP_AFFINITY* newAff, TBB_GROUP_AFFINITY *prevAff ); 271 static BOOL (WINAPI *TBB_GetThreadGroupAffinity)( HANDLE hThread, TBB_GROUP_AFFINITY* ); 272 273 static const dynamic_link_descriptor ProcessorGroupsApiLinkTable[] = { 274 DLD(GetActiveProcessorCount, TBB_GetActiveProcessorCount) 275 , DLD(GetActiveProcessorGroupCount, TBB_GetActiveProcessorGroupCount) 276 , DLD(SetThreadGroupAffinity, TBB_SetThreadGroupAffinity) 277 , DLD(GetThreadGroupAffinity, TBB_GetThreadGroupAffinity) 278 }; 279 280 static void initialize_hardware_concurrency_info () { 281 suppress_unused_warning(TBB_ALL_PROCESSOR_GROUPS); 282 #if __TBB_WIN8UI_SUPPORT 283 // For these applications processor groups info is unavailable 284 // Setting up a number of processors for one processor group 285 theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = std::thread::hardware_concurrency(); 286 #else /* __TBB_WIN8UI_SUPPORT */ 287 dynamic_link( "Kernel32.dll", ProcessorGroupsApiLinkTable, 288 sizeof(ProcessorGroupsApiLinkTable)/sizeof(dynamic_link_descriptor) ); 289 SYSTEM_INFO si; 290 GetNativeSystemInfo(&si); 291 DWORD_PTR pam, sam, m = 1; 292 GetProcessAffinityMask( GetCurrentProcess(), &pam, &sam ); 293 int nproc = 0; 294 for ( std::size_t i = 0; i < sizeof(DWORD_PTR) * CHAR_BIT; ++i, m <<= 1 ) { 295 if ( pam & m ) 296 ++nproc; 297 } 298 __TBB_ASSERT( nproc <= (int)si.dwNumberOfProcessors, nullptr); 299 // By default setting up a number of processors for one processor group 300 theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = nproc; 301 // Setting up processor groups in case the process does not restrict affinity mask and more than one processor group is present 302 if ( nproc == (int)si.dwNumberOfProcessors && TBB_GetActiveProcessorCount ) { 303 // The process does not have restricting affinity mask and multiple processor groups are possible 304 ProcessorGroupInfo::NumGroups = (int)TBB_GetActiveProcessorGroupCount(); 305 __TBB_ASSERT( ProcessorGroupInfo::NumGroups <= MaxProcessorGroups, nullptr); 306 // Fail safety bootstrap. Release versions will limit available concurrency 307 // level, while debug ones would assert. 308 if ( ProcessorGroupInfo::NumGroups > MaxProcessorGroups ) 309 ProcessorGroupInfo::NumGroups = MaxProcessorGroups; 310 if ( ProcessorGroupInfo::NumGroups > 1 ) { 311 TBB_GROUP_AFFINITY ga; 312 if ( TBB_GetThreadGroupAffinity( GetCurrentThread(), &ga ) ) 313 ProcessorGroupInfo::HoleIndex = ga.Group; 314 int nprocs = 0; 315 for ( WORD i = 0; i < ProcessorGroupInfo::NumGroups; ++i ) { 316 ProcessorGroupInfo &pgi = theProcessorGroups[i]; 317 pgi.numProcs = (int)TBB_GetActiveProcessorCount(i); 318 __TBB_ASSERT( pgi.numProcs <= (int)sizeof(DWORD_PTR) * CHAR_BIT, nullptr); 319 pgi.mask = pgi.numProcs == sizeof(DWORD_PTR) * CHAR_BIT ? ~(DWORD_PTR)0 : (DWORD_PTR(1) << pgi.numProcs) - 1; 320 pgi.numProcsRunningTotal = nprocs += pgi.numProcs; 321 } 322 __TBB_ASSERT( nprocs == (int)TBB_GetActiveProcessorCount( TBB_ALL_PROCESSOR_GROUPS ), nullptr); 323 } 324 } 325 #endif /* __TBB_WIN8UI_SUPPORT */ 326 327 PrintExtraVersionInfo("Processor groups", "%d", ProcessorGroupInfo::NumGroups); 328 if (ProcessorGroupInfo::NumGroups>1) 329 for (int i=0; i<ProcessorGroupInfo::NumGroups; ++i) 330 PrintExtraVersionInfo( "----- Group", "%d: size %d", i, theProcessorGroups[i].numProcs); 331 } 332 333 int NumberOfProcessorGroups() { 334 __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "NumberOfProcessorGroups is used before AvailableHwConcurrency" ); 335 return ProcessorGroupInfo::NumGroups; 336 } 337 338 // Offset for the slot reserved for the first external thread 339 #define HoleAdjusted(procIdx, grpIdx) (procIdx + (holeIdx <= grpIdx)) 340 341 int FindProcessorGroupIndex ( int procIdx ) { 342 // In case of oversubscription spread extra workers in a round robin manner 343 int holeIdx; 344 const int numProcs = theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal; 345 if ( procIdx >= numProcs - 1 ) { 346 holeIdx = INT_MAX; 347 procIdx = (procIdx - numProcs + 1) % numProcs; 348 } 349 else 350 holeIdx = ProcessorGroupInfo::HoleIndex; 351 __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "FindProcessorGroupIndex is used before AvailableHwConcurrency" ); 352 // Approximate the likely group index assuming all groups are of the same size 353 int i = procIdx / theProcessorGroups[0].numProcs; 354 // Make sure the approximation is a valid group index 355 if (i >= ProcessorGroupInfo::NumGroups) i = ProcessorGroupInfo::NumGroups-1; 356 // Now adjust the approximation up or down 357 if ( theProcessorGroups[i].numProcsRunningTotal > HoleAdjusted(procIdx, i) ) { 358 while ( theProcessorGroups[i].numProcsRunningTotal - theProcessorGroups[i].numProcs > HoleAdjusted(procIdx, i) ) { 359 __TBB_ASSERT( i > 0, nullptr); 360 --i; 361 } 362 } 363 else { 364 do { 365 ++i; 366 } while ( theProcessorGroups[i].numProcsRunningTotal <= HoleAdjusted(procIdx, i) ); 367 } 368 __TBB_ASSERT( i < ProcessorGroupInfo::NumGroups, nullptr); 369 return i; 370 } 371 372 void MoveThreadIntoProcessorGroup( void* hThread, int groupIndex ) { 373 __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "MoveThreadIntoProcessorGroup is used before AvailableHwConcurrency" ); 374 if ( !TBB_SetThreadGroupAffinity ) 375 return; 376 TBB_GROUP_AFFINITY ga = { theProcessorGroups[groupIndex].mask, (WORD)groupIndex, {0,0,0} }; 377 TBB_SetThreadGroupAffinity( hThread, &ga, nullptr); 378 } 379 380 int AvailableHwConcurrency() { 381 atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info ); 382 return theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal; 383 } 384 385 /* End of _WIN32||_WIN64 implementation */ 386 #else 387 #error AvailableHwConcurrency is not implemented for this OS 388 #endif 389 390 } // namespace r1 391 } // namespace detail 392 } // namespace tbb 393 394 #endif /* !__TBB_HardwareConcurrency */ 395