1 /* 2 Copyright (c) 2005-2022 Intel Corporation 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 // Source file for miscellaneous entities that are infrequently referenced by 18 // an executing program, and implementation of which requires dynamic linking. 19 20 #include "misc.h" 21 22 #if !defined(__TBB_HardwareConcurrency) 23 24 #include "dynamic_link.h" 25 #include <stdio.h> 26 #include <limits.h> 27 28 #if _WIN32||_WIN64 29 #include <windows.h> 30 #if __TBB_WIN8UI_SUPPORT 31 #include <thread> 32 #endif 33 #else 34 #include <unistd.h> 35 #if __unix__ 36 #if __linux__ 37 #include <sys/sysinfo.h> 38 #endif 39 #include <cstring> 40 #include <sched.h> 41 #include <cerrno> 42 #elif __sun 43 #include <sys/sysinfo.h> 44 #elif __FreeBSD__ 45 #include <cerrno> 46 #include <cstring> 47 #include <sys/param.h> // Required by <sys/cpuset.h> 48 #include <sys/cpuset.h> 49 #endif 50 #endif 51 52 namespace tbb { 53 namespace detail { 54 namespace r1 { 55 56 #if __TBB_USE_OS_AFFINITY_SYSCALL 57 58 #if __unix__ 59 // Handlers for interoperation with libiomp 60 static int (*libiomp_try_restoring_original_mask)(); 61 // Table for mapping to libiomp entry points 62 static const dynamic_link_descriptor iompLinkTable[] = { 63 DLD_NOWEAK( kmp_set_thread_affinity_mask_initial, libiomp_try_restoring_original_mask ) 64 }; 65 #endif 66 67 static void set_thread_affinity_mask( std::size_t maskSize, const basic_mask_t* threadMask ) { 68 #if __FreeBSD__ || __NetBSD__ || __OpenBSD__ 69 if( cpuset_setaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) ) 70 #else /* __unix__ */ 71 if( sched_setaffinity( 0, maskSize, threadMask ) ) 72 #endif 73 // Here and below the error severity is lowered from critical level 74 // because it may happen during TBB library unload because of not 75 // waiting for workers to complete (current RML policy, to be fixed). 76 // handle_perror( errno, "setaffinity syscall" ); 77 runtime_warning( "setaffinity syscall failed" ); 78 } 79 80 static void get_thread_affinity_mask( std::size_t maskSize, basic_mask_t* threadMask ) { 81 #if __FreeBSD__ || __NetBSD__ || __OpenBSD__ 82 if( cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) ) 83 #else /* __unix__ */ 84 if( sched_getaffinity( 0, maskSize, threadMask ) ) 85 #endif 86 runtime_warning( "getaffinity syscall failed" ); 87 } 88 89 static basic_mask_t* process_mask; 90 static int num_masks; 91 92 void destroy_process_mask() { 93 delete [] process_mask; 94 process_mask = nullptr; 95 } 96 97 #define curMaskSize sizeof(basic_mask_t) * num_masks 98 affinity_helper::~affinity_helper() { 99 if( threadMask ) { 100 if( is_changed ) { 101 set_thread_affinity_mask( curMaskSize, threadMask ); 102 } 103 delete [] threadMask; 104 } 105 } 106 void affinity_helper::protect_affinity_mask( bool restore_process_mask ) { 107 if( threadMask == nullptr && num_masks ) { // TODO: assert num_masks validity? 108 threadMask = new basic_mask_t [num_masks]; 109 std::memset( threadMask, 0, curMaskSize ); 110 get_thread_affinity_mask( curMaskSize, threadMask ); 111 if( restore_process_mask ) { 112 __TBB_ASSERT( process_mask, "A process mask is requested but not yet stored" ); 113 is_changed = memcmp( process_mask, threadMask, curMaskSize ); 114 if( is_changed ) 115 set_thread_affinity_mask( curMaskSize, process_mask ); 116 } else { 117 // Assume that the mask will be changed by the caller. 118 is_changed = 1; 119 } 120 } 121 } 122 void affinity_helper::dismiss() { 123 delete [] threadMask; 124 threadMask = nullptr; 125 is_changed = 0; 126 } 127 #undef curMaskSize 128 129 static std::atomic<do_once_state> hardware_concurrency_info; 130 131 static int theNumProcs; 132 133 static void initialize_hardware_concurrency_info () { 134 int err; 135 int availableProcs = 0; 136 int numMasks = 1; 137 int maxProcs = sysconf(_SC_NPROCESSORS_ONLN); 138 basic_mask_t* processMask; 139 const std::size_t BasicMaskSize = sizeof(basic_mask_t); 140 for (;;) { 141 const int curMaskSize = BasicMaskSize * numMasks; 142 processMask = new basic_mask_t[numMasks]; 143 std::memset( processMask, 0, curMaskSize ); 144 #if __FreeBSD__ || __NetBSD__ || __OpenBSD__ 145 // CPU_LEVEL_WHICH - anonymous (current) mask, CPU_LEVEL_CPUSET - assigned mask 146 err = cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, curMaskSize, processMask ); 147 if ( !err || errno != ERANGE || curMaskSize * CHAR_BIT >= 16 * 1024 ) 148 break; 149 #else /* __unix__ */ 150 int pid = getpid(); 151 err = sched_getaffinity( pid, curMaskSize, processMask ); 152 if ( !err || errno != EINVAL || curMaskSize * CHAR_BIT >= 256 * 1024 ) 153 break; 154 #endif 155 delete[] processMask; 156 numMasks <<= 1; 157 } 158 if ( !err ) { 159 // We have found the mask size and captured the process affinity mask into processMask. 160 num_masks = numMasks; // do here because it's needed for affinity_helper to work 161 #if __unix__ 162 // For better coexistence with libiomp which might have changed the mask already, 163 // check for its presence and ask it to restore the mask. 164 dynamic_link_handle libhandle; 165 if ( dynamic_link( "libiomp5.so", iompLinkTable, 1, &libhandle, DYNAMIC_LINK_GLOBAL ) ) { 166 // We have found the symbol provided by libiomp5 for restoring original thread affinity. 167 affinity_helper affhelp; 168 affhelp.protect_affinity_mask( /*restore_process_mask=*/false ); 169 if ( libiomp_try_restoring_original_mask()==0 ) { 170 // Now we have the right mask to capture, restored by libiomp. 171 const int curMaskSize = BasicMaskSize * numMasks; 172 std::memset( processMask, 0, curMaskSize ); 173 get_thread_affinity_mask( curMaskSize, processMask ); 174 } else 175 affhelp.dismiss(); // thread mask has not changed 176 dynamic_unlink( libhandle ); 177 // Destructor of affinity_helper restores the thread mask (unless dismissed). 178 } 179 #endif 180 for ( int m = 0; availableProcs < maxProcs && m < numMasks; ++m ) { 181 for ( std::size_t i = 0; (availableProcs < maxProcs) && (i < BasicMaskSize * CHAR_BIT); ++i ) { 182 if ( CPU_ISSET( i, processMask + m ) ) 183 ++availableProcs; 184 } 185 } 186 process_mask = processMask; 187 } 188 else { 189 // Failed to get the process affinity mask; assume the whole machine can be used. 190 availableProcs = (maxProcs == INT_MAX) ? sysconf(_SC_NPROCESSORS_ONLN) : maxProcs; 191 delete[] processMask; 192 } 193 theNumProcs = availableProcs > 0 ? availableProcs : 1; // Fail safety strap 194 __TBB_ASSERT( theNumProcs <= sysconf(_SC_NPROCESSORS_ONLN), nullptr); 195 } 196 197 int AvailableHwConcurrency() { 198 atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info ); 199 return theNumProcs; 200 } 201 202 /* End of __TBB_USE_OS_AFFINITY_SYSCALL implementation */ 203 #elif __ANDROID__ 204 205 // Work-around for Android that reads the correct number of available CPUs since system calls are unreliable. 206 // Format of "present" file is: ([<int>-<int>|<int>],)+ 207 int AvailableHwConcurrency() { 208 FILE *fp = fopen("/sys/devices/system/cpu/present", "r"); 209 if (fp == nullptr) return 1; 210 int num_args, lower, upper, num_cpus=0; 211 while ((num_args = fscanf(fp, "%u-%u", &lower, &upper)) != EOF) { 212 switch(num_args) { 213 case 2: num_cpus += upper - lower + 1; break; 214 case 1: num_cpus += 1; break; 215 } 216 fscanf(fp, ","); 217 } 218 return (num_cpus > 0) ? num_cpus : 1; 219 } 220 221 #elif defined(_SC_NPROCESSORS_ONLN) 222 223 int AvailableHwConcurrency() { 224 int n = sysconf(_SC_NPROCESSORS_ONLN); 225 return (n > 0) ? n : 1; 226 } 227 228 #elif _WIN32||_WIN64 229 230 static std::atomic<do_once_state> hardware_concurrency_info; 231 232 static const WORD TBB_ALL_PROCESSOR_GROUPS = 0xffff; 233 234 // Statically allocate an array for processor group information. 235 // Windows 7 supports maximum 4 groups, but let's look ahead a little. 236 static const WORD MaxProcessorGroups = 64; 237 238 struct ProcessorGroupInfo { 239 DWORD_PTR mask; ///< Affinity mask covering the whole group 240 int numProcs; ///< Number of processors in the group 241 int numProcsRunningTotal; ///< Subtotal of processors in this and preceding groups 242 243 //! Total number of processor groups in the system 244 static int NumGroups; 245 246 //! Index of the group with a slot reserved for the first external thread 247 /** In the context of multiple processor groups support current implementation 248 defines "the first external thread" as the first thread to invoke 249 AvailableHwConcurrency(). 250 251 TODO: Implement a dynamic scheme remapping workers depending on the pending 252 external threads affinity. **/ 253 static int HoleIndex; 254 }; 255 256 int ProcessorGroupInfo::NumGroups = 1; 257 int ProcessorGroupInfo::HoleIndex = 0; 258 259 ProcessorGroupInfo theProcessorGroups[MaxProcessorGroups]; 260 int calculate_numa[MaxProcessorGroups]; //Array needed for FindProcessorGroupIndex to calculate Processor Group when number of threads > number of cores to distribute threads evenly between processor groups 261 int numaSum; 262 struct TBB_GROUP_AFFINITY { 263 DWORD_PTR Mask; 264 WORD Group; 265 WORD Reserved[3]; 266 }; 267 268 static DWORD (WINAPI *TBB_GetActiveProcessorCount)( WORD groupIndex ) = nullptr; 269 static WORD (WINAPI *TBB_GetActiveProcessorGroupCount)() = nullptr; 270 static BOOL (WINAPI *TBB_SetThreadGroupAffinity)( HANDLE hThread, 271 const TBB_GROUP_AFFINITY* newAff, TBB_GROUP_AFFINITY *prevAff ); 272 static BOOL (WINAPI *TBB_GetThreadGroupAffinity)( HANDLE hThread, TBB_GROUP_AFFINITY* ); 273 274 static const dynamic_link_descriptor ProcessorGroupsApiLinkTable[] = { 275 DLD(GetActiveProcessorCount, TBB_GetActiveProcessorCount) 276 , DLD(GetActiveProcessorGroupCount, TBB_GetActiveProcessorGroupCount) 277 , DLD(SetThreadGroupAffinity, TBB_SetThreadGroupAffinity) 278 , DLD(GetThreadGroupAffinity, TBB_GetThreadGroupAffinity) 279 }; 280 281 static void initialize_hardware_concurrency_info () { 282 suppress_unused_warning(TBB_ALL_PROCESSOR_GROUPS); 283 #if __TBB_WIN8UI_SUPPORT 284 // For these applications processor groups info is unavailable 285 // Setting up a number of processors for one processor group 286 theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = std::thread::hardware_concurrency(); 287 #else /* __TBB_WIN8UI_SUPPORT */ 288 dynamic_link( "Kernel32.dll", ProcessorGroupsApiLinkTable, 289 sizeof(ProcessorGroupsApiLinkTable)/sizeof(dynamic_link_descriptor) ); 290 SYSTEM_INFO si; 291 GetNativeSystemInfo(&si); 292 DWORD_PTR pam, sam, m = 1; 293 GetProcessAffinityMask( GetCurrentProcess(), &pam, &sam ); 294 int nproc = 0; 295 for ( std::size_t i = 0; i < sizeof(DWORD_PTR) * CHAR_BIT; ++i, m <<= 1 ) { 296 if ( pam & m ) 297 ++nproc; 298 } 299 __TBB_ASSERT( nproc <= (int)si.dwNumberOfProcessors, nullptr); 300 // By default setting up a number of processors for one processor group 301 theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = nproc; 302 // Setting up processor groups in case the process does not restrict affinity mask and more than one processor group is present 303 if ( nproc == (int)si.dwNumberOfProcessors && TBB_GetActiveProcessorCount ) { 304 // The process does not have restricting affinity mask and multiple processor groups are possible 305 ProcessorGroupInfo::NumGroups = (int)TBB_GetActiveProcessorGroupCount(); 306 __TBB_ASSERT( ProcessorGroupInfo::NumGroups <= MaxProcessorGroups, nullptr); 307 // Fail safety bootstrap. Release versions will limit available concurrency 308 // level, while debug ones would assert. 309 if ( ProcessorGroupInfo::NumGroups > MaxProcessorGroups ) 310 ProcessorGroupInfo::NumGroups = MaxProcessorGroups; 311 if ( ProcessorGroupInfo::NumGroups > 1 ) { 312 TBB_GROUP_AFFINITY ga; 313 if ( TBB_GetThreadGroupAffinity( GetCurrentThread(), &ga ) ) 314 ProcessorGroupInfo::HoleIndex = ga.Group; 315 int nprocs = 0; 316 int min_procs = INT_MAX; 317 for ( WORD i = 0; i < ProcessorGroupInfo::NumGroups; ++i ) { 318 ProcessorGroupInfo &pgi = theProcessorGroups[i]; 319 pgi.numProcs = (int)TBB_GetActiveProcessorCount(i); 320 if (pgi.numProcs < min_procs) min_procs = pgi.numProcs; //Finding the minimum number of processors in the Processor Groups 321 calculate_numa[i] = pgi.numProcs; 322 __TBB_ASSERT( pgi.numProcs <= (int)sizeof(DWORD_PTR) * CHAR_BIT, nullptr); 323 pgi.mask = pgi.numProcs == sizeof(DWORD_PTR) * CHAR_BIT ? ~(DWORD_PTR)0 : (DWORD_PTR(1) << pgi.numProcs) - 1; 324 pgi.numProcsRunningTotal = nprocs += pgi.numProcs; 325 } 326 __TBB_ASSERT( nprocs == (int)TBB_GetActiveProcessorCount( TBB_ALL_PROCESSOR_GROUPS ), nullptr); 327 328 calculate_numa[0] = (calculate_numa[0] / min_procs)-1; 329 for (WORD i = 1; i < ProcessorGroupInfo::NumGroups; ++i) { 330 calculate_numa[i] = calculate_numa[i-1] + (calculate_numa[i] / min_procs); 331 } 332 333 numaSum = calculate_numa[ProcessorGroupInfo::NumGroups - 1]; 334 335 } 336 337 } 338 #endif /* __TBB_WIN8UI_SUPPORT */ 339 340 PrintExtraVersionInfo("Processor groups", "%d", ProcessorGroupInfo::NumGroups); 341 if (ProcessorGroupInfo::NumGroups>1) 342 for (int i=0; i<ProcessorGroupInfo::NumGroups; ++i) 343 PrintExtraVersionInfo( "----- Group", "%d: size %d", i, theProcessorGroups[i].numProcs); 344 } 345 346 int NumberOfProcessorGroups() { 347 __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "NumberOfProcessorGroups is used before AvailableHwConcurrency" ); 348 return ProcessorGroupInfo::NumGroups; 349 } 350 351 int FindProcessorGroupIndex ( int procIdx ) { 352 int current_grp_idx = ProcessorGroupInfo::HoleIndex; 353 if (procIdx >= theProcessorGroups[current_grp_idx].numProcs && procIdx < theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal) { 354 procIdx = procIdx - theProcessorGroups[current_grp_idx].numProcs; 355 do { 356 current_grp_idx = (current_grp_idx + 1) % (ProcessorGroupInfo::NumGroups); 357 procIdx = procIdx - theProcessorGroups[current_grp_idx].numProcs; 358 359 } while (procIdx >= 0); 360 } 361 else if (procIdx >= theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal) { 362 int temp_grp_index = 0; 363 procIdx = procIdx - theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal; 364 procIdx = procIdx % (numaSum+1); //ProcIdx to stay between 0 and numaSum 365 366 while (procIdx - calculate_numa[temp_grp_index] > 0) { 367 temp_grp_index = (temp_grp_index + 1) % ProcessorGroupInfo::NumGroups; 368 } 369 current_grp_idx = temp_grp_index; 370 } 371 __TBB_ASSERT(current_grp_idx < ProcessorGroupInfo::NumGroups, nullptr); 372 373 return current_grp_idx; 374 } 375 376 void MoveThreadIntoProcessorGroup( void* hThread, int groupIndex ) { 377 __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "MoveThreadIntoProcessorGroup is used before AvailableHwConcurrency" ); 378 if ( !TBB_SetThreadGroupAffinity ) 379 return; 380 TBB_GROUP_AFFINITY ga = { theProcessorGroups[groupIndex].mask, (WORD)groupIndex, {0,0,0} }; 381 TBB_SetThreadGroupAffinity( hThread, &ga, nullptr); 382 } 383 384 int AvailableHwConcurrency() { 385 atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info ); 386 return theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal; 387 } 388 389 /* End of _WIN32||_WIN64 implementation */ 390 #else 391 #error AvailableHwConcurrency is not implemented for this OS 392 #endif 393 394 } // namespace r1 395 } // namespace detail 396 } // namespace tbb 397 398 #endif /* !__TBB_HardwareConcurrency */ 399