1 /* 2 Copyright (c) 2005-2023 Intel Corporation 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 // Source file for miscellaneous entities that are infrequently referenced by 18 // an executing program, and implementation of which requires dynamic linking. 19 20 #include "misc.h" 21 22 #if !defined(__TBB_HardwareConcurrency) 23 24 #include "dynamic_link.h" 25 #include <stdio.h> 26 #include <limits.h> 27 28 #if _WIN32||_WIN64 29 #include <windows.h> 30 #if __TBB_WIN8UI_SUPPORT 31 #include <thread> 32 #endif 33 #else 34 #include <unistd.h> 35 #if __unix__ 36 #if __linux__ 37 #include <sys/sysinfo.h> 38 #endif 39 #include <cstring> 40 #include <sched.h> 41 #include <cerrno> 42 #elif __sun 43 #include <sys/sysinfo.h> 44 #elif __FreeBSD__ 45 #include <cerrno> 46 #include <cstring> 47 #include <sys/param.h> // Required by <sys/cpuset.h> 48 #include <sys/cpuset.h> 49 #endif 50 #endif 51 52 namespace tbb { 53 namespace detail { 54 namespace r1 { 55 56 #if __TBB_USE_OS_AFFINITY_SYSCALL 57 58 #if __unix__ 59 // Handlers for interoperation with libiomp 60 static int (*libiomp_try_restoring_original_mask)(); 61 // Table for mapping to libiomp entry points 62 static const dynamic_link_descriptor iompLinkTable[] = { 63 DLD_NOWEAK( kmp_set_thread_affinity_mask_initial, libiomp_try_restoring_original_mask ) 64 }; 65 #endif 66 67 static void set_thread_affinity_mask( std::size_t maskSize, const basic_mask_t* threadMask ) { 68 #if __FreeBSD__ || __NetBSD__ || __OpenBSD__ 69 if( cpuset_setaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) ) 70 #else /* __unix__ */ 71 if( sched_setaffinity( 0, maskSize, threadMask ) ) 72 #endif 73 // Here and below the error severity is lowered from critical level 74 // because it may happen during TBB library unload because of not 75 // waiting for workers to complete (current RML policy, to be fixed). 76 // handle_perror( errno, "setaffinity syscall" ); 77 runtime_warning( "setaffinity syscall failed" ); 78 } 79 80 static void get_thread_affinity_mask( std::size_t maskSize, basic_mask_t* threadMask ) { 81 #if __FreeBSD__ || __NetBSD__ || __OpenBSD__ 82 if( cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, maskSize, threadMask ) ) 83 #else /* __unix__ */ 84 if( sched_getaffinity( 0, maskSize, threadMask ) ) 85 #endif 86 runtime_warning( "getaffinity syscall failed" ); 87 } 88 89 static basic_mask_t* process_mask; 90 static int num_masks; 91 92 void destroy_process_mask() { 93 delete [] process_mask; 94 process_mask = nullptr; 95 } 96 97 #define curMaskSize sizeof(basic_mask_t) * num_masks 98 affinity_helper::~affinity_helper() { 99 if( threadMask ) { 100 if( is_changed ) { 101 set_thread_affinity_mask( curMaskSize, threadMask ); 102 } 103 delete [] threadMask; 104 } 105 } 106 void affinity_helper::protect_affinity_mask( bool restore_process_mask ) { 107 if( threadMask == nullptr && num_masks ) { // TODO: assert num_masks validity? 108 threadMask = new basic_mask_t [num_masks]; 109 std::memset( threadMask, 0, curMaskSize ); 110 get_thread_affinity_mask( curMaskSize, threadMask ); 111 if( restore_process_mask ) { 112 __TBB_ASSERT( process_mask, "A process mask is requested but not yet stored" ); 113 is_changed = memcmp( process_mask, threadMask, curMaskSize ); 114 if( is_changed ) 115 set_thread_affinity_mask( curMaskSize, process_mask ); 116 } else { 117 // Assume that the mask will be changed by the caller. 118 is_changed = 1; 119 } 120 } 121 } 122 void affinity_helper::dismiss() { 123 delete [] threadMask; 124 threadMask = nullptr; 125 is_changed = 0; 126 } 127 #undef curMaskSize 128 129 static std::atomic<do_once_state> hardware_concurrency_info; 130 131 static int theNumProcs; 132 133 static void initialize_hardware_concurrency_info () { 134 int err; 135 int availableProcs = 0; 136 int numMasks = 1; 137 int maxProcs = sysconf(_SC_NPROCESSORS_ONLN); 138 basic_mask_t* processMask; 139 const std::size_t BasicMaskSize = sizeof(basic_mask_t); 140 for (;;) { 141 const int curMaskSize = BasicMaskSize * numMasks; 142 processMask = new basic_mask_t[numMasks]; 143 std::memset( processMask, 0, curMaskSize ); 144 #if __FreeBSD__ || __NetBSD__ || __OpenBSD__ 145 // CPU_LEVEL_WHICH - anonymous (current) mask, CPU_LEVEL_CPUSET - assigned mask 146 err = cpuset_getaffinity( CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, curMaskSize, processMask ); 147 if ( !err || errno != ERANGE || curMaskSize * CHAR_BIT >= 16 * 1024 ) 148 break; 149 #else /* __unix__ */ 150 int pid = getpid(); 151 err = sched_getaffinity( pid, curMaskSize, processMask ); 152 if ( !err || errno != EINVAL || curMaskSize * CHAR_BIT >= 256 * 1024 ) 153 break; 154 #endif 155 delete[] processMask; 156 numMasks <<= 1; 157 } 158 if ( !err ) { 159 // We have found the mask size and captured the process affinity mask into processMask. 160 num_masks = numMasks; // do here because it's needed for affinity_helper to work 161 #if __unix__ 162 // For better coexistence with libiomp which might have changed the mask already, 163 // check for its presence and ask it to restore the mask. 164 dynamic_link_handle libhandle; 165 if ( dynamic_link( "libiomp5.so", iompLinkTable, 1, &libhandle, DYNAMIC_LINK_GLOBAL ) ) { 166 // We have found the symbol provided by libiomp5 for restoring original thread affinity. 167 affinity_helper affhelp; 168 affhelp.protect_affinity_mask( /*restore_process_mask=*/false ); 169 if ( libiomp_try_restoring_original_mask()==0 ) { 170 // Now we have the right mask to capture, restored by libiomp. 171 const int curMaskSize = BasicMaskSize * numMasks; 172 std::memset( processMask, 0, curMaskSize ); 173 get_thread_affinity_mask( curMaskSize, processMask ); 174 } else 175 affhelp.dismiss(); // thread mask has not changed 176 dynamic_unlink( libhandle ); 177 // Destructor of affinity_helper restores the thread mask (unless dismissed). 178 } 179 #endif 180 for ( int m = 0; availableProcs < maxProcs && m < numMasks; ++m ) { 181 for ( std::size_t i = 0; (availableProcs < maxProcs) && (i < BasicMaskSize * CHAR_BIT); ++i ) { 182 if ( CPU_ISSET( i, processMask + m ) ) 183 ++availableProcs; 184 } 185 } 186 process_mask = processMask; 187 } 188 else { 189 // Failed to get the process affinity mask; assume the whole machine can be used. 190 availableProcs = (maxProcs == INT_MAX) ? sysconf(_SC_NPROCESSORS_ONLN) : maxProcs; 191 delete[] processMask; 192 } 193 theNumProcs = availableProcs > 0 ? availableProcs : 1; // Fail safety strap 194 __TBB_ASSERT( theNumProcs <= sysconf(_SC_NPROCESSORS_ONLN), nullptr); 195 } 196 197 int AvailableHwConcurrency() { 198 atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info ); 199 return theNumProcs; 200 } 201 202 /* End of __TBB_USE_OS_AFFINITY_SYSCALL implementation */ 203 #elif __ANDROID__ 204 205 // Work-around for Android that reads the correct number of available CPUs since system calls are unreliable. 206 // Format of "present" file is: ([<int>-<int>|<int>],)+ 207 int AvailableHwConcurrency() { 208 FILE *fp = fopen("/sys/devices/system/cpu/present", "r"); 209 if (fp == nullptr) return 1; 210 int num_args, lower, upper, num_cpus=0; 211 while ((num_args = fscanf(fp, "%u-%u", &lower, &upper)) != EOF) { 212 switch(num_args) { 213 case 2: num_cpus += upper - lower + 1; break; 214 case 1: num_cpus += 1; break; 215 } 216 fscanf(fp, ","); 217 } 218 fclose(fp); 219 return (num_cpus > 0) ? num_cpus : 1; 220 } 221 222 #elif defined(_SC_NPROCESSORS_ONLN) 223 224 int AvailableHwConcurrency() { 225 int n = sysconf(_SC_NPROCESSORS_ONLN); 226 return (n > 0) ? n : 1; 227 } 228 229 #elif _WIN32||_WIN64 230 231 static std::atomic<do_once_state> hardware_concurrency_info; 232 233 static const WORD TBB_ALL_PROCESSOR_GROUPS = 0xffff; 234 235 // Statically allocate an array for processor group information. 236 // Windows 7 supports maximum 4 groups, but let's look ahead a little. 237 static const WORD MaxProcessorGroups = 64; 238 239 struct ProcessorGroupInfo { 240 DWORD_PTR mask; ///< Affinity mask covering the whole group 241 int numProcs; ///< Number of processors in the group 242 int numProcsRunningTotal; ///< Subtotal of processors in this and preceding groups 243 244 //! Total number of processor groups in the system 245 static int NumGroups; 246 247 //! Index of the group with a slot reserved for the first external thread 248 /** In the context of multiple processor groups support current implementation 249 defines "the first external thread" as the first thread to invoke 250 AvailableHwConcurrency(). 251 252 TODO: Implement a dynamic scheme remapping workers depending on the pending 253 external threads affinity. **/ 254 static int HoleIndex; 255 }; 256 257 int ProcessorGroupInfo::NumGroups = 1; 258 int ProcessorGroupInfo::HoleIndex = 0; 259 260 ProcessorGroupInfo theProcessorGroups[MaxProcessorGroups]; 261 int calculate_numa[MaxProcessorGroups]; //Array needed for FindProcessorGroupIndex to calculate Processor Group when number of threads > number of cores to distribute threads evenly between processor groups 262 int numaSum; 263 struct TBB_GROUP_AFFINITY { 264 DWORD_PTR Mask; 265 WORD Group; 266 WORD Reserved[3]; 267 }; 268 269 static DWORD (WINAPI *TBB_GetActiveProcessorCount)( WORD groupIndex ) = nullptr; 270 static WORD (WINAPI *TBB_GetActiveProcessorGroupCount)() = nullptr; 271 static BOOL (WINAPI *TBB_SetThreadGroupAffinity)( HANDLE hThread, 272 const TBB_GROUP_AFFINITY* newAff, TBB_GROUP_AFFINITY *prevAff ); 273 static BOOL (WINAPI *TBB_GetThreadGroupAffinity)( HANDLE hThread, TBB_GROUP_AFFINITY* ); 274 275 static const dynamic_link_descriptor ProcessorGroupsApiLinkTable[] = { 276 DLD(GetActiveProcessorCount, TBB_GetActiveProcessorCount) 277 , DLD(GetActiveProcessorGroupCount, TBB_GetActiveProcessorGroupCount) 278 , DLD(SetThreadGroupAffinity, TBB_SetThreadGroupAffinity) 279 , DLD(GetThreadGroupAffinity, TBB_GetThreadGroupAffinity) 280 }; 281 282 static void initialize_hardware_concurrency_info () { 283 suppress_unused_warning(TBB_ALL_PROCESSOR_GROUPS); 284 #if __TBB_WIN8UI_SUPPORT 285 // For these applications processor groups info is unavailable 286 // Setting up a number of processors for one processor group 287 theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = std::thread::hardware_concurrency(); 288 #else /* __TBB_WIN8UI_SUPPORT */ 289 dynamic_link( "Kernel32.dll", ProcessorGroupsApiLinkTable, 290 sizeof(ProcessorGroupsApiLinkTable)/sizeof(dynamic_link_descriptor) ); 291 SYSTEM_INFO si; 292 GetNativeSystemInfo(&si); 293 DWORD_PTR pam, sam, m = 1; 294 GetProcessAffinityMask( GetCurrentProcess(), &pam, &sam ); 295 int nproc = 0; 296 for ( std::size_t i = 0; i < sizeof(DWORD_PTR) * CHAR_BIT; ++i, m <<= 1 ) { 297 if ( pam & m ) 298 ++nproc; 299 } 300 __TBB_ASSERT( nproc <= (int)si.dwNumberOfProcessors, nullptr); 301 // By default setting up a number of processors for one processor group 302 theProcessorGroups[0].numProcs = theProcessorGroups[0].numProcsRunningTotal = nproc; 303 // Setting up processor groups in case the process does not restrict affinity mask and more than one processor group is present 304 if ( nproc == (int)si.dwNumberOfProcessors && TBB_GetActiveProcessorCount ) { 305 // The process does not have restricting affinity mask and multiple processor groups are possible 306 ProcessorGroupInfo::NumGroups = (int)TBB_GetActiveProcessorGroupCount(); 307 __TBB_ASSERT( ProcessorGroupInfo::NumGroups <= MaxProcessorGroups, nullptr); 308 // Fail safety bootstrap. Release versions will limit available concurrency 309 // level, while debug ones would assert. 310 if ( ProcessorGroupInfo::NumGroups > MaxProcessorGroups ) 311 ProcessorGroupInfo::NumGroups = MaxProcessorGroups; 312 if ( ProcessorGroupInfo::NumGroups > 1 ) { 313 TBB_GROUP_AFFINITY ga; 314 if ( TBB_GetThreadGroupAffinity( GetCurrentThread(), &ga ) ) 315 ProcessorGroupInfo::HoleIndex = ga.Group; 316 int nprocs = 0; 317 int min_procs = INT_MAX; 318 for ( WORD i = 0; i < ProcessorGroupInfo::NumGroups; ++i ) { 319 ProcessorGroupInfo &pgi = theProcessorGroups[i]; 320 pgi.numProcs = (int)TBB_GetActiveProcessorCount(i); 321 if (pgi.numProcs < min_procs) min_procs = pgi.numProcs; //Finding the minimum number of processors in the Processor Groups 322 calculate_numa[i] = pgi.numProcs; 323 __TBB_ASSERT( pgi.numProcs <= (int)sizeof(DWORD_PTR) * CHAR_BIT, nullptr); 324 pgi.mask = pgi.numProcs == sizeof(DWORD_PTR) * CHAR_BIT ? ~(DWORD_PTR)0 : (DWORD_PTR(1) << pgi.numProcs) - 1; 325 pgi.numProcsRunningTotal = nprocs += pgi.numProcs; 326 } 327 __TBB_ASSERT( nprocs == (int)TBB_GetActiveProcessorCount( TBB_ALL_PROCESSOR_GROUPS ), nullptr); 328 329 calculate_numa[0] = (calculate_numa[0] / min_procs)-1; 330 for (WORD i = 1; i < ProcessorGroupInfo::NumGroups; ++i) { 331 calculate_numa[i] = calculate_numa[i-1] + (calculate_numa[i] / min_procs); 332 } 333 334 numaSum = calculate_numa[ProcessorGroupInfo::NumGroups - 1]; 335 336 } 337 338 } 339 #endif /* __TBB_WIN8UI_SUPPORT */ 340 341 PrintExtraVersionInfo("Processor groups", "%d", ProcessorGroupInfo::NumGroups); 342 if (ProcessorGroupInfo::NumGroups>1) 343 for (int i=0; i<ProcessorGroupInfo::NumGroups; ++i) 344 PrintExtraVersionInfo( "----- Group", "%d: size %d", i, theProcessorGroups[i].numProcs); 345 } 346 347 int NumberOfProcessorGroups() { 348 __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "NumberOfProcessorGroups is used before AvailableHwConcurrency" ); 349 return ProcessorGroupInfo::NumGroups; 350 } 351 352 int FindProcessorGroupIndex ( int procIdx ) { 353 int current_grp_idx = ProcessorGroupInfo::HoleIndex; 354 if (procIdx >= theProcessorGroups[current_grp_idx].numProcs && procIdx < theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal) { 355 procIdx = procIdx - theProcessorGroups[current_grp_idx].numProcs; 356 do { 357 current_grp_idx = (current_grp_idx + 1) % (ProcessorGroupInfo::NumGroups); 358 procIdx = procIdx - theProcessorGroups[current_grp_idx].numProcs; 359 360 } while (procIdx >= 0); 361 } 362 else if (procIdx >= theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal) { 363 int temp_grp_index = 0; 364 procIdx = procIdx - theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal; 365 procIdx = procIdx % (numaSum+1); //ProcIdx to stay between 0 and numaSum 366 367 while (procIdx - calculate_numa[temp_grp_index] > 0) { 368 temp_grp_index = (temp_grp_index + 1) % ProcessorGroupInfo::NumGroups; 369 } 370 current_grp_idx = temp_grp_index; 371 } 372 __TBB_ASSERT(current_grp_idx < ProcessorGroupInfo::NumGroups, nullptr); 373 374 return current_grp_idx; 375 } 376 377 void MoveThreadIntoProcessorGroup( void* hThread, int groupIndex ) { 378 __TBB_ASSERT( hardware_concurrency_info == do_once_state::initialized, "MoveThreadIntoProcessorGroup is used before AvailableHwConcurrency" ); 379 if ( !TBB_SetThreadGroupAffinity ) 380 return; 381 TBB_GROUP_AFFINITY ga = { theProcessorGroups[groupIndex].mask, (WORD)groupIndex, {0,0,0} }; 382 TBB_SetThreadGroupAffinity( hThread, &ga, nullptr); 383 } 384 385 int AvailableHwConcurrency() { 386 atomic_do_once( &initialize_hardware_concurrency_info, hardware_concurrency_info ); 387 return theProcessorGroups[ProcessorGroupInfo::NumGroups - 1].numProcsRunningTotal; 388 } 389 390 /* End of _WIN32||_WIN64 implementation */ 391 #else 392 #error AvailableHwConcurrency is not implemented for this OS 393 #endif 394 395 } // namespace r1 396 } // namespace detail 397 } // namespace tbb 398 399 #endif /* !__TBB_HardwareConcurrency */ 400