1 /*
2 Copyright (c) 2005-2023 Intel Corporation
3
4 Licensed under the Apache License, Version 2.0 (the "License");
5 you may not use this file except in compliance with the License.
6 You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
15 */
16
17 #ifndef __TBB_detail__machine_H
18 #define __TBB_detail__machine_H
19
20 #include "_config.h"
21 #include "_assert.h"
22
23 #include <atomic>
24 #include <climits>
25 #include <cstdint>
26 #include <cstddef>
27
28 #ifdef _WIN32
29 #include <intrin.h>
30 #ifdef __TBBMALLOC_BUILD
31 #define WIN32_LEAN_AND_MEAN
32 #ifndef NOMINMAX
33 #define NOMINMAX
34 #endif
35 #include <windows.h> // SwitchToThread()
36 #endif
37 #ifdef _MSC_VER
38 #if __TBB_x86_64 || __TBB_x86_32
39 #pragma intrinsic(__rdtsc)
40 #endif
41 #endif
42 #endif
43 #if __TBB_x86_64 || __TBB_x86_32
44 #include <immintrin.h> // _mm_pause
45 #endif
46 #if (_WIN32)
47 #include <float.h> // _control87
48 #endif
49
50 #if __TBB_GLIBCXX_THIS_THREAD_YIELD_BROKEN
51 #include <sched.h> // sched_yield
52 #else
53 #include <thread> // std::this_thread::yield()
54 #endif
55
56 namespace tbb {
57 namespace detail {
58 inline namespace d0 {
59
60 //--------------------------------------------------------------------------------------------------
61 // Yield implementation
62 //--------------------------------------------------------------------------------------------------
63
64 #if __TBB_GLIBCXX_THIS_THREAD_YIELD_BROKEN
yield()65 static inline void yield() {
66 int err = sched_yield();
67 __TBB_ASSERT_EX(err == 0, "sched_yield has failed");
68 }
69 #elif __TBBMALLOC_BUILD && _WIN32
70 // Use Windows API for yield in tbbmalloc to avoid dependency on C++ runtime with some implementations.
71 static inline void yield() {
72 SwitchToThread();
73 }
74 #else
75 using std::this_thread::yield;
76 #endif
77
78 //--------------------------------------------------------------------------------------------------
79 // atomic_fence_seq_cst implementation
80 //--------------------------------------------------------------------------------------------------
81
atomic_fence_seq_cst()82 static inline void atomic_fence_seq_cst() {
83 #if (__TBB_x86_64 || __TBB_x86_32) && defined(__GNUC__) && __GNUC__ < 11
84 unsigned char dummy = 0u;
85 __asm__ __volatile__ ("lock; notb %0" : "+m" (dummy) :: "memory");
86 #else
87 std::atomic_thread_fence(std::memory_order_seq_cst);
88 #endif
89 }
90
91 //--------------------------------------------------------------------------------------------------
92 // Pause implementation
93 //--------------------------------------------------------------------------------------------------
94
machine_pause(int32_t delay)95 static inline void machine_pause(int32_t delay) {
96 #if __TBB_x86_64 || __TBB_x86_32
97 while (delay-- > 0) { _mm_pause(); }
98 #elif __ARM_ARCH_7A__ || __aarch64__
99 while (delay-- > 0) { __asm__ __volatile__("yield" ::: "memory"); }
100 #else /* Generic */
101 (void)delay; // suppress without including _template_helpers.h
102 yield();
103 #endif
104 }
105
106 ////////////////////////////////////////////////////////////////////////////////////////////////////
107 // tbb::detail::log2() implementation
108 ////////////////////////////////////////////////////////////////////////////////////////////////////
109 // TODO: Use log2p1() function that will be available in C++20 standard
110
111 #if defined(__GNUC__) || defined(__clang__)
112 namespace gnu_builtins {
clz(unsigned int x)113 inline uintptr_t clz(unsigned int x) { return static_cast<uintptr_t>(__builtin_clz(x)); }
clz(unsigned long int x)114 inline uintptr_t clz(unsigned long int x) { return static_cast<uintptr_t>(__builtin_clzl(x)); }
clz(unsigned long long int x)115 inline uintptr_t clz(unsigned long long int x) { return static_cast<uintptr_t>(__builtin_clzll(x)); }
116 }
117 #elif defined(_MSC_VER)
118 #pragma intrinsic(__TBB_W(_BitScanReverse))
119 namespace msvc_intrinsics {
bit_scan_reverse(uintptr_t i)120 static inline uintptr_t bit_scan_reverse(uintptr_t i) {
121 unsigned long j;
122 __TBB_W(_BitScanReverse)( &j, i );
123 return j;
124 }
125 }
126 #endif
127
128 template <typename T>
number_of_bits()129 constexpr std::uintptr_t number_of_bits() {
130 return sizeof(T) * CHAR_BIT;
131 }
132
133 // logarithm is the index of the most significant non-zero bit
machine_log2(uintptr_t x)134 static inline uintptr_t machine_log2(uintptr_t x) {
135 #if defined(__GNUC__) || defined(__clang__)
136 // If P is a power of 2 and x<P, then (P-1)-x == (P-1) XOR x
137 return (number_of_bits<decltype(x)>() - 1) ^ gnu_builtins::clz(x);
138 #elif defined(_MSC_VER)
139 return msvc_intrinsics::bit_scan_reverse(x);
140 #elif __i386__ || __i386 /*for Sun OS*/ || __MINGW32__
141 uintptr_t j, i = x;
142 __asm__("bsr %1,%0" : "=r"(j) : "r"(i));
143 return j;
144 #elif __powerpc__ || __POWERPC__
145 #if __TBB_WORDSIZE==8
146 __asm__ __volatile__ ("cntlzd %0,%0" : "+r"(x));
147 return 63 - static_cast<intptr_t>(x);
148 #else
149 __asm__ __volatile__ ("cntlzw %0,%0" : "+r"(x));
150 return 31 - static_cast<intptr_t>(x);
151 #endif /*__TBB_WORDSIZE*/
152 #elif __sparc
153 uint64_t count;
154 // one hot encode
155 x |= (x >> 1);
156 x |= (x >> 2);
157 x |= (x >> 4);
158 x |= (x >> 8);
159 x |= (x >> 16);
160 x |= (x >> 32);
161 // count 1's
162 __asm__ ("popc %1, %0" : "=r"(count) : "r"(x) );
163 return count - 1;
164 #else
165 intptr_t result = 0;
166
167 if( sizeof(x) > 4 && (uintptr_t tmp = x >> 32) ) { x = tmp; result += 32; }
168 if( uintptr_t tmp = x >> 16 ) { x = tmp; result += 16; }
169 if( uintptr_t tmp = x >> 8 ) { x = tmp; result += 8; }
170 if( uintptr_t tmp = x >> 4 ) { x = tmp; result += 4; }
171 if( uintptr_t tmp = x >> 2 ) { x = tmp; result += 2; }
172
173 return (x & 2) ? result + 1 : result;
174 #endif
175 }
176
177 ////////////////////////////////////////////////////////////////////////////////////////////////////
178 // tbb::detail::reverse_bits() implementation
179 ////////////////////////////////////////////////////////////////////////////////////////////////////
180 #if TBB_USE_CLANG_BITREVERSE_BUILTINS
181 namespace llvm_builtins {
builtin_bitreverse(uint8_t x)182 inline uint8_t builtin_bitreverse(uint8_t x) { return __builtin_bitreverse8 (x); }
builtin_bitreverse(uint16_t x)183 inline uint16_t builtin_bitreverse(uint16_t x) { return __builtin_bitreverse16(x); }
builtin_bitreverse(uint32_t x)184 inline uint32_t builtin_bitreverse(uint32_t x) { return __builtin_bitreverse32(x); }
builtin_bitreverse(uint64_t x)185 inline uint64_t builtin_bitreverse(uint64_t x) { return __builtin_bitreverse64(x); }
186 }
187 #else // generic
188 template<typename T>
189 struct reverse {
190 static const T byte_table[256];
191 };
192
193 template<typename T>
194 const T reverse<T>::byte_table[256] = {
195 0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0,
196 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8,
197 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4,
198 0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC,
199 0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2,
200 0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA,
201 0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,
202 0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE,
203 0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1,
204 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9,
205 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5,
206 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD,
207 0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3,
208 0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
209 0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7,
210 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF
211 };
212
reverse_byte(unsigned char src)213 inline unsigned char reverse_byte(unsigned char src) {
214 return reverse<unsigned char>::byte_table[src];
215 }
216 #endif // TBB_USE_CLANG_BITREVERSE_BUILTINS
217
218 template<typename T>
machine_reverse_bits(T src)219 T machine_reverse_bits(T src) {
220 #if TBB_USE_CLANG_BITREVERSE_BUILTINS
221 return builtin_bitreverse(fixed_width_cast(src));
222 #else /* Generic */
223 T dst;
224 unsigned char *original = reinterpret_cast<unsigned char *>(&src);
225 unsigned char *reversed = reinterpret_cast<unsigned char *>(&dst);
226
227 for ( int i = sizeof(T) - 1; i >= 0; i-- ) {
228 reversed[i] = reverse_byte( original[sizeof(T) - i - 1] );
229 }
230
231 return dst;
232 #endif // TBB_USE_CLANG_BITREVERSE_BUILTINS
233 }
234
235 } // inline namespace d0
236
237 namespace d1 {
238
239 #if (_WIN32)
240 // API to retrieve/update FPU control setting
241 #define __TBB_CPU_CTL_ENV_PRESENT 1
242 struct cpu_ctl_env {
243 unsigned int x87cw{};
244 #if (__TBB_x86_64)
245 // Changing the infinity mode or the floating-point precision is not supported on x64.
246 // The attempt causes an assertion. See
247 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/control87-controlfp-control87-2
248 static constexpr unsigned int X87CW_CONTROL_MASK = _MCW_DN | _MCW_EM | _MCW_RC;
249 #else
250 static constexpr unsigned int X87CW_CONTROL_MASK = ~0U;
251 #endif
252 #if (__TBB_x86_32 || __TBB_x86_64)
253 unsigned int mxcsr{};
254 static constexpr unsigned int MXCSR_CONTROL_MASK = ~0x3fu; /* all except last six status bits */
255 #endif
256
257 bool operator!=( const cpu_ctl_env& ctl ) const {
258 return
259 #if (__TBB_x86_32 || __TBB_x86_64)
260 mxcsr != ctl.mxcsr ||
261 #endif
262 x87cw != ctl.x87cw;
263 }
get_envcpu_ctl_env264 void get_env() {
265 x87cw = _control87(0, 0);
266 #if (__TBB_x86_32 || __TBB_x86_64)
267 mxcsr = _mm_getcsr();
268 #endif
269 }
set_envcpu_ctl_env270 void set_env() const {
271 _control87(x87cw, X87CW_CONTROL_MASK);
272 #if (__TBB_x86_32 || __TBB_x86_64)
273 _mm_setcsr(mxcsr & MXCSR_CONTROL_MASK);
274 #endif
275 }
276 };
277 #elif (__TBB_x86_32 || __TBB_x86_64)
278 // API to retrieve/update FPU control setting
279 #define __TBB_CPU_CTL_ENV_PRESENT 1
280 struct cpu_ctl_env {
281 int mxcsr{};
282 short x87cw{};
283 static const int MXCSR_CONTROL_MASK = ~0x3f; /* all except last six status bits */
284
285 bool operator!=(const cpu_ctl_env& ctl) const {
286 return mxcsr != ctl.mxcsr || x87cw != ctl.x87cw;
287 }
288 void get_env() {
289 __asm__ __volatile__(
290 "stmxcsr %0\n\t"
291 "fstcw %1"
292 : "=m"(mxcsr), "=m"(x87cw)
293 );
294 mxcsr &= MXCSR_CONTROL_MASK;
295 }
296 void set_env() const {
297 __asm__ __volatile__(
298 "ldmxcsr %0\n\t"
299 "fldcw %1"
300 : : "m"(mxcsr), "m"(x87cw)
301 );
302 }
303 };
304 #endif
305
306 } // namespace d1
307
308 } // namespace detail
309 } // namespace tbb
310
311 #if !__TBB_CPU_CTL_ENV_PRESENT
312 #include <fenv.h>
313
314 #include <cstring>
315
316 namespace tbb {
317 namespace detail {
318
319 namespace r1 {
320 void* __TBB_EXPORTED_FUNC cache_aligned_allocate(std::size_t size);
321 void __TBB_EXPORTED_FUNC cache_aligned_deallocate(void* p);
322 } // namespace r1
323
324 namespace d1 {
325
326 class cpu_ctl_env {
327 fenv_t *my_fenv_ptr;
328 public:
cpu_ctl_env()329 cpu_ctl_env() : my_fenv_ptr(nullptr) {}
~cpu_ctl_env()330 ~cpu_ctl_env() {
331 if ( my_fenv_ptr )
332 r1::cache_aligned_deallocate( (void*)my_fenv_ptr );
333 }
334 // It is possible not to copy memory but just to copy pointers but the following issues should be addressed:
335 // 1. The arena lifetime and the context lifetime are independent;
336 // 2. The user is allowed to recapture different FPU settings to context so 'current FPU settings' inside
337 // dispatch loop may become invalid.
338 // But do we really want to improve the fenv implementation? It seems to be better to replace the fenv implementation
339 // with a platform specific implementation.
cpu_ctl_env(const cpu_ctl_env & src)340 cpu_ctl_env( const cpu_ctl_env &src ) : my_fenv_ptr(nullptr) {
341 *this = src;
342 }
343 cpu_ctl_env& operator=( const cpu_ctl_env &src ) {
344 __TBB_ASSERT( src.my_fenv_ptr, nullptr);
345 if ( !my_fenv_ptr )
346 my_fenv_ptr = (fenv_t*)r1::cache_aligned_allocate(sizeof(fenv_t));
347 *my_fenv_ptr = *src.my_fenv_ptr;
348 return *this;
349 }
350 bool operator!=( const cpu_ctl_env &ctl ) const {
351 __TBB_ASSERT( my_fenv_ptr, "cpu_ctl_env is not initialized." );
352 __TBB_ASSERT( ctl.my_fenv_ptr, "cpu_ctl_env is not initialized." );
353 return std::memcmp( (void*)my_fenv_ptr, (void*)ctl.my_fenv_ptr, sizeof(fenv_t) );
354 }
get_env()355 void get_env () {
356 if ( !my_fenv_ptr )
357 my_fenv_ptr = (fenv_t*)r1::cache_aligned_allocate(sizeof(fenv_t));
358 fegetenv( my_fenv_ptr );
359 }
set_env()360 const cpu_ctl_env& set_env () const {
361 __TBB_ASSERT( my_fenv_ptr, "cpu_ctl_env is not initialized." );
362 fesetenv( my_fenv_ptr );
363 return *this;
364 }
365 };
366
367 } // namespace d1
368 } // namespace detail
369 } // namespace tbb
370
371 #endif /* !__TBB_CPU_CTL_ENV_PRESENT */
372
373 #endif // __TBB_detail__machine_H
374