xref: /oneTBB/include/oneapi/tbb/detail/_machine.h (revision 0cf592bd)
1 /*
2     Copyright (c) 2005-2023 Intel Corporation
3 
4     Licensed under the Apache License, Version 2.0 (the "License");
5     you may not use this file except in compliance with the License.
6     You may obtain a copy of the License at
7 
8         http://www.apache.org/licenses/LICENSE-2.0
9 
10     Unless required by applicable law or agreed to in writing, software
11     distributed under the License is distributed on an "AS IS" BASIS,
12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13     See the License for the specific language governing permissions and
14     limitations under the License.
15 */
16 
17 #ifndef __TBB_detail__machine_H
18 #define __TBB_detail__machine_H
19 
20 #include "_config.h"
21 #include "_assert.h"
22 
23 #include <atomic>
24 #include <climits>
25 #include <cstdint>
26 #include <cstddef>
27 
28 #ifdef _WIN32
29 #include <intrin.h>
30 #ifdef __TBBMALLOC_BUILD
31 #define WIN32_LEAN_AND_MEAN
32 #ifndef NOMINMAX
33 #define NOMINMAX
34 #endif
35 #include <windows.h> // SwitchToThread()
36 #endif
37 #ifdef _MSC_VER
38 #if __TBB_x86_64 || __TBB_x86_32
39 #pragma intrinsic(__rdtsc)
40 #endif
41 #endif
42 #endif
43 #if __TBB_x86_64 || __TBB_x86_32
44 #include <immintrin.h> // _mm_pause
45 #endif
46 #if (_WIN32)
47 #include <float.h> // _control87
48 #endif
49 
50 #if __TBB_GLIBCXX_THIS_THREAD_YIELD_BROKEN
51 #include <sched.h> // sched_yield
52 #else
53 #include <thread> // std::this_thread::yield()
54 #endif
55 
56 namespace tbb {
57 namespace detail {
58 inline namespace d0 {
59 
60 //--------------------------------------------------------------------------------------------------
61 // Yield implementation
62 //--------------------------------------------------------------------------------------------------
63 
64 #if __TBB_GLIBCXX_THIS_THREAD_YIELD_BROKEN
yield()65 static inline void yield() {
66     int err = sched_yield();
67     __TBB_ASSERT_EX(err == 0, "sched_yield has failed");
68 }
69 #elif __TBBMALLOC_BUILD && _WIN32
70 // Use Windows API for yield in tbbmalloc to avoid dependency on C++ runtime with some implementations.
71 static inline void yield() {
72     SwitchToThread();
73 }
74 #else
75 using std::this_thread::yield;
76 #endif
77 
78 //--------------------------------------------------------------------------------------------------
79 // atomic_fence_seq_cst implementation
80 //--------------------------------------------------------------------------------------------------
81 
atomic_fence_seq_cst()82 static inline void atomic_fence_seq_cst() {
83 #if (__TBB_x86_64 || __TBB_x86_32) && defined(__GNUC__) && __GNUC__ < 11
84     unsigned char dummy = 0u;
85     __asm__ __volatile__ ("lock; notb %0" : "+m" (dummy) :: "memory");
86 #else
87     std::atomic_thread_fence(std::memory_order_seq_cst);
88 #endif
89 }
90 
91 //--------------------------------------------------------------------------------------------------
92 // Pause implementation
93 //--------------------------------------------------------------------------------------------------
94 
machine_pause(int32_t delay)95 static inline void machine_pause(int32_t delay) {
96 #if __TBB_x86_64 || __TBB_x86_32
97     while (delay-- > 0) { _mm_pause(); }
98 #elif __ARM_ARCH_7A__ || __aarch64__
99     while (delay-- > 0) { __asm__ __volatile__("yield" ::: "memory"); }
100 #else /* Generic */
101     (void)delay; // suppress without including _template_helpers.h
102     yield();
103 #endif
104 }
105 
106 ////////////////////////////////////////////////////////////////////////////////////////////////////
107 // tbb::detail::log2() implementation
108 ////////////////////////////////////////////////////////////////////////////////////////////////////
109 // TODO: Use log2p1() function that will be available in C++20 standard
110 
111 #if defined(__GNUC__) || defined(__clang__)
112 namespace gnu_builtins {
clz(unsigned int x)113     inline uintptr_t clz(unsigned int x) { return static_cast<uintptr_t>(__builtin_clz(x)); }
clz(unsigned long int x)114     inline uintptr_t clz(unsigned long int x) { return static_cast<uintptr_t>(__builtin_clzl(x)); }
clz(unsigned long long int x)115     inline uintptr_t clz(unsigned long long int x) { return static_cast<uintptr_t>(__builtin_clzll(x)); }
116 }
117 #elif defined(_MSC_VER)
118 #pragma intrinsic(__TBB_W(_BitScanReverse))
119 namespace msvc_intrinsics {
bit_scan_reverse(uintptr_t i)120     static inline uintptr_t bit_scan_reverse(uintptr_t i) {
121         unsigned long j;
122         __TBB_W(_BitScanReverse)( &j, i );
123         return j;
124     }
125 }
126 #endif
127 
128 template <typename T>
number_of_bits()129 constexpr std::uintptr_t number_of_bits() {
130     return sizeof(T) * CHAR_BIT;
131 }
132 
133 // logarithm is the index of the most significant non-zero bit
machine_log2(uintptr_t x)134 static inline uintptr_t machine_log2(uintptr_t x) {
135 #if defined(__GNUC__) || defined(__clang__)
136     // If P is a power of 2 and x<P, then (P-1)-x == (P-1) XOR x
137     return (number_of_bits<decltype(x)>() - 1) ^ gnu_builtins::clz(x);
138 #elif defined(_MSC_VER)
139     return msvc_intrinsics::bit_scan_reverse(x);
140 #elif __i386__ || __i386 /*for Sun OS*/ || __MINGW32__
141     uintptr_t j, i = x;
142     __asm__("bsr %1,%0" : "=r"(j) : "r"(i));
143     return j;
144 #elif __powerpc__ || __POWERPC__
145     #if __TBB_WORDSIZE==8
146     __asm__ __volatile__ ("cntlzd %0,%0" : "+r"(x));
147     return 63 - static_cast<intptr_t>(x);
148     #else
149     __asm__ __volatile__ ("cntlzw %0,%0" : "+r"(x));
150     return 31 - static_cast<intptr_t>(x);
151     #endif /*__TBB_WORDSIZE*/
152 #elif __sparc
153     uint64_t count;
154     // one hot encode
155     x |= (x >> 1);
156     x |= (x >> 2);
157     x |= (x >> 4);
158     x |= (x >> 8);
159     x |= (x >> 16);
160     x |= (x >> 32);
161     // count 1's
162     __asm__ ("popc %1, %0" : "=r"(count) : "r"(x) );
163     return count - 1;
164 #else
165     intptr_t result = 0;
166 
167     if( sizeof(x) > 4 && (uintptr_t tmp = x >> 32) ) { x = tmp; result += 32; }
168     if( uintptr_t tmp = x >> 16 ) { x = tmp; result += 16; }
169     if( uintptr_t tmp = x >> 8 )  { x = tmp; result += 8; }
170     if( uintptr_t tmp = x >> 4 )  { x = tmp; result += 4; }
171     if( uintptr_t tmp = x >> 2 )  { x = tmp; result += 2; }
172 
173     return (x & 2) ? result + 1 : result;
174 #endif
175 }
176 
177 ////////////////////////////////////////////////////////////////////////////////////////////////////
178 // tbb::detail::reverse_bits() implementation
179 ////////////////////////////////////////////////////////////////////////////////////////////////////
180 #if TBB_USE_CLANG_BITREVERSE_BUILTINS
181 namespace  llvm_builtins {
builtin_bitreverse(uint8_t x)182     inline uint8_t  builtin_bitreverse(uint8_t  x) { return __builtin_bitreverse8 (x); }
builtin_bitreverse(uint16_t x)183     inline uint16_t builtin_bitreverse(uint16_t x) { return __builtin_bitreverse16(x); }
builtin_bitreverse(uint32_t x)184     inline uint32_t builtin_bitreverse(uint32_t x) { return __builtin_bitreverse32(x); }
builtin_bitreverse(uint64_t x)185     inline uint64_t builtin_bitreverse(uint64_t x) { return __builtin_bitreverse64(x); }
186 }
187 #else // generic
188 template<typename T>
189 struct reverse {
190     static const T byte_table[256];
191 };
192 
193 template<typename T>
194 const T reverse<T>::byte_table[256] = {
195     0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0,
196     0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8,
197     0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4,
198     0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC,
199     0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2,
200     0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA,
201     0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,
202     0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE,
203     0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1,
204     0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9,
205     0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5,
206     0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD,
207     0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3,
208     0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
209     0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7,
210     0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF
211 };
212 
reverse_byte(unsigned char src)213 inline unsigned char reverse_byte(unsigned char src) {
214     return reverse<unsigned char>::byte_table[src];
215 }
216 #endif // TBB_USE_CLANG_BITREVERSE_BUILTINS
217 
218 template<typename T>
machine_reverse_bits(T src)219 T machine_reverse_bits(T src) {
220 #if TBB_USE_CLANG_BITREVERSE_BUILTINS
221     return builtin_bitreverse(fixed_width_cast(src));
222 #else /* Generic */
223     T dst;
224     unsigned char *original = reinterpret_cast<unsigned char *>(&src);
225     unsigned char *reversed = reinterpret_cast<unsigned char *>(&dst);
226 
227     for ( int i = sizeof(T) - 1; i >= 0; i-- ) {
228         reversed[i] = reverse_byte( original[sizeof(T) - i - 1] );
229     }
230 
231     return dst;
232 #endif // TBB_USE_CLANG_BITREVERSE_BUILTINS
233 }
234 
235 } // inline namespace d0
236 
237 namespace d1 {
238 
239 #if (_WIN32)
240 // API to retrieve/update FPU control setting
241 #define __TBB_CPU_CTL_ENV_PRESENT 1
242 struct cpu_ctl_env {
243     unsigned int x87cw{};
244 #if (__TBB_x86_64)
245     // Changing the infinity mode or the floating-point precision is not supported on x64.
246     // The attempt causes an assertion. See
247     // https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/control87-controlfp-control87-2
248     static constexpr unsigned int X87CW_CONTROL_MASK = _MCW_DN | _MCW_EM | _MCW_RC;
249 #else
250     static constexpr unsigned int X87CW_CONTROL_MASK = ~0U;
251 #endif
252 #if (__TBB_x86_32 || __TBB_x86_64)
253     unsigned int mxcsr{};
254     static constexpr unsigned int MXCSR_CONTROL_MASK = ~0x3fu; /* all except last six status bits */
255 #endif
256 
257     bool operator!=( const cpu_ctl_env& ctl ) const {
258         return
259 #if (__TBB_x86_32 || __TBB_x86_64)
260             mxcsr != ctl.mxcsr ||
261 #endif
262             x87cw != ctl.x87cw;
263     }
get_envcpu_ctl_env264     void get_env() {
265         x87cw = _control87(0, 0);
266 #if (__TBB_x86_32 || __TBB_x86_64)
267         mxcsr = _mm_getcsr();
268 #endif
269     }
set_envcpu_ctl_env270     void set_env() const {
271         _control87(x87cw, X87CW_CONTROL_MASK);
272 #if (__TBB_x86_32 || __TBB_x86_64)
273         _mm_setcsr(mxcsr & MXCSR_CONTROL_MASK);
274 #endif
275     }
276 };
277 #elif (__TBB_x86_32 || __TBB_x86_64)
278 // API to retrieve/update FPU control setting
279 #define __TBB_CPU_CTL_ENV_PRESENT 1
280 struct cpu_ctl_env {
281     int     mxcsr{};
282     short   x87cw{};
283     static const int MXCSR_CONTROL_MASK = ~0x3f; /* all except last six status bits */
284 
285     bool operator!=(const cpu_ctl_env& ctl) const {
286         return mxcsr != ctl.mxcsr || x87cw != ctl.x87cw;
287     }
288     void get_env() {
289         __asm__ __volatile__(
290             "stmxcsr %0\n\t"
291             "fstcw %1"
292             : "=m"(mxcsr), "=m"(x87cw)
293         );
294         mxcsr &= MXCSR_CONTROL_MASK;
295     }
296     void set_env() const {
297         __asm__ __volatile__(
298             "ldmxcsr %0\n\t"
299             "fldcw %1"
300             : : "m"(mxcsr), "m"(x87cw)
301         );
302     }
303 };
304 #endif
305 
306 } // namespace d1
307 
308 } // namespace detail
309 } // namespace tbb
310 
311 #if !__TBB_CPU_CTL_ENV_PRESENT
312 #include <fenv.h>
313 
314 #include <cstring>
315 
316 namespace tbb {
317 namespace detail {
318 
319 namespace r1 {
320 void* __TBB_EXPORTED_FUNC cache_aligned_allocate(std::size_t size);
321 void __TBB_EXPORTED_FUNC cache_aligned_deallocate(void* p);
322 } // namespace r1
323 
324 namespace d1 {
325 
326 class cpu_ctl_env {
327     fenv_t *my_fenv_ptr;
328 public:
cpu_ctl_env()329     cpu_ctl_env() : my_fenv_ptr(nullptr) {}
~cpu_ctl_env()330     ~cpu_ctl_env() {
331         if ( my_fenv_ptr )
332             r1::cache_aligned_deallocate( (void*)my_fenv_ptr );
333     }
334     // It is possible not to copy memory but just to copy pointers but the following issues should be addressed:
335     //   1. The arena lifetime and the context lifetime are independent;
336     //   2. The user is allowed to recapture different FPU settings to context so 'current FPU settings' inside
337     //   dispatch loop may become invalid.
338     // But do we really want to improve the fenv implementation? It seems to be better to replace the fenv implementation
339     // with a platform specific implementation.
cpu_ctl_env(const cpu_ctl_env & src)340     cpu_ctl_env( const cpu_ctl_env &src ) : my_fenv_ptr(nullptr) {
341         *this = src;
342     }
343     cpu_ctl_env& operator=( const cpu_ctl_env &src ) {
344         __TBB_ASSERT( src.my_fenv_ptr, nullptr);
345         if ( !my_fenv_ptr )
346             my_fenv_ptr = (fenv_t*)r1::cache_aligned_allocate(sizeof(fenv_t));
347         *my_fenv_ptr = *src.my_fenv_ptr;
348         return *this;
349     }
350     bool operator!=( const cpu_ctl_env &ctl ) const {
351         __TBB_ASSERT( my_fenv_ptr, "cpu_ctl_env is not initialized." );
352         __TBB_ASSERT( ctl.my_fenv_ptr, "cpu_ctl_env is not initialized." );
353         return std::memcmp( (void*)my_fenv_ptr, (void*)ctl.my_fenv_ptr, sizeof(fenv_t) );
354     }
get_env()355     void get_env () {
356         if ( !my_fenv_ptr )
357             my_fenv_ptr = (fenv_t*)r1::cache_aligned_allocate(sizeof(fenv_t));
358         fegetenv( my_fenv_ptr );
359     }
set_env()360     const cpu_ctl_env& set_env () const {
361         __TBB_ASSERT( my_fenv_ptr, "cpu_ctl_env is not initialized." );
362         fesetenv( my_fenv_ptr );
363         return *this;
364     }
365 };
366 
367 } // namespace d1
368 } // namespace detail
369 } // namespace tbb
370 
371 #endif /* !__TBB_CPU_CTL_ENV_PRESENT */
372 
373 #endif // __TBB_detail__machine_H
374