1 //===-- Elementary operations for x86 -------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_X86_H
9 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_X86_H
10 
11 #if defined(LLVM_LIBC_ARCH_X86)
12 #include "src/__support/CPP/TypeTraits.h" // ConditionalType, EnableIfType
13 #include "src/string/memory_utils/backend_scalar.h"
14 
15 #ifdef __SSE2__
16 #include <immintrin.h>
17 #endif //  __SSE2__
18 
19 #if defined(__SSE2__)
20 #define HAS_M128 true
21 #else
22 #define HAS_M128 false
23 #endif
24 
25 #if defined(__AVX2__)
26 #define HAS_M256 true
27 #else
28 #define HAS_M256 false
29 #endif
30 
31 #if defined(__AVX512F__) and defined(__AVX512BW__)
32 #define HAS_M512 true
33 #else
34 #define HAS_M512 false
35 #endif
36 
37 namespace __llvm_libc {
38 struct X86Backend : public Scalar64BitBackend {
39   static constexpr bool IS_BACKEND_TYPE = true;
40 
41   // Scalar types use base class implementations.
42   template <typename T, Temporality TS, Aligned AS,
43             cpp::EnableIfType<Scalar64BitBackend::IsScalarType<T>, bool> = true>
loadX86Backend44   static inline T load(const T *src) {
45     return Scalar64BitBackend::template load<T, TS, AS>(src);
46   }
47 
48   // Scalar types use base class implementations.
49   template <typename T, Temporality TS, Aligned AS,
50             cpp::EnableIfType<Scalar64BitBackend::IsScalarType<T>, bool> = true>
storeX86Backend51   static inline void store(T *dst, T value) {
52     Scalar64BitBackend::template store<T, TS, AS>(dst, value);
53   }
54 
55   // Scalar types use base class implementations.
56   template <typename T,
57             cpp::EnableIfType<Scalar64BitBackend::IsScalarType<T>, bool> = true>
notEqualsX86Backend58   static inline uint64_t notEquals(T v1, T v2) {
59     return Scalar64BitBackend::template notEquals<T>(v1, v2);
60   }
61 
62   // Scalar types use base class implementations.
63   template <typename T,
64             cpp::EnableIfType<Scalar64BitBackend::IsScalarType<T>, bool> = true>
splatX86Backend65   static inline T splat(ubyte value) {
66     return Scalar64BitBackend::template splat<T>(value);
67   }
68 
69   // Scalar types use base class implementations.
70   template <typename T,
71             cpp::EnableIfType<Scalar64BitBackend::IsScalarType<T>, bool> = true>
threeWayCmpX86Backend72   static inline int32_t threeWayCmp(T v1, T v2) {
73     return Scalar64BitBackend::template threeWayCmp<T>(v1, v2);
74   }
75 
76   // X86 types are specialized below.
77   template <
78       typename T, Temporality TS, Aligned AS,
79       cpp::EnableIfType<!Scalar64BitBackend::IsScalarType<T>, bool> = true>
80   static inline T load(const T *src);
81 
82   // X86 types are specialized below.
83   template <
84       typename T, Temporality TS, Aligned AS,
85       cpp::EnableIfType<!Scalar64BitBackend::IsScalarType<T>, bool> = true>
86   static inline void store(T *dst, T value);
87 
88   // X86 types are specialized below.
89   template <typename T, cpp::EnableIfType<!Scalar64BitBackend::IsScalarType<T>,
90                                           bool> = true>
91   static inline T splat(ubyte value);
92 
93   // X86 types are specialized below.
94   template <typename T, cpp::EnableIfType<!Scalar64BitBackend::IsScalarType<T>,
95                                           bool> = true>
96   static inline uint64_t notEquals(T v1, T v2);
97 
98   template <typename T, cpp::EnableIfType<!Scalar64BitBackend::IsScalarType<T>,
99                                           bool> = true>
threeWayCmpX86Backend100   static inline int32_t threeWayCmp(T v1, T v2) {
101     return char_diff(reinterpret_cast<char *>(&v1),
102                      reinterpret_cast<char *>(&v2), notEquals(v1, v2));
103   }
104 
105   // Returns the type to use to consume Size bytes.
106   template <size_t Size>
107   using getNextType = cpp::ConditionalType<
108       (HAS_M512 && Size >= 64), __m512i,
109       cpp::ConditionalType<
110           (HAS_M256 && Size >= 32), __m256i,
111           cpp::ConditionalType<(HAS_M128 && Size >= 16), __m128i,
112                                Scalar64BitBackend::getNextType<Size>>>>;
113 
114 private:
char_diffX86Backend115   static inline int32_t char_diff(const char *a, const char *b, uint64_t mask) {
116     const size_t diff_index = mask == 0 ? 0 : __builtin_ctzll(mask);
117     const int16_t ca = (unsigned char)a[diff_index];
118     const int16_t cb = (unsigned char)b[diff_index];
119     return ca - cb;
120   }
121 };
122 
repmovsb(void * dst,const void * src,size_t runtime_size)123 static inline void repmovsb(void *dst, const void *src, size_t runtime_size) {
124   asm volatile("rep movsb"
125                : "+D"(dst), "+S"(src), "+c"(runtime_size)
126                :
127                : "memory");
128 }
129 
130 #define SPECIALIZE_LOAD(T, OS, AS, INTRISIC)                                   \
131   template <> inline T X86Backend::load<T, OS, AS>(const T *src) {             \
132     return INTRISIC(const_cast<T *>(src));                                     \
133   }
134 #define SPECIALIZE_STORE(T, OS, AS, INTRISIC)                                  \
135   template <> inline void X86Backend::store<T, OS, AS>(T * dst, T value) {     \
136     INTRISIC(dst, value);                                                      \
137   }
138 
139 #if HAS_M128
SPECIALIZE_LOAD(__m128i,Temporality::TEMPORAL,Aligned::YES,_mm_load_si128)140 SPECIALIZE_LOAD(__m128i, Temporality::TEMPORAL, Aligned::YES, _mm_load_si128)
141 SPECIALIZE_LOAD(__m128i, Temporality::TEMPORAL, Aligned::NO, _mm_loadu_si128)
142 SPECIALIZE_LOAD(__m128i, Temporality::NON_TEMPORAL, Aligned::YES,
143                 _mm_stream_load_si128)
144 // X86 non-temporal load needs aligned access
145 SPECIALIZE_STORE(__m128i, Temporality::TEMPORAL, Aligned::YES, _mm_store_si128)
146 SPECIALIZE_STORE(__m128i, Temporality::TEMPORAL, Aligned::NO, _mm_storeu_si128)
147 SPECIALIZE_STORE(__m128i, Temporality::NON_TEMPORAL, Aligned::YES,
148                  _mm_stream_si128)
149 // X86 non-temporal store needs aligned access
150 template <> inline __m128i X86Backend::splat<__m128i>(ubyte value) {
151   return _mm_set1_epi8(__builtin_bit_cast(char, value));
152 }
153 template <>
154 inline uint64_t X86Backend::notEquals<__m128i>(__m128i a, __m128i b) {
155   using T = char __attribute__((__vector_size__(16)));
156   return _mm_movemask_epi8(T(a) != T(b));
157 }
158 #endif // HAS_M128
159 
160 #if HAS_M256
SPECIALIZE_LOAD(__m256i,Temporality::TEMPORAL,Aligned::YES,_mm256_load_si256)161 SPECIALIZE_LOAD(__m256i, Temporality::TEMPORAL, Aligned::YES, _mm256_load_si256)
162 SPECIALIZE_LOAD(__m256i, Temporality::TEMPORAL, Aligned::NO, _mm256_loadu_si256)
163 SPECIALIZE_LOAD(__m256i, Temporality::NON_TEMPORAL, Aligned::YES,
164                 _mm256_stream_load_si256)
165 // X86 non-temporal load needs aligned access
166 SPECIALIZE_STORE(__m256i, Temporality::TEMPORAL, Aligned::YES,
167                  _mm256_store_si256)
168 SPECIALIZE_STORE(__m256i, Temporality::TEMPORAL, Aligned::NO,
169                  _mm256_storeu_si256)
170 SPECIALIZE_STORE(__m256i, Temporality::NON_TEMPORAL, Aligned::YES,
171                  _mm256_stream_si256)
172 // X86 non-temporal store needs aligned access
173 template <> inline __m256i X86Backend::splat<__m256i>(ubyte value) {
174   return _mm256_set1_epi8(__builtin_bit_cast(char, value));
175 }
176 template <>
177 inline uint64_t X86Backend::notEquals<__m256i>(__m256i a, __m256i b) {
178   using T = char __attribute__((__vector_size__(32)));
179   return _mm256_movemask_epi8(T(a) != T(b));
180 }
181 #endif // HAS_M256
182 
183 #if HAS_M512
SPECIALIZE_LOAD(__m512i,Temporality::TEMPORAL,Aligned::YES,_mm512_load_si512)184 SPECIALIZE_LOAD(__m512i, Temporality::TEMPORAL, Aligned::YES, _mm512_load_si512)
185 SPECIALIZE_LOAD(__m512i, Temporality::TEMPORAL, Aligned::NO, _mm512_loadu_si512)
186 SPECIALIZE_LOAD(__m512i, Temporality::NON_TEMPORAL, Aligned::YES,
187                 _mm512_stream_load_si512)
188 // X86 non-temporal load needs aligned access
189 SPECIALIZE_STORE(__m512i, Temporality::TEMPORAL, Aligned::YES,
190                  _mm512_store_si512)
191 SPECIALIZE_STORE(__m512i, Temporality::TEMPORAL, Aligned::NO,
192                  _mm512_storeu_si512)
193 SPECIALIZE_STORE(__m512i, Temporality::NON_TEMPORAL, Aligned::YES,
194                  _mm512_stream_si512)
195 // X86 non-temporal store needs aligned access
196 template <> inline __m512i X86Backend::splat<__m512i>(ubyte value) {
197   return _mm512_broadcastb_epi8(_mm_set1_epi8(__builtin_bit_cast(char, value)));
198 }
199 template <>
200 inline uint64_t X86Backend::notEquals<__m512i>(__m512i a, __m512i b) {
201   return _mm512_cmpneq_epi8_mask(a, b);
202 }
203 #endif // HAS_M512
204 
205 namespace x86 {
206 using _1 = SizedOp<X86Backend, 1>;
207 using _2 = SizedOp<X86Backend, 2>;
208 using _3 = SizedOp<X86Backend, 3>;
209 using _4 = SizedOp<X86Backend, 4>;
210 using _8 = SizedOp<X86Backend, 8>;
211 using _16 = SizedOp<X86Backend, 16>;
212 using _32 = SizedOp<X86Backend, 32>;
213 using _64 = SizedOp<X86Backend, 64>;
214 using _128 = SizedOp<X86Backend, 128>;
215 } // namespace x86
216 
217 } // namespace __llvm_libc
218 
219 #endif // defined(LLVM_LIBC_ARCH_X86)
220 
221 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_X86_H
222