1 //===-- Elementary operations for x86 -------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_X86_H
9 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_X86_H
10
11 #if defined(LLVM_LIBC_ARCH_X86)
12 #include "src/__support/CPP/TypeTraits.h" // ConditionalType, EnableIfType
13 #include "src/string/memory_utils/backend_scalar.h"
14
15 #ifdef __SSE2__
16 #include <immintrin.h>
17 #endif // __SSE2__
18
19 #if defined(__SSE2__)
20 #define HAS_M128 true
21 #else
22 #define HAS_M128 false
23 #endif
24
25 #if defined(__AVX2__)
26 #define HAS_M256 true
27 #else
28 #define HAS_M256 false
29 #endif
30
31 #if defined(__AVX512F__) and defined(__AVX512BW__)
32 #define HAS_M512 true
33 #else
34 #define HAS_M512 false
35 #endif
36
37 namespace __llvm_libc {
38 struct X86Backend : public Scalar64BitBackend {
39 static constexpr bool IS_BACKEND_TYPE = true;
40
41 // Scalar types use base class implementations.
42 template <typename T, Temporality TS, Aligned AS,
43 cpp::EnableIfType<Scalar64BitBackend::IsScalarType<T>, bool> = true>
loadX86Backend44 static inline T load(const T *src) {
45 return Scalar64BitBackend::template load<T, TS, AS>(src);
46 }
47
48 // Scalar types use base class implementations.
49 template <typename T, Temporality TS, Aligned AS,
50 cpp::EnableIfType<Scalar64BitBackend::IsScalarType<T>, bool> = true>
storeX86Backend51 static inline void store(T *dst, T value) {
52 Scalar64BitBackend::template store<T, TS, AS>(dst, value);
53 }
54
55 // Scalar types use base class implementations.
56 template <typename T,
57 cpp::EnableIfType<Scalar64BitBackend::IsScalarType<T>, bool> = true>
notEqualsX86Backend58 static inline uint64_t notEquals(T v1, T v2) {
59 return Scalar64BitBackend::template notEquals<T>(v1, v2);
60 }
61
62 // Scalar types use base class implementations.
63 template <typename T,
64 cpp::EnableIfType<Scalar64BitBackend::IsScalarType<T>, bool> = true>
splatX86Backend65 static inline T splat(ubyte value) {
66 return Scalar64BitBackend::template splat<T>(value);
67 }
68
69 // Scalar types use base class implementations.
70 template <typename T,
71 cpp::EnableIfType<Scalar64BitBackend::IsScalarType<T>, bool> = true>
threeWayCmpX86Backend72 static inline int32_t threeWayCmp(T v1, T v2) {
73 return Scalar64BitBackend::template threeWayCmp<T>(v1, v2);
74 }
75
76 // X86 types are specialized below.
77 template <
78 typename T, Temporality TS, Aligned AS,
79 cpp::EnableIfType<!Scalar64BitBackend::IsScalarType<T>, bool> = true>
80 static inline T load(const T *src);
81
82 // X86 types are specialized below.
83 template <
84 typename T, Temporality TS, Aligned AS,
85 cpp::EnableIfType<!Scalar64BitBackend::IsScalarType<T>, bool> = true>
86 static inline void store(T *dst, T value);
87
88 // X86 types are specialized below.
89 template <typename T, cpp::EnableIfType<!Scalar64BitBackend::IsScalarType<T>,
90 bool> = true>
91 static inline T splat(ubyte value);
92
93 // X86 types are specialized below.
94 template <typename T, cpp::EnableIfType<!Scalar64BitBackend::IsScalarType<T>,
95 bool> = true>
96 static inline uint64_t notEquals(T v1, T v2);
97
98 template <typename T, cpp::EnableIfType<!Scalar64BitBackend::IsScalarType<T>,
99 bool> = true>
threeWayCmpX86Backend100 static inline int32_t threeWayCmp(T v1, T v2) {
101 return char_diff(reinterpret_cast<char *>(&v1),
102 reinterpret_cast<char *>(&v2), notEquals(v1, v2));
103 }
104
105 // Returns the type to use to consume Size bytes.
106 template <size_t Size>
107 using getNextType = cpp::ConditionalType<
108 (HAS_M512 && Size >= 64), __m512i,
109 cpp::ConditionalType<
110 (HAS_M256 && Size >= 32), __m256i,
111 cpp::ConditionalType<(HAS_M128 && Size >= 16), __m128i,
112 Scalar64BitBackend::getNextType<Size>>>>;
113
114 private:
char_diffX86Backend115 static inline int32_t char_diff(const char *a, const char *b, uint64_t mask) {
116 const size_t diff_index = mask == 0 ? 0 : __builtin_ctzll(mask);
117 const int16_t ca = (unsigned char)a[diff_index];
118 const int16_t cb = (unsigned char)b[diff_index];
119 return ca - cb;
120 }
121 };
122
repmovsb(void * dst,const void * src,size_t runtime_size)123 static inline void repmovsb(void *dst, const void *src, size_t runtime_size) {
124 asm volatile("rep movsb"
125 : "+D"(dst), "+S"(src), "+c"(runtime_size)
126 :
127 : "memory");
128 }
129
130 #define SPECIALIZE_LOAD(T, OS, AS, INTRISIC) \
131 template <> inline T X86Backend::load<T, OS, AS>(const T *src) { \
132 return INTRISIC(const_cast<T *>(src)); \
133 }
134 #define SPECIALIZE_STORE(T, OS, AS, INTRISIC) \
135 template <> inline void X86Backend::store<T, OS, AS>(T * dst, T value) { \
136 INTRISIC(dst, value); \
137 }
138
139 #if HAS_M128
SPECIALIZE_LOAD(__m128i,Temporality::TEMPORAL,Aligned::YES,_mm_load_si128)140 SPECIALIZE_LOAD(__m128i, Temporality::TEMPORAL, Aligned::YES, _mm_load_si128)
141 SPECIALIZE_LOAD(__m128i, Temporality::TEMPORAL, Aligned::NO, _mm_loadu_si128)
142 SPECIALIZE_LOAD(__m128i, Temporality::NON_TEMPORAL, Aligned::YES,
143 _mm_stream_load_si128)
144 // X86 non-temporal load needs aligned access
145 SPECIALIZE_STORE(__m128i, Temporality::TEMPORAL, Aligned::YES, _mm_store_si128)
146 SPECIALIZE_STORE(__m128i, Temporality::TEMPORAL, Aligned::NO, _mm_storeu_si128)
147 SPECIALIZE_STORE(__m128i, Temporality::NON_TEMPORAL, Aligned::YES,
148 _mm_stream_si128)
149 // X86 non-temporal store needs aligned access
150 template <> inline __m128i X86Backend::splat<__m128i>(ubyte value) {
151 return _mm_set1_epi8(__builtin_bit_cast(char, value));
152 }
153 template <>
154 inline uint64_t X86Backend::notEquals<__m128i>(__m128i a, __m128i b) {
155 using T = char __attribute__((__vector_size__(16)));
156 return _mm_movemask_epi8(T(a) != T(b));
157 }
158 #endif // HAS_M128
159
160 #if HAS_M256
SPECIALIZE_LOAD(__m256i,Temporality::TEMPORAL,Aligned::YES,_mm256_load_si256)161 SPECIALIZE_LOAD(__m256i, Temporality::TEMPORAL, Aligned::YES, _mm256_load_si256)
162 SPECIALIZE_LOAD(__m256i, Temporality::TEMPORAL, Aligned::NO, _mm256_loadu_si256)
163 SPECIALIZE_LOAD(__m256i, Temporality::NON_TEMPORAL, Aligned::YES,
164 _mm256_stream_load_si256)
165 // X86 non-temporal load needs aligned access
166 SPECIALIZE_STORE(__m256i, Temporality::TEMPORAL, Aligned::YES,
167 _mm256_store_si256)
168 SPECIALIZE_STORE(__m256i, Temporality::TEMPORAL, Aligned::NO,
169 _mm256_storeu_si256)
170 SPECIALIZE_STORE(__m256i, Temporality::NON_TEMPORAL, Aligned::YES,
171 _mm256_stream_si256)
172 // X86 non-temporal store needs aligned access
173 template <> inline __m256i X86Backend::splat<__m256i>(ubyte value) {
174 return _mm256_set1_epi8(__builtin_bit_cast(char, value));
175 }
176 template <>
177 inline uint64_t X86Backend::notEquals<__m256i>(__m256i a, __m256i b) {
178 using T = char __attribute__((__vector_size__(32)));
179 return _mm256_movemask_epi8(T(a) != T(b));
180 }
181 #endif // HAS_M256
182
183 #if HAS_M512
SPECIALIZE_LOAD(__m512i,Temporality::TEMPORAL,Aligned::YES,_mm512_load_si512)184 SPECIALIZE_LOAD(__m512i, Temporality::TEMPORAL, Aligned::YES, _mm512_load_si512)
185 SPECIALIZE_LOAD(__m512i, Temporality::TEMPORAL, Aligned::NO, _mm512_loadu_si512)
186 SPECIALIZE_LOAD(__m512i, Temporality::NON_TEMPORAL, Aligned::YES,
187 _mm512_stream_load_si512)
188 // X86 non-temporal load needs aligned access
189 SPECIALIZE_STORE(__m512i, Temporality::TEMPORAL, Aligned::YES,
190 _mm512_store_si512)
191 SPECIALIZE_STORE(__m512i, Temporality::TEMPORAL, Aligned::NO,
192 _mm512_storeu_si512)
193 SPECIALIZE_STORE(__m512i, Temporality::NON_TEMPORAL, Aligned::YES,
194 _mm512_stream_si512)
195 // X86 non-temporal store needs aligned access
196 template <> inline __m512i X86Backend::splat<__m512i>(ubyte value) {
197 return _mm512_broadcastb_epi8(_mm_set1_epi8(__builtin_bit_cast(char, value)));
198 }
199 template <>
200 inline uint64_t X86Backend::notEquals<__m512i>(__m512i a, __m512i b) {
201 return _mm512_cmpneq_epi8_mask(a, b);
202 }
203 #endif // HAS_M512
204
205 namespace x86 {
206 using _1 = SizedOp<X86Backend, 1>;
207 using _2 = SizedOp<X86Backend, 2>;
208 using _3 = SizedOp<X86Backend, 3>;
209 using _4 = SizedOp<X86Backend, 4>;
210 using _8 = SizedOp<X86Backend, 8>;
211 using _16 = SizedOp<X86Backend, 16>;
212 using _32 = SizedOp<X86Backend, 32>;
213 using _64 = SizedOp<X86Backend, 64>;
214 using _128 = SizedOp<X86Backend, 128>;
215 } // namespace x86
216
217 } // namespace __llvm_libc
218
219 #endif // defined(LLVM_LIBC_ARCH_X86)
220
221 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_X86_H
222