1 //===-- Elementary operations for x86 -------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_X86_H 9 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_X86_H 10 11 #if defined(LLVM_LIBC_ARCH_X86) 12 #include "src/__support/CPP/TypeTraits.h" // ConditionalType, EnableIfType 13 #include "src/string/memory_utils/backend_scalar.h" 14 15 #ifdef __SSE2__ 16 #include <immintrin.h> 17 #endif // __SSE2__ 18 19 #if defined(__SSE2__) 20 #define HAS_M128 true 21 #else 22 #define HAS_M128 false 23 #endif 24 25 #if defined(__AVX2__) 26 #define HAS_M256 true 27 #else 28 #define HAS_M256 false 29 #endif 30 31 #if defined(__AVX512F__) and defined(__AVX512BW__) 32 #define HAS_M512 true 33 #else 34 #define HAS_M512 false 35 #endif 36 37 namespace __llvm_libc { 38 struct X86Backend : public Scalar64BitBackend { 39 static constexpr bool IS_BACKEND_TYPE = true; 40 41 // Scalar types use base class implementations. 42 template <typename T, Temporality TS, Aligned AS, 43 cpp::EnableIfType<Scalar64BitBackend::IsScalarType<T>, bool> = true> 44 static inline T load(const T *src) { 45 return Scalar64BitBackend::template load<T, TS, AS>(src); 46 } 47 48 // Scalar types use base class implementations. 49 template <typename T, Temporality TS, Aligned AS, 50 cpp::EnableIfType<Scalar64BitBackend::IsScalarType<T>, bool> = true> 51 static inline void store(T *dst, T value) { 52 Scalar64BitBackend::template store<T, TS, AS>(dst, value); 53 } 54 55 // Scalar types use base class implementations. 56 template <typename T, 57 cpp::EnableIfType<Scalar64BitBackend::IsScalarType<T>, bool> = true> 58 static inline uint64_t notEquals(T v1, T v2) { 59 return Scalar64BitBackend::template notEquals<T>(v1, v2); 60 } 61 62 // Scalar types use base class implementations. 63 template <typename T, 64 cpp::EnableIfType<Scalar64BitBackend::IsScalarType<T>, bool> = true> 65 static inline T splat(ubyte value) { 66 return Scalar64BitBackend::template splat<T>(value); 67 } 68 69 // Scalar types use base class implementations. 70 template <typename T, 71 cpp::EnableIfType<Scalar64BitBackend::IsScalarType<T>, bool> = true> 72 static inline int32_t threeWayCmp(T v1, T v2) { 73 return Scalar64BitBackend::template threeWayCmp<T>(v1, v2); 74 } 75 76 // X86 types are specialized below. 77 template < 78 typename T, Temporality TS, Aligned AS, 79 cpp::EnableIfType<!Scalar64BitBackend::IsScalarType<T>, bool> = true> 80 static inline T load(const T *src); 81 82 // X86 types are specialized below. 83 template < 84 typename T, Temporality TS, Aligned AS, 85 cpp::EnableIfType<!Scalar64BitBackend::IsScalarType<T>, bool> = true> 86 static inline void store(T *dst, T value); 87 88 // X86 types are specialized below. 89 template <typename T, cpp::EnableIfType<!Scalar64BitBackend::IsScalarType<T>, 90 bool> = true> 91 static inline T splat(ubyte value); 92 93 // X86 types are specialized below. 94 template <typename T, cpp::EnableIfType<!Scalar64BitBackend::IsScalarType<T>, 95 bool> = true> 96 static inline uint64_t notEquals(T v1, T v2); 97 98 template <typename T, cpp::EnableIfType<!Scalar64BitBackend::IsScalarType<T>, 99 bool> = true> 100 static inline int32_t threeWayCmp(T v1, T v2) { 101 return char_diff(reinterpret_cast<char *>(&v1), 102 reinterpret_cast<char *>(&v2), notEquals(v1, v2)); 103 } 104 105 // Returns the type to use to consume Size bytes. 106 template <size_t Size> 107 using getNextType = cpp::ConditionalType< 108 (HAS_M512 && Size >= 64), __m512i, 109 cpp::ConditionalType< 110 (HAS_M256 && Size >= 32), __m256i, 111 cpp::ConditionalType<(HAS_M128 && Size >= 16), __m128i, 112 Scalar64BitBackend::getNextType<Size>>>>; 113 114 private: 115 static inline int32_t char_diff(const char *a, const char *b, uint64_t mask) { 116 const size_t diff_index = mask == 0 ? 0 : __builtin_ctzll(mask); 117 const int16_t ca = (unsigned char)a[diff_index]; 118 const int16_t cb = (unsigned char)b[diff_index]; 119 return ca - cb; 120 } 121 }; 122 123 static inline void repmovsb(void *dst, const void *src, size_t runtime_size) { 124 asm volatile("rep movsb" 125 : "+D"(dst), "+S"(src), "+c"(runtime_size) 126 : 127 : "memory"); 128 } 129 130 #define SPECIALIZE_LOAD(T, OS, AS, INTRISIC) \ 131 template <> inline T X86Backend::load<T, OS, AS>(const T *src) { \ 132 return INTRISIC(const_cast<T *>(src)); \ 133 } 134 #define SPECIALIZE_STORE(T, OS, AS, INTRISIC) \ 135 template <> inline void X86Backend::store<T, OS, AS>(T * dst, T value) { \ 136 INTRISIC(dst, value); \ 137 } 138 139 #if HAS_M128 140 SPECIALIZE_LOAD(__m128i, Temporality::TEMPORAL, Aligned::YES, _mm_load_si128) 141 SPECIALIZE_LOAD(__m128i, Temporality::TEMPORAL, Aligned::NO, _mm_loadu_si128) 142 SPECIALIZE_LOAD(__m128i, Temporality::NON_TEMPORAL, Aligned::YES, 143 _mm_stream_load_si128) 144 // X86 non-temporal load needs aligned access 145 SPECIALIZE_STORE(__m128i, Temporality::TEMPORAL, Aligned::YES, _mm_store_si128) 146 SPECIALIZE_STORE(__m128i, Temporality::TEMPORAL, Aligned::NO, _mm_storeu_si128) 147 SPECIALIZE_STORE(__m128i, Temporality::NON_TEMPORAL, Aligned::YES, 148 _mm_stream_si128) 149 // X86 non-temporal store needs aligned access 150 template <> inline __m128i X86Backend::splat<__m128i>(ubyte value) { 151 return _mm_set1_epi8(__builtin_bit_cast(char, value)); 152 } 153 template <> 154 inline uint64_t X86Backend::notEquals<__m128i>(__m128i a, __m128i b) { 155 using T = char __attribute__((__vector_size__(16))); 156 return _mm_movemask_epi8(T(a) != T(b)); 157 } 158 #endif // HAS_M128 159 160 #if HAS_M256 161 SPECIALIZE_LOAD(__m256i, Temporality::TEMPORAL, Aligned::YES, _mm256_load_si256) 162 SPECIALIZE_LOAD(__m256i, Temporality::TEMPORAL, Aligned::NO, _mm256_loadu_si256) 163 SPECIALIZE_LOAD(__m256i, Temporality::NON_TEMPORAL, Aligned::YES, 164 _mm256_stream_load_si256) 165 // X86 non-temporal load needs aligned access 166 SPECIALIZE_STORE(__m256i, Temporality::TEMPORAL, Aligned::YES, 167 _mm256_store_si256) 168 SPECIALIZE_STORE(__m256i, Temporality::TEMPORAL, Aligned::NO, 169 _mm256_storeu_si256) 170 SPECIALIZE_STORE(__m256i, Temporality::NON_TEMPORAL, Aligned::YES, 171 _mm256_stream_si256) 172 // X86 non-temporal store needs aligned access 173 template <> inline __m256i X86Backend::splat<__m256i>(ubyte value) { 174 return _mm256_set1_epi8(__builtin_bit_cast(char, value)); 175 } 176 template <> 177 inline uint64_t X86Backend::notEquals<__m256i>(__m256i a, __m256i b) { 178 using T = char __attribute__((__vector_size__(32))); 179 return _mm256_movemask_epi8(T(a) != T(b)); 180 } 181 #endif // HAS_M256 182 183 #if HAS_M512 184 SPECIALIZE_LOAD(__m512i, Temporality::TEMPORAL, Aligned::YES, _mm512_load_si512) 185 SPECIALIZE_LOAD(__m512i, Temporality::TEMPORAL, Aligned::NO, _mm512_loadu_si512) 186 SPECIALIZE_LOAD(__m512i, Temporality::NON_TEMPORAL, Aligned::YES, 187 _mm512_stream_load_si512) 188 // X86 non-temporal load needs aligned access 189 SPECIALIZE_STORE(__m512i, Temporality::TEMPORAL, Aligned::YES, 190 _mm512_store_si512) 191 SPECIALIZE_STORE(__m512i, Temporality::TEMPORAL, Aligned::NO, 192 _mm512_storeu_si512) 193 SPECIALIZE_STORE(__m512i, Temporality::NON_TEMPORAL, Aligned::YES, 194 _mm512_stream_si512) 195 // X86 non-temporal store needs aligned access 196 template <> inline __m512i X86Backend::splat<__m512i>(ubyte value) { 197 return _mm512_broadcastb_epi8(_mm_set1_epi8(__builtin_bit_cast(char, value))); 198 } 199 template <> 200 inline uint64_t X86Backend::notEquals<__m512i>(__m512i a, __m512i b) { 201 return _mm512_cmpneq_epi8_mask(a, b); 202 } 203 #endif // HAS_M512 204 205 namespace x86 { 206 using _1 = SizedOp<X86Backend, 1>; 207 using _2 = SizedOp<X86Backend, 2>; 208 using _3 = SizedOp<X86Backend, 3>; 209 using _4 = SizedOp<X86Backend, 4>; 210 using _8 = SizedOp<X86Backend, 8>; 211 using _16 = SizedOp<X86Backend, 16>; 212 using _32 = SizedOp<X86Backend, 32>; 213 using _64 = SizedOp<X86Backend, 64>; 214 using _128 = SizedOp<X86Backend, 128>; 215 } // namespace x86 216 217 } // namespace __llvm_libc 218 219 #endif // defined(LLVM_LIBC_ARCH_X86) 220 221 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BACKEND_X86_H 222