1 //===-- Implementation of memset and bzero --------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H 10 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H 11 12 #include "src/__support/architectures.h" 13 #include "src/string/memory_utils/elements.h" 14 #include "src/string/memory_utils/utils.h" 15 16 #include <stddef.h> // size_t 17 18 namespace __llvm_libc { 19 20 // A general purpose implementation assuming cheap unaligned writes for sizes: 21 // 1, 2, 4, 8, 16, 32 and 64 Bytes. Note that some architecture can't store 32 22 // or 64 Bytes at a time, the compiler will expand them as needed. 23 // 24 // This implementation is subject to change as we benchmark more processors. We 25 // may also want to customize it for processors with specialized instructions 26 // that performs better (e.g. `rep stosb`). 27 // 28 // A note on the apparent discrepancy in the use of 32 vs 64 Bytes writes. 29 // We want to balance two things here: 30 // - The number of redundant writes (when using `SetBlockOverlap`), 31 // - The number of conditionals for sizes <=128 (~90% of memset calls are for 32 // such sizes). 33 // 34 // For the range 64-128: 35 // - SetBlockOverlap<64> uses no conditionals but always writes 128 Bytes this 36 // is wasteful near 65 but efficient toward 128. 37 // - SetAlignedBlocks<32> would consume between 3 and 4 conditionals and write 38 // 96 or 128 Bytes. 39 // - Another approach could be to use an hybrid approach Copy<64>+Overlap<32> 40 // for 65-96 and Copy<96>+Overlap<32> for 97-128 41 // 42 // Benchmarks showed that redundant writes were cheap (for Intel X86) but 43 // conditional were expensive, even on processor that do not support writing 64B 44 // at a time (pre-AVX512F). We also want to favor short functions that allow 45 // more hot code to fit in the iL1 cache. 46 // 47 // Above 128 we have to use conditionals since we don't know the upper bound in 48 // advance. SetAlignedBlocks<64> may waste up to 63 Bytes, SetAlignedBlocks<32> 49 // may waste up to 31 Bytes. Benchmarks showed that SetAlignedBlocks<64> was not 50 // superior for sizes that mattered. 51 inline static void inline_memset(char *dst, unsigned char value, size_t count) { 52 #if defined(LLVM_LIBC_ARCH_X86) 53 ///////////////////////////////////////////////////////////////////////////// 54 // LLVM_LIBC_ARCH_X86 55 ///////////////////////////////////////////////////////////////////////////// 56 using namespace __llvm_libc::x86; 57 if (count == 0) 58 return; 59 if (count == 1) 60 return SplatSet<_1>(dst, value); 61 if (count == 2) 62 return SplatSet<_2>(dst, value); 63 if (count == 3) 64 return SplatSet<_3>(dst, value); 65 if (count <= 8) 66 return SplatSet<HeadTail<_4>>(dst, value, count); 67 if (count <= 16) 68 return SplatSet<HeadTail<_8>>(dst, value, count); 69 if (count <= 32) 70 return SplatSet<HeadTail<_16>>(dst, value, count); 71 if (count <= 64) 72 return SplatSet<HeadTail<_32>>(dst, value, count); 73 if (count <= 128) 74 return SplatSet<HeadTail<_64>>(dst, value, count); 75 return SplatSet<Align<_32, Arg::Dst>::Then<Loop<_32>>>(dst, value, count); 76 #elif defined(LLVM_LIBC_ARCH_AARCH64) 77 ///////////////////////////////////////////////////////////////////////////// 78 // LLVM_LIBC_ARCH_AARCH64 79 ///////////////////////////////////////////////////////////////////////////// 80 using namespace __llvm_libc::aarch64_memset; 81 if (count == 0) 82 return; 83 if (count <= 3) { 84 SplatSet<_1>(dst, value); 85 if (count > 1) 86 SplatSet<Tail<_2>>(dst, value, count); 87 return; 88 } 89 if (count <= 8) 90 return SplatSet<HeadTail<_4>>(dst, value, count); 91 if (count <= 16) 92 return SplatSet<HeadTail<_8>>(dst, value, count); 93 if (count <= 32) 94 return SplatSet<HeadTail<_16>>(dst, value, count); 95 if (count <= 96) { 96 SplatSet<_32>(dst, value); 97 if (count <= 64) 98 return SplatSet<Tail<_32>>(dst, value, count); 99 SplatSet<Skip<32>::Then<_32>>(dst, value); 100 SplatSet<Tail<_32>>(dst, value, count); 101 return; 102 } 103 if (count < 448 || value != 0 || !AArch64ZVA(dst, count)) 104 return SplatSet<Align<_16, Arg::_1>::Then<Loop<_64>>>(dst, value, count); 105 #else 106 ///////////////////////////////////////////////////////////////////////////// 107 // Default 108 ///////////////////////////////////////////////////////////////////////////// 109 using namespace ::__llvm_libc::scalar; 110 111 if (count == 0) 112 return; 113 if (count == 1) 114 return SplatSet<_1>(dst, value); 115 if (count == 2) 116 return SplatSet<_2>(dst, value); 117 if (count == 3) 118 return SplatSet<_3>(dst, value); 119 if (count <= 8) 120 return SplatSet<HeadTail<_4>>(dst, value, count); 121 if (count <= 16) 122 return SplatSet<HeadTail<_8>>(dst, value, count); 123 if (count <= 32) 124 return SplatSet<HeadTail<_16>>(dst, value, count); 125 if (count <= 64) 126 return SplatSet<HeadTail<_32>>(dst, value, count); 127 if (count <= 128) 128 return SplatSet<HeadTail<_64>>(dst, value, count); 129 return SplatSet<Align<_32, Arg::Dst>::Then<Loop<_32>>>(dst, value, count); 130 #endif 131 } 132 133 } // namespace __llvm_libc 134 135 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H 136