1 //===-- Implementation of memset and bzero --------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H
10 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H
11
12 #include "src/__support/architectures.h"
13 #include "src/string/memory_utils/elements.h"
14 #include "src/string/memory_utils/utils.h"
15
16 #include <stddef.h> // size_t
17
18 namespace __llvm_libc {
19
20 // A general purpose implementation assuming cheap unaligned writes for sizes:
21 // 1, 2, 4, 8, 16, 32 and 64 Bytes. Note that some architecture can't store 32
22 // or 64 Bytes at a time, the compiler will expand them as needed.
23 //
24 // This implementation is subject to change as we benchmark more processors. We
25 // may also want to customize it for processors with specialized instructions
26 // that performs better (e.g. `rep stosb`).
27 //
28 // A note on the apparent discrepancy in the use of 32 vs 64 Bytes writes.
29 // We want to balance two things here:
30 // - The number of redundant writes (when using `SetBlockOverlap`),
31 // - The number of conditionals for sizes <=128 (~90% of memset calls are for
32 // such sizes).
33 //
34 // For the range 64-128:
35 // - SetBlockOverlap<64> uses no conditionals but always writes 128 Bytes this
36 // is wasteful near 65 but efficient toward 128.
37 // - SetAlignedBlocks<32> would consume between 3 and 4 conditionals and write
38 // 96 or 128 Bytes.
39 // - Another approach could be to use an hybrid approach copy<64>+Overlap<32>
40 // for 65-96 and copy<96>+Overlap<32> for 97-128
41 //
42 // Benchmarks showed that redundant writes were cheap (for Intel X86) but
43 // conditional were expensive, even on processor that do not support writing 64B
44 // at a time (pre-AVX512F). We also want to favor short functions that allow
45 // more hot code to fit in the iL1 cache.
46 //
47 // Above 128 we have to use conditionals since we don't know the upper bound in
48 // advance. SetAlignedBlocks<64> may waste up to 63 Bytes, SetAlignedBlocks<32>
49 // may waste up to 31 Bytes. Benchmarks showed that SetAlignedBlocks<64> was not
50 // superior for sizes that mattered.
inline_memset(char * dst,unsigned char value,size_t count)51 inline static void inline_memset(char *dst, unsigned char value, size_t count) {
52 #if defined(LLVM_LIBC_ARCH_X86)
53 /////////////////////////////////////////////////////////////////////////////
54 // LLVM_LIBC_ARCH_X86
55 /////////////////////////////////////////////////////////////////////////////
56 using namespace __llvm_libc::x86;
57 if (count == 0)
58 return;
59 if (count == 1)
60 return splat_set<_1>(dst, value);
61 if (count == 2)
62 return splat_set<_2>(dst, value);
63 if (count == 3)
64 return splat_set<_3>(dst, value);
65 if (count <= 8)
66 return splat_set<HeadTail<_4>>(dst, value, count);
67 if (count <= 16)
68 return splat_set<HeadTail<_8>>(dst, value, count);
69 if (count <= 32)
70 return splat_set<HeadTail<_16>>(dst, value, count);
71 if (count <= 64)
72 return splat_set<HeadTail<_32>>(dst, value, count);
73 if (count <= 128)
74 return splat_set<HeadTail<_64>>(dst, value, count);
75 return splat_set<Align<_32, Arg::Dst>::Then<Loop<_32>>>(dst, value, count);
76 #elif defined(LLVM_LIBC_ARCH_AARCH64)
77 /////////////////////////////////////////////////////////////////////////////
78 // LLVM_LIBC_ARCH_AARCH64
79 /////////////////////////////////////////////////////////////////////////////
80 using namespace __llvm_libc::aarch64_memset;
81 if (count == 0)
82 return;
83 if (count <= 3) {
84 splat_set<_1>(dst, value);
85 if (count > 1)
86 splat_set<Tail<_2>>(dst, value, count);
87 return;
88 }
89 if (count <= 8)
90 return splat_set<HeadTail<_4>>(dst, value, count);
91 if (count <= 16)
92 return splat_set<HeadTail<_8>>(dst, value, count);
93 if (count <= 32)
94 return splat_set<HeadTail<_16>>(dst, value, count);
95 if (count <= 96) {
96 splat_set<_32>(dst, value);
97 if (count <= 64)
98 return splat_set<Tail<_32>>(dst, value, count);
99 splat_set<Skip<32>::Then<_32>>(dst, value);
100 splat_set<Tail<_32>>(dst, value, count);
101 return;
102 }
103 if (count < 448 || value != 0 || !AArch64ZVA(dst, count))
104 return splat_set<Align<_16, Arg::_1>::Then<Loop<_64>>>(dst, value, count);
105 #else
106 /////////////////////////////////////////////////////////////////////////////
107 // Default
108 /////////////////////////////////////////////////////////////////////////////
109 using namespace ::__llvm_libc::scalar;
110
111 if (count == 0)
112 return;
113 if (count == 1)
114 return splat_set<_1>(dst, value);
115 if (count == 2)
116 return splat_set<_2>(dst, value);
117 if (count == 3)
118 return splat_set<_3>(dst, value);
119 if (count <= 8)
120 return splat_set<HeadTail<_4>>(dst, value, count);
121 if (count <= 16)
122 return splat_set<HeadTail<_8>>(dst, value, count);
123 if (count <= 32)
124 return splat_set<HeadTail<_16>>(dst, value, count);
125 if (count <= 64)
126 return splat_set<HeadTail<_32>>(dst, value, count);
127 if (count <= 128)
128 return splat_set<HeadTail<_64>>(dst, value, count);
129 return splat_set<Align<_32, Arg::Dst>::Then<Loop<_32>>>(dst, value, count);
130 #endif
131 }
132
133 } // namespace __llvm_libc
134
135 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H
136