1c02aa154SGuillaume Chatelet //===-- Implementation of memset and bzero --------------------------------===//
2c02aa154SGuillaume Chatelet //
3c02aa154SGuillaume Chatelet // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4c02aa154SGuillaume Chatelet // See https://llvm.org/LICENSE.txt for license information.
5c02aa154SGuillaume Chatelet // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6c02aa154SGuillaume Chatelet //
7c02aa154SGuillaume Chatelet //===----------------------------------------------------------------------===//
8c02aa154SGuillaume Chatelet
9c02aa154SGuillaume Chatelet #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H
10c02aa154SGuillaume Chatelet #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H
11c02aa154SGuillaume Chatelet
12c02aa154SGuillaume Chatelet #include "src/__support/architectures.h"
13c02aa154SGuillaume Chatelet #include "src/string/memory_utils/elements.h"
14c02aa154SGuillaume Chatelet #include "src/string/memory_utils/utils.h"
15c02aa154SGuillaume Chatelet
16c02aa154SGuillaume Chatelet #include <stddef.h> // size_t
17c02aa154SGuillaume Chatelet
18c02aa154SGuillaume Chatelet namespace __llvm_libc {
19c02aa154SGuillaume Chatelet
20c02aa154SGuillaume Chatelet // A general purpose implementation assuming cheap unaligned writes for sizes:
21c02aa154SGuillaume Chatelet // 1, 2, 4, 8, 16, 32 and 64 Bytes. Note that some architecture can't store 32
22c02aa154SGuillaume Chatelet // or 64 Bytes at a time, the compiler will expand them as needed.
23c02aa154SGuillaume Chatelet //
24c02aa154SGuillaume Chatelet // This implementation is subject to change as we benchmark more processors. We
25c02aa154SGuillaume Chatelet // may also want to customize it for processors with specialized instructions
26c02aa154SGuillaume Chatelet // that performs better (e.g. `rep stosb`).
27c02aa154SGuillaume Chatelet //
28c02aa154SGuillaume Chatelet // A note on the apparent discrepancy in the use of 32 vs 64 Bytes writes.
29c02aa154SGuillaume Chatelet // We want to balance two things here:
30c02aa154SGuillaume Chatelet // - The number of redundant writes (when using `SetBlockOverlap`),
31c02aa154SGuillaume Chatelet // - The number of conditionals for sizes <=128 (~90% of memset calls are for
32c02aa154SGuillaume Chatelet // such sizes).
33c02aa154SGuillaume Chatelet //
34c02aa154SGuillaume Chatelet // For the range 64-128:
35c02aa154SGuillaume Chatelet // - SetBlockOverlap<64> uses no conditionals but always writes 128 Bytes this
36c02aa154SGuillaume Chatelet // is wasteful near 65 but efficient toward 128.
37c02aa154SGuillaume Chatelet // - SetAlignedBlocks<32> would consume between 3 and 4 conditionals and write
38c02aa154SGuillaume Chatelet // 96 or 128 Bytes.
39*1c92911eSMichael Jones // - Another approach could be to use an hybrid approach copy<64>+Overlap<32>
40*1c92911eSMichael Jones // for 65-96 and copy<96>+Overlap<32> for 97-128
41c02aa154SGuillaume Chatelet //
42c02aa154SGuillaume Chatelet // Benchmarks showed that redundant writes were cheap (for Intel X86) but
43c02aa154SGuillaume Chatelet // conditional were expensive, even on processor that do not support writing 64B
44c02aa154SGuillaume Chatelet // at a time (pre-AVX512F). We also want to favor short functions that allow
45c02aa154SGuillaume Chatelet // more hot code to fit in the iL1 cache.
46c02aa154SGuillaume Chatelet //
47c02aa154SGuillaume Chatelet // Above 128 we have to use conditionals since we don't know the upper bound in
48c02aa154SGuillaume Chatelet // advance. SetAlignedBlocks<64> may waste up to 63 Bytes, SetAlignedBlocks<32>
49c02aa154SGuillaume Chatelet // may waste up to 31 Bytes. Benchmarks showed that SetAlignedBlocks<64> was not
50c02aa154SGuillaume Chatelet // superior for sizes that mattered.
inline_memset(char * dst,unsigned char value,size_t count)51c02aa154SGuillaume Chatelet inline static void inline_memset(char *dst, unsigned char value, size_t count) {
52c02aa154SGuillaume Chatelet #if defined(LLVM_LIBC_ARCH_X86)
53c02aa154SGuillaume Chatelet /////////////////////////////////////////////////////////////////////////////
54c02aa154SGuillaume Chatelet // LLVM_LIBC_ARCH_X86
55c02aa154SGuillaume Chatelet /////////////////////////////////////////////////////////////////////////////
56c02aa154SGuillaume Chatelet using namespace __llvm_libc::x86;
57c02aa154SGuillaume Chatelet if (count == 0)
58c02aa154SGuillaume Chatelet return;
59c02aa154SGuillaume Chatelet if (count == 1)
60*1c92911eSMichael Jones return splat_set<_1>(dst, value);
61c02aa154SGuillaume Chatelet if (count == 2)
62*1c92911eSMichael Jones return splat_set<_2>(dst, value);
63c02aa154SGuillaume Chatelet if (count == 3)
64*1c92911eSMichael Jones return splat_set<_3>(dst, value);
65c02aa154SGuillaume Chatelet if (count <= 8)
66*1c92911eSMichael Jones return splat_set<HeadTail<_4>>(dst, value, count);
67c02aa154SGuillaume Chatelet if (count <= 16)
68*1c92911eSMichael Jones return splat_set<HeadTail<_8>>(dst, value, count);
69c02aa154SGuillaume Chatelet if (count <= 32)
70*1c92911eSMichael Jones return splat_set<HeadTail<_16>>(dst, value, count);
71c02aa154SGuillaume Chatelet if (count <= 64)
72*1c92911eSMichael Jones return splat_set<HeadTail<_32>>(dst, value, count);
73c02aa154SGuillaume Chatelet if (count <= 128)
74*1c92911eSMichael Jones return splat_set<HeadTail<_64>>(dst, value, count);
75*1c92911eSMichael Jones return splat_set<Align<_32, Arg::Dst>::Then<Loop<_32>>>(dst, value, count);
76c02aa154SGuillaume Chatelet #elif defined(LLVM_LIBC_ARCH_AARCH64)
77c02aa154SGuillaume Chatelet /////////////////////////////////////////////////////////////////////////////
78c02aa154SGuillaume Chatelet // LLVM_LIBC_ARCH_AARCH64
79c02aa154SGuillaume Chatelet /////////////////////////////////////////////////////////////////////////////
80c02aa154SGuillaume Chatelet using namespace __llvm_libc::aarch64_memset;
81c02aa154SGuillaume Chatelet if (count == 0)
82c02aa154SGuillaume Chatelet return;
83c02aa154SGuillaume Chatelet if (count <= 3) {
84*1c92911eSMichael Jones splat_set<_1>(dst, value);
85c02aa154SGuillaume Chatelet if (count > 1)
86*1c92911eSMichael Jones splat_set<Tail<_2>>(dst, value, count);
87c02aa154SGuillaume Chatelet return;
88c02aa154SGuillaume Chatelet }
89c02aa154SGuillaume Chatelet if (count <= 8)
90*1c92911eSMichael Jones return splat_set<HeadTail<_4>>(dst, value, count);
91c02aa154SGuillaume Chatelet if (count <= 16)
92*1c92911eSMichael Jones return splat_set<HeadTail<_8>>(dst, value, count);
93c02aa154SGuillaume Chatelet if (count <= 32)
94*1c92911eSMichael Jones return splat_set<HeadTail<_16>>(dst, value, count);
95c02aa154SGuillaume Chatelet if (count <= 96) {
96*1c92911eSMichael Jones splat_set<_32>(dst, value);
97c02aa154SGuillaume Chatelet if (count <= 64)
98*1c92911eSMichael Jones return splat_set<Tail<_32>>(dst, value, count);
99*1c92911eSMichael Jones splat_set<Skip<32>::Then<_32>>(dst, value);
100*1c92911eSMichael Jones splat_set<Tail<_32>>(dst, value, count);
101c02aa154SGuillaume Chatelet return;
102c02aa154SGuillaume Chatelet }
103c02aa154SGuillaume Chatelet if (count < 448 || value != 0 || !AArch64ZVA(dst, count))
104*1c92911eSMichael Jones return splat_set<Align<_16, Arg::_1>::Then<Loop<_64>>>(dst, value, count);
105c02aa154SGuillaume Chatelet #else
106c02aa154SGuillaume Chatelet /////////////////////////////////////////////////////////////////////////////
107c02aa154SGuillaume Chatelet // Default
108c02aa154SGuillaume Chatelet /////////////////////////////////////////////////////////////////////////////
109c02aa154SGuillaume Chatelet using namespace ::__llvm_libc::scalar;
110c02aa154SGuillaume Chatelet
111c02aa154SGuillaume Chatelet if (count == 0)
112c02aa154SGuillaume Chatelet return;
113c02aa154SGuillaume Chatelet if (count == 1)
114*1c92911eSMichael Jones return splat_set<_1>(dst, value);
115c02aa154SGuillaume Chatelet if (count == 2)
116*1c92911eSMichael Jones return splat_set<_2>(dst, value);
117c02aa154SGuillaume Chatelet if (count == 3)
118*1c92911eSMichael Jones return splat_set<_3>(dst, value);
119c02aa154SGuillaume Chatelet if (count <= 8)
120*1c92911eSMichael Jones return splat_set<HeadTail<_4>>(dst, value, count);
121c02aa154SGuillaume Chatelet if (count <= 16)
122*1c92911eSMichael Jones return splat_set<HeadTail<_8>>(dst, value, count);
123c02aa154SGuillaume Chatelet if (count <= 32)
124*1c92911eSMichael Jones return splat_set<HeadTail<_16>>(dst, value, count);
125c02aa154SGuillaume Chatelet if (count <= 64)
126*1c92911eSMichael Jones return splat_set<HeadTail<_32>>(dst, value, count);
127c02aa154SGuillaume Chatelet if (count <= 128)
128*1c92911eSMichael Jones return splat_set<HeadTail<_64>>(dst, value, count);
129*1c92911eSMichael Jones return splat_set<Align<_32, Arg::Dst>::Then<Loop<_32>>>(dst, value, count);
130c02aa154SGuillaume Chatelet #endif
131c02aa154SGuillaume Chatelet }
132c02aa154SGuillaume Chatelet
133c02aa154SGuillaume Chatelet } // namespace __llvm_libc
134c02aa154SGuillaume Chatelet
135c02aa154SGuillaume Chatelet #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H
136