1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse  | FileCheck %s --check-prefix=SSE
3; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2 | FileCheck %s --check-prefix=SSE
4; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2,-slow-unaligned-mem-16 | FileCheck %s --check-prefix=SSE2FAST
5; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx  | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
6; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
7; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512f  -mattr=+prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
8; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512bw -mattr=+prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
9; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512dq -mattr=+prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
10; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512f  -mattr=-prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
11; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512bw -mattr=-prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
12
13; https://llvm.org/bugs/show_bug.cgi?id=27100
14
15define void @memset_16_nonzero_bytes(ptr %x) {
16; SSE-LABEL: memset_16_nonzero_bytes:
17; SSE:       # %bb.0:
18; SSE-NEXT:    movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
19; SSE-NEXT:    movq %rax, 8(%rdi)
20; SSE-NEXT:    movq %rax, (%rdi)
21; SSE-NEXT:    retq
22;
23; SSE2FAST-LABEL: memset_16_nonzero_bytes:
24; SSE2FAST:       # %bb.0:
25; SSE2FAST-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
26; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
27; SSE2FAST-NEXT:    retq
28;
29; AVX-LABEL: memset_16_nonzero_bytes:
30; AVX:       # %bb.0:
31; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
32; AVX-NEXT:    vmovups %xmm0, (%rdi)
33; AVX-NEXT:    retq
34  %call = tail call ptr @__memset_chk(ptr %x, i32 42, i64 16, i64 -1)
35  ret void
36}
37
38define void @memset_32_nonzero_bytes(ptr %x) {
39; SSE-LABEL: memset_32_nonzero_bytes:
40; SSE:       # %bb.0:
41; SSE-NEXT:    movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
42; SSE-NEXT:    movq %rax, 24(%rdi)
43; SSE-NEXT:    movq %rax, 16(%rdi)
44; SSE-NEXT:    movq %rax, 8(%rdi)
45; SSE-NEXT:    movq %rax, (%rdi)
46; SSE-NEXT:    retq
47;
48; SSE2FAST-LABEL: memset_32_nonzero_bytes:
49; SSE2FAST:       # %bb.0:
50; SSE2FAST-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
51; SSE2FAST-NEXT:    movups %xmm0, 16(%rdi)
52; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
53; SSE2FAST-NEXT:    retq
54;
55; AVX-LABEL: memset_32_nonzero_bytes:
56; AVX:       # %bb.0:
57; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
58; AVX-NEXT:    vmovups %ymm0, (%rdi)
59; AVX-NEXT:    vzeroupper
60; AVX-NEXT:    retq
61  %call = tail call ptr @__memset_chk(ptr %x, i32 42, i64 32, i64 -1)
62  ret void
63}
64
65define void @memset_64_nonzero_bytes(ptr %x) {
66; SSE-LABEL: memset_64_nonzero_bytes:
67; SSE:       # %bb.0:
68; SSE-NEXT:    movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
69; SSE-NEXT:    movq %rax, 56(%rdi)
70; SSE-NEXT:    movq %rax, 48(%rdi)
71; SSE-NEXT:    movq %rax, 40(%rdi)
72; SSE-NEXT:    movq %rax, 32(%rdi)
73; SSE-NEXT:    movq %rax, 24(%rdi)
74; SSE-NEXT:    movq %rax, 16(%rdi)
75; SSE-NEXT:    movq %rax, 8(%rdi)
76; SSE-NEXT:    movq %rax, (%rdi)
77; SSE-NEXT:    retq
78;
79; SSE2FAST-LABEL: memset_64_nonzero_bytes:
80; SSE2FAST:       # %bb.0:
81; SSE2FAST-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
82; SSE2FAST-NEXT:    movups %xmm0, 48(%rdi)
83; SSE2FAST-NEXT:    movups %xmm0, 32(%rdi)
84; SSE2FAST-NEXT:    movups %xmm0, 16(%rdi)
85; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
86; SSE2FAST-NEXT:    retq
87;
88; AVX1-LABEL: memset_64_nonzero_bytes:
89; AVX1:       # %bb.0:
90; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
91; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
92; AVX1-NEXT:    vmovups %ymm0, (%rdi)
93; AVX1-NEXT:    vzeroupper
94; AVX1-NEXT:    retq
95;
96; AVX2-LABEL: memset_64_nonzero_bytes:
97; AVX2:       # %bb.0:
98; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
99; AVX2-NEXT:    vmovups %ymm0, 32(%rdi)
100; AVX2-NEXT:    vmovups %ymm0, (%rdi)
101; AVX2-NEXT:    vzeroupper
102; AVX2-NEXT:    retq
103;
104; AVX512F-LABEL: memset_64_nonzero_bytes:
105; AVX512F:       # %bb.0:
106; AVX512F-NEXT:    vbroadcastss {{.*#+}} zmm0 = [707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378]
107; AVX512F-NEXT:    vmovups %zmm0, (%rdi)
108; AVX512F-NEXT:    vzeroupper
109; AVX512F-NEXT:    retq
110;
111; AVX512BW-LABEL: memset_64_nonzero_bytes:
112; AVX512BW:       # %bb.0:
113; AVX512BW-NEXT:    vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
114; AVX512BW-NEXT:    vmovups %zmm0, (%rdi)
115; AVX512BW-NEXT:    vzeroupper
116; AVX512BW-NEXT:    retq
117; AVX512NW-NEXT: retq
118  %call = tail call ptr @__memset_chk(ptr %x, i32 42, i64 64, i64 -1)
119  ret void
120}
121
122define void @memset_128_nonzero_bytes(ptr %x) {
123; SSE-LABEL: memset_128_nonzero_bytes:
124; SSE:       # %bb.0:
125; SSE-NEXT:    movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
126; SSE-NEXT:    movq %rax, 120(%rdi)
127; SSE-NEXT:    movq %rax, 112(%rdi)
128; SSE-NEXT:    movq %rax, 104(%rdi)
129; SSE-NEXT:    movq %rax, 96(%rdi)
130; SSE-NEXT:    movq %rax, 88(%rdi)
131; SSE-NEXT:    movq %rax, 80(%rdi)
132; SSE-NEXT:    movq %rax, 72(%rdi)
133; SSE-NEXT:    movq %rax, 64(%rdi)
134; SSE-NEXT:    movq %rax, 56(%rdi)
135; SSE-NEXT:    movq %rax, 48(%rdi)
136; SSE-NEXT:    movq %rax, 40(%rdi)
137; SSE-NEXT:    movq %rax, 32(%rdi)
138; SSE-NEXT:    movq %rax, 24(%rdi)
139; SSE-NEXT:    movq %rax, 16(%rdi)
140; SSE-NEXT:    movq %rax, 8(%rdi)
141; SSE-NEXT:    movq %rax, (%rdi)
142; SSE-NEXT:    retq
143;
144; SSE2FAST-LABEL: memset_128_nonzero_bytes:
145; SSE2FAST:       # %bb.0:
146; SSE2FAST-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
147; SSE2FAST-NEXT:    movups %xmm0, 112(%rdi)
148; SSE2FAST-NEXT:    movups %xmm0, 96(%rdi)
149; SSE2FAST-NEXT:    movups %xmm0, 80(%rdi)
150; SSE2FAST-NEXT:    movups %xmm0, 64(%rdi)
151; SSE2FAST-NEXT:    movups %xmm0, 48(%rdi)
152; SSE2FAST-NEXT:    movups %xmm0, 32(%rdi)
153; SSE2FAST-NEXT:    movups %xmm0, 16(%rdi)
154; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
155; SSE2FAST-NEXT:    retq
156;
157; AVX1-LABEL: memset_128_nonzero_bytes:
158; AVX1:       # %bb.0:
159; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
160; AVX1-NEXT:    vmovups %ymm0, 96(%rdi)
161; AVX1-NEXT:    vmovups %ymm0, 64(%rdi)
162; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
163; AVX1-NEXT:    vmovups %ymm0, (%rdi)
164; AVX1-NEXT:    vzeroupper
165; AVX1-NEXT:    retq
166;
167; AVX2-LABEL: memset_128_nonzero_bytes:
168; AVX2:       # %bb.0:
169; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
170; AVX2-NEXT:    vmovups %ymm0, 96(%rdi)
171; AVX2-NEXT:    vmovups %ymm0, 64(%rdi)
172; AVX2-NEXT:    vmovups %ymm0, 32(%rdi)
173; AVX2-NEXT:    vmovups %ymm0, (%rdi)
174; AVX2-NEXT:    vzeroupper
175; AVX2-NEXT:    retq
176;
177; AVX512F-LABEL: memset_128_nonzero_bytes:
178; AVX512F:       # %bb.0:
179; AVX512F-NEXT:    vbroadcastss {{.*#+}} zmm0 = [707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378]
180; AVX512F-NEXT:    vmovups %zmm0, 64(%rdi)
181; AVX512F-NEXT:    vmovups %zmm0, (%rdi)
182; AVX512F-NEXT:    vzeroupper
183; AVX512F-NEXT:    retq
184;
185; AVX512BW-LABEL: memset_128_nonzero_bytes:
186; AVX512BW:       # %bb.0:
187; AVX512BW-NEXT:    vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
188; AVX512BW-NEXT:    vmovups %zmm0, 64(%rdi)
189; AVX512BW-NEXT:    vmovups %zmm0, (%rdi)
190; AVX512BW-NEXT:    vzeroupper
191; AVX512BW-NEXT:    retq
192  %call = tail call ptr @__memset_chk(ptr %x, i32 42, i64 128, i64 -1)
193  ret void
194}
195
196define void @memset_256_nonzero_bytes(ptr %x) {
197; SSE-LABEL: memset_256_nonzero_bytes:
198; SSE:       # %bb.0:
199; SSE-NEXT:    movl $256, %edx # imm = 0x100
200; SSE-NEXT:    movl $42, %esi
201; SSE-NEXT:    jmp memset@PLT # TAILCALL
202;
203; SSE2FAST-LABEL: memset_256_nonzero_bytes:
204; SSE2FAST:       # %bb.0:
205; SSE2FAST-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
206; SSE2FAST-NEXT:    movups %xmm0, 240(%rdi)
207; SSE2FAST-NEXT:    movups %xmm0, 224(%rdi)
208; SSE2FAST-NEXT:    movups %xmm0, 208(%rdi)
209; SSE2FAST-NEXT:    movups %xmm0, 192(%rdi)
210; SSE2FAST-NEXT:    movups %xmm0, 176(%rdi)
211; SSE2FAST-NEXT:    movups %xmm0, 160(%rdi)
212; SSE2FAST-NEXT:    movups %xmm0, 144(%rdi)
213; SSE2FAST-NEXT:    movups %xmm0, 128(%rdi)
214; SSE2FAST-NEXT:    movups %xmm0, 112(%rdi)
215; SSE2FAST-NEXT:    movups %xmm0, 96(%rdi)
216; SSE2FAST-NEXT:    movups %xmm0, 80(%rdi)
217; SSE2FAST-NEXT:    movups %xmm0, 64(%rdi)
218; SSE2FAST-NEXT:    movups %xmm0, 48(%rdi)
219; SSE2FAST-NEXT:    movups %xmm0, 32(%rdi)
220; SSE2FAST-NEXT:    movups %xmm0, 16(%rdi)
221; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
222; SSE2FAST-NEXT:    retq
223;
224; AVX1-LABEL: memset_256_nonzero_bytes:
225; AVX1:       # %bb.0:
226; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
227; AVX1-NEXT:    vmovups %ymm0, 224(%rdi)
228; AVX1-NEXT:    vmovups %ymm0, 192(%rdi)
229; AVX1-NEXT:    vmovups %ymm0, 160(%rdi)
230; AVX1-NEXT:    vmovups %ymm0, 128(%rdi)
231; AVX1-NEXT:    vmovups %ymm0, 96(%rdi)
232; AVX1-NEXT:    vmovups %ymm0, 64(%rdi)
233; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
234; AVX1-NEXT:    vmovups %ymm0, (%rdi)
235; AVX1-NEXT:    vzeroupper
236; AVX1-NEXT:    retq
237;
238; AVX2-LABEL: memset_256_nonzero_bytes:
239; AVX2:       # %bb.0:
240; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
241; AVX2-NEXT:    vmovups %ymm0, 224(%rdi)
242; AVX2-NEXT:    vmovups %ymm0, 192(%rdi)
243; AVX2-NEXT:    vmovups %ymm0, 160(%rdi)
244; AVX2-NEXT:    vmovups %ymm0, 128(%rdi)
245; AVX2-NEXT:    vmovups %ymm0, 96(%rdi)
246; AVX2-NEXT:    vmovups %ymm0, 64(%rdi)
247; AVX2-NEXT:    vmovups %ymm0, 32(%rdi)
248; AVX2-NEXT:    vmovups %ymm0, (%rdi)
249; AVX2-NEXT:    vzeroupper
250; AVX2-NEXT:    retq
251;
252; AVX512F-LABEL: memset_256_nonzero_bytes:
253; AVX512F:       # %bb.0:
254; AVX512F-NEXT:    vbroadcastss {{.*#+}} zmm0 = [707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378]
255; AVX512F-NEXT:    vmovups %zmm0, 192(%rdi)
256; AVX512F-NEXT:    vmovups %zmm0, 128(%rdi)
257; AVX512F-NEXT:    vmovups %zmm0, 64(%rdi)
258; AVX512F-NEXT:    vmovups %zmm0, (%rdi)
259; AVX512F-NEXT:    vzeroupper
260; AVX512F-NEXT:    retq
261;
262; AVX512BW-LABEL: memset_256_nonzero_bytes:
263; AVX512BW:       # %bb.0:
264; AVX512BW-NEXT:    vmovaps {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
265; AVX512BW-NEXT:    vmovups %zmm0, 192(%rdi)
266; AVX512BW-NEXT:    vmovups %zmm0, 128(%rdi)
267; AVX512BW-NEXT:    vmovups %zmm0, 64(%rdi)
268; AVX512BW-NEXT:    vmovups %zmm0, (%rdi)
269; AVX512BW-NEXT:    vzeroupper
270; AVX512BW-NEXT:    retq
271  %call = tail call ptr @__memset_chk(ptr %x, i32 42, i64 256, i64 -1)
272  ret void
273}
274
275declare ptr @__memset_chk(ptr, i32, i64, i64)
276
277; Repeat with a non-constant value for the stores.
278
279define void @memset_16_nonconst_bytes(ptr %x, i8 %c) {
280; SSE-LABEL: memset_16_nonconst_bytes:
281; SSE:       # %bb.0:
282; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
283; SSE-NEXT:    movzbl %sil, %eax
284; SSE-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
285; SSE-NEXT:    imulq %rax, %rcx
286; SSE-NEXT:    movq %rcx, 8(%rdi)
287; SSE-NEXT:    movq %rcx, (%rdi)
288; SSE-NEXT:    retq
289;
290; SSE2FAST-LABEL: memset_16_nonconst_bytes:
291; SSE2FAST:       # %bb.0:
292; SSE2FAST-NEXT:    movd %esi, %xmm0
293; SSE2FAST-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
294; SSE2FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
295; SSE2FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
296; SSE2FAST-NEXT:    movdqu %xmm0, (%rdi)
297; SSE2FAST-NEXT:    retq
298;
299; AVX1-LABEL: memset_16_nonconst_bytes:
300; AVX1:       # %bb.0:
301; AVX1-NEXT:    vmovd %esi, %xmm0
302; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
303; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
304; AVX1-NEXT:    vmovdqu %xmm0, (%rdi)
305; AVX1-NEXT:    retq
306;
307; AVX2-LABEL: memset_16_nonconst_bytes:
308; AVX2:       # %bb.0:
309; AVX2-NEXT:    vmovd %esi, %xmm0
310; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
311; AVX2-NEXT:    vmovdqu %xmm0, (%rdi)
312; AVX2-NEXT:    retq
313;
314; AVX512-LABEL: memset_16_nonconst_bytes:
315; AVX512:       # %bb.0:
316; AVX512-NEXT:    vmovd %esi, %xmm0
317; AVX512-NEXT:    vpbroadcastb %xmm0, %xmm0
318; AVX512-NEXT:    vmovdqu %xmm0, (%rdi)
319; AVX512-NEXT:    retq
320  tail call void @llvm.memset.p0.i64(ptr %x, i8 %c, i64 16, i1 false)
321  ret void
322}
323
324define void @memset_32_nonconst_bytes(ptr %x, i8 %c) {
325; SSE-LABEL: memset_32_nonconst_bytes:
326; SSE:       # %bb.0:
327; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
328; SSE-NEXT:    movzbl %sil, %eax
329; SSE-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
330; SSE-NEXT:    imulq %rax, %rcx
331; SSE-NEXT:    movq %rcx, 24(%rdi)
332; SSE-NEXT:    movq %rcx, 16(%rdi)
333; SSE-NEXT:    movq %rcx, 8(%rdi)
334; SSE-NEXT:    movq %rcx, (%rdi)
335; SSE-NEXT:    retq
336;
337; SSE2FAST-LABEL: memset_32_nonconst_bytes:
338; SSE2FAST:       # %bb.0:
339; SSE2FAST-NEXT:    movd %esi, %xmm0
340; SSE2FAST-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
341; SSE2FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
342; SSE2FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
343; SSE2FAST-NEXT:    movdqu %xmm0, 16(%rdi)
344; SSE2FAST-NEXT:    movdqu %xmm0, (%rdi)
345; SSE2FAST-NEXT:    retq
346;
347; AVX1-LABEL: memset_32_nonconst_bytes:
348; AVX1:       # %bb.0:
349; AVX1-NEXT:    vmovd %esi, %xmm0
350; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
351; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
352; AVX1-NEXT:    vmovdqu %xmm0, 16(%rdi)
353; AVX1-NEXT:    vmovdqu %xmm0, (%rdi)
354; AVX1-NEXT:    retq
355;
356; AVX2-LABEL: memset_32_nonconst_bytes:
357; AVX2:       # %bb.0:
358; AVX2-NEXT:    vmovd %esi, %xmm0
359; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
360; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
361; AVX2-NEXT:    vzeroupper
362; AVX2-NEXT:    retq
363;
364; AVX512-LABEL: memset_32_nonconst_bytes:
365; AVX512:       # %bb.0:
366; AVX512-NEXT:    vmovd %esi, %xmm0
367; AVX512-NEXT:    vpbroadcastb %xmm0, %ymm0
368; AVX512-NEXT:    vmovdqu %ymm0, (%rdi)
369; AVX512-NEXT:    vzeroupper
370; AVX512-NEXT:    retq
371  tail call void @llvm.memset.p0.i64(ptr %x, i8 %c, i64 32, i1 false)
372  ret void
373}
374
375define void @memset_64_nonconst_bytes(ptr %x, i8 %c) {
376; SSE-LABEL: memset_64_nonconst_bytes:
377; SSE:       # %bb.0:
378; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
379; SSE-NEXT:    movzbl %sil, %eax
380; SSE-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
381; SSE-NEXT:    imulq %rax, %rcx
382; SSE-NEXT:    movq %rcx, 56(%rdi)
383; SSE-NEXT:    movq %rcx, 48(%rdi)
384; SSE-NEXT:    movq %rcx, 40(%rdi)
385; SSE-NEXT:    movq %rcx, 32(%rdi)
386; SSE-NEXT:    movq %rcx, 24(%rdi)
387; SSE-NEXT:    movq %rcx, 16(%rdi)
388; SSE-NEXT:    movq %rcx, 8(%rdi)
389; SSE-NEXT:    movq %rcx, (%rdi)
390; SSE-NEXT:    retq
391;
392; SSE2FAST-LABEL: memset_64_nonconst_bytes:
393; SSE2FAST:       # %bb.0:
394; SSE2FAST-NEXT:    movd %esi, %xmm0
395; SSE2FAST-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
396; SSE2FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
397; SSE2FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
398; SSE2FAST-NEXT:    movdqu %xmm0, 48(%rdi)
399; SSE2FAST-NEXT:    movdqu %xmm0, 32(%rdi)
400; SSE2FAST-NEXT:    movdqu %xmm0, 16(%rdi)
401; SSE2FAST-NEXT:    movdqu %xmm0, (%rdi)
402; SSE2FAST-NEXT:    retq
403;
404; AVX1-LABEL: memset_64_nonconst_bytes:
405; AVX1:       # %bb.0:
406; AVX1-NEXT:    vmovd %esi, %xmm0
407; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
408; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
409; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
410; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
411; AVX1-NEXT:    vmovups %ymm0, (%rdi)
412; AVX1-NEXT:    vzeroupper
413; AVX1-NEXT:    retq
414;
415; AVX2-LABEL: memset_64_nonconst_bytes:
416; AVX2:       # %bb.0:
417; AVX2-NEXT:    vmovd %esi, %xmm0
418; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
419; AVX2-NEXT:    vmovdqu %ymm0, 32(%rdi)
420; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
421; AVX2-NEXT:    vzeroupper
422; AVX2-NEXT:    retq
423;
424; AVX512F-LABEL: memset_64_nonconst_bytes:
425; AVX512F:       # %bb.0:
426; AVX512F-NEXT:    movzbl %sil, %eax
427; AVX512F-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
428; AVX512F-NEXT:    vpbroadcastd %eax, %zmm0
429; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rdi)
430; AVX512F-NEXT:    vzeroupper
431; AVX512F-NEXT:    retq
432;
433; AVX512BW-LABEL: memset_64_nonconst_bytes:
434; AVX512BW:       # %bb.0:
435; AVX512BW-NEXT:    vpbroadcastb %esi, %zmm0
436; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rdi)
437; AVX512BW-NEXT:    vzeroupper
438; AVX512BW-NEXT:    retq
439  tail call void @llvm.memset.p0.i64(ptr %x, i8 %c, i64 64, i1 false)
440  ret void
441}
442
443define void @memset_128_nonconst_bytes(ptr %x, i8 %c) {
444; SSE-LABEL: memset_128_nonconst_bytes:
445; SSE:       # %bb.0:
446; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
447; SSE-NEXT:    movzbl %sil, %eax
448; SSE-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
449; SSE-NEXT:    imulq %rax, %rcx
450; SSE-NEXT:    movq %rcx, 120(%rdi)
451; SSE-NEXT:    movq %rcx, 112(%rdi)
452; SSE-NEXT:    movq %rcx, 104(%rdi)
453; SSE-NEXT:    movq %rcx, 96(%rdi)
454; SSE-NEXT:    movq %rcx, 88(%rdi)
455; SSE-NEXT:    movq %rcx, 80(%rdi)
456; SSE-NEXT:    movq %rcx, 72(%rdi)
457; SSE-NEXT:    movq %rcx, 64(%rdi)
458; SSE-NEXT:    movq %rcx, 56(%rdi)
459; SSE-NEXT:    movq %rcx, 48(%rdi)
460; SSE-NEXT:    movq %rcx, 40(%rdi)
461; SSE-NEXT:    movq %rcx, 32(%rdi)
462; SSE-NEXT:    movq %rcx, 24(%rdi)
463; SSE-NEXT:    movq %rcx, 16(%rdi)
464; SSE-NEXT:    movq %rcx, 8(%rdi)
465; SSE-NEXT:    movq %rcx, (%rdi)
466; SSE-NEXT:    retq
467;
468; SSE2FAST-LABEL: memset_128_nonconst_bytes:
469; SSE2FAST:       # %bb.0:
470; SSE2FAST-NEXT:    movd %esi, %xmm0
471; SSE2FAST-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
472; SSE2FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
473; SSE2FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
474; SSE2FAST-NEXT:    movdqu %xmm0, 112(%rdi)
475; SSE2FAST-NEXT:    movdqu %xmm0, 96(%rdi)
476; SSE2FAST-NEXT:    movdqu %xmm0, 80(%rdi)
477; SSE2FAST-NEXT:    movdqu %xmm0, 64(%rdi)
478; SSE2FAST-NEXT:    movdqu %xmm0, 48(%rdi)
479; SSE2FAST-NEXT:    movdqu %xmm0, 32(%rdi)
480; SSE2FAST-NEXT:    movdqu %xmm0, 16(%rdi)
481; SSE2FAST-NEXT:    movdqu %xmm0, (%rdi)
482; SSE2FAST-NEXT:    retq
483;
484; AVX1-LABEL: memset_128_nonconst_bytes:
485; AVX1:       # %bb.0:
486; AVX1-NEXT:    vmovd %esi, %xmm0
487; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
488; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
489; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
490; AVX1-NEXT:    vmovups %ymm0, 96(%rdi)
491; AVX1-NEXT:    vmovups %ymm0, 64(%rdi)
492; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
493; AVX1-NEXT:    vmovups %ymm0, (%rdi)
494; AVX1-NEXT:    vzeroupper
495; AVX1-NEXT:    retq
496;
497; AVX2-LABEL: memset_128_nonconst_bytes:
498; AVX2:       # %bb.0:
499; AVX2-NEXT:    vmovd %esi, %xmm0
500; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
501; AVX2-NEXT:    vmovdqu %ymm0, 96(%rdi)
502; AVX2-NEXT:    vmovdqu %ymm0, 64(%rdi)
503; AVX2-NEXT:    vmovdqu %ymm0, 32(%rdi)
504; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
505; AVX2-NEXT:    vzeroupper
506; AVX2-NEXT:    retq
507;
508; AVX512F-LABEL: memset_128_nonconst_bytes:
509; AVX512F:       # %bb.0:
510; AVX512F-NEXT:    movzbl %sil, %eax
511; AVX512F-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
512; AVX512F-NEXT:    vpbroadcastd %eax, %zmm0
513; AVX512F-NEXT:    vmovdqu64 %zmm0, 64(%rdi)
514; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rdi)
515; AVX512F-NEXT:    vzeroupper
516; AVX512F-NEXT:    retq
517;
518; AVX512BW-LABEL: memset_128_nonconst_bytes:
519; AVX512BW:       # %bb.0:
520; AVX512BW-NEXT:    vpbroadcastb %esi, %zmm0
521; AVX512BW-NEXT:    vmovdqu64 %zmm0, 64(%rdi)
522; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rdi)
523; AVX512BW-NEXT:    vzeroupper
524; AVX512BW-NEXT:    retq
525  tail call void @llvm.memset.p0.i64(ptr %x, i8 %c, i64 128, i1 false)
526  ret void
527}
528
529define void @memset_256_nonconst_bytes(ptr %x, i8 %c) {
530; SSE-LABEL: memset_256_nonconst_bytes:
531; SSE:       # %bb.0:
532; SSE-NEXT:    movl $256, %edx # imm = 0x100
533; SSE-NEXT:    jmp memset@PLT # TAILCALL
534;
535; SSE2FAST-LABEL: memset_256_nonconst_bytes:
536; SSE2FAST:       # %bb.0:
537; SSE2FAST-NEXT:    movd %esi, %xmm0
538; SSE2FAST-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
539; SSE2FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
540; SSE2FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
541; SSE2FAST-NEXT:    movdqu %xmm0, 240(%rdi)
542; SSE2FAST-NEXT:    movdqu %xmm0, 224(%rdi)
543; SSE2FAST-NEXT:    movdqu %xmm0, 208(%rdi)
544; SSE2FAST-NEXT:    movdqu %xmm0, 192(%rdi)
545; SSE2FAST-NEXT:    movdqu %xmm0, 176(%rdi)
546; SSE2FAST-NEXT:    movdqu %xmm0, 160(%rdi)
547; SSE2FAST-NEXT:    movdqu %xmm0, 144(%rdi)
548; SSE2FAST-NEXT:    movdqu %xmm0, 128(%rdi)
549; SSE2FAST-NEXT:    movdqu %xmm0, 112(%rdi)
550; SSE2FAST-NEXT:    movdqu %xmm0, 96(%rdi)
551; SSE2FAST-NEXT:    movdqu %xmm0, 80(%rdi)
552; SSE2FAST-NEXT:    movdqu %xmm0, 64(%rdi)
553; SSE2FAST-NEXT:    movdqu %xmm0, 48(%rdi)
554; SSE2FAST-NEXT:    movdqu %xmm0, 32(%rdi)
555; SSE2FAST-NEXT:    movdqu %xmm0, 16(%rdi)
556; SSE2FAST-NEXT:    movdqu %xmm0, (%rdi)
557; SSE2FAST-NEXT:    retq
558;
559; AVX1-LABEL: memset_256_nonconst_bytes:
560; AVX1:       # %bb.0:
561; AVX1-NEXT:    vmovd %esi, %xmm0
562; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
563; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
564; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
565; AVX1-NEXT:    vmovups %ymm0, 224(%rdi)
566; AVX1-NEXT:    vmovups %ymm0, 192(%rdi)
567; AVX1-NEXT:    vmovups %ymm0, 160(%rdi)
568; AVX1-NEXT:    vmovups %ymm0, 128(%rdi)
569; AVX1-NEXT:    vmovups %ymm0, 96(%rdi)
570; AVX1-NEXT:    vmovups %ymm0, 64(%rdi)
571; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
572; AVX1-NEXT:    vmovups %ymm0, (%rdi)
573; AVX1-NEXT:    vzeroupper
574; AVX1-NEXT:    retq
575;
576; AVX2-LABEL: memset_256_nonconst_bytes:
577; AVX2:       # %bb.0:
578; AVX2-NEXT:    vmovd %esi, %xmm0
579; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
580; AVX2-NEXT:    vmovdqu %ymm0, 224(%rdi)
581; AVX2-NEXT:    vmovdqu %ymm0, 192(%rdi)
582; AVX2-NEXT:    vmovdqu %ymm0, 160(%rdi)
583; AVX2-NEXT:    vmovdqu %ymm0, 128(%rdi)
584; AVX2-NEXT:    vmovdqu %ymm0, 96(%rdi)
585; AVX2-NEXT:    vmovdqu %ymm0, 64(%rdi)
586; AVX2-NEXT:    vmovdqu %ymm0, 32(%rdi)
587; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
588; AVX2-NEXT:    vzeroupper
589; AVX2-NEXT:    retq
590;
591; AVX512F-LABEL: memset_256_nonconst_bytes:
592; AVX512F:       # %bb.0:
593; AVX512F-NEXT:    movzbl %sil, %eax
594; AVX512F-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
595; AVX512F-NEXT:    vpbroadcastd %eax, %zmm0
596; AVX512F-NEXT:    vmovdqu64 %zmm0, 192(%rdi)
597; AVX512F-NEXT:    vmovdqu64 %zmm0, 128(%rdi)
598; AVX512F-NEXT:    vmovdqu64 %zmm0, 64(%rdi)
599; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rdi)
600; AVX512F-NEXT:    vzeroupper
601; AVX512F-NEXT:    retq
602;
603; AVX512BW-LABEL: memset_256_nonconst_bytes:
604; AVX512BW:       # %bb.0:
605; AVX512BW-NEXT:    vpbroadcastb %esi, %zmm0
606; AVX512BW-NEXT:    vmovdqu64 %zmm0, 192(%rdi)
607; AVX512BW-NEXT:    vmovdqu64 %zmm0, 128(%rdi)
608; AVX512BW-NEXT:    vmovdqu64 %zmm0, 64(%rdi)
609; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rdi)
610; AVX512BW-NEXT:    vzeroupper
611; AVX512BW-NEXT:    retq
612  tail call void @llvm.memset.p0.i64(ptr %x, i8 %c, i64 256, i1 false)
613  ret void
614}
615
616declare void @llvm.memset.p0.i64(ptr nocapture, i8, i64, i1) #1
617
618