1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+sse2 < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=SSE32
3; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+sse2 < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=SSE64
4; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=AVX32 --check-prefix=AVXONLY32
5; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=AVX64 --check-prefix=AVXONLY64
6; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=AVX32 --check-prefix=AVX51232 --check-prefix=KNL32
7; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=AVX64 --check-prefix=AVX51264 --check-prefix=KNL64
8; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512vl,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=AVX32 --check-prefix=AVX51232 --check-prefix=SKX32
9; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=AVX64 --check-prefix=AVX51264 --check-prefix=SKX64
10
11define i32 @test_store_32(i32* nocapture %addr, i32 %value) {
12; ALL32-LABEL: test_store_32:
13; ALL32:       # BB#0: # %entry
14; ALL32-NEXT:    movl %esi, (%rdi)
15; ALL32-NEXT:    movl %esi, %eax
16; ALL32-NEXT:    retq
17;
18; ALL64-LABEL: test_store_32:
19; ALL64:       # BB#0: # %entry
20; ALL64-NEXT:    movl {{[0-9]+}}(%esp), %eax
21; ALL64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
22; ALL64-NEXT:    movl %eax, (%ecx)
23; ALL64-NEXT:    retl
24entry:
25  store i32 %value, i32* %addr, align 1
26  ret i32 %value
27}
28
29define i16 @test_store_16(i16* nocapture %addr, i16 %value) {
30; ALL32-LABEL: test_store_16:
31; ALL32:       # BB#0: # %entry
32; ALL32-NEXT:    movw %si, (%rdi)
33; ALL32-NEXT:    movl %esi, %eax
34; ALL32-NEXT:    retq
35;
36; ALL64-LABEL: test_store_16:
37; ALL64:       # BB#0: # %entry
38; ALL64-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
39; ALL64-NEXT:    movl {{[0-9]+}}(%esp), %ecx
40; ALL64-NEXT:    movw %ax, (%ecx)
41; ALL64-NEXT:    retl
42entry:
43  store i16 %value, i16* %addr, align 1
44  ret i16 %value
45}
46
47define <4 x i32> @test_store_4xi32(<4 x i32>* nocapture %addr, <4 x i32> %value, <4 x i32> %value2) {
48; SSE32-LABEL: test_store_4xi32:
49; SSE32:       # BB#0:
50; SSE32-NEXT:    paddd %xmm1, %xmm0
51; SSE32-NEXT:    movdqu %xmm0, (%rdi)
52; SSE32-NEXT:    retq
53;
54; SSE64-LABEL: test_store_4xi32:
55; SSE64:       # BB#0:
56; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
57; SSE64-NEXT:    paddd %xmm1, %xmm0
58; SSE64-NEXT:    movdqu %xmm0, (%eax)
59; SSE64-NEXT:    retl
60;
61; AVXONLY32-LABEL: test_store_4xi32:
62; AVXONLY32:       # BB#0:
63; AVXONLY32-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
64; AVXONLY32-NEXT:    vmovdqu %xmm0, (%rdi)
65; AVXONLY32-NEXT:    retq
66;
67; AVX64-LABEL: test_store_4xi32:
68; AVX64:       # BB#0:
69; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
70; AVX64-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
71; AVX64-NEXT:    vmovdqu %xmm0, (%eax)
72; AVX64-NEXT:    retl
73;
74; KNL32-LABEL: test_store_4xi32:
75; KNL32:       # BB#0:
76; KNL32-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
77; KNL32-NEXT:    vmovdqu %xmm0, (%rdi)
78; KNL32-NEXT:    retq
79;
80; SKX32-LABEL: test_store_4xi32:
81; SKX32:       # BB#0:
82; SKX32-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
83; SKX32-NEXT:    vmovdqu %xmm0, (%rdi)
84; SKX32-NEXT:    retq
85  %foo = add <4 x i32> %value, %value2 ; to force integer type on store
86  store <4 x i32> %foo, <4 x i32>* %addr, align 1
87  ret <4 x i32> %foo
88}
89
90define <4 x i32> @test_store_4xi32_aligned(<4 x i32>* nocapture %addr, <4 x i32> %value, <4 x i32> %value2) {
91; SSE32-LABEL: test_store_4xi32_aligned:
92; SSE32:       # BB#0:
93; SSE32-NEXT:    paddd %xmm1, %xmm0
94; SSE32-NEXT:    movdqa %xmm0, (%rdi)
95; SSE32-NEXT:    retq
96;
97; SSE64-LABEL: test_store_4xi32_aligned:
98; SSE64:       # BB#0:
99; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
100; SSE64-NEXT:    paddd %xmm1, %xmm0
101; SSE64-NEXT:    movdqa %xmm0, (%eax)
102; SSE64-NEXT:    retl
103;
104; AVXONLY32-LABEL: test_store_4xi32_aligned:
105; AVXONLY32:       # BB#0:
106; AVXONLY32-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
107; AVXONLY32-NEXT:    vmovdqa %xmm0, (%rdi)
108; AVXONLY32-NEXT:    retq
109;
110; AVX64-LABEL: test_store_4xi32_aligned:
111; AVX64:       # BB#0:
112; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
113; AVX64-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
114; AVX64-NEXT:    vmovdqa %xmm0, (%eax)
115; AVX64-NEXT:    retl
116;
117; KNL32-LABEL: test_store_4xi32_aligned:
118; KNL32:       # BB#0:
119; KNL32-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
120; KNL32-NEXT:    vmovdqa %xmm0, (%rdi)
121; KNL32-NEXT:    retq
122;
123; SKX32-LABEL: test_store_4xi32_aligned:
124; SKX32:       # BB#0:
125; SKX32-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
126; SKX32-NEXT:    vmovdqa %xmm0, (%rdi)
127; SKX32-NEXT:    retq
128  %foo = add <4 x i32> %value, %value2 ; to force integer type on store
129  store <4 x i32> %foo, <4 x i32>* %addr, align 16
130  ret <4 x i32> %foo
131}
132
133define <4 x float> @test_store_4xf32(<4 x float>* nocapture %addr, <4 x float> %value) {
134; SSE32-LABEL: test_store_4xf32:
135; SSE32:       # BB#0:
136; SSE32-NEXT:    movups %xmm0, (%rdi)
137; SSE32-NEXT:    retq
138;
139; SSE64-LABEL: test_store_4xf32:
140; SSE64:       # BB#0:
141; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
142; SSE64-NEXT:    movups %xmm0, (%eax)
143; SSE64-NEXT:    retl
144;
145; AVX32-LABEL: test_store_4xf32:
146; AVX32:       # BB#0:
147; AVX32-NEXT:    vmovups %xmm0, (%rdi)
148; AVX32-NEXT:    retq
149;
150; AVX64-LABEL: test_store_4xf32:
151; AVX64:       # BB#0:
152; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
153; AVX64-NEXT:    vmovups %xmm0, (%eax)
154; AVX64-NEXT:    retl
155  store <4 x float> %value, <4 x float>* %addr, align 1
156  ret <4 x float> %value
157}
158
159define <4 x float> @test_store_4xf32_aligned(<4 x float>* nocapture %addr, <4 x float> %value) {
160; SSE32-LABEL: test_store_4xf32_aligned:
161; SSE32:       # BB#0:
162; SSE32-NEXT:    movaps %xmm0, (%rdi)
163; SSE32-NEXT:    retq
164;
165; SSE64-LABEL: test_store_4xf32_aligned:
166; SSE64:       # BB#0:
167; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
168; SSE64-NEXT:    movaps %xmm0, (%eax)
169; SSE64-NEXT:    retl
170;
171; AVX32-LABEL: test_store_4xf32_aligned:
172; AVX32:       # BB#0:
173; AVX32-NEXT:    vmovaps %xmm0, (%rdi)
174; AVX32-NEXT:    retq
175;
176; AVX64-LABEL: test_store_4xf32_aligned:
177; AVX64:       # BB#0:
178; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
179; AVX64-NEXT:    vmovaps %xmm0, (%eax)
180; AVX64-NEXT:    retl
181  store <4 x float> %value, <4 x float>* %addr, align 16
182  ret <4 x float> %value
183}
184
185define <2 x double> @test_store_2xf64(<2 x double>* nocapture %addr, <2 x double> %value, <2 x double> %value2) {
186; SSE32-LABEL: test_store_2xf64:
187; SSE32:       # BB#0:
188; SSE32-NEXT:    addpd %xmm1, %xmm0
189; SSE32-NEXT:    movupd %xmm0, (%rdi)
190; SSE32-NEXT:    retq
191;
192; SSE64-LABEL: test_store_2xf64:
193; SSE64:       # BB#0:
194; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
195; SSE64-NEXT:    addpd %xmm1, %xmm0
196; SSE64-NEXT:    movupd %xmm0, (%eax)
197; SSE64-NEXT:    retl
198;
199; AVX32-LABEL: test_store_2xf64:
200; AVX32:       # BB#0:
201; AVX32-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
202; AVX32-NEXT:    vmovupd %xmm0, (%rdi)
203; AVX32-NEXT:    retq
204;
205; AVX64-LABEL: test_store_2xf64:
206; AVX64:       # BB#0:
207; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
208; AVX64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
209; AVX64-NEXT:    vmovupd %xmm0, (%eax)
210; AVX64-NEXT:    retl
211  %foo = fadd <2 x double> %value, %value2 ; to force dobule type on store
212  store <2 x double> %foo, <2 x double>* %addr, align 1
213  ret <2 x double> %foo
214}
215
216define <2 x double> @test_store_2xf64_aligned(<2 x double>* nocapture %addr, <2 x double> %value, <2 x double> %value2) {
217; SSE32-LABEL: test_store_2xf64_aligned:
218; SSE32:       # BB#0:
219; SSE32-NEXT:    addpd %xmm1, %xmm0
220; SSE32-NEXT:    movapd %xmm0, (%rdi)
221; SSE32-NEXT:    retq
222;
223; SSE64-LABEL: test_store_2xf64_aligned:
224; SSE64:       # BB#0:
225; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
226; SSE64-NEXT:    addpd %xmm1, %xmm0
227; SSE64-NEXT:    movapd %xmm0, (%eax)
228; SSE64-NEXT:    retl
229;
230; AVX32-LABEL: test_store_2xf64_aligned:
231; AVX32:       # BB#0:
232; AVX32-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
233; AVX32-NEXT:    vmovapd %xmm0, (%rdi)
234; AVX32-NEXT:    retq
235;
236; AVX64-LABEL: test_store_2xf64_aligned:
237; AVX64:       # BB#0:
238; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
239; AVX64-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
240; AVX64-NEXT:    vmovapd %xmm0, (%eax)
241; AVX64-NEXT:    retl
242  %foo = fadd <2 x double> %value, %value2 ; to force dobule type on store
243  store <2 x double> %foo, <2 x double>* %addr, align 16
244  ret <2 x double> %foo
245}
246
247define <8 x i32> @test_store_8xi32(<8 x i32>* nocapture %addr, <8 x i32> %value) {
248; SSE32-LABEL: test_store_8xi32:
249; SSE32:       # BB#0:
250; SSE32-NEXT:    movups %xmm0, (%rdi)
251; SSE32-NEXT:    movups %xmm1, 16(%rdi)
252; SSE32-NEXT:    retq
253;
254; SSE64-LABEL: test_store_8xi32:
255; SSE64:       # BB#0:
256; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
257; SSE64-NEXT:    movups %xmm0, (%eax)
258; SSE64-NEXT:    movups %xmm1, 16(%eax)
259; SSE64-NEXT:    retl
260;
261; AVX32-LABEL: test_store_8xi32:
262; AVX32:       # BB#0:
263; AVX32-NEXT:    vmovups %ymm0, (%rdi)
264; AVX32-NEXT:    retq
265;
266; AVX64-LABEL: test_store_8xi32:
267; AVX64:       # BB#0:
268; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
269; AVX64-NEXT:    vmovups %ymm0, (%eax)
270; AVX64-NEXT:    retl
271  store <8 x i32> %value, <8 x i32>* %addr, align 1
272  ret <8 x i32> %value
273}
274
275define <8 x i32> @test_store_8xi32_aligned(<8 x i32>* nocapture %addr, <8 x i32> %value) {
276; SSE32-LABEL: test_store_8xi32_aligned:
277; SSE32:       # BB#0:
278; SSE32-NEXT:    movaps %xmm0, (%rdi)
279; SSE32-NEXT:    movaps %xmm1, 16(%rdi)
280; SSE32-NEXT:    retq
281;
282; SSE64-LABEL: test_store_8xi32_aligned:
283; SSE64:       # BB#0:
284; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
285; SSE64-NEXT:    movaps %xmm0, (%eax)
286; SSE64-NEXT:    movaps %xmm1, 16(%eax)
287; SSE64-NEXT:    retl
288;
289; AVX32-LABEL: test_store_8xi32_aligned:
290; AVX32:       # BB#0:
291; AVX32-NEXT:    vmovaps %ymm0, (%rdi)
292; AVX32-NEXT:    retq
293;
294; AVX64-LABEL: test_store_8xi32_aligned:
295; AVX64:       # BB#0:
296; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
297; AVX64-NEXT:    vmovaps %ymm0, (%eax)
298; AVX64-NEXT:    retl
299  store <8 x i32> %value, <8 x i32>* %addr, align 32
300  ret <8 x i32> %value
301}
302
303define <8 x float> @test_store_8xf32(<8 x float>* nocapture %addr, <8 x float> %value) {
304; SSE32-LABEL: test_store_8xf32:
305; SSE32:       # BB#0:
306; SSE32-NEXT:    movups %xmm0, (%rdi)
307; SSE32-NEXT:    movups %xmm1, 16(%rdi)
308; SSE32-NEXT:    retq
309;
310; SSE64-LABEL: test_store_8xf32:
311; SSE64:       # BB#0:
312; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
313; SSE64-NEXT:    movups %xmm0, (%eax)
314; SSE64-NEXT:    movups %xmm1, 16(%eax)
315; SSE64-NEXT:    retl
316;
317; AVX32-LABEL: test_store_8xf32:
318; AVX32:       # BB#0:
319; AVX32-NEXT:    vmovups %ymm0, (%rdi)
320; AVX32-NEXT:    retq
321;
322; AVX64-LABEL: test_store_8xf32:
323; AVX64:       # BB#0:
324; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
325; AVX64-NEXT:    vmovups %ymm0, (%eax)
326; AVX64-NEXT:    retl
327  store <8 x float> %value, <8 x float>* %addr, align 1
328  ret <8 x float> %value
329}
330
331define <8 x float> @test_store_8xf32_aligned(<8 x float>* nocapture %addr, <8 x float> %value) {
332; SSE32-LABEL: test_store_8xf32_aligned:
333; SSE32:       # BB#0:
334; SSE32-NEXT:    movaps %xmm0, (%rdi)
335; SSE32-NEXT:    movaps %xmm1, 16(%rdi)
336; SSE32-NEXT:    retq
337;
338; SSE64-LABEL: test_store_8xf32_aligned:
339; SSE64:       # BB#0:
340; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
341; SSE64-NEXT:    movaps %xmm0, (%eax)
342; SSE64-NEXT:    movaps %xmm1, 16(%eax)
343; SSE64-NEXT:    retl
344;
345; AVX32-LABEL: test_store_8xf32_aligned:
346; AVX32:       # BB#0:
347; AVX32-NEXT:    vmovaps %ymm0, (%rdi)
348; AVX32-NEXT:    retq
349;
350; AVX64-LABEL: test_store_8xf32_aligned:
351; AVX64:       # BB#0:
352; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
353; AVX64-NEXT:    vmovaps %ymm0, (%eax)
354; AVX64-NEXT:    retl
355  store <8 x float> %value, <8 x float>* %addr, align 32
356  ret <8 x float> %value
357}
358
359define <4 x double> @test_store_4xf64(<4 x double>* nocapture %addr, <4 x double> %value, <4 x double> %value2) {
360; SSE32-LABEL: test_store_4xf64:
361; SSE32:       # BB#0:
362; SSE32-NEXT:    addpd %xmm3, %xmm1
363; SSE32-NEXT:    addpd %xmm2, %xmm0
364; SSE32-NEXT:    movupd %xmm0, (%rdi)
365; SSE32-NEXT:    movupd %xmm1, 16(%rdi)
366; SSE32-NEXT:    retq
367;
368; SSE64-LABEL: test_store_4xf64:
369; SSE64:       # BB#0:
370; SSE64-NEXT:    subl $12, %esp
371; SSE64-NEXT:  .Lcfi0:
372; SSE64-NEXT:    .cfi_def_cfa_offset 16
373; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
374; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm1
375; SSE64-NEXT:    addpd %xmm2, %xmm0
376; SSE64-NEXT:    movupd %xmm0, (%eax)
377; SSE64-NEXT:    movupd %xmm1, 16(%eax)
378; SSE64-NEXT:    addl $12, %esp
379; SSE64-NEXT:    retl
380;
381; AVX32-LABEL: test_store_4xf64:
382; AVX32:       # BB#0:
383; AVX32-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
384; AVX32-NEXT:    vmovupd %ymm0, (%rdi)
385; AVX32-NEXT:    retq
386;
387; AVX64-LABEL: test_store_4xf64:
388; AVX64:       # BB#0:
389; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
390; AVX64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
391; AVX64-NEXT:    vmovupd %ymm0, (%eax)
392; AVX64-NEXT:    retl
393  %foo = fadd <4 x double> %value, %value2 ; to force dobule type on store
394  store <4 x double> %foo, <4 x double>* %addr, align 1
395  ret <4 x double> %foo
396}
397
398define <4 x double> @test_store_4xf64_aligned(<4 x double>* nocapture %addr, <4 x double> %value, <4 x double> %value2) {
399; SSE32-LABEL: test_store_4xf64_aligned:
400; SSE32:       # BB#0:
401; SSE32-NEXT:    addpd %xmm3, %xmm1
402; SSE32-NEXT:    addpd %xmm2, %xmm0
403; SSE32-NEXT:    movapd %xmm0, (%rdi)
404; SSE32-NEXT:    movapd %xmm1, 16(%rdi)
405; SSE32-NEXT:    retq
406;
407; SSE64-LABEL: test_store_4xf64_aligned:
408; SSE64:       # BB#0:
409; SSE64-NEXT:    subl $12, %esp
410; SSE64-NEXT:  .Lcfi1:
411; SSE64-NEXT:    .cfi_def_cfa_offset 16
412; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
413; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm1
414; SSE64-NEXT:    addpd %xmm2, %xmm0
415; SSE64-NEXT:    movapd %xmm0, (%eax)
416; SSE64-NEXT:    movapd %xmm1, 16(%eax)
417; SSE64-NEXT:    addl $12, %esp
418; SSE64-NEXT:    retl
419;
420; AVX32-LABEL: test_store_4xf64_aligned:
421; AVX32:       # BB#0:
422; AVX32-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
423; AVX32-NEXT:    vmovapd %ymm0, (%rdi)
424; AVX32-NEXT:    retq
425;
426; AVX64-LABEL: test_store_4xf64_aligned:
427; AVX64:       # BB#0:
428; AVX64-NEXT:    movl {{[0-9]+}}(%esp), %eax
429; AVX64-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
430; AVX64-NEXT:    vmovapd %ymm0, (%eax)
431; AVX64-NEXT:    retl
432  %foo = fadd <4 x double> %value, %value2 ; to force dobule type on store
433  store <4 x double> %foo, <4 x double>* %addr, align 32
434  ret <4 x double> %foo
435}
436
437define <16 x i32> @test_store_16xi32(<16 x i32>* nocapture %addr, <16 x i32> %value) {
438; SSE32-LABEL: test_store_16xi32:
439; SSE32:       # BB#0:
440; SSE32-NEXT:    movups %xmm0, (%rdi)
441; SSE32-NEXT:    movups %xmm1, 16(%rdi)
442; SSE32-NEXT:    movups %xmm2, 32(%rdi)
443; SSE32-NEXT:    movups %xmm3, 48(%rdi)
444; SSE32-NEXT:    retq
445;
446; SSE64-LABEL: test_store_16xi32:
447; SSE64:       # BB#0:
448; SSE64-NEXT:    subl $12, %esp
449; SSE64-NEXT:  .Lcfi2:
450; SSE64-NEXT:    .cfi_def_cfa_offset 16
451; SSE64-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
452; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
453; SSE64-NEXT:    movups %xmm0, (%eax)
454; SSE64-NEXT:    movups %xmm1, 16(%eax)
455; SSE64-NEXT:    movups %xmm2, 32(%eax)
456; SSE64-NEXT:    movups %xmm3, 48(%eax)
457; SSE64-NEXT:    addl $12, %esp
458; SSE64-NEXT:    retl
459;
460; AVXONLY32-LABEL: test_store_16xi32:
461; AVXONLY32:       # BB#0:
462; AVXONLY32-NEXT:    vmovups %ymm0, (%rdi)
463; AVXONLY32-NEXT:    vmovups %ymm1, 32(%rdi)
464; AVXONLY32-NEXT:    retq
465;
466; AVXONLY64-LABEL: test_store_16xi32:
467; AVXONLY64:       # BB#0:
468; AVXONLY64-NEXT:    movl {{[0-9]+}}(%esp), %eax
469; AVXONLY64-NEXT:    vmovups %ymm0, (%eax)
470; AVXONLY64-NEXT:    vmovups %ymm1, 32(%eax)
471; AVXONLY64-NEXT:    retl
472;
473; AVX51232-LABEL: test_store_16xi32:
474; AVX51232:       # BB#0:
475; AVX51232-NEXT:    vmovups %zmm0, (%rdi)
476; AVX51232-NEXT:    retq
477;
478; AVX51264-LABEL: test_store_16xi32:
479; AVX51264:       # BB#0:
480; AVX51264-NEXT:    movl {{[0-9]+}}(%esp), %eax
481; AVX51264-NEXT:    vmovups %zmm0, (%eax)
482; AVX51264-NEXT:    retl
483  store <16 x i32> %value, <16 x i32>* %addr, align 1
484  ret <16 x i32> %value
485}
486
487define <16 x i32> @test_store_16xi32_aligned(<16 x i32>* nocapture %addr, <16 x i32> %value) {
488; SSE32-LABEL: test_store_16xi32_aligned:
489; SSE32:       # BB#0:
490; SSE32-NEXT:    movaps %xmm0, (%rdi)
491; SSE32-NEXT:    movaps %xmm1, 16(%rdi)
492; SSE32-NEXT:    movaps %xmm2, 32(%rdi)
493; SSE32-NEXT:    movaps %xmm3, 48(%rdi)
494; SSE32-NEXT:    retq
495;
496; SSE64-LABEL: test_store_16xi32_aligned:
497; SSE64:       # BB#0:
498; SSE64-NEXT:    subl $12, %esp
499; SSE64-NEXT:  .Lcfi3:
500; SSE64-NEXT:    .cfi_def_cfa_offset 16
501; SSE64-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
502; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
503; SSE64-NEXT:    movaps %xmm0, (%eax)
504; SSE64-NEXT:    movaps %xmm1, 16(%eax)
505; SSE64-NEXT:    movaps %xmm2, 32(%eax)
506; SSE64-NEXT:    movaps %xmm3, 48(%eax)
507; SSE64-NEXT:    addl $12, %esp
508; SSE64-NEXT:    retl
509;
510; AVXONLY32-LABEL: test_store_16xi32_aligned:
511; AVXONLY32:       # BB#0:
512; AVXONLY32-NEXT:    vmovaps %ymm0, (%rdi)
513; AVXONLY32-NEXT:    vmovaps %ymm1, 32(%rdi)
514; AVXONLY32-NEXT:    retq
515;
516; AVXONLY64-LABEL: test_store_16xi32_aligned:
517; AVXONLY64:       # BB#0:
518; AVXONLY64-NEXT:    movl {{[0-9]+}}(%esp), %eax
519; AVXONLY64-NEXT:    vmovaps %ymm0, (%eax)
520; AVXONLY64-NEXT:    vmovaps %ymm1, 32(%eax)
521; AVXONLY64-NEXT:    retl
522;
523; AVX51232-LABEL: test_store_16xi32_aligned:
524; AVX51232:       # BB#0:
525; AVX51232-NEXT:    vmovaps %zmm0, (%rdi)
526; AVX51232-NEXT:    retq
527;
528; AVX51264-LABEL: test_store_16xi32_aligned:
529; AVX51264:       # BB#0:
530; AVX51264-NEXT:    movl {{[0-9]+}}(%esp), %eax
531; AVX51264-NEXT:    vmovaps %zmm0, (%eax)
532; AVX51264-NEXT:    retl
533  store <16 x i32> %value, <16 x i32>* %addr, align 64
534  ret <16 x i32> %value
535}
536
537define <16 x float> @test_store_16xf32(<16 x float>* nocapture %addr, <16 x float> %value) {
538; SSE32-LABEL: test_store_16xf32:
539; SSE32:       # BB#0:
540; SSE32-NEXT:    movups %xmm0, (%rdi)
541; SSE32-NEXT:    movups %xmm1, 16(%rdi)
542; SSE32-NEXT:    movups %xmm2, 32(%rdi)
543; SSE32-NEXT:    movups %xmm3, 48(%rdi)
544; SSE32-NEXT:    retq
545;
546; SSE64-LABEL: test_store_16xf32:
547; SSE64:       # BB#0:
548; SSE64-NEXT:    subl $12, %esp
549; SSE64-NEXT:  .Lcfi4:
550; SSE64-NEXT:    .cfi_def_cfa_offset 16
551; SSE64-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
552; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
553; SSE64-NEXT:    movups %xmm0, (%eax)
554; SSE64-NEXT:    movups %xmm1, 16(%eax)
555; SSE64-NEXT:    movups %xmm2, 32(%eax)
556; SSE64-NEXT:    movups %xmm3, 48(%eax)
557; SSE64-NEXT:    addl $12, %esp
558; SSE64-NEXT:    retl
559;
560; AVXONLY32-LABEL: test_store_16xf32:
561; AVXONLY32:       # BB#0:
562; AVXONLY32-NEXT:    vmovups %ymm0, (%rdi)
563; AVXONLY32-NEXT:    vmovups %ymm1, 32(%rdi)
564; AVXONLY32-NEXT:    retq
565;
566; AVXONLY64-LABEL: test_store_16xf32:
567; AVXONLY64:       # BB#0:
568; AVXONLY64-NEXT:    movl {{[0-9]+}}(%esp), %eax
569; AVXONLY64-NEXT:    vmovups %ymm0, (%eax)
570; AVXONLY64-NEXT:    vmovups %ymm1, 32(%eax)
571; AVXONLY64-NEXT:    retl
572;
573; AVX51232-LABEL: test_store_16xf32:
574; AVX51232:       # BB#0:
575; AVX51232-NEXT:    vmovups %zmm0, (%rdi)
576; AVX51232-NEXT:    retq
577;
578; AVX51264-LABEL: test_store_16xf32:
579; AVX51264:       # BB#0:
580; AVX51264-NEXT:    movl {{[0-9]+}}(%esp), %eax
581; AVX51264-NEXT:    vmovups %zmm0, (%eax)
582; AVX51264-NEXT:    retl
583  store <16 x float> %value, <16 x float>* %addr, align 1
584  ret <16 x float> %value
585}
586
587define <16 x float> @test_store_16xf32_aligned(<16 x float>* nocapture %addr, <16 x float> %value) {
588; SSE32-LABEL: test_store_16xf32_aligned:
589; SSE32:       # BB#0:
590; SSE32-NEXT:    movaps %xmm0, (%rdi)
591; SSE32-NEXT:    movaps %xmm1, 16(%rdi)
592; SSE32-NEXT:    movaps %xmm2, 32(%rdi)
593; SSE32-NEXT:    movaps %xmm3, 48(%rdi)
594; SSE32-NEXT:    retq
595;
596; SSE64-LABEL: test_store_16xf32_aligned:
597; SSE64:       # BB#0:
598; SSE64-NEXT:    subl $12, %esp
599; SSE64-NEXT:  .Lcfi5:
600; SSE64-NEXT:    .cfi_def_cfa_offset 16
601; SSE64-NEXT:    movaps {{[0-9]+}}(%esp), %xmm3
602; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
603; SSE64-NEXT:    movaps %xmm0, (%eax)
604; SSE64-NEXT:    movaps %xmm1, 16(%eax)
605; SSE64-NEXT:    movaps %xmm2, 32(%eax)
606; SSE64-NEXT:    movaps %xmm3, 48(%eax)
607; SSE64-NEXT:    addl $12, %esp
608; SSE64-NEXT:    retl
609;
610; AVXONLY32-LABEL: test_store_16xf32_aligned:
611; AVXONLY32:       # BB#0:
612; AVXONLY32-NEXT:    vmovaps %ymm0, (%rdi)
613; AVXONLY32-NEXT:    vmovaps %ymm1, 32(%rdi)
614; AVXONLY32-NEXT:    retq
615;
616; AVXONLY64-LABEL: test_store_16xf32_aligned:
617; AVXONLY64:       # BB#0:
618; AVXONLY64-NEXT:    movl {{[0-9]+}}(%esp), %eax
619; AVXONLY64-NEXT:    vmovaps %ymm0, (%eax)
620; AVXONLY64-NEXT:    vmovaps %ymm1, 32(%eax)
621; AVXONLY64-NEXT:    retl
622;
623; AVX51232-LABEL: test_store_16xf32_aligned:
624; AVX51232:       # BB#0:
625; AVX51232-NEXT:    vmovaps %zmm0, (%rdi)
626; AVX51232-NEXT:    retq
627;
628; AVX51264-LABEL: test_store_16xf32_aligned:
629; AVX51264:       # BB#0:
630; AVX51264-NEXT:    movl {{[0-9]+}}(%esp), %eax
631; AVX51264-NEXT:    vmovaps %zmm0, (%eax)
632; AVX51264-NEXT:    retl
633  store <16 x float> %value, <16 x float>* %addr, align 64
634  ret <16 x float> %value
635}
636
637define <8 x double> @test_store_8xf64(<8 x double>* nocapture %addr, <8 x double> %value, <8 x double> %value2) {
638; SSE32-LABEL: test_store_8xf64:
639; SSE32:       # BB#0:
640; SSE32-NEXT:    addpd %xmm7, %xmm3
641; SSE32-NEXT:    addpd %xmm6, %xmm2
642; SSE32-NEXT:    addpd %xmm5, %xmm1
643; SSE32-NEXT:    addpd %xmm4, %xmm0
644; SSE32-NEXT:    movupd %xmm0, (%rdi)
645; SSE32-NEXT:    movupd %xmm1, 16(%rdi)
646; SSE32-NEXT:    movupd %xmm2, 32(%rdi)
647; SSE32-NEXT:    movupd %xmm3, 48(%rdi)
648; SSE32-NEXT:    retq
649;
650; SSE64-LABEL: test_store_8xf64:
651; SSE64:       # BB#0:
652; SSE64-NEXT:    subl $12, %esp
653; SSE64-NEXT:  .Lcfi6:
654; SSE64-NEXT:    .cfi_def_cfa_offset 16
655; SSE64-NEXT:    movapd {{[0-9]+}}(%esp), %xmm3
656; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
657; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm3
658; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm2
659; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm1
660; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm0
661; SSE64-NEXT:    movupd %xmm0, (%eax)
662; SSE64-NEXT:    movupd %xmm1, 16(%eax)
663; SSE64-NEXT:    movupd %xmm2, 32(%eax)
664; SSE64-NEXT:    movupd %xmm3, 48(%eax)
665; SSE64-NEXT:    addl $12, %esp
666; SSE64-NEXT:    retl
667;
668; AVXONLY32-LABEL: test_store_8xf64:
669; AVXONLY32:       # BB#0:
670; AVXONLY32-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
671; AVXONLY32-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
672; AVXONLY32-NEXT:    vmovupd %ymm0, (%rdi)
673; AVXONLY32-NEXT:    vmovupd %ymm1, 32(%rdi)
674; AVXONLY32-NEXT:    retq
675;
676; AVXONLY64-LABEL: test_store_8xf64:
677; AVXONLY64:       # BB#0:
678; AVXONLY64-NEXT:    pushl %ebp
679; AVXONLY64-NEXT:  .Lcfi0:
680; AVXONLY64-NEXT:    .cfi_def_cfa_offset 8
681; AVXONLY64-NEXT:  .Lcfi1:
682; AVXONLY64-NEXT:    .cfi_offset %ebp, -8
683; AVXONLY64-NEXT:    movl %esp, %ebp
684; AVXONLY64-NEXT:  .Lcfi2:
685; AVXONLY64-NEXT:    .cfi_def_cfa_register %ebp
686; AVXONLY64-NEXT:    andl $-32, %esp
687; AVXONLY64-NEXT:    subl $32, %esp
688; AVXONLY64-NEXT:    movl 8(%ebp), %eax
689; AVXONLY64-NEXT:    vaddpd 40(%ebp), %ymm1, %ymm1
690; AVXONLY64-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
691; AVXONLY64-NEXT:    vmovupd %ymm0, (%eax)
692; AVXONLY64-NEXT:    vmovupd %ymm1, 32(%eax)
693; AVXONLY64-NEXT:    movl %ebp, %esp
694; AVXONLY64-NEXT:    popl %ebp
695; AVXONLY64-NEXT:    retl
696;
697; AVX51232-LABEL: test_store_8xf64:
698; AVX51232:       # BB#0:
699; AVX51232-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
700; AVX51232-NEXT:    vmovupd %zmm0, (%rdi)
701; AVX51232-NEXT:    retq
702;
703; AVX51264-LABEL: test_store_8xf64:
704; AVX51264:       # BB#0:
705; AVX51264-NEXT:    movl {{[0-9]+}}(%esp), %eax
706; AVX51264-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
707; AVX51264-NEXT:    vmovupd %zmm0, (%eax)
708; AVX51264-NEXT:    retl
709  %foo = fadd <8 x double> %value, %value2 ; to force dobule type on store
710  store <8 x double> %foo, <8 x double>* %addr, align 1
711  ret <8 x double> %foo
712}
713
714define <8 x double> @test_store_8xf64_aligned(<8 x double>* nocapture %addr, <8 x double> %value, <8 x double> %value2) {
715; SSE32-LABEL: test_store_8xf64_aligned:
716; SSE32:       # BB#0:
717; SSE32-NEXT:    addpd %xmm7, %xmm3
718; SSE32-NEXT:    addpd %xmm6, %xmm2
719; SSE32-NEXT:    addpd %xmm5, %xmm1
720; SSE32-NEXT:    addpd %xmm4, %xmm0
721; SSE32-NEXT:    movapd %xmm0, (%rdi)
722; SSE32-NEXT:    movapd %xmm1, 16(%rdi)
723; SSE32-NEXT:    movapd %xmm2, 32(%rdi)
724; SSE32-NEXT:    movapd %xmm3, 48(%rdi)
725; SSE32-NEXT:    retq
726;
727; SSE64-LABEL: test_store_8xf64_aligned:
728; SSE64:       # BB#0:
729; SSE64-NEXT:    subl $12, %esp
730; SSE64-NEXT:  .Lcfi7:
731; SSE64-NEXT:    .cfi_def_cfa_offset 16
732; SSE64-NEXT:    movapd {{[0-9]+}}(%esp), %xmm3
733; SSE64-NEXT:    movl {{[0-9]+}}(%esp), %eax
734; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm3
735; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm2
736; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm1
737; SSE64-NEXT:    addpd {{[0-9]+}}(%esp), %xmm0
738; SSE64-NEXT:    movapd %xmm0, (%eax)
739; SSE64-NEXT:    movapd %xmm1, 16(%eax)
740; SSE64-NEXT:    movapd %xmm2, 32(%eax)
741; SSE64-NEXT:    movapd %xmm3, 48(%eax)
742; SSE64-NEXT:    addl $12, %esp
743; SSE64-NEXT:    retl
744;
745; AVXONLY32-LABEL: test_store_8xf64_aligned:
746; AVXONLY32:       # BB#0:
747; AVXONLY32-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
748; AVXONLY32-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
749; AVXONLY32-NEXT:    vmovapd %ymm0, (%rdi)
750; AVXONLY32-NEXT:    vmovapd %ymm1, 32(%rdi)
751; AVXONLY32-NEXT:    retq
752;
753; AVXONLY64-LABEL: test_store_8xf64_aligned:
754; AVXONLY64:       # BB#0:
755; AVXONLY64-NEXT:    pushl %ebp
756; AVXONLY64-NEXT:  .Lcfi3:
757; AVXONLY64-NEXT:    .cfi_def_cfa_offset 8
758; AVXONLY64-NEXT:  .Lcfi4:
759; AVXONLY64-NEXT:    .cfi_offset %ebp, -8
760; AVXONLY64-NEXT:    movl %esp, %ebp
761; AVXONLY64-NEXT:  .Lcfi5:
762; AVXONLY64-NEXT:    .cfi_def_cfa_register %ebp
763; AVXONLY64-NEXT:    andl $-32, %esp
764; AVXONLY64-NEXT:    subl $32, %esp
765; AVXONLY64-NEXT:    movl 8(%ebp), %eax
766; AVXONLY64-NEXT:    vaddpd 40(%ebp), %ymm1, %ymm1
767; AVXONLY64-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
768; AVXONLY64-NEXT:    vmovapd %ymm0, (%eax)
769; AVXONLY64-NEXT:    vmovapd %ymm1, 32(%eax)
770; AVXONLY64-NEXT:    movl %ebp, %esp
771; AVXONLY64-NEXT:    popl %ebp
772; AVXONLY64-NEXT:    retl
773;
774; AVX51232-LABEL: test_store_8xf64_aligned:
775; AVX51232:       # BB#0:
776; AVX51232-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
777; AVX51232-NEXT:    vmovapd %zmm0, (%rdi)
778; AVX51232-NEXT:    retq
779;
780; AVX51264-LABEL: test_store_8xf64_aligned:
781; AVX51264:       # BB#0:
782; AVX51264-NEXT:    movl {{[0-9]+}}(%esp), %eax
783; AVX51264-NEXT:    vaddpd %zmm1, %zmm0, %zmm0
784; AVX51264-NEXT:    vmovapd %zmm0, (%eax)
785; AVX51264-NEXT:    retl
786  %foo = fadd <8 x double> %value, %value2 ; to force dobule type on store
787  store <8 x double> %foo, <8 x double>* %addr, align 64
788  ret <8 x double> %foo
789}
790