1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE
3; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefix=X32-AVX
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X64-SSE
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=X64-AVX
6
7define void @fptrunc_frommem2(<2 x double>* %in, <2 x float>* %out) {
8; X32-SSE-LABEL: fptrunc_frommem2:
9; X32-SSE:       # BB#0: # %entry
10; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
11; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
12; X32-SSE-NEXT:    cvtpd2ps (%ecx), %xmm0
13; X32-SSE-NEXT:    extractps $1, %xmm0, 4(%eax)
14; X32-SSE-NEXT:    movss %xmm0, (%eax)
15; X32-SSE-NEXT:    retl
16;
17; X32-AVX-LABEL: fptrunc_frommem2:
18; X32-AVX:       # BB#0: # %entry
19; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
20; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
21; X32-AVX-NEXT:    vcvtpd2psx (%ecx), %xmm0
22; X32-AVX-NEXT:    vextractps $1, %xmm0, 4(%eax)
23; X32-AVX-NEXT:    vmovss %xmm0, (%eax)
24; X32-AVX-NEXT:    retl
25;
26; X64-SSE-LABEL: fptrunc_frommem2:
27; X64-SSE:       # BB#0: # %entry
28; X64-SSE-NEXT:    cvtpd2ps (%rdi), %xmm0
29; X64-SSE-NEXT:    movlpd %xmm0, (%rsi)
30; X64-SSE-NEXT:    retq
31;
32; X64-AVX-LABEL: fptrunc_frommem2:
33; X64-AVX:       # BB#0: # %entry
34; X64-AVX-NEXT:    vcvtpd2psx (%rdi), %xmm0
35; X64-AVX-NEXT:    vmovlpd %xmm0, (%rsi)
36; X64-AVX-NEXT:    retq
37entry:
38  %0 = load <2 x double>, <2 x double>* %in
39  %1 = fptrunc <2 x double> %0 to <2 x float>
40  store <2 x float> %1, <2 x float>* %out, align 1
41  ret void
42}
43
44define void @fptrunc_frommem4(<4 x double>* %in, <4 x float>* %out) {
45; X32-SSE-LABEL: fptrunc_frommem4:
46; X32-SSE:       # BB#0: # %entry
47; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
48; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
49; X32-SSE-NEXT:    cvtpd2ps 16(%ecx), %xmm0
50; X32-SSE-NEXT:    cvtpd2ps (%ecx), %xmm1
51; X32-SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
52; X32-SSE-NEXT:    movupd %xmm1, (%eax)
53; X32-SSE-NEXT:    retl
54;
55; X32-AVX-LABEL: fptrunc_frommem4:
56; X32-AVX:       # BB#0: # %entry
57; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
58; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
59; X32-AVX-NEXT:    vcvtpd2psy (%ecx), %xmm0
60; X32-AVX-NEXT:    vmovupd %xmm0, (%eax)
61; X32-AVX-NEXT:    retl
62;
63; X64-SSE-LABEL: fptrunc_frommem4:
64; X64-SSE:       # BB#0: # %entry
65; X64-SSE-NEXT:    cvtpd2ps 16(%rdi), %xmm0
66; X64-SSE-NEXT:    cvtpd2ps (%rdi), %xmm1
67; X64-SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
68; X64-SSE-NEXT:    movupd %xmm1, (%rsi)
69; X64-SSE-NEXT:    retq
70;
71; X64-AVX-LABEL: fptrunc_frommem4:
72; X64-AVX:       # BB#0: # %entry
73; X64-AVX-NEXT:    vcvtpd2psy (%rdi), %xmm0
74; X64-AVX-NEXT:    vmovupd %xmm0, (%rsi)
75; X64-AVX-NEXT:    retq
76entry:
77  %0 = load <4 x double>, <4 x double>* %in
78  %1 = fptrunc <4 x double> %0 to <4 x float>
79  store <4 x float> %1, <4 x float>* %out, align 1
80  ret void
81}
82
83define void @fptrunc_frommem8(<8 x double>* %in, <8 x float>* %out) {
84; X32-SSE-LABEL: fptrunc_frommem8:
85; X32-SSE:       # BB#0: # %entry
86; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
87; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
88; X32-SSE-NEXT:    cvtpd2ps 16(%ecx), %xmm0
89; X32-SSE-NEXT:    cvtpd2ps (%ecx), %xmm1
90; X32-SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
91; X32-SSE-NEXT:    cvtpd2ps 48(%ecx), %xmm0
92; X32-SSE-NEXT:    cvtpd2ps 32(%ecx), %xmm2
93; X32-SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
94; X32-SSE-NEXT:    movupd %xmm2, 16(%eax)
95; X32-SSE-NEXT:    movupd %xmm1, (%eax)
96; X32-SSE-NEXT:    retl
97;
98; X32-AVX-LABEL: fptrunc_frommem8:
99; X32-AVX:       # BB#0: # %entry
100; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
101; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
102; X32-AVX-NEXT:    vcvtpd2psy (%ecx), %xmm0
103; X32-AVX-NEXT:    vcvtpd2psy 32(%ecx), %xmm1
104; X32-AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
105; X32-AVX-NEXT:    vmovupd %ymm0, (%eax)
106; X32-AVX-NEXT:    vzeroupper
107; X32-AVX-NEXT:    retl
108;
109; X64-SSE-LABEL: fptrunc_frommem8:
110; X64-SSE:       # BB#0: # %entry
111; X64-SSE-NEXT:    cvtpd2ps 16(%rdi), %xmm0
112; X64-SSE-NEXT:    cvtpd2ps (%rdi), %xmm1
113; X64-SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
114; X64-SSE-NEXT:    cvtpd2ps 48(%rdi), %xmm0
115; X64-SSE-NEXT:    cvtpd2ps 32(%rdi), %xmm2
116; X64-SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
117; X64-SSE-NEXT:    movupd %xmm2, 16(%rsi)
118; X64-SSE-NEXT:    movupd %xmm1, (%rsi)
119; X64-SSE-NEXT:    retq
120;
121; X64-AVX-LABEL: fptrunc_frommem8:
122; X64-AVX:       # BB#0: # %entry
123; X64-AVX-NEXT:    vcvtpd2psy (%rdi), %xmm0
124; X64-AVX-NEXT:    vcvtpd2psy 32(%rdi), %xmm1
125; X64-AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
126; X64-AVX-NEXT:    vmovupd %ymm0, (%rsi)
127; X64-AVX-NEXT:    vzeroupper
128; X64-AVX-NEXT:    retq
129entry:
130  %0 = load <8 x double>, <8 x double>* %in
131  %1 = fptrunc <8 x double> %0 to <8 x float>
132  store <8 x float> %1, <8 x float>* %out, align 1
133  ret void
134}
135
136define <4 x float> @fptrunc_frommem2_zext(<2 x double> * %ld) {
137; X32-SSE-LABEL: fptrunc_frommem2_zext:
138; X32-SSE:       # BB#0:
139; X32-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
140; X32-SSE-NEXT:    cvtpd2ps (%eax), %xmm0
141; X32-SSE-NEXT:    retl
142;
143; X32-AVX-LABEL: fptrunc_frommem2_zext:
144; X32-AVX:       # BB#0:
145; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
146; X32-AVX-NEXT:    vcvtpd2psx (%eax), %xmm0
147; X32-AVX-NEXT:    retl
148;
149; X64-SSE-LABEL: fptrunc_frommem2_zext:
150; X64-SSE:       # BB#0:
151; X64-SSE-NEXT:    cvtpd2ps (%rdi), %xmm0
152; X64-SSE-NEXT:    retq
153;
154; X64-AVX-LABEL: fptrunc_frommem2_zext:
155; X64-AVX:       # BB#0:
156; X64-AVX-NEXT:    vcvtpd2psx (%rdi), %xmm0
157; X64-AVX-NEXT:    retq
158  %arg = load <2 x double>, <2 x double> * %ld, align 16
159  %cvt = fptrunc <2 x double> %arg to <2 x float>
160  %ret = shufflevector <2 x float> %cvt, <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
161  ret <4 x float> %ret
162}
163
164define <4 x float> @fptrunc_fromreg2_zext(<2 x double> %arg) {
165; X32-SSE-LABEL: fptrunc_fromreg2_zext:
166; X32-SSE:       # BB#0:
167; X32-SSE-NEXT:    cvtpd2ps %xmm0, %xmm0
168; X32-SSE-NEXT:    retl
169;
170; X32-AVX-LABEL: fptrunc_fromreg2_zext:
171; X32-AVX:       # BB#0:
172; X32-AVX-NEXT:    vcvtpd2ps %xmm0, %xmm0
173; X32-AVX-NEXT:    retl
174;
175; X64-SSE-LABEL: fptrunc_fromreg2_zext:
176; X64-SSE:       # BB#0:
177; X64-SSE-NEXT:    cvtpd2ps %xmm0, %xmm0
178; X64-SSE-NEXT:    retq
179;
180; X64-AVX-LABEL: fptrunc_fromreg2_zext:
181; X64-AVX:       # BB#0:
182; X64-AVX-NEXT:    vcvtpd2ps %xmm0, %xmm0
183; X64-AVX-NEXT:    retq
184  %cvt = fptrunc <2 x double> %arg to <2 x float>
185  %ret = shufflevector <2 x float> %cvt, <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
186  ret <4 x float> %ret
187}
188
189; FIXME: For exact truncations we should be able to fold this.
190define <4 x float> @fptrunc_fromconst() {
191; X32-SSE-LABEL: fptrunc_fromconst:
192; X32-SSE:       # BB#0: # %entry
193; X32-SSE-NEXT:    cvtpd2ps {{\.LCPI.*}}, %xmm1
194; X32-SSE-NEXT:    cvtpd2ps {{\.LCPI.*}}, %xmm0
195; X32-SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
196; X32-SSE-NEXT:    retl
197;
198; X32-AVX-LABEL: fptrunc_fromconst:
199; X32-AVX:       # BB#0: # %entry
200; X32-AVX-NEXT:    vcvtpd2psy {{\.LCPI.*}}, %xmm0
201; X32-AVX-NEXT:    retl
202;
203; X64-SSE-LABEL: fptrunc_fromconst:
204; X64-SSE:       # BB#0: # %entry
205; X64-SSE-NEXT:    cvtpd2ps {{.*}}(%rip), %xmm1
206; X64-SSE-NEXT:    cvtpd2ps {{.*}}(%rip), %xmm0
207; X64-SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
208; X64-SSE-NEXT:    retq
209;
210; X64-AVX-LABEL: fptrunc_fromconst:
211; X64-AVX:       # BB#0: # %entry
212; X64-AVX-NEXT:    vcvtpd2psy {{.*}}(%rip), %xmm0
213; X64-AVX-NEXT:    retq
214entry:
215  %0  = insertelement <4 x double> undef, double 1.0, i32 0
216  %1  = insertelement <4 x double> %0, double -2.0, i32 1
217  %2  = insertelement <4 x double> %1, double +4.0, i32 2
218  %3  = insertelement <4 x double> %2, double -0.0, i32 3
219  %4  = fptrunc <4 x double> %3 to <4 x float>
220  ret <4 x float> %4
221}
222