1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin9 -mattr=+sse,+sse2,+sse4.1 | FileCheck %s --check-prefix=X32
3; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin9 -mattr=+sse,+sse2,+sse4.1 | FileCheck %s --check-prefix=X64
4; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin9 -mattr=+avx | FileCheck %s --check-prefix=X32_AVX --check-prefix=X32_AVX1
5; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin9 -mattr=+avx | FileCheck %s --check-prefix=X64_AVX --check-prefix=X64_AVX1
6; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin9 -mattr=+avx512f | FileCheck %s --check-prefix=X32_AVX --check-prefix=X32_AVX512
7; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin9 -mattr=+avx512f | FileCheck %s --check-prefix=X64_AVX --check-prefix=X64_AVX512
8
9define i16 @test1(float %f) nounwind {
10; X32-LABEL: test1:
11; X32:       ## BB#0:
12; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
13; X32-NEXT:    addss LCPI0_0, %xmm0
14; X32-NEXT:    mulss LCPI0_1, %xmm0
15; X32-NEXT:    xorps %xmm1, %xmm1
16; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
17; X32-NEXT:    minss LCPI0_2, %xmm0
18; X32-NEXT:    maxss %xmm1, %xmm0
19; X32-NEXT:    cvttss2si %xmm0, %eax
20; X32-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
21; X32-NEXT:    retl
22;
23; X64-LABEL: test1:
24; X64:       ## BB#0:
25; X64-NEXT:    addss {{.*}}(%rip), %xmm0
26; X64-NEXT:    mulss {{.*}}(%rip), %xmm0
27; X64-NEXT:    xorps %xmm1, %xmm1
28; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
29; X64-NEXT:    minss {{.*}}(%rip), %xmm0
30; X64-NEXT:    maxss %xmm1, %xmm0
31; X64-NEXT:    cvttss2si %xmm0, %eax
32; X64-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
33; X64-NEXT:    retq
34;
35; X32_AVX1-LABEL: test1:
36; X32_AVX1:       ## BB#0:
37; X32_AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
38; X32_AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
39; X32_AVX1-NEXT:    vaddss LCPI0_0, %xmm0, %xmm0
40; X32_AVX1-NEXT:    vmulss LCPI0_1, %xmm0, %xmm0
41; X32_AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
42; X32_AVX1-NEXT:    vminss LCPI0_2, %xmm0, %xmm0
43; X32_AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
44; X32_AVX1-NEXT:    vcvttss2si %xmm0, %eax
45; X32_AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
46; X32_AVX1-NEXT:    retl
47;
48; X64_AVX1-LABEL: test1:
49; X64_AVX1:       ## BB#0:
50; X64_AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
51; X64_AVX1-NEXT:    vaddss {{.*}}(%rip), %xmm0, %xmm0
52; X64_AVX1-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
53; X64_AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
54; X64_AVX1-NEXT:    vminss {{.*}}(%rip), %xmm0, %xmm0
55; X64_AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
56; X64_AVX1-NEXT:    vcvttss2si %xmm0, %eax
57; X64_AVX1-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
58; X64_AVX1-NEXT:    retq
59;
60; X32_AVX512-LABEL: test1:
61; X32_AVX512:       ## BB#0:
62; X32_AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
63; X32_AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
64; X32_AVX512-NEXT:    vaddss LCPI0_0, %xmm0, %xmm0
65; X32_AVX512-NEXT:    vmulss LCPI0_1, %xmm0, %xmm0
66; X32_AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
67; X32_AVX512-NEXT:    vminss LCPI0_2, %xmm0, %xmm0
68; X32_AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
69; X32_AVX512-NEXT:    vcvttss2si %xmm0, %eax
70; X32_AVX512-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
71; X32_AVX512-NEXT:    retl
72;
73; X64_AVX512-LABEL: test1:
74; X64_AVX512:       ## BB#0:
75; X64_AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
76; X64_AVX512-NEXT:    vaddss {{.*}}(%rip), %xmm0, %xmm0
77; X64_AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
78; X64_AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
79; X64_AVX512-NEXT:    vminss {{.*}}(%rip), %xmm0, %xmm0
80; X64_AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
81; X64_AVX512-NEXT:    vcvttss2si %xmm0, %eax
82; X64_AVX512-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
83; X64_AVX512-NEXT:    retq
84  %tmp = insertelement <4 x float> undef, float %f, i32 0		; <<4 x float>> [#uses=1]
85  %tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1		; <<4 x float>> [#uses=1]
86  %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2		; <<4 x float>> [#uses=1]
87  %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3		; <<4 x float>> [#uses=1]
88  %tmp28 = tail call <4 x float> @llvm.x86.sse.sub.ss( <4 x float> %tmp12, <4 x float> < float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )		; <<4 x float>> [#uses=1]
89  %tmp37 = tail call <4 x float> @llvm.x86.sse.mul.ss( <4 x float> %tmp28, <4 x float> < float 5.000000e-01, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )		; <<4 x float>> [#uses=1]
90  %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp37, <4 x float> < float 6.553500e+04, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )		; <<4 x float>> [#uses=1]
91  %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> zeroinitializer )		; <<4 x float>> [#uses=1]
92  %tmp.upgrd.1 = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 )		; <i32> [#uses=1]
93  %tmp69 = trunc i32 %tmp.upgrd.1 to i16		; <i16> [#uses=1]
94  ret i16 %tmp69
95}
96
97define i16 @test2(float %f) nounwind {
98; X32-LABEL: test2:
99; X32:       ## BB#0:
100; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
101; X32-NEXT:    addss LCPI1_0, %xmm0
102; X32-NEXT:    mulss LCPI1_1, %xmm0
103; X32-NEXT:    minss LCPI1_2, %xmm0
104; X32-NEXT:    xorps %xmm1, %xmm1
105; X32-NEXT:    maxss %xmm1, %xmm0
106; X32-NEXT:    cvttss2si %xmm0, %eax
107; X32-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
108; X32-NEXT:    retl
109;
110; X64-LABEL: test2:
111; X64:       ## BB#0:
112; X64-NEXT:    addss {{.*}}(%rip), %xmm0
113; X64-NEXT:    mulss {{.*}}(%rip), %xmm0
114; X64-NEXT:    minss {{.*}}(%rip), %xmm0
115; X64-NEXT:    xorps %xmm1, %xmm1
116; X64-NEXT:    maxss %xmm1, %xmm0
117; X64-NEXT:    cvttss2si %xmm0, %eax
118; X64-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
119; X64-NEXT:    retq
120;
121; X32_AVX-LABEL: test2:
122; X32_AVX:       ## BB#0:
123; X32_AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
124; X32_AVX-NEXT:    vaddss LCPI1_0, %xmm0, %xmm0
125; X32_AVX-NEXT:    vmulss LCPI1_1, %xmm0, %xmm0
126; X32_AVX-NEXT:    vminss LCPI1_2, %xmm0, %xmm0
127; X32_AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
128; X32_AVX-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
129; X32_AVX-NEXT:    vcvttss2si %xmm0, %eax
130; X32_AVX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
131; X32_AVX-NEXT:    retl
132;
133; X64_AVX-LABEL: test2:
134; X64_AVX:       ## BB#0:
135; X64_AVX-NEXT:    vaddss {{.*}}(%rip), %xmm0, %xmm0
136; X64_AVX-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
137; X64_AVX-NEXT:    vminss {{.*}}(%rip), %xmm0, %xmm0
138; X64_AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
139; X64_AVX-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
140; X64_AVX-NEXT:    vcvttss2si %xmm0, %eax
141; X64_AVX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
142; X64_AVX-NEXT:    retq
143  %tmp28 = fsub float %f, 1.000000e+00		; <float> [#uses=1]
144  %tmp37 = fmul float %tmp28, 5.000000e-01		; <float> [#uses=1]
145  %tmp375 = insertelement <4 x float> undef, float %tmp37, i32 0		; <<4 x float>> [#uses=1]
146  %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp375, <4 x float> < float 6.553500e+04, float undef, float undef, float undef > )		; <<4 x float>> [#uses=1]
147  %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> < float 0.000000e+00, float undef, float undef, float undef > )		; <<4 x float>> [#uses=1]
148  %tmp = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 )		; <i32> [#uses=1]
149  %tmp69 = trunc i32 %tmp to i16		; <i16> [#uses=1]
150  ret i16 %tmp69
151}
152
153declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>)
154
155declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>)
156
157declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>)
158
159declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>)
160
161declare i32 @llvm.x86.sse.cvttss2si(<4 x float>)
162
163declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32)
164
165declare <4 x float> @f()
166
167define <4 x float> @test3(<4 x float> %A, float *%b, i32 %C) nounwind {
168; X32-LABEL: test3:
169; X32:       ## BB#0:
170; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
171; X32-NEXT:    roundss $4, (%eax), %xmm0
172; X32-NEXT:    retl
173;
174; X64-LABEL: test3:
175; X64:       ## BB#0:
176; X64-NEXT:    roundss $4, (%rdi), %xmm0
177; X64-NEXT:    retq
178;
179; X32_AVX-LABEL: test3:
180; X32_AVX:       ## BB#0:
181; X32_AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
182; X32_AVX-NEXT:    vroundss $4, (%eax), %xmm0, %xmm0
183; X32_AVX-NEXT:    retl
184;
185; X64_AVX-LABEL: test3:
186; X64_AVX:       ## BB#0:
187; X64_AVX-NEXT:    vroundss $4, (%rdi), %xmm0, %xmm0
188; X64_AVX-NEXT:    retq
189  %a = load float , float *%b
190  %B = insertelement <4 x float> undef, float %a, i32 0
191  %X = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %A, <4 x float> %B, i32 4)
192  ret <4 x float> %X
193}
194
195define <4 x float> @test4(<4 x float> %A, float *%b, i32 %C) nounwind {
196; X32-LABEL: test4:
197; X32:       ## BB#0:
198; X32-NEXT:    subl $28, %esp
199; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
200; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
201; X32-NEXT:    movaps %xmm0, (%esp) ## 16-byte Spill
202; X32-NEXT:    calll _f
203; X32-NEXT:    roundss $4, (%esp), %xmm0 ## 16-byte Folded Reload
204; X32-NEXT:    addl $28, %esp
205; X32-NEXT:    retl
206;
207; X64-LABEL: test4:
208; X64:       ## BB#0:
209; X64-NEXT:    subq $24, %rsp
210; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
211; X64-NEXT:    movaps %xmm0, (%rsp) ## 16-byte Spill
212; X64-NEXT:    callq _f
213; X64-NEXT:    roundss $4, (%rsp), %xmm0 ## 16-byte Folded Reload
214; X64-NEXT:    addq $24, %rsp
215; X64-NEXT:    retq
216;
217; X32_AVX-LABEL: test4:
218; X32_AVX:       ## BB#0:
219; X32_AVX-NEXT:    subl $28, %esp
220; X32_AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
221; X32_AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
222; X32_AVX-NEXT:    vmovaps %xmm0, (%esp) ## 16-byte Spill
223; X32_AVX-NEXT:    calll _f
224; X32_AVX-NEXT:    vroundss $4, (%esp), %xmm0, %xmm0 ## 16-byte Folded Reload
225; X32_AVX-NEXT:    addl $28, %esp
226; X32_AVX-NEXT:    retl
227;
228; X64_AVX-LABEL: test4:
229; X64_AVX:       ## BB#0:
230; X64_AVX-NEXT:    subq $24, %rsp
231; X64_AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
232; X64_AVX-NEXT:    vmovaps %xmm0, (%rsp) ## 16-byte Spill
233; X64_AVX-NEXT:    callq _f
234; X64_AVX-NEXT:    vroundss $4, (%rsp), %xmm0, %xmm0 ## 16-byte Folded Reload
235; X64_AVX-NEXT:    addq $24, %rsp
236; X64_AVX-NEXT:    retq
237  %a = load float , float *%b
238  %B = insertelement <4 x float> undef, float %a, i32 0
239  %q = call <4 x float> @f()
240  %X = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %q, <4 x float> %B, i32 4)
241  ret <4 x float> %X
242}
243
244; PR13576
245define  <2 x double> @test5() nounwind uwtable readnone noinline {
246; X32-LABEL: test5:
247; X32:       ## BB#0: ## %entry
248; X32-NEXT:    movaps {{.*#+}} xmm0 = [4.569870e+02,1.233210e+02]
249; X32-NEXT:    movl $128, %eax
250; X32-NEXT:    cvtsi2sdl %eax, %xmm0
251; X32-NEXT:    retl
252;
253; X64-LABEL: test5:
254; X64:       ## BB#0: ## %entry
255; X64-NEXT:    movaps {{.*#+}} xmm0 = [4.569870e+02,1.233210e+02]
256; X64-NEXT:    movl $128, %eax
257; X64-NEXT:    cvtsi2sdl %eax, %xmm0
258; X64-NEXT:    retq
259;
260; X32_AVX-LABEL: test5:
261; X32_AVX:       ## BB#0: ## %entry
262; X32_AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [4.569870e+02,1.233210e+02]
263; X32_AVX-NEXT:    movl $128, %eax
264; X32_AVX-NEXT:    vcvtsi2sdl %eax, %xmm0, %xmm0
265; X32_AVX-NEXT:    retl
266;
267; X64_AVX-LABEL: test5:
268; X64_AVX:       ## BB#0: ## %entry
269; X64_AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [4.569870e+02,1.233210e+02]
270; X64_AVX-NEXT:    movl $128, %eax
271; X64_AVX-NEXT:    vcvtsi2sdl %eax, %xmm0, %xmm0
272; X64_AVX-NEXT:    retq
273entry:
274  %0 = tail call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> <double 4.569870e+02, double 1.233210e+02>, i32 128) nounwind readnone
275  ret <2 x double> %0
276}
277
278declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnone
279
280define <4 x float> @minss_fold(float* %x, <4 x float> %y) {
281; X32-LABEL: minss_fold:
282; X32:       ## BB#0: ## %entry
283; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
284; X32-NEXT:    minss (%eax), %xmm0
285; X32-NEXT:    retl
286;
287; X64-LABEL: minss_fold:
288; X64:       ## BB#0: ## %entry
289; X64-NEXT:    minss (%rdi), %xmm0
290; X64-NEXT:    retq
291;
292; X32_AVX-LABEL: minss_fold:
293; X32_AVX:       ## BB#0: ## %entry
294; X32_AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
295; X32_AVX-NEXT:    vminss (%eax), %xmm0, %xmm0
296; X32_AVX-NEXT:    retl
297;
298; X64_AVX-LABEL: minss_fold:
299; X64_AVX:       ## BB#0: ## %entry
300; X64_AVX-NEXT:    vminss (%rdi), %xmm0, %xmm0
301; X64_AVX-NEXT:    retq
302entry:
303  %0 = load float, float* %x, align 1
304  %vecinit.i = insertelement <4 x float> undef, float %0, i32 0
305  %vecinit2.i = insertelement <4 x float> %vecinit.i, float 0.000000e+00, i32 1
306  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float 0.000000e+00, i32 2
307  %vecinit4.i = insertelement <4 x float> %vecinit3.i, float 0.000000e+00, i32 3
308  %1 = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %y, <4 x float> %vecinit4.i)
309  ret <4 x float> %1
310}
311
312define <4 x float> @maxss_fold(float* %x, <4 x float> %y) {
313; X32-LABEL: maxss_fold:
314; X32:       ## BB#0: ## %entry
315; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
316; X32-NEXT:    maxss (%eax), %xmm0
317; X32-NEXT:    retl
318;
319; X64-LABEL: maxss_fold:
320; X64:       ## BB#0: ## %entry
321; X64-NEXT:    maxss (%rdi), %xmm0
322; X64-NEXT:    retq
323;
324; X32_AVX-LABEL: maxss_fold:
325; X32_AVX:       ## BB#0: ## %entry
326; X32_AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
327; X32_AVX-NEXT:    vmaxss (%eax), %xmm0, %xmm0
328; X32_AVX-NEXT:    retl
329;
330; X64_AVX-LABEL: maxss_fold:
331; X64_AVX:       ## BB#0: ## %entry
332; X64_AVX-NEXT:    vmaxss (%rdi), %xmm0, %xmm0
333; X64_AVX-NEXT:    retq
334entry:
335  %0 = load float, float* %x, align 1
336  %vecinit.i = insertelement <4 x float> undef, float %0, i32 0
337  %vecinit2.i = insertelement <4 x float> %vecinit.i, float 0.000000e+00, i32 1
338  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float 0.000000e+00, i32 2
339  %vecinit4.i = insertelement <4 x float> %vecinit3.i, float 0.000000e+00, i32 3
340  %1 = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %y, <4 x float> %vecinit4.i)
341  ret <4 x float> %1
342}
343
344define <4 x float> @cmpss_fold(float* %x, <4 x float> %y) {
345; X32-LABEL: cmpss_fold:
346; X32:       ## BB#0: ## %entry
347; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
348; X32-NEXT:    cmpeqss (%eax), %xmm0
349; X32-NEXT:    retl
350;
351; X64-LABEL: cmpss_fold:
352; X64:       ## BB#0: ## %entry
353; X64-NEXT:    cmpeqss (%rdi), %xmm0
354; X64-NEXT:    retq
355;
356; X32_AVX-LABEL: cmpss_fold:
357; X32_AVX:       ## BB#0: ## %entry
358; X32_AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
359; X32_AVX-NEXT:    vcmpeqss (%eax), %xmm0, %xmm0
360; X32_AVX-NEXT:    retl
361;
362; X64_AVX-LABEL: cmpss_fold:
363; X64_AVX:       ## BB#0: ## %entry
364; X64_AVX-NEXT:    vcmpeqss (%rdi), %xmm0, %xmm0
365; X64_AVX-NEXT:    retq
366entry:
367  %0 = load float, float* %x, align 1
368  %vecinit.i = insertelement <4 x float> undef, float %0, i32 0
369  %vecinit2.i = insertelement <4 x float> %vecinit.i, float 0.000000e+00, i32 1
370  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float 0.000000e+00, i32 2
371  %vecinit4.i = insertelement <4 x float> %vecinit3.i, float 0.000000e+00, i32 3
372  %1 = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %y, <4 x float> %vecinit4.i, i8 0)
373  ret <4 x float> %1
374}
375declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
376
377
378define <4 x float> @double_fold(float* %x, <4 x float> %y) {
379; X32-LABEL: double_fold:
380; X32:       ## BB#0: ## %entry
381; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
382; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
383; X32-NEXT:    movaps %xmm0, %xmm2
384; X32-NEXT:    minss %xmm1, %xmm2
385; X32-NEXT:    maxss %xmm1, %xmm0
386; X32-NEXT:    addps %xmm2, %xmm0
387; X32-NEXT:    retl
388;
389; X64-LABEL: double_fold:
390; X64:       ## BB#0: ## %entry
391; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
392; X64-NEXT:    movaps %xmm0, %xmm2
393; X64-NEXT:    minss %xmm1, %xmm2
394; X64-NEXT:    maxss %xmm1, %xmm0
395; X64-NEXT:    addps %xmm2, %xmm0
396; X64-NEXT:    retq
397;
398; X32_AVX-LABEL: double_fold:
399; X32_AVX:       ## BB#0: ## %entry
400; X32_AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
401; X32_AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
402; X32_AVX-NEXT:    vminss %xmm1, %xmm0, %xmm2
403; X32_AVX-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
404; X32_AVX-NEXT:    vaddps %xmm0, %xmm2, %xmm0
405; X32_AVX-NEXT:    retl
406;
407; X64_AVX-LABEL: double_fold:
408; X64_AVX:       ## BB#0: ## %entry
409; X64_AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
410; X64_AVX-NEXT:    vminss %xmm1, %xmm0, %xmm2
411; X64_AVX-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
412; X64_AVX-NEXT:    vaddps %xmm0, %xmm2, %xmm0
413; X64_AVX-NEXT:    retq
414entry:
415  %0 = load float, float* %x, align 1
416  %vecinit.i = insertelement <4 x float> undef, float %0, i32 0
417  %1 = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %y, <4 x float> %vecinit.i)
418  %2 = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %y, <4 x float> %vecinit.i)
419  %3 = fadd <4 x float> %1, %2
420  ret <4 x float> %3
421}
422