1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin9 -mattr=+sse,+sse2,+sse4.1 | FileCheck %s --check-prefix=X32 3; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin9 -mattr=+sse,+sse2,+sse4.1 | FileCheck %s --check-prefix=X64 4; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin9 -mattr=+avx | FileCheck %s --check-prefix=X32_AVX --check-prefix=X32_AVX1 5; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin9 -mattr=+avx | FileCheck %s --check-prefix=X64_AVX --check-prefix=X64_AVX1 6; RUN: llc < %s -disable-peephole -mtriple=i686-apple-darwin9 -mattr=+avx512f | FileCheck %s --check-prefix=X32_AVX --check-prefix=X32_AVX512 7; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin9 -mattr=+avx512f | FileCheck %s --check-prefix=X64_AVX --check-prefix=X64_AVX512 8 9define i16 @test1(float %f) nounwind { 10; X32-LABEL: test1: 11; X32: ## BB#0: 12; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 13; X32-NEXT: addss LCPI0_0, %xmm0 14; X32-NEXT: mulss LCPI0_1, %xmm0 15; X32-NEXT: xorps %xmm1, %xmm1 16; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 17; X32-NEXT: minss LCPI0_2, %xmm0 18; X32-NEXT: maxss %xmm1, %xmm0 19; X32-NEXT: cvttss2si %xmm0, %eax 20; X32-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> 21; X32-NEXT: retl 22; 23; X64-LABEL: test1: 24; X64: ## BB#0: 25; X64-NEXT: addss {{.*}}(%rip), %xmm0 26; X64-NEXT: mulss {{.*}}(%rip), %xmm0 27; X64-NEXT: xorps %xmm1, %xmm1 28; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 29; X64-NEXT: minss {{.*}}(%rip), %xmm0 30; X64-NEXT: maxss %xmm1, %xmm0 31; X64-NEXT: cvttss2si %xmm0, %eax 32; X64-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> 33; X64-NEXT: retq 34; 35; X32_AVX1-LABEL: test1: 36; X32_AVX1: ## BB#0: 37; X32_AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 38; X32_AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 39; X32_AVX1-NEXT: vaddss LCPI0_0, %xmm0, %xmm0 40; X32_AVX1-NEXT: vmulss LCPI0_1, %xmm0, %xmm0 41; X32_AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 42; X32_AVX1-NEXT: vminss LCPI0_2, %xmm0, %xmm0 43; X32_AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm0 44; X32_AVX1-NEXT: vcvttss2si %xmm0, %eax 45; X32_AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> 46; X32_AVX1-NEXT: retl 47; 48; X64_AVX1-LABEL: test1: 49; X64_AVX1: ## BB#0: 50; X64_AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 51; X64_AVX1-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0 52; X64_AVX1-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 53; X64_AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 54; X64_AVX1-NEXT: vminss {{.*}}(%rip), %xmm0, %xmm0 55; X64_AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm0 56; X64_AVX1-NEXT: vcvttss2si %xmm0, %eax 57; X64_AVX1-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> 58; X64_AVX1-NEXT: retq 59; 60; X32_AVX512-LABEL: test1: 61; X32_AVX512: ## BB#0: 62; X32_AVX512-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 63; X32_AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 64; X32_AVX512-NEXT: vaddss LCPI0_0, %xmm0, %xmm0 65; X32_AVX512-NEXT: vmulss LCPI0_1, %xmm0, %xmm0 66; X32_AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 67; X32_AVX512-NEXT: vminss LCPI0_2, %xmm0, %xmm0 68; X32_AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 69; X32_AVX512-NEXT: vcvttss2si %xmm0, %eax 70; X32_AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> 71; X32_AVX512-NEXT: retl 72; 73; X64_AVX512-LABEL: test1: 74; X64_AVX512: ## BB#0: 75; X64_AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 76; X64_AVX512-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0 77; X64_AVX512-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 78; X64_AVX512-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 79; X64_AVX512-NEXT: vminss {{.*}}(%rip), %xmm0, %xmm0 80; X64_AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm0 81; X64_AVX512-NEXT: vcvttss2si %xmm0, %eax 82; X64_AVX512-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> 83; X64_AVX512-NEXT: retq 84 %tmp = insertelement <4 x float> undef, float %f, i32 0 ; <<4 x float>> [#uses=1] 85 %tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1 ; <<4 x float>> [#uses=1] 86 %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, i32 2 ; <<4 x float>> [#uses=1] 87 %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, i32 3 ; <<4 x float>> [#uses=1] 88 %tmp28 = tail call <4 x float> @llvm.x86.sse.sub.ss( <4 x float> %tmp12, <4 x float> < float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1] 89 %tmp37 = tail call <4 x float> @llvm.x86.sse.mul.ss( <4 x float> %tmp28, <4 x float> < float 5.000000e-01, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1] 90 %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp37, <4 x float> < float 6.553500e+04, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > ) ; <<4 x float>> [#uses=1] 91 %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> zeroinitializer ) ; <<4 x float>> [#uses=1] 92 %tmp.upgrd.1 = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 ) ; <i32> [#uses=1] 93 %tmp69 = trunc i32 %tmp.upgrd.1 to i16 ; <i16> [#uses=1] 94 ret i16 %tmp69 95} 96 97define i16 @test2(float %f) nounwind { 98; X32-LABEL: test2: 99; X32: ## BB#0: 100; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 101; X32-NEXT: addss LCPI1_0, %xmm0 102; X32-NEXT: mulss LCPI1_1, %xmm0 103; X32-NEXT: minss LCPI1_2, %xmm0 104; X32-NEXT: xorps %xmm1, %xmm1 105; X32-NEXT: maxss %xmm1, %xmm0 106; X32-NEXT: cvttss2si %xmm0, %eax 107; X32-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> 108; X32-NEXT: retl 109; 110; X64-LABEL: test2: 111; X64: ## BB#0: 112; X64-NEXT: addss {{.*}}(%rip), %xmm0 113; X64-NEXT: mulss {{.*}}(%rip), %xmm0 114; X64-NEXT: minss {{.*}}(%rip), %xmm0 115; X64-NEXT: xorps %xmm1, %xmm1 116; X64-NEXT: maxss %xmm1, %xmm0 117; X64-NEXT: cvttss2si %xmm0, %eax 118; X64-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> 119; X64-NEXT: retq 120; 121; X32_AVX-LABEL: test2: 122; X32_AVX: ## BB#0: 123; X32_AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 124; X32_AVX-NEXT: vaddss LCPI1_0, %xmm0, %xmm0 125; X32_AVX-NEXT: vmulss LCPI1_1, %xmm0, %xmm0 126; X32_AVX-NEXT: vminss LCPI1_2, %xmm0, %xmm0 127; X32_AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 128; X32_AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 129; X32_AVX-NEXT: vcvttss2si %xmm0, %eax 130; X32_AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> 131; X32_AVX-NEXT: retl 132; 133; X64_AVX-LABEL: test2: 134; X64_AVX: ## BB#0: 135; X64_AVX-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0 136; X64_AVX-NEXT: vmulss {{.*}}(%rip), %xmm0, %xmm0 137; X64_AVX-NEXT: vminss {{.*}}(%rip), %xmm0, %xmm0 138; X64_AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 139; X64_AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 140; X64_AVX-NEXT: vcvttss2si %xmm0, %eax 141; X64_AVX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill> 142; X64_AVX-NEXT: retq 143 %tmp28 = fsub float %f, 1.000000e+00 ; <float> [#uses=1] 144 %tmp37 = fmul float %tmp28, 5.000000e-01 ; <float> [#uses=1] 145 %tmp375 = insertelement <4 x float> undef, float %tmp37, i32 0 ; <<4 x float>> [#uses=1] 146 %tmp48 = tail call <4 x float> @llvm.x86.sse.min.ss( <4 x float> %tmp375, <4 x float> < float 6.553500e+04, float undef, float undef, float undef > ) ; <<4 x float>> [#uses=1] 147 %tmp59 = tail call <4 x float> @llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> < float 0.000000e+00, float undef, float undef, float undef > ) ; <<4 x float>> [#uses=1] 148 %tmp = tail call i32 @llvm.x86.sse.cvttss2si( <4 x float> %tmp59 ) ; <i32> [#uses=1] 149 %tmp69 = trunc i32 %tmp to i16 ; <i16> [#uses=1] 150 ret i16 %tmp69 151} 152 153declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) 154 155declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) 156 157declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) 158 159declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) 160 161declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) 162 163declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) 164 165declare <4 x float> @f() 166 167define <4 x float> @test3(<4 x float> %A, float *%b, i32 %C) nounwind { 168; X32-LABEL: test3: 169; X32: ## BB#0: 170; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 171; X32-NEXT: roundss $4, (%eax), %xmm0 172; X32-NEXT: retl 173; 174; X64-LABEL: test3: 175; X64: ## BB#0: 176; X64-NEXT: roundss $4, (%rdi), %xmm0 177; X64-NEXT: retq 178; 179; X32_AVX-LABEL: test3: 180; X32_AVX: ## BB#0: 181; X32_AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 182; X32_AVX-NEXT: vroundss $4, (%eax), %xmm0, %xmm0 183; X32_AVX-NEXT: retl 184; 185; X64_AVX-LABEL: test3: 186; X64_AVX: ## BB#0: 187; X64_AVX-NEXT: vroundss $4, (%rdi), %xmm0, %xmm0 188; X64_AVX-NEXT: retq 189 %a = load float , float *%b 190 %B = insertelement <4 x float> undef, float %a, i32 0 191 %X = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %A, <4 x float> %B, i32 4) 192 ret <4 x float> %X 193} 194 195define <4 x float> @test4(<4 x float> %A, float *%b, i32 %C) nounwind { 196; X32-LABEL: test4: 197; X32: ## BB#0: 198; X32-NEXT: subl $28, %esp 199; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 200; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 201; X32-NEXT: movaps %xmm0, (%esp) ## 16-byte Spill 202; X32-NEXT: calll _f 203; X32-NEXT: roundss $4, (%esp), %xmm0 ## 16-byte Folded Reload 204; X32-NEXT: addl $28, %esp 205; X32-NEXT: retl 206; 207; X64-LABEL: test4: 208; X64: ## BB#0: 209; X64-NEXT: subq $24, %rsp 210; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 211; X64-NEXT: movaps %xmm0, (%rsp) ## 16-byte Spill 212; X64-NEXT: callq _f 213; X64-NEXT: roundss $4, (%rsp), %xmm0 ## 16-byte Folded Reload 214; X64-NEXT: addq $24, %rsp 215; X64-NEXT: retq 216; 217; X32_AVX-LABEL: test4: 218; X32_AVX: ## BB#0: 219; X32_AVX-NEXT: subl $28, %esp 220; X32_AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 221; X32_AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 222; X32_AVX-NEXT: vmovaps %xmm0, (%esp) ## 16-byte Spill 223; X32_AVX-NEXT: calll _f 224; X32_AVX-NEXT: vroundss $4, (%esp), %xmm0, %xmm0 ## 16-byte Folded Reload 225; X32_AVX-NEXT: addl $28, %esp 226; X32_AVX-NEXT: retl 227; 228; X64_AVX-LABEL: test4: 229; X64_AVX: ## BB#0: 230; X64_AVX-NEXT: subq $24, %rsp 231; X64_AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 232; X64_AVX-NEXT: vmovaps %xmm0, (%rsp) ## 16-byte Spill 233; X64_AVX-NEXT: callq _f 234; X64_AVX-NEXT: vroundss $4, (%rsp), %xmm0, %xmm0 ## 16-byte Folded Reload 235; X64_AVX-NEXT: addq $24, %rsp 236; X64_AVX-NEXT: retq 237 %a = load float , float *%b 238 %B = insertelement <4 x float> undef, float %a, i32 0 239 %q = call <4 x float> @f() 240 %X = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %q, <4 x float> %B, i32 4) 241 ret <4 x float> %X 242} 243 244; PR13576 245define <2 x double> @test5() nounwind uwtable readnone noinline { 246; X32-LABEL: test5: 247; X32: ## BB#0: ## %entry 248; X32-NEXT: movaps {{.*#+}} xmm0 = [4.569870e+02,1.233210e+02] 249; X32-NEXT: movl $128, %eax 250; X32-NEXT: cvtsi2sdl %eax, %xmm0 251; X32-NEXT: retl 252; 253; X64-LABEL: test5: 254; X64: ## BB#0: ## %entry 255; X64-NEXT: movaps {{.*#+}} xmm0 = [4.569870e+02,1.233210e+02] 256; X64-NEXT: movl $128, %eax 257; X64-NEXT: cvtsi2sdl %eax, %xmm0 258; X64-NEXT: retq 259; 260; X32_AVX-LABEL: test5: 261; X32_AVX: ## BB#0: ## %entry 262; X32_AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4.569870e+02,1.233210e+02] 263; X32_AVX-NEXT: movl $128, %eax 264; X32_AVX-NEXT: vcvtsi2sdl %eax, %xmm0, %xmm0 265; X32_AVX-NEXT: retl 266; 267; X64_AVX-LABEL: test5: 268; X64_AVX: ## BB#0: ## %entry 269; X64_AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4.569870e+02,1.233210e+02] 270; X64_AVX-NEXT: movl $128, %eax 271; X64_AVX-NEXT: vcvtsi2sdl %eax, %xmm0, %xmm0 272; X64_AVX-NEXT: retq 273entry: 274 %0 = tail call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> <double 4.569870e+02, double 1.233210e+02>, i32 128) nounwind readnone 275 ret <2 x double> %0 276} 277 278declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnone 279 280define <4 x float> @minss_fold(float* %x, <4 x float> %y) { 281; X32-LABEL: minss_fold: 282; X32: ## BB#0: ## %entry 283; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 284; X32-NEXT: minss (%eax), %xmm0 285; X32-NEXT: retl 286; 287; X64-LABEL: minss_fold: 288; X64: ## BB#0: ## %entry 289; X64-NEXT: minss (%rdi), %xmm0 290; X64-NEXT: retq 291; 292; X32_AVX-LABEL: minss_fold: 293; X32_AVX: ## BB#0: ## %entry 294; X32_AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 295; X32_AVX-NEXT: vminss (%eax), %xmm0, %xmm0 296; X32_AVX-NEXT: retl 297; 298; X64_AVX-LABEL: minss_fold: 299; X64_AVX: ## BB#0: ## %entry 300; X64_AVX-NEXT: vminss (%rdi), %xmm0, %xmm0 301; X64_AVX-NEXT: retq 302entry: 303 %0 = load float, float* %x, align 1 304 %vecinit.i = insertelement <4 x float> undef, float %0, i32 0 305 %vecinit2.i = insertelement <4 x float> %vecinit.i, float 0.000000e+00, i32 1 306 %vecinit3.i = insertelement <4 x float> %vecinit2.i, float 0.000000e+00, i32 2 307 %vecinit4.i = insertelement <4 x float> %vecinit3.i, float 0.000000e+00, i32 3 308 %1 = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %y, <4 x float> %vecinit4.i) 309 ret <4 x float> %1 310} 311 312define <4 x float> @maxss_fold(float* %x, <4 x float> %y) { 313; X32-LABEL: maxss_fold: 314; X32: ## BB#0: ## %entry 315; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 316; X32-NEXT: maxss (%eax), %xmm0 317; X32-NEXT: retl 318; 319; X64-LABEL: maxss_fold: 320; X64: ## BB#0: ## %entry 321; X64-NEXT: maxss (%rdi), %xmm0 322; X64-NEXT: retq 323; 324; X32_AVX-LABEL: maxss_fold: 325; X32_AVX: ## BB#0: ## %entry 326; X32_AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 327; X32_AVX-NEXT: vmaxss (%eax), %xmm0, %xmm0 328; X32_AVX-NEXT: retl 329; 330; X64_AVX-LABEL: maxss_fold: 331; X64_AVX: ## BB#0: ## %entry 332; X64_AVX-NEXT: vmaxss (%rdi), %xmm0, %xmm0 333; X64_AVX-NEXT: retq 334entry: 335 %0 = load float, float* %x, align 1 336 %vecinit.i = insertelement <4 x float> undef, float %0, i32 0 337 %vecinit2.i = insertelement <4 x float> %vecinit.i, float 0.000000e+00, i32 1 338 %vecinit3.i = insertelement <4 x float> %vecinit2.i, float 0.000000e+00, i32 2 339 %vecinit4.i = insertelement <4 x float> %vecinit3.i, float 0.000000e+00, i32 3 340 %1 = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %y, <4 x float> %vecinit4.i) 341 ret <4 x float> %1 342} 343 344define <4 x float> @cmpss_fold(float* %x, <4 x float> %y) { 345; X32-LABEL: cmpss_fold: 346; X32: ## BB#0: ## %entry 347; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 348; X32-NEXT: cmpeqss (%eax), %xmm0 349; X32-NEXT: retl 350; 351; X64-LABEL: cmpss_fold: 352; X64: ## BB#0: ## %entry 353; X64-NEXT: cmpeqss (%rdi), %xmm0 354; X64-NEXT: retq 355; 356; X32_AVX-LABEL: cmpss_fold: 357; X32_AVX: ## BB#0: ## %entry 358; X32_AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 359; X32_AVX-NEXT: vcmpeqss (%eax), %xmm0, %xmm0 360; X32_AVX-NEXT: retl 361; 362; X64_AVX-LABEL: cmpss_fold: 363; X64_AVX: ## BB#0: ## %entry 364; X64_AVX-NEXT: vcmpeqss (%rdi), %xmm0, %xmm0 365; X64_AVX-NEXT: retq 366entry: 367 %0 = load float, float* %x, align 1 368 %vecinit.i = insertelement <4 x float> undef, float %0, i32 0 369 %vecinit2.i = insertelement <4 x float> %vecinit.i, float 0.000000e+00, i32 1 370 %vecinit3.i = insertelement <4 x float> %vecinit2.i, float 0.000000e+00, i32 2 371 %vecinit4.i = insertelement <4 x float> %vecinit3.i, float 0.000000e+00, i32 3 372 %1 = tail call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %y, <4 x float> %vecinit4.i, i8 0) 373 ret <4 x float> %1 374} 375declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone 376 377 378define <4 x float> @double_fold(float* %x, <4 x float> %y) { 379; X32-LABEL: double_fold: 380; X32: ## BB#0: ## %entry 381; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 382; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 383; X32-NEXT: movaps %xmm0, %xmm2 384; X32-NEXT: minss %xmm1, %xmm2 385; X32-NEXT: maxss %xmm1, %xmm0 386; X32-NEXT: addps %xmm2, %xmm0 387; X32-NEXT: retl 388; 389; X64-LABEL: double_fold: 390; X64: ## BB#0: ## %entry 391; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 392; X64-NEXT: movaps %xmm0, %xmm2 393; X64-NEXT: minss %xmm1, %xmm2 394; X64-NEXT: maxss %xmm1, %xmm0 395; X64-NEXT: addps %xmm2, %xmm0 396; X64-NEXT: retq 397; 398; X32_AVX-LABEL: double_fold: 399; X32_AVX: ## BB#0: ## %entry 400; X32_AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 401; X32_AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 402; X32_AVX-NEXT: vminss %xmm1, %xmm0, %xmm2 403; X32_AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 404; X32_AVX-NEXT: vaddps %xmm0, %xmm2, %xmm0 405; X32_AVX-NEXT: retl 406; 407; X64_AVX-LABEL: double_fold: 408; X64_AVX: ## BB#0: ## %entry 409; X64_AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 410; X64_AVX-NEXT: vminss %xmm1, %xmm0, %xmm2 411; X64_AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm0 412; X64_AVX-NEXT: vaddps %xmm0, %xmm2, %xmm0 413; X64_AVX-NEXT: retq 414entry: 415 %0 = load float, float* %x, align 1 416 %vecinit.i = insertelement <4 x float> undef, float %0, i32 0 417 %1 = tail call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %y, <4 x float> %vecinit.i) 418 %2 = tail call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %y, <4 x float> %vecinit.i) 419 %3 = fadd <4 x float> %1, %2 420 ret <4 x float> %3 421} 422