1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64--linux-gnu -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64--linux-gnu -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 4; RUN: llc < %s -mtriple=x86_64--linux-gnu -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 5 6declare double @__sqrt_finite(double) 7declare float @__sqrtf_finite(float) 8declare x86_fp80 @__sqrtl_finite(x86_fp80) 9declare float @llvm.sqrt.f32(float) 10declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) 11declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) 12declare <16 x float> @llvm.sqrt.v16f32(<16 x float>) 13declare double @llvm.sqrt.f64(double) 14declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) 15 16declare float @llvm.fabs.f32(float) 17declare <4 x float> @llvm.fabs.v4f32(<4 x float>) 18declare double @llvm.fabs.f64(double) 19 20define double @finite_f64_no_estimate(double %d) #0 { 21; SSE-LABEL: finite_f64_no_estimate: 22; SSE: # %bb.0: 23; SSE-NEXT: sqrtsd %xmm0, %xmm0 24; SSE-NEXT: retq 25; 26; AVX-LABEL: finite_f64_no_estimate: 27; AVX: # %bb.0: 28; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 29; AVX-NEXT: retq 30 %call = tail call double @__sqrt_finite(double %d) #2 31 ret double %call 32} 33 34; No estimates for doubles. 35 36define double @finite_f64_estimate(double %d) #1 { 37; SSE-LABEL: finite_f64_estimate: 38; SSE: # %bb.0: 39; SSE-NEXT: sqrtsd %xmm0, %xmm0 40; SSE-NEXT: retq 41; 42; AVX-LABEL: finite_f64_estimate: 43; AVX: # %bb.0: 44; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 45; AVX-NEXT: retq 46 %call = tail call double @__sqrt_finite(double %d) #2 47 ret double %call 48} 49 50define float @finite_f32_no_estimate(float %f) #0 { 51; SSE-LABEL: finite_f32_no_estimate: 52; SSE: # %bb.0: 53; SSE-NEXT: sqrtss %xmm0, %xmm0 54; SSE-NEXT: retq 55; 56; AVX-LABEL: finite_f32_no_estimate: 57; AVX: # %bb.0: 58; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 59; AVX-NEXT: retq 60 %call = tail call float @__sqrtf_finite(float %f) #2 61 ret float %call 62} 63 64define float @finite_f32_estimate_ieee(float %f) #1 { 65; SSE-LABEL: finite_f32_estimate_ieee: 66; SSE: # %bb.0: 67; SSE-NEXT: sqrtss %xmm0, %xmm0 68; SSE-NEXT: retq 69; 70; AVX-LABEL: finite_f32_estimate_ieee: 71; AVX: # %bb.0: 72; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 73; AVX-NEXT: retq 74 %call = tail call float @__sqrtf_finite(float %f) #2 75 ret float %call 76} 77 78define float @finite_f32_estimate_ieee_ninf(float %f) #1 { 79; SSE-LABEL: finite_f32_estimate_ieee_ninf: 80; SSE: # %bb.0: 81; SSE-NEXT: sqrtss %xmm0, %xmm0 82; SSE-NEXT: retq 83; 84; AVX-LABEL: finite_f32_estimate_ieee_ninf: 85; AVX: # %bb.0: 86; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 87; AVX-NEXT: retq 88 %call = tail call ninf afn float @__sqrtf_finite(float %f) #2 89 ret float %call 90} 91 92define float @finite_f32_estimate_daz(float %f) #4 { 93; SSE-LABEL: finite_f32_estimate_daz: 94; SSE: # %bb.0: 95; SSE-NEXT: sqrtss %xmm0, %xmm0 96; SSE-NEXT: retq 97; 98; AVX-LABEL: finite_f32_estimate_daz: 99; AVX: # %bb.0: 100; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 101; AVX-NEXT: retq 102 %call = tail call float @__sqrtf_finite(float %f) #2 103 ret float %call 104} 105 106define float @finite_f32_estimate_daz_ninf(float %f) #4 { 107; SSE-LABEL: finite_f32_estimate_daz_ninf: 108; SSE: # %bb.0: 109; SSE-NEXT: sqrtss %xmm0, %xmm0 110; SSE-NEXT: retq 111; 112; AVX-LABEL: finite_f32_estimate_daz_ninf: 113; AVX: # %bb.0: 114; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 115; AVX-NEXT: retq 116 %call = tail call ninf afn float @__sqrtf_finite(float %f) #2 117 ret float %call 118} 119 120define x86_fp80 @finite_f80_no_estimate(x86_fp80 %ld) #0 { 121; CHECK-LABEL: finite_f80_no_estimate: 122; CHECK: # %bb.0: 123; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) 124; CHECK-NEXT: fsqrt 125; CHECK-NEXT: retq 126 %call = tail call x86_fp80 @__sqrtl_finite(x86_fp80 %ld) #2 127 ret x86_fp80 %call 128} 129 130; Don't die on the impossible. 131 132define x86_fp80 @finite_f80_estimate_but_no(x86_fp80 %ld) #1 { 133; CHECK-LABEL: finite_f80_estimate_but_no: 134; CHECK: # %bb.0: 135; CHECK-NEXT: fldt {{[0-9]+}}(%rsp) 136; CHECK-NEXT: fsqrt 137; CHECK-NEXT: retq 138 %call = tail call x86_fp80 @__sqrtl_finite(x86_fp80 %ld) #2 139 ret x86_fp80 %call 140} 141 142; PR34994 - https://bugs.llvm.org/show_bug.cgi?id=34994 143 144define float @sqrtf_check_denorms(float %x) #3 { 145; SSE-LABEL: sqrtf_check_denorms: 146; SSE: # %bb.0: 147; SSE-NEXT: sqrtss %xmm0, %xmm0 148; SSE-NEXT: retq 149; 150; AVX-LABEL: sqrtf_check_denorms: 151; AVX: # %bb.0: 152; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 153; AVX-NEXT: retq 154 %call = tail call float @__sqrtf_finite(float %x) #2 155 ret float %call 156} 157 158define float @sqrtf_check_denorms_ninf(float %x) #3 { 159; SSE-LABEL: sqrtf_check_denorms_ninf: 160; SSE: # %bb.0: 161; SSE-NEXT: sqrtss %xmm0, %xmm0 162; SSE-NEXT: retq 163; 164; AVX-LABEL: sqrtf_check_denorms_ninf: 165; AVX: # %bb.0: 166; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 167; AVX-NEXT: retq 168 %call = tail call ninf afn float @__sqrtf_finite(float %x) #2 169 ret float %call 170} 171 172define <4 x float> @sqrt_v4f32_check_denorms(<4 x float> %x) #3 { 173; SSE-LABEL: sqrt_v4f32_check_denorms: 174; SSE: # %bb.0: 175; SSE-NEXT: sqrtps %xmm0, %xmm0 176; SSE-NEXT: retq 177; 178; AVX-LABEL: sqrt_v4f32_check_denorms: 179; AVX: # %bb.0: 180; AVX-NEXT: vsqrtps %xmm0, %xmm0 181; AVX-NEXT: retq 182 %call = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2 183 ret <4 x float> %call 184} 185 186define <4 x float> @sqrt_v4f32_check_denorms_ninf(<4 x float> %x) #3 { 187; SSE-LABEL: sqrt_v4f32_check_denorms_ninf: 188; SSE: # %bb.0: 189; SSE-NEXT: rsqrtps %xmm0, %xmm1 190; SSE-NEXT: movaps %xmm0, %xmm2 191; SSE-NEXT: mulps %xmm1, %xmm2 192; SSE-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 193; SSE-NEXT: mulps %xmm2, %xmm3 194; SSE-NEXT: mulps %xmm1, %xmm2 195; SSE-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 196; SSE-NEXT: mulps %xmm3, %xmm2 197; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 198; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] 199; SSE-NEXT: cmpleps %xmm0, %xmm1 200; SSE-NEXT: andps %xmm2, %xmm1 201; SSE-NEXT: movaps %xmm1, %xmm0 202; SSE-NEXT: retq 203; 204; AVX1-LABEL: sqrt_v4f32_check_denorms_ninf: 205; AVX1: # %bb.0: 206; AVX1-NEXT: vrsqrtps %xmm0, %xmm1 207; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm2 208; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 209; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1 210; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 211; AVX1-NEXT: vmulps %xmm1, %xmm3, %xmm1 212; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 213; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] 214; AVX1-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 215; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 216; AVX1-NEXT: retq 217; 218; AVX512-LABEL: sqrt_v4f32_check_denorms_ninf: 219; AVX512: # %bb.0: 220; AVX512-NEXT: vrsqrtps %xmm0, %xmm1 221; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm2 222; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 223; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3 224; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 225; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1 226; AVX512-NEXT: vmulps %xmm3, %xmm1, %xmm1 227; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] 228; AVX512-NEXT: vandps %xmm2, %xmm0, %xmm0 229; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] 230; AVX512-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 231; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0 232; AVX512-NEXT: retq 233 %call = tail call ninf afn <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2 234 ret <4 x float> %call 235} 236 237define float @f32_no_estimate(float %x) #0 { 238; SSE-LABEL: f32_no_estimate: 239; SSE: # %bb.0: 240; SSE-NEXT: sqrtss %xmm0, %xmm1 241; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 242; SSE-NEXT: divss %xmm1, %xmm0 243; SSE-NEXT: retq 244; 245; AVX-LABEL: f32_no_estimate: 246; AVX: # %bb.0: 247; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 248; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 249; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 250; AVX-NEXT: retq 251 %sqrt = tail call float @llvm.sqrt.f32(float %x) 252 %div = fdiv fast float 1.0, %sqrt 253 ret float %div 254} 255 256define float @f32_estimate(float %x) #1 { 257; SSE-LABEL: f32_estimate: 258; SSE: # %bb.0: 259; SSE-NEXT: rsqrtss %xmm0, %xmm1 260; SSE-NEXT: mulss %xmm1, %xmm0 261; SSE-NEXT: mulss %xmm1, %xmm0 262; SSE-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 263; SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 264; SSE-NEXT: mulss %xmm1, %xmm0 265; SSE-NEXT: retq 266; 267; AVX1-LABEL: f32_estimate: 268; AVX1: # %bb.0: 269; AVX1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1 270; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm0 271; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm0 272; AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 273; AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 274; AVX1-NEXT: vmulss %xmm0, %xmm1, %xmm0 275; AVX1-NEXT: retq 276; 277; AVX512-LABEL: f32_estimate: 278; AVX512: # %bb.0: 279; AVX512-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1 280; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 281; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem 282; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 283; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 284; AVX512-NEXT: retq 285 %sqrt = tail call float @llvm.sqrt.f32(float %x) 286 %div = fdiv fast float 1.0, %sqrt 287 ret float %div 288} 289 290define float @f32_estimate2(float %x) #5 { 291; SSE-LABEL: f32_estimate2: 292; SSE: # %bb.0: 293; SSE-NEXT: sqrtss %xmm0, %xmm0 294; SSE-NEXT: retq 295; 296; AVX-LABEL: f32_estimate2: 297; AVX: # %bb.0: 298; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 299; AVX-NEXT: retq 300 %sqrt = tail call fast float @llvm.sqrt.f32(float %x) 301 ret float %sqrt 302} 303 304define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 { 305; SSE-LABEL: v4f32_no_estimate: 306; SSE: # %bb.0: 307; SSE-NEXT: sqrtps %xmm0, %xmm1 308; SSE-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 309; SSE-NEXT: divps %xmm1, %xmm0 310; SSE-NEXT: retq 311; 312; AVX1-LABEL: v4f32_no_estimate: 313; AVX1: # %bb.0: 314; AVX1-NEXT: vsqrtps %xmm0, %xmm0 315; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 316; AVX1-NEXT: vdivps %xmm0, %xmm1, %xmm0 317; AVX1-NEXT: retq 318; 319; AVX512-LABEL: v4f32_no_estimate: 320; AVX512: # %bb.0: 321; AVX512-NEXT: vsqrtps %xmm0, %xmm0 322; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 323; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0 324; AVX512-NEXT: retq 325 %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) 326 %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt 327 ret <4 x float> %div 328} 329 330define <4 x float> @v4f32_estimate(<4 x float> %x) #1 { 331; SSE-LABEL: v4f32_estimate: 332; SSE: # %bb.0: 333; SSE-NEXT: rsqrtps %xmm0, %xmm1 334; SSE-NEXT: mulps %xmm1, %xmm0 335; SSE-NEXT: mulps %xmm1, %xmm0 336; SSE-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 337; SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 338; SSE-NEXT: mulps %xmm1, %xmm0 339; SSE-NEXT: retq 340; 341; AVX1-LABEL: v4f32_estimate: 342; AVX1: # %bb.0: 343; AVX1-NEXT: vrsqrtps %xmm0, %xmm1 344; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0 345; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0 346; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 347; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 348; AVX1-NEXT: vmulps %xmm0, %xmm1, %xmm0 349; AVX1-NEXT: retq 350; 351; AVX512-LABEL: v4f32_estimate: 352; AVX512: # %bb.0: 353; AVX512-NEXT: vrsqrtps %xmm0, %xmm1 354; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 355; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 356; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2 357; AVX512-NEXT: vbroadcastss {{.*#+}} xmm0 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 358; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 359; AVX512-NEXT: vmulps %xmm2, %xmm0, %xmm0 360; AVX512-NEXT: retq 361 %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) 362 %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt 363 ret <4 x float> %div 364} 365 366define <4 x float> @v4f32_estimate2(<4 x float> %x) #5 { 367; SSE-LABEL: v4f32_estimate2: 368; SSE: # %bb.0: 369; SSE-NEXT: rsqrtps %xmm0, %xmm2 370; SSE-NEXT: mulps %xmm0, %xmm2 371; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 372; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] 373; SSE-NEXT: cmpleps %xmm0, %xmm1 374; SSE-NEXT: andps %xmm2, %xmm1 375; SSE-NEXT: movaps %xmm1, %xmm0 376; SSE-NEXT: retq 377; 378; AVX1-LABEL: v4f32_estimate2: 379; AVX1: # %bb.0: 380; AVX1-NEXT: vrsqrtps %xmm0, %xmm1 381; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm1 382; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 383; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] 384; AVX1-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 385; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 386; AVX1-NEXT: retq 387; 388; AVX512-LABEL: v4f32_estimate2: 389; AVX512: # %bb.0: 390; AVX512-NEXT: vrsqrtps %xmm0, %xmm1 391; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm1 392; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] 393; AVX512-NEXT: vandps %xmm2, %xmm0, %xmm0 394; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] 395; AVX512-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 396; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0 397; AVX512-NEXT: retq 398 %sqrt = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) 399 ret <4 x float> %sqrt 400} 401 402define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 { 403; SSE-LABEL: v8f32_no_estimate: 404; SSE: # %bb.0: 405; SSE-NEXT: sqrtps %xmm1, %xmm2 406; SSE-NEXT: sqrtps %xmm0, %xmm3 407; SSE-NEXT: movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 408; SSE-NEXT: movaps %xmm1, %xmm0 409; SSE-NEXT: divps %xmm3, %xmm0 410; SSE-NEXT: divps %xmm2, %xmm1 411; SSE-NEXT: retq 412; 413; AVX1-LABEL: v8f32_no_estimate: 414; AVX1: # %bb.0: 415; AVX1-NEXT: vsqrtps %ymm0, %ymm0 416; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 417; AVX1-NEXT: vdivps %ymm0, %ymm1, %ymm0 418; AVX1-NEXT: retq 419; 420; AVX512-LABEL: v8f32_no_estimate: 421; AVX512: # %bb.0: 422; AVX512-NEXT: vsqrtps %ymm0, %ymm0 423; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 424; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0 425; AVX512-NEXT: retq 426 %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x) 427 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt 428 ret <8 x float> %div 429} 430 431define <8 x float> @v8f32_estimate(<8 x float> %x) #1 { 432; SSE-LABEL: v8f32_estimate: 433; SSE: # %bb.0: 434; SSE-NEXT: rsqrtps %xmm0, %xmm2 435; SSE-NEXT: movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 436; SSE-NEXT: mulps %xmm2, %xmm0 437; SSE-NEXT: mulps %xmm2, %xmm0 438; SSE-NEXT: mulps %xmm3, %xmm2 439; SSE-NEXT: movaps {{.*#+}} xmm4 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 440; SSE-NEXT: addps %xmm4, %xmm0 441; SSE-NEXT: mulps %xmm2, %xmm0 442; SSE-NEXT: rsqrtps %xmm1, %xmm2 443; SSE-NEXT: mulps %xmm2, %xmm3 444; SSE-NEXT: mulps %xmm2, %xmm1 445; SSE-NEXT: mulps %xmm2, %xmm1 446; SSE-NEXT: addps %xmm4, %xmm1 447; SSE-NEXT: mulps %xmm3, %xmm1 448; SSE-NEXT: retq 449; 450; AVX1-LABEL: v8f32_estimate: 451; AVX1: # %bb.0: 452; AVX1-NEXT: vrsqrtps %ymm0, %ymm1 453; AVX1-NEXT: vmulps %ymm1, %ymm0, %ymm0 454; AVX1-NEXT: vmulps %ymm1, %ymm0, %ymm0 455; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 456; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 457; AVX1-NEXT: vmulps %ymm0, %ymm1, %ymm0 458; AVX1-NEXT: retq 459; 460; AVX512-LABEL: v8f32_estimate: 461; AVX512: # %bb.0: 462; AVX512-NEXT: vrsqrtps %ymm0, %ymm1 463; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0 464; AVX512-NEXT: vbroadcastss {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 465; AVX512-NEXT: vfmadd231ps {{.*#+}} ymm2 = (ymm1 * ymm0) + ymm2 466; AVX512-NEXT: vbroadcastss {{.*#+}} ymm0 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 467; AVX512-NEXT: vmulps %ymm0, %ymm1, %ymm0 468; AVX512-NEXT: vmulps %ymm2, %ymm0, %ymm0 469; AVX512-NEXT: retq 470 %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x) 471 %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt 472 ret <8 x float> %div 473} 474 475define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 { 476; SSE-LABEL: v16f32_no_estimate: 477; SSE: # %bb.0: 478; SSE-NEXT: sqrtps %xmm3, %xmm4 479; SSE-NEXT: sqrtps %xmm2, %xmm5 480; SSE-NEXT: sqrtps %xmm1, %xmm2 481; SSE-NEXT: sqrtps %xmm0, %xmm1 482; SSE-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 483; SSE-NEXT: movaps %xmm3, %xmm0 484; SSE-NEXT: divps %xmm1, %xmm0 485; SSE-NEXT: movaps %xmm3, %xmm1 486; SSE-NEXT: divps %xmm2, %xmm1 487; SSE-NEXT: movaps %xmm3, %xmm2 488; SSE-NEXT: divps %xmm5, %xmm2 489; SSE-NEXT: divps %xmm4, %xmm3 490; SSE-NEXT: retq 491; 492; AVX1-LABEL: v16f32_no_estimate: 493; AVX1: # %bb.0: 494; AVX1-NEXT: vsqrtps %ymm1, %ymm1 495; AVX1-NEXT: vsqrtps %ymm0, %ymm0 496; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 497; AVX1-NEXT: vdivps %ymm0, %ymm2, %ymm0 498; AVX1-NEXT: vdivps %ymm1, %ymm2, %ymm1 499; AVX1-NEXT: retq 500; 501; AVX512-LABEL: v16f32_no_estimate: 502; AVX512: # %bb.0: 503; AVX512-NEXT: vsqrtps %zmm0, %zmm0 504; AVX512-NEXT: vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 505; AVX512-NEXT: vdivps %zmm0, %zmm1, %zmm0 506; AVX512-NEXT: retq 507 %sqrt = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %x) 508 %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt 509 ret <16 x float> %div 510} 511 512define <16 x float> @v16f32_estimate(<16 x float> %x) #1 { 513; SSE-LABEL: v16f32_estimate: 514; SSE: # %bb.0: 515; SSE-NEXT: rsqrtps %xmm0, %xmm5 516; SSE-NEXT: movaps {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 517; SSE-NEXT: mulps %xmm5, %xmm0 518; SSE-NEXT: mulps %xmm5, %xmm0 519; SSE-NEXT: movaps %xmm5, %xmm6 520; SSE-NEXT: mulps %xmm4, %xmm6 521; SSE-NEXT: movaps {{.*#+}} xmm5 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 522; SSE-NEXT: addps %xmm5, %xmm0 523; SSE-NEXT: mulps %xmm6, %xmm0 524; SSE-NEXT: rsqrtps %xmm1, %xmm6 525; SSE-NEXT: mulps %xmm6, %xmm1 526; SSE-NEXT: mulps %xmm6, %xmm1 527; SSE-NEXT: mulps %xmm4, %xmm6 528; SSE-NEXT: addps %xmm5, %xmm1 529; SSE-NEXT: mulps %xmm6, %xmm1 530; SSE-NEXT: rsqrtps %xmm2, %xmm6 531; SSE-NEXT: mulps %xmm6, %xmm2 532; SSE-NEXT: mulps %xmm6, %xmm2 533; SSE-NEXT: mulps %xmm4, %xmm6 534; SSE-NEXT: addps %xmm5, %xmm2 535; SSE-NEXT: mulps %xmm6, %xmm2 536; SSE-NEXT: rsqrtps %xmm3, %xmm6 537; SSE-NEXT: mulps %xmm6, %xmm4 538; SSE-NEXT: mulps %xmm6, %xmm3 539; SSE-NEXT: mulps %xmm6, %xmm3 540; SSE-NEXT: addps %xmm5, %xmm3 541; SSE-NEXT: mulps %xmm4, %xmm3 542; SSE-NEXT: retq 543; 544; AVX1-LABEL: v16f32_estimate: 545; AVX1: # %bb.0: 546; AVX1-NEXT: vrsqrtps %ymm0, %ymm2 547; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 548; AVX1-NEXT: vmulps %ymm3, %ymm2, %ymm4 549; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0 550; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0 551; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 552; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0 553; AVX1-NEXT: vrsqrtps %ymm1, %ymm5 554; AVX1-NEXT: vmulps %ymm0, %ymm4, %ymm0 555; AVX1-NEXT: vmulps %ymm3, %ymm5, %ymm3 556; AVX1-NEXT: vmulps %ymm5, %ymm1, %ymm1 557; AVX1-NEXT: vmulps %ymm5, %ymm1, %ymm1 558; AVX1-NEXT: vaddps %ymm2, %ymm1, %ymm1 559; AVX1-NEXT: vmulps %ymm1, %ymm3, %ymm1 560; AVX1-NEXT: retq 561; 562; AVX512-LABEL: v16f32_estimate: 563; AVX512: # %bb.0: 564; AVX512-NEXT: vrsqrt14ps %zmm0, %zmm1 565; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0 566; AVX512-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + mem 567; AVX512-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1 568; AVX512-NEXT: vmulps %zmm0, %zmm1, %zmm0 569; AVX512-NEXT: retq 570 %sqrt = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %x) 571 %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt 572 ret <16 x float> %div 573} 574 575; x / (fabs(y) * sqrt(z)) --> x * rsqrt(y*y*z) 576 577define float @div_sqrt_fabs_f32(float %x, float %y, float %z) { 578; SSE-LABEL: div_sqrt_fabs_f32: 579; SSE: # %bb.0: 580; SSE-NEXT: mulss %xmm1, %xmm1 581; SSE-NEXT: mulss %xmm2, %xmm1 582; SSE-NEXT: xorps %xmm2, %xmm2 583; SSE-NEXT: rsqrtss %xmm1, %xmm2 584; SSE-NEXT: mulss %xmm2, %xmm1 585; SSE-NEXT: mulss %xmm2, %xmm1 586; SSE-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 587; SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 588; SSE-NEXT: mulss %xmm2, %xmm0 589; SSE-NEXT: mulss %xmm1, %xmm0 590; SSE-NEXT: retq 591; 592; AVX1-LABEL: div_sqrt_fabs_f32: 593; AVX1: # %bb.0: 594; AVX1-NEXT: vmulss %xmm1, %xmm1, %xmm1 595; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm1 596; AVX1-NEXT: vrsqrtss %xmm1, %xmm1, %xmm2 597; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm1 598; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm1 599; AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 600; AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 601; AVX1-NEXT: vmulss %xmm0, %xmm2, %xmm0 602; AVX1-NEXT: vmulss %xmm0, %xmm1, %xmm0 603; AVX1-NEXT: retq 604; 605; AVX512-LABEL: div_sqrt_fabs_f32: 606; AVX512: # %bb.0: 607; AVX512-NEXT: vmulss %xmm1, %xmm1, %xmm1 608; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 609; AVX512-NEXT: vrsqrtss %xmm1, %xmm1, %xmm2 610; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 611; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem 612; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 613; AVX512-NEXT: vmulss %xmm0, %xmm2, %xmm0 614; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 615; AVX512-NEXT: retq 616 %s = call fast float @llvm.sqrt.f32(float %z) 617 %a = call fast float @llvm.fabs.f32(float %y) 618 %m = fmul fast float %s, %a 619 %d = fdiv fast float %x, %m 620 ret float %d 621} 622 623; x / (fabs(y) * sqrt(z)) --> x * rsqrt(y*y*z) 624 625define <4 x float> @div_sqrt_fabs_v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z) { 626; SSE-LABEL: div_sqrt_fabs_v4f32: 627; SSE: # %bb.0: 628; SSE-NEXT: mulps %xmm1, %xmm1 629; SSE-NEXT: mulps %xmm2, %xmm1 630; SSE-NEXT: rsqrtps %xmm1, %xmm2 631; SSE-NEXT: mulps %xmm2, %xmm1 632; SSE-NEXT: mulps %xmm2, %xmm1 633; SSE-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 634; SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 635; SSE-NEXT: mulps %xmm1, %xmm2 636; SSE-NEXT: mulps %xmm2, %xmm0 637; SSE-NEXT: retq 638; 639; AVX1-LABEL: div_sqrt_fabs_v4f32: 640; AVX1: # %bb.0: 641; AVX1-NEXT: vmulps %xmm1, %xmm1, %xmm1 642; AVX1-NEXT: vmulps %xmm2, %xmm1, %xmm1 643; AVX1-NEXT: vrsqrtps %xmm1, %xmm2 644; AVX1-NEXT: vmulps %xmm2, %xmm1, %xmm1 645; AVX1-NEXT: vmulps %xmm2, %xmm1, %xmm1 646; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 647; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 648; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1 649; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0 650; AVX1-NEXT: retq 651; 652; AVX512-LABEL: div_sqrt_fabs_v4f32: 653; AVX512: # %bb.0: 654; AVX512-NEXT: vmulps %xmm1, %xmm1, %xmm1 655; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 656; AVX512-NEXT: vrsqrtps %xmm1, %xmm2 657; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 658; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 659; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3 660; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 661; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1 662; AVX512-NEXT: vmulps %xmm3, %xmm1, %xmm1 663; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 664; AVX512-NEXT: retq 665 %s = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %z) 666 %a = call <4 x float> @llvm.fabs.v4f32(<4 x float> %y) 667 %m = fmul contract reassoc <4 x float> %a, %s 668 %d = fdiv contract reassoc arcp <4 x float> %x, %m 669 ret <4 x float> %d 670} 671 672; This has 'arcp' but does not have 'reassoc' FMF. 673; We allow converting the sqrt to an estimate, but 674; do not pull the divisor into the estimate. 675; x / (fabs(y) * sqrt(z)) --> x * rsqrt(z) / fabs(y) 676 677define <4 x float> @div_sqrt_fabs_v4f32_fmf(<4 x float> %x, <4 x float> %y, <4 x float> %z) { 678; SSE-LABEL: div_sqrt_fabs_v4f32_fmf: 679; SSE: # %bb.0: 680; SSE-NEXT: rsqrtps %xmm2, %xmm3 681; SSE-NEXT: mulps %xmm3, %xmm2 682; SSE-NEXT: mulps %xmm3, %xmm2 683; SSE-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 684; SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 685; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 686; SSE-NEXT: mulps %xmm2, %xmm3 687; SSE-NEXT: divps %xmm1, %xmm3 688; SSE-NEXT: mulps %xmm3, %xmm0 689; SSE-NEXT: retq 690; 691; AVX1-LABEL: div_sqrt_fabs_v4f32_fmf: 692; AVX1: # %bb.0: 693; AVX1-NEXT: vrsqrtps %xmm2, %xmm3 694; AVX1-NEXT: vmulps %xmm3, %xmm2, %xmm2 695; AVX1-NEXT: vmulps %xmm3, %xmm2, %xmm2 696; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 697; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 698; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 699; AVX1-NEXT: vmulps %xmm2, %xmm3, %xmm2 700; AVX1-NEXT: vdivps %xmm1, %xmm2, %xmm1 701; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0 702; AVX1-NEXT: retq 703; 704; AVX512-LABEL: div_sqrt_fabs_v4f32_fmf: 705; AVX512: # %bb.0: 706; AVX512-NEXT: vrsqrtps %xmm2, %xmm3 707; AVX512-NEXT: vbroadcastss {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 708; AVX512-NEXT: vmulps %xmm4, %xmm3, %xmm4 709; AVX512-NEXT: vmulps %xmm3, %xmm2, %xmm2 710; AVX512-NEXT: vmulps %xmm3, %xmm2, %xmm2 711; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 712; AVX512-NEXT: vaddps %xmm3, %xmm2, %xmm2 713; AVX512-NEXT: vmulps %xmm2, %xmm4, %xmm2 714; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN] 715; AVX512-NEXT: vandps %xmm3, %xmm1, %xmm1 716; AVX512-NEXT: vdivps %xmm1, %xmm2, %xmm1 717; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 718; AVX512-NEXT: retq 719 %s = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %z) 720 %a = call <4 x float> @llvm.fabs.v4f32(<4 x float> %y) 721 %m = fmul <4 x float> %a, %s 722 %d = fdiv arcp <4 x float> %x, %m 723 ret <4 x float> %d 724} 725 726; No estimates for f64, so do not convert fabs into an fmul. 727 728define double @div_sqrt_fabs_f64(double %x, double %y, double %z) { 729; SSE-LABEL: div_sqrt_fabs_f64: 730; SSE: # %bb.0: 731; SSE-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 732; SSE-NEXT: sqrtsd %xmm2, %xmm2 733; SSE-NEXT: mulsd %xmm2, %xmm1 734; SSE-NEXT: divsd %xmm1, %xmm0 735; SSE-NEXT: retq 736; 737; AVX-LABEL: div_sqrt_fabs_f64: 738; AVX: # %bb.0: 739; AVX-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 740; AVX-NEXT: vsqrtsd %xmm2, %xmm2, %xmm2 741; AVX-NEXT: vmulsd %xmm1, %xmm2, %xmm1 742; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 743; AVX-NEXT: retq 744 %s = call fast double @llvm.sqrt.f64(double %z) 745 %a = call fast double @llvm.fabs.f64(double %y) 746 %m = fmul fast double %s, %a 747 %d = fdiv fast double %x, %m 748 ret double %d 749} 750 751; This is a special case for the general pattern above - 752; if the sqrt operand is the same as the other mul op, 753; then fabs may be omitted. 754; x / (y * sqrt(y)) --> x * rsqrt(y*y*y) 755 756define float @div_sqrt_f32(float %x, float %y) { 757; SSE-LABEL: div_sqrt_f32: 758; SSE: # %bb.0: 759; SSE-NEXT: movaps %xmm1, %xmm2 760; SSE-NEXT: mulss %xmm1, %xmm2 761; SSE-NEXT: mulss %xmm1, %xmm2 762; SSE-NEXT: xorps %xmm1, %xmm1 763; SSE-NEXT: rsqrtss %xmm2, %xmm1 764; SSE-NEXT: mulss %xmm1, %xmm2 765; SSE-NEXT: mulss %xmm1, %xmm2 766; SSE-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 767; SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 768; SSE-NEXT: mulss %xmm1, %xmm0 769; SSE-NEXT: mulss %xmm2, %xmm0 770; SSE-NEXT: retq 771; 772; AVX1-LABEL: div_sqrt_f32: 773; AVX1: # %bb.0: 774; AVX1-NEXT: vmulss %xmm1, %xmm1, %xmm2 775; AVX1-NEXT: vmulss %xmm1, %xmm2, %xmm1 776; AVX1-NEXT: vrsqrtss %xmm1, %xmm1, %xmm2 777; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm1 778; AVX1-NEXT: vmulss %xmm2, %xmm1, %xmm1 779; AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 780; AVX1-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 781; AVX1-NEXT: vmulss %xmm0, %xmm2, %xmm0 782; AVX1-NEXT: vmulss %xmm0, %xmm1, %xmm0 783; AVX1-NEXT: retq 784; 785; AVX512-LABEL: div_sqrt_f32: 786; AVX512: # %bb.0: 787; AVX512-NEXT: vmulss %xmm1, %xmm1, %xmm2 788; AVX512-NEXT: vmulss %xmm1, %xmm2, %xmm1 789; AVX512-NEXT: vrsqrtss %xmm1, %xmm1, %xmm2 790; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 791; AVX512-NEXT: vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem 792; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 793; AVX512-NEXT: vmulss %xmm0, %xmm2, %xmm0 794; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 795; AVX512-NEXT: retq 796 %s = call fast float @llvm.sqrt.f32(float %y) 797 %m = fmul fast float %s, %y 798 %d = fdiv fast float %x, %m 799 ret float %d 800} 801 802; This is a special case for the general pattern above - 803; if the sqrt operand is the same as the other mul op, 804; then fabs may be omitted. 805; x / (y * sqrt(y)) --> x * rsqrt(y*y*y) 806 807define <4 x float> @div_sqrt_v4f32(<4 x float> %x, <4 x float> %y) { 808; SSE-LABEL: div_sqrt_v4f32: 809; SSE: # %bb.0: 810; SSE-NEXT: movaps %xmm1, %xmm2 811; SSE-NEXT: mulps %xmm1, %xmm2 812; SSE-NEXT: mulps %xmm1, %xmm2 813; SSE-NEXT: rsqrtps %xmm2, %xmm1 814; SSE-NEXT: mulps %xmm1, %xmm2 815; SSE-NEXT: mulps %xmm1, %xmm2 816; SSE-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 817; SSE-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 818; SSE-NEXT: mulps %xmm2, %xmm1 819; SSE-NEXT: mulps %xmm1, %xmm0 820; SSE-NEXT: retq 821; 822; AVX1-LABEL: div_sqrt_v4f32: 823; AVX1: # %bb.0: 824; AVX1-NEXT: vmulps %xmm1, %xmm1, %xmm2 825; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1 826; AVX1-NEXT: vrsqrtps %xmm1, %xmm2 827; AVX1-NEXT: vmulps %xmm2, %xmm1, %xmm1 828; AVX1-NEXT: vmulps %xmm2, %xmm1, %xmm1 829; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 830; AVX1-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 831; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1 832; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0 833; AVX1-NEXT: retq 834; 835; AVX512-LABEL: div_sqrt_v4f32: 836; AVX512: # %bb.0: 837; AVX512-NEXT: vmulps %xmm1, %xmm1, %xmm2 838; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1 839; AVX512-NEXT: vrsqrtps %xmm1, %xmm2 840; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 841; AVX512-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] 842; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3 843; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] 844; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1 845; AVX512-NEXT: vmulps %xmm3, %xmm1, %xmm1 846; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 847; AVX512-NEXT: retq 848 %s = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %y) 849 %m = fmul contract reassoc <4 x float> %y, %s 850 %d = fdiv contract reassoc arcp <4 x float> %x, %m 851 ret <4 x float> %d 852} 853 854define double @sqrt_fdiv_common_operand(double %x) nounwind { 855; SSE-LABEL: sqrt_fdiv_common_operand: 856; SSE: # %bb.0: 857; SSE-NEXT: sqrtsd %xmm0, %xmm0 858; SSE-NEXT: retq 859; 860; AVX-LABEL: sqrt_fdiv_common_operand: 861; AVX: # %bb.0: 862; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 863; AVX-NEXT: retq 864 %sqrt = call fast double @llvm.sqrt.f64(double %x) 865 %r = fdiv fast double %x, %sqrt 866 ret double %r 867} 868 869define <2 x double> @sqrt_fdiv_common_operand_vec(<2 x double> %x) nounwind { 870; SSE-LABEL: sqrt_fdiv_common_operand_vec: 871; SSE: # %bb.0: 872; SSE-NEXT: sqrtpd %xmm0, %xmm0 873; SSE-NEXT: retq 874; 875; AVX-LABEL: sqrt_fdiv_common_operand_vec: 876; AVX: # %bb.0: 877; AVX-NEXT: vsqrtpd %xmm0, %xmm0 878; AVX-NEXT: retq 879 %sqrt = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %x) 880 %r = fdiv arcp nsz reassoc <2 x double> %x, %sqrt 881 ret <2 x double> %r 882} 883 884define double @sqrt_fdiv_common_operand_extra_use(double %x, ptr %p) nounwind { 885; SSE-LABEL: sqrt_fdiv_common_operand_extra_use: 886; SSE: # %bb.0: 887; SSE-NEXT: sqrtsd %xmm0, %xmm0 888; SSE-NEXT: movsd %xmm0, (%rdi) 889; SSE-NEXT: retq 890; 891; AVX-LABEL: sqrt_fdiv_common_operand_extra_use: 892; AVX: # %bb.0: 893; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 894; AVX-NEXT: vmovsd %xmm0, (%rdi) 895; AVX-NEXT: retq 896 %sqrt = call fast double @llvm.sqrt.f64(double %x) 897 store double %sqrt, ptr %p 898 %r = fdiv fast double %x, %sqrt 899 ret double %r 900} 901 902define double @sqrt_simplify_before_recip(double %x, ptr %p) nounwind { 903; SSE-LABEL: sqrt_simplify_before_recip: 904; SSE: # %bb.0: 905; SSE-NEXT: sqrtsd %xmm0, %xmm0 906; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 907; SSE-NEXT: divsd %xmm0, %xmm1 908; SSE-NEXT: movsd %xmm1, (%rdi) 909; SSE-NEXT: retq 910; 911; AVX-LABEL: sqrt_simplify_before_recip: 912; AVX: # %bb.0: 913; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 914; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 915; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm1 916; AVX-NEXT: vmovsd %xmm1, (%rdi) 917; AVX-NEXT: retq 918 %sqrt = tail call fast double @llvm.sqrt.f64(double %x) 919 %rsqrt = fdiv fast double 1.0, %sqrt 920 %sqrt_fast = fdiv fast double %x, %sqrt 921 store double %rsqrt, ptr %p, align 8 922 ret double %sqrt_fast 923} 924 925define <2 x double> @sqrt_simplify_before_recip_vec(<2 x double> %x, ptr %p) nounwind { 926; SSE-LABEL: sqrt_simplify_before_recip_vec: 927; SSE: # %bb.0: 928; SSE-NEXT: sqrtpd %xmm0, %xmm0 929; SSE-NEXT: movapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0] 930; SSE-NEXT: divpd %xmm0, %xmm1 931; SSE-NEXT: movupd %xmm1, (%rdi) 932; SSE-NEXT: retq 933; 934; AVX-LABEL: sqrt_simplify_before_recip_vec: 935; AVX: # %bb.0: 936; AVX-NEXT: vsqrtpd %xmm0, %xmm0 937; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0] 938; AVX-NEXT: vdivpd %xmm0, %xmm1, %xmm1 939; AVX-NEXT: vmovupd %xmm1, (%rdi) 940; AVX-NEXT: retq 941 %sqrt = tail call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> %x) 942 %rsqrt = fdiv fast <2 x double> <double 1.0, double 1.0>, %sqrt 943 %sqrt_fast = fdiv fast <2 x double> %x, %sqrt 944 store <2 x double> %rsqrt, ptr %p, align 8 945 ret <2 x double> %sqrt_fast 946} 947 948define double @sqrt_simplify_before_recip_order(double %x, ptr %p) nounwind { 949; SSE-LABEL: sqrt_simplify_before_recip_order: 950; SSE: # %bb.0: 951; SSE-NEXT: sqrtsd %xmm0, %xmm0 952; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 953; SSE-NEXT: divsd %xmm0, %xmm1 954; SSE-NEXT: movsd %xmm1, (%rdi) 955; SSE-NEXT: retq 956; 957; AVX-LABEL: sqrt_simplify_before_recip_order: 958; AVX: # %bb.0: 959; AVX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 960; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 961; AVX-NEXT: vdivsd %xmm0, %xmm1, %xmm1 962; AVX-NEXT: vmovsd %xmm1, (%rdi) 963; AVX-NEXT: retq 964 %sqrt = tail call fast double @llvm.sqrt.f64(double %x) 965 %sqrt_fast = fdiv fast double %x, %sqrt 966 %rsqrt = fdiv fast double 42.0, %sqrt 967 store double %rsqrt, ptr %p, align 8 968 ret double %sqrt_fast 969} 970 971attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!sqrtf,!vec-sqrtf,!divf,!vec-divf" } 972attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" } 973attributes #2 = { nounwind readnone } 974attributes #3 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="ieee" } 975attributes #4 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="ieee,preserve-sign" } 976attributes #5 = { "unsafe-fp-math"="true" "reciprocal-estimates"="all:0" } 977