1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64--linux-gnu -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
3; RUN: llc < %s -mtriple=x86_64--linux-gnu -mcpu=x86-64 -mattr=+avx  | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
4; RUN: llc < %s -mtriple=x86_64--linux-gnu -mcpu=x86-64 -mattr=+avx512f  | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
5
6declare double @__sqrt_finite(double)
7declare float @__sqrtf_finite(float)
8declare x86_fp80 @__sqrtl_finite(x86_fp80)
9declare float @llvm.sqrt.f32(float)
10declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
11declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
12declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
13declare double @llvm.sqrt.f64(double)
14declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
15
16declare float @llvm.fabs.f32(float)
17declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
18declare double @llvm.fabs.f64(double)
19
20define double @finite_f64_no_estimate(double %d) #0 {
21; SSE-LABEL: finite_f64_no_estimate:
22; SSE:       # %bb.0:
23; SSE-NEXT:    sqrtsd %xmm0, %xmm0
24; SSE-NEXT:    retq
25;
26; AVX-LABEL: finite_f64_no_estimate:
27; AVX:       # %bb.0:
28; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
29; AVX-NEXT:    retq
30  %call = tail call double @__sqrt_finite(double %d) #2
31  ret double %call
32}
33
34; No estimates for doubles.
35
36define double @finite_f64_estimate(double %d) #1 {
37; SSE-LABEL: finite_f64_estimate:
38; SSE:       # %bb.0:
39; SSE-NEXT:    sqrtsd %xmm0, %xmm0
40; SSE-NEXT:    retq
41;
42; AVX-LABEL: finite_f64_estimate:
43; AVX:       # %bb.0:
44; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
45; AVX-NEXT:    retq
46  %call = tail call double @__sqrt_finite(double %d) #2
47  ret double %call
48}
49
50define float @finite_f32_no_estimate(float %f) #0 {
51; SSE-LABEL: finite_f32_no_estimate:
52; SSE:       # %bb.0:
53; SSE-NEXT:    sqrtss %xmm0, %xmm0
54; SSE-NEXT:    retq
55;
56; AVX-LABEL: finite_f32_no_estimate:
57; AVX:       # %bb.0:
58; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
59; AVX-NEXT:    retq
60  %call = tail call float @__sqrtf_finite(float %f) #2
61  ret float %call
62}
63
64define float @finite_f32_estimate_ieee(float %f) #1 {
65; SSE-LABEL: finite_f32_estimate_ieee:
66; SSE:       # %bb.0:
67; SSE-NEXT:    sqrtss %xmm0, %xmm0
68; SSE-NEXT:    retq
69;
70; AVX-LABEL: finite_f32_estimate_ieee:
71; AVX:       # %bb.0:
72; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
73; AVX-NEXT:    retq
74  %call = tail call float @__sqrtf_finite(float %f) #2
75  ret float %call
76}
77
78define float @finite_f32_estimate_ieee_ninf(float %f) #1 {
79; SSE-LABEL: finite_f32_estimate_ieee_ninf:
80; SSE:       # %bb.0:
81; SSE-NEXT:    sqrtss %xmm0, %xmm0
82; SSE-NEXT:    retq
83;
84; AVX-LABEL: finite_f32_estimate_ieee_ninf:
85; AVX:       # %bb.0:
86; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
87; AVX-NEXT:    retq
88  %call = tail call ninf afn float @__sqrtf_finite(float %f) #2
89  ret float %call
90}
91
92define float @finite_f32_estimate_daz(float %f) #4 {
93; SSE-LABEL: finite_f32_estimate_daz:
94; SSE:       # %bb.0:
95; SSE-NEXT:    sqrtss %xmm0, %xmm0
96; SSE-NEXT:    retq
97;
98; AVX-LABEL: finite_f32_estimate_daz:
99; AVX:       # %bb.0:
100; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
101; AVX-NEXT:    retq
102  %call = tail call float @__sqrtf_finite(float %f) #2
103  ret float %call
104}
105
106define float @finite_f32_estimate_daz_ninf(float %f) #4 {
107; SSE-LABEL: finite_f32_estimate_daz_ninf:
108; SSE:       # %bb.0:
109; SSE-NEXT:    sqrtss %xmm0, %xmm0
110; SSE-NEXT:    retq
111;
112; AVX-LABEL: finite_f32_estimate_daz_ninf:
113; AVX:       # %bb.0:
114; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
115; AVX-NEXT:    retq
116  %call = tail call ninf afn float @__sqrtf_finite(float %f) #2
117  ret float %call
118}
119
120define x86_fp80 @finite_f80_no_estimate(x86_fp80 %ld) #0 {
121; CHECK-LABEL: finite_f80_no_estimate:
122; CHECK:       # %bb.0:
123; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
124; CHECK-NEXT:    fsqrt
125; CHECK-NEXT:    retq
126  %call = tail call x86_fp80 @__sqrtl_finite(x86_fp80 %ld) #2
127  ret x86_fp80 %call
128}
129
130; Don't die on the impossible.
131
132define x86_fp80 @finite_f80_estimate_but_no(x86_fp80 %ld) #1 {
133; CHECK-LABEL: finite_f80_estimate_but_no:
134; CHECK:       # %bb.0:
135; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
136; CHECK-NEXT:    fsqrt
137; CHECK-NEXT:    retq
138  %call = tail call x86_fp80 @__sqrtl_finite(x86_fp80 %ld) #2
139  ret x86_fp80 %call
140}
141
142; PR34994 - https://bugs.llvm.org/show_bug.cgi?id=34994
143
144define float @sqrtf_check_denorms(float %x) #3 {
145; SSE-LABEL: sqrtf_check_denorms:
146; SSE:       # %bb.0:
147; SSE-NEXT:    sqrtss %xmm0, %xmm0
148; SSE-NEXT:    retq
149;
150; AVX-LABEL: sqrtf_check_denorms:
151; AVX:       # %bb.0:
152; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
153; AVX-NEXT:    retq
154  %call = tail call float @__sqrtf_finite(float %x) #2
155  ret float %call
156}
157
158define float @sqrtf_check_denorms_ninf(float %x) #3 {
159; SSE-LABEL: sqrtf_check_denorms_ninf:
160; SSE:       # %bb.0:
161; SSE-NEXT:    sqrtss %xmm0, %xmm0
162; SSE-NEXT:    retq
163;
164; AVX-LABEL: sqrtf_check_denorms_ninf:
165; AVX:       # %bb.0:
166; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
167; AVX-NEXT:    retq
168  %call = tail call ninf afn float @__sqrtf_finite(float %x) #2
169  ret float %call
170}
171
172define <4 x float> @sqrt_v4f32_check_denorms(<4 x float> %x) #3 {
173; SSE-LABEL: sqrt_v4f32_check_denorms:
174; SSE:       # %bb.0:
175; SSE-NEXT:    sqrtps %xmm0, %xmm0
176; SSE-NEXT:    retq
177;
178; AVX-LABEL: sqrt_v4f32_check_denorms:
179; AVX:       # %bb.0:
180; AVX-NEXT:    vsqrtps %xmm0, %xmm0
181; AVX-NEXT:    retq
182  %call = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2
183  ret <4 x float> %call
184}
185
186define <4 x float> @sqrt_v4f32_check_denorms_ninf(<4 x float> %x) #3 {
187; SSE-LABEL: sqrt_v4f32_check_denorms_ninf:
188; SSE:       # %bb.0:
189; SSE-NEXT:    rsqrtps %xmm0, %xmm1
190; SSE-NEXT:    movaps %xmm0, %xmm2
191; SSE-NEXT:    mulps %xmm1, %xmm2
192; SSE-NEXT:    movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
193; SSE-NEXT:    mulps %xmm2, %xmm3
194; SSE-NEXT:    mulps %xmm1, %xmm2
195; SSE-NEXT:    addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
196; SSE-NEXT:    mulps %xmm3, %xmm2
197; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
198; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
199; SSE-NEXT:    cmpleps %xmm0, %xmm1
200; SSE-NEXT:    andps %xmm2, %xmm1
201; SSE-NEXT:    movaps %xmm1, %xmm0
202; SSE-NEXT:    retq
203;
204; AVX1-LABEL: sqrt_v4f32_check_denorms_ninf:
205; AVX1:       # %bb.0:
206; AVX1-NEXT:    vrsqrtps %xmm0, %xmm1
207; AVX1-NEXT:    vmulps %xmm1, %xmm0, %xmm2
208; AVX1-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
209; AVX1-NEXT:    vmulps %xmm1, %xmm2, %xmm1
210; AVX1-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
211; AVX1-NEXT:    vmulps %xmm1, %xmm3, %xmm1
212; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
213; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
214; AVX1-NEXT:    vcmpleps %xmm0, %xmm2, %xmm0
215; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
216; AVX1-NEXT:    retq
217;
218; AVX512-LABEL: sqrt_v4f32_check_denorms_ninf:
219; AVX512:       # %bb.0:
220; AVX512-NEXT:    vrsqrtps %xmm0, %xmm1
221; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm2
222; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
223; AVX512-NEXT:    vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
224; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
225; AVX512-NEXT:    vmulps %xmm1, %xmm2, %xmm1
226; AVX512-NEXT:    vmulps %xmm3, %xmm1, %xmm1
227; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
228; AVX512-NEXT:    vandps %xmm2, %xmm0, %xmm0
229; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
230; AVX512-NEXT:    vcmpleps %xmm0, %xmm2, %xmm0
231; AVX512-NEXT:    vandps %xmm1, %xmm0, %xmm0
232; AVX512-NEXT:    retq
233  %call = tail call ninf afn <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2
234  ret <4 x float> %call
235}
236
237define float @f32_no_estimate(float %x) #0 {
238; SSE-LABEL: f32_no_estimate:
239; SSE:       # %bb.0:
240; SSE-NEXT:    sqrtss %xmm0, %xmm1
241; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
242; SSE-NEXT:    divss %xmm1, %xmm0
243; SSE-NEXT:    retq
244;
245; AVX-LABEL: f32_no_estimate:
246; AVX:       # %bb.0:
247; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
248; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
249; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
250; AVX-NEXT:    retq
251  %sqrt = tail call float @llvm.sqrt.f32(float %x)
252  %div = fdiv fast float 1.0, %sqrt
253  ret float %div
254}
255
256define float @f32_estimate(float %x) #1 {
257; SSE-LABEL: f32_estimate:
258; SSE:       # %bb.0:
259; SSE-NEXT:    rsqrtss %xmm0, %xmm1
260; SSE-NEXT:    mulss %xmm1, %xmm0
261; SSE-NEXT:    mulss %xmm1, %xmm0
262; SSE-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
263; SSE-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
264; SSE-NEXT:    mulss %xmm1, %xmm0
265; SSE-NEXT:    retq
266;
267; AVX1-LABEL: f32_estimate:
268; AVX1:       # %bb.0:
269; AVX1-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm1
270; AVX1-NEXT:    vmulss %xmm1, %xmm0, %xmm0
271; AVX1-NEXT:    vmulss %xmm1, %xmm0, %xmm0
272; AVX1-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
273; AVX1-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
274; AVX1-NEXT:    vmulss %xmm0, %xmm1, %xmm0
275; AVX1-NEXT:    retq
276;
277; AVX512-LABEL: f32_estimate:
278; AVX512:       # %bb.0:
279; AVX512-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm1
280; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
281; AVX512-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem
282; AVX512-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
283; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
284; AVX512-NEXT:    retq
285  %sqrt = tail call float @llvm.sqrt.f32(float %x)
286  %div = fdiv fast float 1.0, %sqrt
287  ret float %div
288}
289
290define float @f32_estimate2(float %x) #5 {
291; SSE-LABEL: f32_estimate2:
292; SSE:       # %bb.0:
293; SSE-NEXT:    sqrtss %xmm0, %xmm0
294; SSE-NEXT:    retq
295;
296; AVX-LABEL: f32_estimate2:
297; AVX:       # %bb.0:
298; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
299; AVX-NEXT:    retq
300  %sqrt = tail call fast float @llvm.sqrt.f32(float %x)
301  ret float %sqrt
302}
303
304define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
305; SSE-LABEL: v4f32_no_estimate:
306; SSE:       # %bb.0:
307; SSE-NEXT:    sqrtps %xmm0, %xmm1
308; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
309; SSE-NEXT:    divps %xmm1, %xmm0
310; SSE-NEXT:    retq
311;
312; AVX1-LABEL: v4f32_no_estimate:
313; AVX1:       # %bb.0:
314; AVX1-NEXT:    vsqrtps %xmm0, %xmm0
315; AVX1-NEXT:    vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
316; AVX1-NEXT:    vdivps %xmm0, %xmm1, %xmm0
317; AVX1-NEXT:    retq
318;
319; AVX512-LABEL: v4f32_no_estimate:
320; AVX512:       # %bb.0:
321; AVX512-NEXT:    vsqrtps %xmm0, %xmm0
322; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
323; AVX512-NEXT:    vdivps %xmm0, %xmm1, %xmm0
324; AVX512-NEXT:    retq
325  %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
326  %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
327  ret <4 x float> %div
328}
329
330define <4 x float> @v4f32_estimate(<4 x float> %x) #1 {
331; SSE-LABEL: v4f32_estimate:
332; SSE:       # %bb.0:
333; SSE-NEXT:    rsqrtps %xmm0, %xmm1
334; SSE-NEXT:    mulps %xmm1, %xmm0
335; SSE-NEXT:    mulps %xmm1, %xmm0
336; SSE-NEXT:    addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
337; SSE-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
338; SSE-NEXT:    mulps %xmm1, %xmm0
339; SSE-NEXT:    retq
340;
341; AVX1-LABEL: v4f32_estimate:
342; AVX1:       # %bb.0:
343; AVX1-NEXT:    vrsqrtps %xmm0, %xmm1
344; AVX1-NEXT:    vmulps %xmm1, %xmm0, %xmm0
345; AVX1-NEXT:    vmulps %xmm1, %xmm0, %xmm0
346; AVX1-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
347; AVX1-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
348; AVX1-NEXT:    vmulps %xmm0, %xmm1, %xmm0
349; AVX1-NEXT:    retq
350;
351; AVX512-LABEL: v4f32_estimate:
352; AVX512:       # %bb.0:
353; AVX512-NEXT:    vrsqrtps %xmm0, %xmm1
354; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
355; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
356; AVX512-NEXT:    vfmadd231ps {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2
357; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm0 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
358; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0
359; AVX512-NEXT:    vmulps %xmm2, %xmm0, %xmm0
360; AVX512-NEXT:    retq
361  %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
362  %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
363  ret <4 x float> %div
364}
365
366define <4 x float> @v4f32_estimate2(<4 x float> %x) #5 {
367; SSE-LABEL: v4f32_estimate2:
368; SSE:       # %bb.0:
369; SSE-NEXT:    rsqrtps %xmm0, %xmm2
370; SSE-NEXT:    mulps %xmm0, %xmm2
371; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
372; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
373; SSE-NEXT:    cmpleps %xmm0, %xmm1
374; SSE-NEXT:    andps %xmm2, %xmm1
375; SSE-NEXT:    movaps %xmm1, %xmm0
376; SSE-NEXT:    retq
377;
378; AVX1-LABEL: v4f32_estimate2:
379; AVX1:       # %bb.0:
380; AVX1-NEXT:    vrsqrtps %xmm0, %xmm1
381; AVX1-NEXT:    vmulps %xmm1, %xmm0, %xmm1
382; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
383; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
384; AVX1-NEXT:    vcmpleps %xmm0, %xmm2, %xmm0
385; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
386; AVX1-NEXT:    retq
387;
388; AVX512-LABEL: v4f32_estimate2:
389; AVX512:       # %bb.0:
390; AVX512-NEXT:    vrsqrtps %xmm0, %xmm1
391; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm1
392; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
393; AVX512-NEXT:    vandps %xmm2, %xmm0, %xmm0
394; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
395; AVX512-NEXT:    vcmpleps %xmm0, %xmm2, %xmm0
396; AVX512-NEXT:    vandps %xmm1, %xmm0, %xmm0
397; AVX512-NEXT:    retq
398  %sqrt = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
399  ret <4 x float> %sqrt
400}
401
402define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
403; SSE-LABEL: v8f32_no_estimate:
404; SSE:       # %bb.0:
405; SSE-NEXT:    sqrtps %xmm1, %xmm2
406; SSE-NEXT:    sqrtps %xmm0, %xmm3
407; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
408; SSE-NEXT:    movaps %xmm1, %xmm0
409; SSE-NEXT:    divps %xmm3, %xmm0
410; SSE-NEXT:    divps %xmm2, %xmm1
411; SSE-NEXT:    retq
412;
413; AVX1-LABEL: v8f32_no_estimate:
414; AVX1:       # %bb.0:
415; AVX1-NEXT:    vsqrtps %ymm0, %ymm0
416; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
417; AVX1-NEXT:    vdivps %ymm0, %ymm1, %ymm0
418; AVX1-NEXT:    retq
419;
420; AVX512-LABEL: v8f32_no_estimate:
421; AVX512:       # %bb.0:
422; AVX512-NEXT:    vsqrtps %ymm0, %ymm0
423; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
424; AVX512-NEXT:    vdivps %ymm0, %ymm1, %ymm0
425; AVX512-NEXT:    retq
426  %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
427  %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
428  ret <8 x float> %div
429}
430
431define <8 x float> @v8f32_estimate(<8 x float> %x) #1 {
432; SSE-LABEL: v8f32_estimate:
433; SSE:       # %bb.0:
434; SSE-NEXT:    rsqrtps %xmm0, %xmm2
435; SSE-NEXT:    movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
436; SSE-NEXT:    mulps %xmm2, %xmm0
437; SSE-NEXT:    mulps %xmm2, %xmm0
438; SSE-NEXT:    mulps %xmm3, %xmm2
439; SSE-NEXT:    movaps {{.*#+}} xmm4 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
440; SSE-NEXT:    addps %xmm4, %xmm0
441; SSE-NEXT:    mulps %xmm2, %xmm0
442; SSE-NEXT:    rsqrtps %xmm1, %xmm2
443; SSE-NEXT:    mulps %xmm2, %xmm3
444; SSE-NEXT:    mulps %xmm2, %xmm1
445; SSE-NEXT:    mulps %xmm2, %xmm1
446; SSE-NEXT:    addps %xmm4, %xmm1
447; SSE-NEXT:    mulps %xmm3, %xmm1
448; SSE-NEXT:    retq
449;
450; AVX1-LABEL: v8f32_estimate:
451; AVX1:       # %bb.0:
452; AVX1-NEXT:    vrsqrtps %ymm0, %ymm1
453; AVX1-NEXT:    vmulps %ymm1, %ymm0, %ymm0
454; AVX1-NEXT:    vmulps %ymm1, %ymm0, %ymm0
455; AVX1-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
456; AVX1-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
457; AVX1-NEXT:    vmulps %ymm0, %ymm1, %ymm0
458; AVX1-NEXT:    retq
459;
460; AVX512-LABEL: v8f32_estimate:
461; AVX512:       # %bb.0:
462; AVX512-NEXT:    vrsqrtps %ymm0, %ymm1
463; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
464; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
465; AVX512-NEXT:    vfmadd231ps {{.*#+}} ymm2 = (ymm1 * ymm0) + ymm2
466; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm0 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
467; AVX512-NEXT:    vmulps %ymm0, %ymm1, %ymm0
468; AVX512-NEXT:    vmulps %ymm2, %ymm0, %ymm0
469; AVX512-NEXT:    retq
470  %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
471  %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
472  ret <8 x float> %div
473}
474
475define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
476; SSE-LABEL: v16f32_no_estimate:
477; SSE:       # %bb.0:
478; SSE-NEXT:    sqrtps %xmm3, %xmm4
479; SSE-NEXT:    sqrtps %xmm2, %xmm5
480; SSE-NEXT:    sqrtps %xmm1, %xmm2
481; SSE-NEXT:    sqrtps %xmm0, %xmm1
482; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
483; SSE-NEXT:    movaps %xmm3, %xmm0
484; SSE-NEXT:    divps %xmm1, %xmm0
485; SSE-NEXT:    movaps %xmm3, %xmm1
486; SSE-NEXT:    divps %xmm2, %xmm1
487; SSE-NEXT:    movaps %xmm3, %xmm2
488; SSE-NEXT:    divps %xmm5, %xmm2
489; SSE-NEXT:    divps %xmm4, %xmm3
490; SSE-NEXT:    retq
491;
492; AVX1-LABEL: v16f32_no_estimate:
493; AVX1:       # %bb.0:
494; AVX1-NEXT:    vsqrtps %ymm1, %ymm1
495; AVX1-NEXT:    vsqrtps %ymm0, %ymm0
496; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
497; AVX1-NEXT:    vdivps %ymm0, %ymm2, %ymm0
498; AVX1-NEXT:    vdivps %ymm1, %ymm2, %ymm1
499; AVX1-NEXT:    retq
500;
501; AVX512-LABEL: v16f32_no_estimate:
502; AVX512:       # %bb.0:
503; AVX512-NEXT:    vsqrtps %zmm0, %zmm0
504; AVX512-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
505; AVX512-NEXT:    vdivps %zmm0, %zmm1, %zmm0
506; AVX512-NEXT:    retq
507  %sqrt = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %x)
508  %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
509  ret <16 x float> %div
510}
511
512define <16 x float> @v16f32_estimate(<16 x float> %x) #1 {
513; SSE-LABEL: v16f32_estimate:
514; SSE:       # %bb.0:
515; SSE-NEXT:    rsqrtps %xmm0, %xmm5
516; SSE-NEXT:    movaps {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
517; SSE-NEXT:    mulps %xmm5, %xmm0
518; SSE-NEXT:    mulps %xmm5, %xmm0
519; SSE-NEXT:    movaps %xmm5, %xmm6
520; SSE-NEXT:    mulps %xmm4, %xmm6
521; SSE-NEXT:    movaps {{.*#+}} xmm5 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
522; SSE-NEXT:    addps %xmm5, %xmm0
523; SSE-NEXT:    mulps %xmm6, %xmm0
524; SSE-NEXT:    rsqrtps %xmm1, %xmm6
525; SSE-NEXT:    mulps %xmm6, %xmm1
526; SSE-NEXT:    mulps %xmm6, %xmm1
527; SSE-NEXT:    mulps %xmm4, %xmm6
528; SSE-NEXT:    addps %xmm5, %xmm1
529; SSE-NEXT:    mulps %xmm6, %xmm1
530; SSE-NEXT:    rsqrtps %xmm2, %xmm6
531; SSE-NEXT:    mulps %xmm6, %xmm2
532; SSE-NEXT:    mulps %xmm6, %xmm2
533; SSE-NEXT:    mulps %xmm4, %xmm6
534; SSE-NEXT:    addps %xmm5, %xmm2
535; SSE-NEXT:    mulps %xmm6, %xmm2
536; SSE-NEXT:    rsqrtps %xmm3, %xmm6
537; SSE-NEXT:    mulps %xmm6, %xmm4
538; SSE-NEXT:    mulps %xmm6, %xmm3
539; SSE-NEXT:    mulps %xmm6, %xmm3
540; SSE-NEXT:    addps %xmm5, %xmm3
541; SSE-NEXT:    mulps %xmm4, %xmm3
542; SSE-NEXT:    retq
543;
544; AVX1-LABEL: v16f32_estimate:
545; AVX1:       # %bb.0:
546; AVX1-NEXT:    vrsqrtps %ymm0, %ymm2
547; AVX1-NEXT:    vmovaps {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
548; AVX1-NEXT:    vmulps %ymm3, %ymm2, %ymm4
549; AVX1-NEXT:    vmulps %ymm2, %ymm0, %ymm0
550; AVX1-NEXT:    vmulps %ymm2, %ymm0, %ymm0
551; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
552; AVX1-NEXT:    vaddps %ymm2, %ymm0, %ymm0
553; AVX1-NEXT:    vrsqrtps %ymm1, %ymm5
554; AVX1-NEXT:    vmulps %ymm0, %ymm4, %ymm0
555; AVX1-NEXT:    vmulps %ymm3, %ymm5, %ymm3
556; AVX1-NEXT:    vmulps %ymm5, %ymm1, %ymm1
557; AVX1-NEXT:    vmulps %ymm5, %ymm1, %ymm1
558; AVX1-NEXT:    vaddps %ymm2, %ymm1, %ymm1
559; AVX1-NEXT:    vmulps %ymm1, %ymm3, %ymm1
560; AVX1-NEXT:    retq
561;
562; AVX512-LABEL: v16f32_estimate:
563; AVX512:       # %bb.0:
564; AVX512-NEXT:    vrsqrt14ps %zmm0, %zmm1
565; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
566; AVX512-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + mem
567; AVX512-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
568; AVX512-NEXT:    vmulps %zmm0, %zmm1, %zmm0
569; AVX512-NEXT:    retq
570  %sqrt = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %x)
571  %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
572  ret <16 x float> %div
573}
574
575; x / (fabs(y) * sqrt(z)) --> x * rsqrt(y*y*z)
576
577define float @div_sqrt_fabs_f32(float %x, float %y, float %z) {
578; SSE-LABEL: div_sqrt_fabs_f32:
579; SSE:       # %bb.0:
580; SSE-NEXT:    mulss %xmm1, %xmm1
581; SSE-NEXT:    mulss %xmm2, %xmm1
582; SSE-NEXT:    xorps %xmm2, %xmm2
583; SSE-NEXT:    rsqrtss %xmm1, %xmm2
584; SSE-NEXT:    mulss %xmm2, %xmm1
585; SSE-NEXT:    mulss %xmm2, %xmm1
586; SSE-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
587; SSE-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
588; SSE-NEXT:    mulss %xmm2, %xmm0
589; SSE-NEXT:    mulss %xmm1, %xmm0
590; SSE-NEXT:    retq
591;
592; AVX1-LABEL: div_sqrt_fabs_f32:
593; AVX1:       # %bb.0:
594; AVX1-NEXT:    vmulss %xmm1, %xmm1, %xmm1
595; AVX1-NEXT:    vmulss %xmm2, %xmm1, %xmm1
596; AVX1-NEXT:    vrsqrtss %xmm1, %xmm1, %xmm2
597; AVX1-NEXT:    vmulss %xmm2, %xmm1, %xmm1
598; AVX1-NEXT:    vmulss %xmm2, %xmm1, %xmm1
599; AVX1-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
600; AVX1-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
601; AVX1-NEXT:    vmulss %xmm0, %xmm2, %xmm0
602; AVX1-NEXT:    vmulss %xmm0, %xmm1, %xmm0
603; AVX1-NEXT:    retq
604;
605; AVX512-LABEL: div_sqrt_fabs_f32:
606; AVX512:       # %bb.0:
607; AVX512-NEXT:    vmulss %xmm1, %xmm1, %xmm1
608; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
609; AVX512-NEXT:    vrsqrtss %xmm1, %xmm1, %xmm2
610; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
611; AVX512-NEXT:    vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem
612; AVX512-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
613; AVX512-NEXT:    vmulss %xmm0, %xmm2, %xmm0
614; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
615; AVX512-NEXT:    retq
616  %s = call fast float @llvm.sqrt.f32(float %z)
617  %a = call fast float @llvm.fabs.f32(float %y)
618  %m = fmul fast float %s, %a
619  %d = fdiv fast float %x, %m
620  ret float %d
621}
622
623; x / (fabs(y) * sqrt(z)) --> x * rsqrt(y*y*z)
624
625define <4 x float> @div_sqrt_fabs_v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
626; SSE-LABEL: div_sqrt_fabs_v4f32:
627; SSE:       # %bb.0:
628; SSE-NEXT:    mulps %xmm1, %xmm1
629; SSE-NEXT:    mulps %xmm2, %xmm1
630; SSE-NEXT:    rsqrtps %xmm1, %xmm2
631; SSE-NEXT:    mulps %xmm2, %xmm1
632; SSE-NEXT:    mulps %xmm2, %xmm1
633; SSE-NEXT:    addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
634; SSE-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
635; SSE-NEXT:    mulps %xmm1, %xmm2
636; SSE-NEXT:    mulps %xmm2, %xmm0
637; SSE-NEXT:    retq
638;
639; AVX1-LABEL: div_sqrt_fabs_v4f32:
640; AVX1:       # %bb.0:
641; AVX1-NEXT:    vmulps %xmm1, %xmm1, %xmm1
642; AVX1-NEXT:    vmulps %xmm2, %xmm1, %xmm1
643; AVX1-NEXT:    vrsqrtps %xmm1, %xmm2
644; AVX1-NEXT:    vmulps %xmm2, %xmm1, %xmm1
645; AVX1-NEXT:    vmulps %xmm2, %xmm1, %xmm1
646; AVX1-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
647; AVX1-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
648; AVX1-NEXT:    vmulps %xmm1, %xmm2, %xmm1
649; AVX1-NEXT:    vmulps %xmm1, %xmm0, %xmm0
650; AVX1-NEXT:    retq
651;
652; AVX512-LABEL: div_sqrt_fabs_v4f32:
653; AVX512:       # %bb.0:
654; AVX512-NEXT:    vmulps %xmm1, %xmm1, %xmm1
655; AVX512-NEXT:    vmulps %xmm2, %xmm1, %xmm1
656; AVX512-NEXT:    vrsqrtps %xmm1, %xmm2
657; AVX512-NEXT:    vmulps %xmm2, %xmm1, %xmm1
658; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
659; AVX512-NEXT:    vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
660; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
661; AVX512-NEXT:    vmulps %xmm1, %xmm2, %xmm1
662; AVX512-NEXT:    vmulps %xmm3, %xmm1, %xmm1
663; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
664; AVX512-NEXT:    retq
665  %s = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %z)
666  %a = call <4 x float> @llvm.fabs.v4f32(<4 x float> %y)
667  %m = fmul contract reassoc <4 x float> %a, %s
668  %d = fdiv contract reassoc arcp <4 x float> %x, %m
669  ret <4 x float> %d
670}
671
672; This has 'arcp' but does not have 'reassoc' FMF.
673; We allow converting the sqrt to an estimate, but
674; do not pull the divisor into the estimate.
675; x / (fabs(y) * sqrt(z)) --> x * rsqrt(z) / fabs(y)
676
677define <4 x float> @div_sqrt_fabs_v4f32_fmf(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
678; SSE-LABEL: div_sqrt_fabs_v4f32_fmf:
679; SSE:       # %bb.0:
680; SSE-NEXT:    rsqrtps %xmm2, %xmm3
681; SSE-NEXT:    mulps %xmm3, %xmm2
682; SSE-NEXT:    mulps %xmm3, %xmm2
683; SSE-NEXT:    addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
684; SSE-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
685; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
686; SSE-NEXT:    mulps %xmm2, %xmm3
687; SSE-NEXT:    divps %xmm1, %xmm3
688; SSE-NEXT:    mulps %xmm3, %xmm0
689; SSE-NEXT:    retq
690;
691; AVX1-LABEL: div_sqrt_fabs_v4f32_fmf:
692; AVX1:       # %bb.0:
693; AVX1-NEXT:    vrsqrtps %xmm2, %xmm3
694; AVX1-NEXT:    vmulps %xmm3, %xmm2, %xmm2
695; AVX1-NEXT:    vmulps %xmm3, %xmm2, %xmm2
696; AVX1-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
697; AVX1-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
698; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
699; AVX1-NEXT:    vmulps %xmm2, %xmm3, %xmm2
700; AVX1-NEXT:    vdivps %xmm1, %xmm2, %xmm1
701; AVX1-NEXT:    vmulps %xmm1, %xmm0, %xmm0
702; AVX1-NEXT:    retq
703;
704; AVX512-LABEL: div_sqrt_fabs_v4f32_fmf:
705; AVX512:       # %bb.0:
706; AVX512-NEXT:    vrsqrtps %xmm2, %xmm3
707; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
708; AVX512-NEXT:    vmulps %xmm4, %xmm3, %xmm4
709; AVX512-NEXT:    vmulps %xmm3, %xmm2, %xmm2
710; AVX512-NEXT:    vmulps %xmm3, %xmm2, %xmm2
711; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
712; AVX512-NEXT:    vaddps %xmm3, %xmm2, %xmm2
713; AVX512-NEXT:    vmulps %xmm2, %xmm4, %xmm2
714; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN]
715; AVX512-NEXT:    vandps %xmm3, %xmm1, %xmm1
716; AVX512-NEXT:    vdivps %xmm1, %xmm2, %xmm1
717; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
718; AVX512-NEXT:    retq
719  %s = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %z)
720  %a = call <4 x float> @llvm.fabs.v4f32(<4 x float> %y)
721  %m = fmul <4 x float> %a, %s
722  %d = fdiv arcp <4 x float> %x, %m
723  ret <4 x float> %d
724}
725
726; No estimates for f64, so do not convert fabs into an fmul.
727
728define double @div_sqrt_fabs_f64(double %x, double %y, double %z) {
729; SSE-LABEL: div_sqrt_fabs_f64:
730; SSE:       # %bb.0:
731; SSE-NEXT:    andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
732; SSE-NEXT:    sqrtsd %xmm2, %xmm2
733; SSE-NEXT:    mulsd %xmm2, %xmm1
734; SSE-NEXT:    divsd %xmm1, %xmm0
735; SSE-NEXT:    retq
736;
737; AVX-LABEL: div_sqrt_fabs_f64:
738; AVX:       # %bb.0:
739; AVX-NEXT:    vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
740; AVX-NEXT:    vsqrtsd %xmm2, %xmm2, %xmm2
741; AVX-NEXT:    vmulsd %xmm1, %xmm2, %xmm1
742; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
743; AVX-NEXT:    retq
744  %s = call fast double @llvm.sqrt.f64(double %z)
745  %a = call fast double @llvm.fabs.f64(double %y)
746  %m = fmul fast double %s, %a
747  %d = fdiv fast double %x, %m
748  ret double %d
749}
750
751; This is a special case for the general pattern above -
752; if the sqrt operand is the same as the other mul op,
753; then fabs may be omitted.
754; x / (y * sqrt(y)) --> x * rsqrt(y*y*y)
755
756define float @div_sqrt_f32(float %x, float %y) {
757; SSE-LABEL: div_sqrt_f32:
758; SSE:       # %bb.0:
759; SSE-NEXT:    movaps %xmm1, %xmm2
760; SSE-NEXT:    mulss %xmm1, %xmm2
761; SSE-NEXT:    mulss %xmm1, %xmm2
762; SSE-NEXT:    xorps %xmm1, %xmm1
763; SSE-NEXT:    rsqrtss %xmm2, %xmm1
764; SSE-NEXT:    mulss %xmm1, %xmm2
765; SSE-NEXT:    mulss %xmm1, %xmm2
766; SSE-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
767; SSE-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
768; SSE-NEXT:    mulss %xmm1, %xmm0
769; SSE-NEXT:    mulss %xmm2, %xmm0
770; SSE-NEXT:    retq
771;
772; AVX1-LABEL: div_sqrt_f32:
773; AVX1:       # %bb.0:
774; AVX1-NEXT:    vmulss %xmm1, %xmm1, %xmm2
775; AVX1-NEXT:    vmulss %xmm1, %xmm2, %xmm1
776; AVX1-NEXT:    vrsqrtss %xmm1, %xmm1, %xmm2
777; AVX1-NEXT:    vmulss %xmm2, %xmm1, %xmm1
778; AVX1-NEXT:    vmulss %xmm2, %xmm1, %xmm1
779; AVX1-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
780; AVX1-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
781; AVX1-NEXT:    vmulss %xmm0, %xmm2, %xmm0
782; AVX1-NEXT:    vmulss %xmm0, %xmm1, %xmm0
783; AVX1-NEXT:    retq
784;
785; AVX512-LABEL: div_sqrt_f32:
786; AVX512:       # %bb.0:
787; AVX512-NEXT:    vmulss %xmm1, %xmm1, %xmm2
788; AVX512-NEXT:    vmulss %xmm1, %xmm2, %xmm1
789; AVX512-NEXT:    vrsqrtss %xmm1, %xmm1, %xmm2
790; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
791; AVX512-NEXT:    vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem
792; AVX512-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
793; AVX512-NEXT:    vmulss %xmm0, %xmm2, %xmm0
794; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
795; AVX512-NEXT:    retq
796  %s = call fast float @llvm.sqrt.f32(float %y)
797  %m = fmul fast float %s, %y
798  %d = fdiv fast float %x, %m
799  ret float %d
800}
801
802; This is a special case for the general pattern above -
803; if the sqrt operand is the same as the other mul op,
804; then fabs may be omitted.
805; x / (y * sqrt(y)) --> x * rsqrt(y*y*y)
806
807define <4 x float> @div_sqrt_v4f32(<4 x float> %x, <4 x float> %y) {
808; SSE-LABEL: div_sqrt_v4f32:
809; SSE:       # %bb.0:
810; SSE-NEXT:    movaps %xmm1, %xmm2
811; SSE-NEXT:    mulps %xmm1, %xmm2
812; SSE-NEXT:    mulps %xmm1, %xmm2
813; SSE-NEXT:    rsqrtps %xmm2, %xmm1
814; SSE-NEXT:    mulps %xmm1, %xmm2
815; SSE-NEXT:    mulps %xmm1, %xmm2
816; SSE-NEXT:    addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
817; SSE-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
818; SSE-NEXT:    mulps %xmm2, %xmm1
819; SSE-NEXT:    mulps %xmm1, %xmm0
820; SSE-NEXT:    retq
821;
822; AVX1-LABEL: div_sqrt_v4f32:
823; AVX1:       # %bb.0:
824; AVX1-NEXT:    vmulps %xmm1, %xmm1, %xmm2
825; AVX1-NEXT:    vmulps %xmm1, %xmm2, %xmm1
826; AVX1-NEXT:    vrsqrtps %xmm1, %xmm2
827; AVX1-NEXT:    vmulps %xmm2, %xmm1, %xmm1
828; AVX1-NEXT:    vmulps %xmm2, %xmm1, %xmm1
829; AVX1-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
830; AVX1-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
831; AVX1-NEXT:    vmulps %xmm1, %xmm2, %xmm1
832; AVX1-NEXT:    vmulps %xmm1, %xmm0, %xmm0
833; AVX1-NEXT:    retq
834;
835; AVX512-LABEL: div_sqrt_v4f32:
836; AVX512:       # %bb.0:
837; AVX512-NEXT:    vmulps %xmm1, %xmm1, %xmm2
838; AVX512-NEXT:    vmulps %xmm1, %xmm2, %xmm1
839; AVX512-NEXT:    vrsqrtps %xmm1, %xmm2
840; AVX512-NEXT:    vmulps %xmm2, %xmm1, %xmm1
841; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
842; AVX512-NEXT:    vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
843; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
844; AVX512-NEXT:    vmulps %xmm1, %xmm2, %xmm1
845; AVX512-NEXT:    vmulps %xmm3, %xmm1, %xmm1
846; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
847; AVX512-NEXT:    retq
848  %s = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %y)
849  %m = fmul contract reassoc <4 x float> %y, %s
850  %d = fdiv contract reassoc arcp <4 x float> %x, %m
851  ret <4 x float> %d
852}
853
854define double @sqrt_fdiv_common_operand(double %x) nounwind {
855; SSE-LABEL: sqrt_fdiv_common_operand:
856; SSE:       # %bb.0:
857; SSE-NEXT:    sqrtsd %xmm0, %xmm0
858; SSE-NEXT:    retq
859;
860; AVX-LABEL: sqrt_fdiv_common_operand:
861; AVX:       # %bb.0:
862; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
863; AVX-NEXT:    retq
864  %sqrt = call fast double @llvm.sqrt.f64(double %x)
865  %r = fdiv fast double %x, %sqrt
866  ret double %r
867}
868
869define <2 x double> @sqrt_fdiv_common_operand_vec(<2 x double> %x) nounwind {
870; SSE-LABEL: sqrt_fdiv_common_operand_vec:
871; SSE:       # %bb.0:
872; SSE-NEXT:    sqrtpd %xmm0, %xmm0
873; SSE-NEXT:    retq
874;
875; AVX-LABEL: sqrt_fdiv_common_operand_vec:
876; AVX:       # %bb.0:
877; AVX-NEXT:    vsqrtpd %xmm0, %xmm0
878; AVX-NEXT:    retq
879  %sqrt = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
880  %r = fdiv arcp nsz reassoc <2 x double> %x, %sqrt
881  ret <2 x double> %r
882}
883
884define double @sqrt_fdiv_common_operand_extra_use(double %x, ptr %p) nounwind {
885; SSE-LABEL: sqrt_fdiv_common_operand_extra_use:
886; SSE:       # %bb.0:
887; SSE-NEXT:    sqrtsd %xmm0, %xmm0
888; SSE-NEXT:    movsd %xmm0, (%rdi)
889; SSE-NEXT:    retq
890;
891; AVX-LABEL: sqrt_fdiv_common_operand_extra_use:
892; AVX:       # %bb.0:
893; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
894; AVX-NEXT:    vmovsd %xmm0, (%rdi)
895; AVX-NEXT:    retq
896  %sqrt = call fast double @llvm.sqrt.f64(double %x)
897  store double %sqrt, ptr %p
898  %r = fdiv fast double %x, %sqrt
899  ret double %r
900}
901
902define double @sqrt_simplify_before_recip(double %x, ptr %p) nounwind {
903; SSE-LABEL: sqrt_simplify_before_recip:
904; SSE:       # %bb.0:
905; SSE-NEXT:    sqrtsd %xmm0, %xmm0
906; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
907; SSE-NEXT:    divsd %xmm0, %xmm1
908; SSE-NEXT:    movsd %xmm1, (%rdi)
909; SSE-NEXT:    retq
910;
911; AVX-LABEL: sqrt_simplify_before_recip:
912; AVX:       # %bb.0:
913; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
914; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
915; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm1
916; AVX-NEXT:    vmovsd %xmm1, (%rdi)
917; AVX-NEXT:    retq
918  %sqrt = tail call fast double @llvm.sqrt.f64(double %x)
919  %rsqrt = fdiv fast double 1.0, %sqrt
920  %sqrt_fast = fdiv fast double %x, %sqrt
921  store double %rsqrt, ptr %p, align 8
922  ret double %sqrt_fast
923}
924
925define <2 x double> @sqrt_simplify_before_recip_vec(<2 x double> %x, ptr %p) nounwind {
926; SSE-LABEL: sqrt_simplify_before_recip_vec:
927; SSE:       # %bb.0:
928; SSE-NEXT:    sqrtpd %xmm0, %xmm0
929; SSE-NEXT:    movapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
930; SSE-NEXT:    divpd %xmm0, %xmm1
931; SSE-NEXT:    movupd %xmm1, (%rdi)
932; SSE-NEXT:    retq
933;
934; AVX-LABEL: sqrt_simplify_before_recip_vec:
935; AVX:       # %bb.0:
936; AVX-NEXT:    vsqrtpd %xmm0, %xmm0
937; AVX-NEXT:    vmovapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
938; AVX-NEXT:    vdivpd %xmm0, %xmm1, %xmm1
939; AVX-NEXT:    vmovupd %xmm1, (%rdi)
940; AVX-NEXT:    retq
941  %sqrt = tail call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
942  %rsqrt = fdiv fast <2 x double> <double 1.0, double 1.0>, %sqrt
943  %sqrt_fast = fdiv fast <2 x double> %x, %sqrt
944  store <2 x double> %rsqrt, ptr %p, align 8
945  ret <2 x double> %sqrt_fast
946}
947
948define double @sqrt_simplify_before_recip_order(double %x, ptr %p) nounwind {
949; SSE-LABEL: sqrt_simplify_before_recip_order:
950; SSE:       # %bb.0:
951; SSE-NEXT:    sqrtsd %xmm0, %xmm0
952; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
953; SSE-NEXT:    divsd %xmm0, %xmm1
954; SSE-NEXT:    movsd %xmm1, (%rdi)
955; SSE-NEXT:    retq
956;
957; AVX-LABEL: sqrt_simplify_before_recip_order:
958; AVX:       # %bb.0:
959; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
960; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
961; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm1
962; AVX-NEXT:    vmovsd %xmm1, (%rdi)
963; AVX-NEXT:    retq
964  %sqrt = tail call fast double @llvm.sqrt.f64(double %x)
965  %sqrt_fast = fdiv fast double %x, %sqrt
966  %rsqrt = fdiv fast double 42.0, %sqrt
967  store double %rsqrt, ptr %p, align 8
968  ret double %sqrt_fast
969}
970
971attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!sqrtf,!vec-sqrtf,!divf,!vec-divf" }
972attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" }
973attributes #2 = { nounwind readnone }
974attributes #3 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="ieee" }
975attributes #4 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="ieee,preserve-sign" }
976attributes #5 = { "unsafe-fp-math"="true" "reciprocal-estimates"="all:0" }
977