1; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI -check-prefix=FUNC %s
2; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI -check-prefix=FUNC %s
3; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -mattr=+fp32-denormals < %s | FileCheck -check-prefix=SI-FMA -check-prefix=SI -check-prefix=FUNC %s
4
5; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be
6; beneficial even without fp32 denormals, but they do require no-infs-fp-math
7; for correctness.
8
9declare i32 @llvm.amdgcn.workitem.id.x() #0
10declare double @llvm.fabs.f64(double) #0
11declare double @llvm.fma.f64(double, double, double) #0
12declare float @llvm.fma.f32(float, float, float) #0
13
14; (fadd (fmul x, y), z) -> (fma x, y, z)
15; FUNC-LABEL: {{^}}combine_to_fma_f64_0:
16; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
17; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
18; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
19; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
20; SI: buffer_store_dwordx2 [[RESULT]]
21define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
22  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
23  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
24  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
25  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
26  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
27
28  %a = load volatile double, double addrspace(1)* %gep.0
29  %b = load volatile double, double addrspace(1)* %gep.1
30  %c = load volatile double, double addrspace(1)* %gep.2
31
32  %mul = fmul double %a, %b
33  %fma = fadd double %mul, %c
34  store double %fma, double addrspace(1)* %gep.out
35  ret void
36}
37
38; (fadd (fmul x, y), z) -> (fma x, y, z)
39; FUNC-LABEL: {{^}}combine_to_fma_f64_0_2use:
40; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
41; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
42; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
43; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
44; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
45; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[D]]
46; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
47; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
48; SI: s_endpgm
49define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
50  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
51  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
52  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
53  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
54  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
55  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
56  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
57
58  %a = load volatile double, double addrspace(1)* %gep.0
59  %b = load volatile double, double addrspace(1)* %gep.1
60  %c = load volatile double, double addrspace(1)* %gep.2
61  %d = load volatile double, double addrspace(1)* %gep.3
62
63  %mul = fmul double %a, %b
64  %fma0 = fadd double %mul, %c
65  %fma1 = fadd double %mul, %d
66  store volatile double %fma0, double addrspace(1)* %gep.out.0
67  store volatile double %fma1, double addrspace(1)* %gep.out.1
68  ret void
69}
70
71; (fadd x, (fmul y, z)) -> (fma y, z, x)
72; FUNC-LABEL: {{^}}combine_to_fma_f64_1:
73; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
74; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
75; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
76; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
77; SI: buffer_store_dwordx2 [[RESULT]]
78define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
79  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
80  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
81  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
82  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
83  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
84
85  %a = load volatile double, double addrspace(1)* %gep.0
86  %b = load volatile double, double addrspace(1)* %gep.1
87  %c = load volatile double, double addrspace(1)* %gep.2
88
89  %mul = fmul double %a, %b
90  %fma = fadd double %c, %mul
91  store double %fma, double addrspace(1)* %gep.out
92  ret void
93}
94
95; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
96; FUNC-LABEL: {{^}}combine_to_fma_fsub_0_f64:
97; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
98; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
99; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
100; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
101; SI: buffer_store_dwordx2 [[RESULT]]
102define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
103  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
104  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
105  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
106  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
107  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
108
109  %a = load volatile double, double addrspace(1)* %gep.0
110  %b = load volatile double, double addrspace(1)* %gep.1
111  %c = load volatile double, double addrspace(1)* %gep.2
112
113  %mul = fmul double %a, %b
114  %fma = fsub double %mul, %c
115  store double %fma, double addrspace(1)* %gep.out
116  ret void
117}
118
119; (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
120; FUNC-LABEL: {{^}}combine_to_fma_fsub_f64_0_2use:
121; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
122; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
123; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
124; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
125; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
126; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
127; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
128; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
129; SI: s_endpgm
130define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
131  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
132  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
133  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
134  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
135  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
136  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
137  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
138
139  %a = load volatile double, double addrspace(1)* %gep.0
140  %b = load volatile double, double addrspace(1)* %gep.1
141  %c = load volatile double, double addrspace(1)* %gep.2
142  %d = load volatile double, double addrspace(1)* %gep.3
143
144  %mul = fmul double %a, %b
145  %fma0 = fsub double %mul, %c
146  %fma1 = fsub double %mul, %d
147  store volatile double %fma0, double addrspace(1)* %gep.out.0
148  store volatile double %fma1, double addrspace(1)* %gep.out.1
149  ret void
150}
151
152; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
153; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64:
154; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
155; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
156; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
157; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
158; SI: buffer_store_dwordx2 [[RESULT]]
159define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
160  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
161  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
162  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
163  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
164  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
165
166  %a = load volatile double, double addrspace(1)* %gep.0
167  %b = load volatile double, double addrspace(1)* %gep.1
168  %c = load volatile double, double addrspace(1)* %gep.2
169
170  %mul = fmul double %a, %b
171  %fma = fsub double %c, %mul
172  store double %fma, double addrspace(1)* %gep.out
173  ret void
174}
175
176; (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
177; FUNC-LABEL: {{^}}combine_to_fma_fsub_1_f64_2use:
178; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
179; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
180; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
181; SI-DAG: buffer_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
182; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
183; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[D]]
184; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
185; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
186; SI: s_endpgm
187define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
188  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
189  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
190  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
191  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
192  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
193  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
194  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
195
196  %a = load volatile double, double addrspace(1)* %gep.0
197  %b = load volatile double, double addrspace(1)* %gep.1
198  %c = load volatile double, double addrspace(1)* %gep.2
199  %d = load volatile double, double addrspace(1)* %gep.3
200
201  %mul = fmul double %a, %b
202  %fma0 = fsub double %c, %mul
203  %fma1 = fsub double %d, %mul
204  store volatile double %fma0, double addrspace(1)* %gep.out.0
205  store volatile double %fma1, double addrspace(1)* %gep.out.1
206  ret void
207}
208
209; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
210; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64:
211; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
212; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
213; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
214; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
215; SI: buffer_store_dwordx2 [[RESULT]]
216define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
217  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
218  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
219  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
220  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
221  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
222
223  %a = load volatile double, double addrspace(1)* %gep.0
224  %b = load volatile double, double addrspace(1)* %gep.1
225  %c = load volatile double, double addrspace(1)* %gep.2
226
227  %mul = fmul double %a, %b
228  %mul.neg = fsub double -0.0, %mul
229  %fma = fsub double %mul.neg, %c
230
231  store double %fma, double addrspace(1)* %gep.out
232  ret void
233}
234
235; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
236; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_neg:
237; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
238; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
239; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
240; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
241; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[D]]
242; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
243; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
244; SI: s_endpgm
245define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
246  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
247  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
248  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
249  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
250  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
251  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
252  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
253
254  %a = load volatile double, double addrspace(1)* %gep.0
255  %b = load volatile double, double addrspace(1)* %gep.1
256  %c = load volatile double, double addrspace(1)* %gep.2
257  %d = load volatile double, double addrspace(1)* %gep.3
258
259  %mul = fmul double %a, %b
260  %mul.neg = fsub double -0.0, %mul
261  %fma0 = fsub double %mul.neg, %c
262  %fma1 = fsub double %mul.neg, %d
263
264  store volatile double %fma0, double addrspace(1)* %gep.out.0
265  store volatile double %fma1, double addrspace(1)* %gep.out.1
266  ret void
267}
268
269; (fsub (fneg (fmul x, y)), z) -> (fma (fneg x), y, (fneg z))
270; FUNC-LABEL: {{^}}combine_to_fma_fsub_2_f64_2uses_mul:
271; SI-DAG: buffer_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
272; SI-DAG: buffer_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
273; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
274; SI-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
275; SI-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[D]]
276; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
277; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
278; SI: s_endpgm
279define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
280  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
281  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
282  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
283  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
284  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
285  %gep.out.0 = getelementptr double, double addrspace(1)* %out, i32 %tid
286  %gep.out.1 = getelementptr double, double addrspace(1)* %gep.out.0, i32 1
287
288  %a = load volatile double, double addrspace(1)* %gep.0
289  %b = load volatile double, double addrspace(1)* %gep.1
290  %c = load volatile double, double addrspace(1)* %gep.2
291  %d = load volatile double, double addrspace(1)* %gep.3
292
293  %mul = fmul double %a, %b
294  %mul.neg = fsub double -0.0, %mul
295  %fma0 = fsub double %mul.neg, %c
296  %fma1 = fsub double %mul, %d
297
298  store volatile double %fma0, double addrspace(1)* %gep.out.0
299  store volatile double %fma1, double addrspace(1)* %gep.out.1
300  ret void
301}
302
303; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
304
305; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_0_f64:
306; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
307; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
308; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
309; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
310; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
311; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]], -[[Z]]
312; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]]
313; SI: buffer_store_dwordx2 [[RESULT]]
314define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
315  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
316  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
317  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
318  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
319  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
320  %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
321  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
322
323  %x = load volatile double, double addrspace(1)* %gep.0
324  %y = load volatile double, double addrspace(1)* %gep.1
325  %z = load volatile double, double addrspace(1)* %gep.2
326  %u = load volatile double, double addrspace(1)* %gep.3
327  %v = load volatile double, double addrspace(1)* %gep.4
328
329  %tmp0 = fmul double %u, %v
330  %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0
331  %tmp2 = fsub double %tmp1, %z
332
333  store double %tmp2, double addrspace(1)* %gep.out
334  ret void
335}
336
337; fold (fsub x, (fma y, z, (fmul u, v)))
338;   -> (fma (fneg y), z, (fma (fneg u), v, x))
339
340; FUNC-LABEL: {{^}}aggressive_combine_to_fma_fsub_1_f64:
341; SI-DAG: buffer_load_dwordx2 [[X:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
342; SI-DAG: buffer_load_dwordx2 [[Y:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
343; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
344; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
345; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
346; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[U]], [[V]], [[X]]
347; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]]
348; SI: buffer_store_dwordx2 [[RESULT]]
349define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
350  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
351  %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
352  %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
353  %gep.2 = getelementptr double, double addrspace(1)* %gep.0, i32 2
354  %gep.3 = getelementptr double, double addrspace(1)* %gep.0, i32 3
355  %gep.4 = getelementptr double, double addrspace(1)* %gep.0, i32 4
356  %gep.out = getelementptr double, double addrspace(1)* %out, i32 %tid
357
358  %x = load volatile double, double addrspace(1)* %gep.0
359  %y = load volatile double, double addrspace(1)* %gep.1
360  %z = load volatile double, double addrspace(1)* %gep.2
361  %u = load volatile double, double addrspace(1)* %gep.3
362  %v = load volatile double, double addrspace(1)* %gep.4
363
364  %tmp0 = fmul double %u, %v
365  %tmp1 = call double @llvm.fma.f64(double %y, double %z, double %tmp0) #0
366  %tmp2 = fsub double %x, %tmp1
367
368  store double %tmp2, double addrspace(1)* %gep.out
369  ret void
370}
371
372;
373; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y)
374;
375
376; FUNC-LABEL: {{^}}test_f32_mul_add_x_one_y:
377; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
378; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
379;
380; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
381define void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
382                                        float addrspace(1)* %in1,
383                                        float addrspace(1)* %in2) {
384  %x = load volatile float, float addrspace(1)* %in1
385  %y = load volatile float, float addrspace(1)* %in2
386  %a = fadd float %x, 1.0
387  %m = fmul float %a, %y
388  store float %m, float addrspace(1)* %out
389  ret void
390}
391
392; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_one:
393; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
394; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
395;
396; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
397define void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
398                                        float addrspace(1)* %in1,
399                                        float addrspace(1)* %in2) {
400  %x = load volatile float, float addrspace(1)* %in1
401  %y = load volatile float, float addrspace(1)* %in2
402  %a = fadd float %x, 1.0
403  %m = fmul float %y, %a
404  store float %m, float addrspace(1)* %out
405  ret void
406}
407
408; FUNC-LABEL: {{^}}test_f32_mul_add_x_negone_y:
409; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
410; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
411;
412; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
413define void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
414                                           float addrspace(1)* %in1,
415                                           float addrspace(1)* %in2) {
416  %x = load float, float addrspace(1)* %in1
417  %y = load float, float addrspace(1)* %in2
418  %a = fadd float %x, -1.0
419  %m = fmul float %a, %y
420  store float %m, float addrspace(1)* %out
421  ret void
422}
423
424; FUNC-LABEL: {{^}}test_f32_mul_y_add_x_negone:
425; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
426; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
427;
428; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
429define void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
430                                           float addrspace(1)* %in1,
431                                           float addrspace(1)* %in2) {
432  %x = load float, float addrspace(1)* %in1
433  %y = load float, float addrspace(1)* %in2
434  %a = fadd float %x, -1.0
435  %m = fmul float %y, %a
436  store float %m, float addrspace(1)* %out
437  ret void
438}
439
440; FUNC-LABEL: {{^}}test_f32_mul_sub_one_x_y:
441; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
442; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
443;
444; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
445define void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
446                                        float addrspace(1)* %in1,
447                                        float addrspace(1)* %in2) {
448  %x = load float, float addrspace(1)* %in1
449  %y = load float, float addrspace(1)* %in2
450  %s = fsub float 1.0, %x
451  %m = fmul float %s, %y
452  store float %m, float addrspace(1)* %out
453  ret void
454}
455
456; FUNC-LABEL: {{^}}test_f32_mul_y_sub_one_x:
457; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
458; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
459;
460; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
461define void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
462                                        float addrspace(1)* %in1,
463                                        float addrspace(1)* %in2) {
464  %x = load float, float addrspace(1)* %in1
465  %y = load float, float addrspace(1)* %in2
466  %s = fsub float 1.0, %x
467  %m = fmul float %y, %s
468  store float %m, float addrspace(1)* %out
469  ret void
470}
471
472; FUNC-LABEL: {{^}}test_f32_mul_sub_negone_x_y:
473; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
474; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
475;
476; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
477define void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
478                                           float addrspace(1)* %in1,
479                                           float addrspace(1)* %in2) {
480  %x = load float, float addrspace(1)* %in1
481  %y = load float, float addrspace(1)* %in2
482  %s = fsub float -1.0, %x
483  %m = fmul float %s, %y
484  store float %m, float addrspace(1)* %out
485  ret void
486}
487
488; FUNC-LABEL: {{^}}test_f32_mul_y_sub_negone_x:
489; SI-NOFMA: v_sub_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
490; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
491;
492; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
493define void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
494                                         float addrspace(1)* %in1,
495                                         float addrspace(1)* %in2) {
496  %x = load float, float addrspace(1)* %in1
497  %y = load float, float addrspace(1)* %in2
498  %s = fsub float -1.0, %x
499  %m = fmul float %y, %s
500  store float %m, float addrspace(1)* %out
501  ret void
502}
503
504; FUNC-LABEL: {{^}}test_f32_mul_sub_x_one_y:
505; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
506; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
507;
508; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
509define void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
510                                        float addrspace(1)* %in1,
511                                        float addrspace(1)* %in2) {
512  %x = load float, float addrspace(1)* %in1
513  %y = load float, float addrspace(1)* %in2
514  %s = fsub float %x, 1.0
515  %m = fmul float %s, %y
516  store float %m, float addrspace(1)* %out
517  ret void
518}
519
520; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_one:
521; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], -1.0, [[VX:v[0-9]]]
522; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
523;
524; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
525define void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
526                                      float addrspace(1)* %in1,
527                                      float addrspace(1)* %in2) {
528  %x = load float, float addrspace(1)* %in1
529  %y = load float, float addrspace(1)* %in2
530  %s = fsub float %x, 1.0
531  %m = fmul float %y, %s
532  store float %m, float addrspace(1)* %out
533  ret void
534}
535
536; FUNC-LABEL: {{^}}test_f32_mul_sub_x_negone_y:
537; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
538; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
539;
540; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
541define void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
542                                         float addrspace(1)* %in1,
543                                         float addrspace(1)* %in2) {
544  %x = load float, float addrspace(1)* %in1
545  %y = load float, float addrspace(1)* %in2
546  %s = fsub float %x, -1.0
547  %m = fmul float %s, %y
548  store float %m, float addrspace(1)* %out
549  ret void
550}
551
552; FUNC-LABEL: {{^}}test_f32_mul_y_sub_x_negone:
553; SI-NOFMA: v_add_f32_e32 [[VS:v[0-9]]], 1.0, [[VX:v[0-9]]]
554; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
555;
556; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
557define void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
558                                         float addrspace(1)* %in1,
559                                         float addrspace(1)* %in2) {
560  %x = load float, float addrspace(1)* %in1
561  %y = load float, float addrspace(1)* %in2
562  %s = fsub float %x, -1.0
563  %m = fmul float %y, %s
564  store float %m, float addrspace(1)* %out
565  ret void
566}
567
568;
569; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y))
570;
571
572; FUNC-LABEL: {{^}}test_f32_interp:
573; SI-NOFMA: v_sub_f32_e32 [[VT1:v[0-9]]], 1.0, [[VT:v[0-9]]]
574; SI-NOFMA: v_mul_f32_e32 [[VTY:v[0-9]]], [[VT1]], [[VY:v[0-9]]]
575; SI-NOFMA: v_mac_f32_e32 [[VTY]], [[VT]], [[VX:v[0-9]]]
576;
577; SI-FMA: v_fma_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]]
578; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VT]], [[VR]]
579define void @test_f32_interp(float addrspace(1)* %out,
580                             float addrspace(1)* %in1,
581                             float addrspace(1)* %in2,
582                             float addrspace(1)* %in3) {
583  %x = load float, float addrspace(1)* %in1
584  %y = load float, float addrspace(1)* %in2
585  %t = load float, float addrspace(1)* %in3
586  %t1 = fsub float 1.0, %t
587  %tx = fmul float %x, %t
588  %ty = fmul float %y, %t1
589  %r = fadd float %tx, %ty
590  store float %r, float addrspace(1)* %out
591  ret void
592}
593
594; FUNC-LABEL: {{^}}test_f64_interp:
595; SI-NOFMA: v_add_f64 [[VT1:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], 1.0
596; SI-NOFMA: v_mul_f64 [[VTY:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VT1]]
597; SI-NOFMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VTY]]
598;
599; SI-FMA: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]]
600; SI-FMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]]
601define void @test_f64_interp(double addrspace(1)* %out,
602                             double addrspace(1)* %in1,
603                             double addrspace(1)* %in2,
604                             double addrspace(1)* %in3) {
605  %x = load double, double addrspace(1)* %in1
606  %y = load double, double addrspace(1)* %in2
607  %t = load double, double addrspace(1)* %in3
608  %t1 = fsub double 1.0, %t
609  %tx = fmul double %x, %t
610  %ty = fmul double %y, %t1
611  %r = fadd double %tx, %ty
612  store double %r, double addrspace(1)* %out
613  ret void
614}
615
616attributes #0 = { nounwind readnone }
617attributes #1 = { nounwind }
618