1; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s
2; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s
3
4; GCN-LABEL: {{^}}fadd_v2_vv:
5; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
6; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
7define amdgpu_kernel void @fadd_v2_vv(<2 x float> addrspace(1)* %a) {
8  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
9  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
10  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
11  %add = fadd <2 x float> %load, %load
12  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
13  ret void
14}
15
16; GCN-LABEL: {{^}}fadd_v2_vs:
17; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
18; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
19define amdgpu_kernel void @fadd_v2_vs(<2 x float> addrspace(1)* %a, <2 x float> %x) {
20  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
21  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
22  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
23  %add = fadd <2 x float> %load, %x
24  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
25  ret void
26}
27
28; GCN-LABEL: {{^}}fadd_v4_vs:
29; GFX900-COUNT-4: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
30; GFX90A-COUNT-2: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
31define amdgpu_kernel void @fadd_v4_vs(<4 x float> addrspace(1)* %a, <4 x float> %x) {
32  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
33  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a, i32 %id
34  %load = load <4 x float>, <4 x float> addrspace(1)* %gep, align 16
35  %add = fadd <4 x float> %load, %x
36  store <4 x float> %add, <4 x float> addrspace(1)* %gep, align 16
37  ret void
38}
39
40; GCN-LABEL: {{^}}fadd_v32_vs:
41; GFX900-COUNT-32: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
42; GFX90A-COUNT-16: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
43define amdgpu_kernel void @fadd_v32_vs(<32 x float> addrspace(1)* %a, <32 x float> %x) {
44  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
45  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %a, i32 %id
46  %load = load <32 x float>, <32 x float> addrspace(1)* %gep, align 128
47  %add = fadd <32 x float> %load, %x
48  store <32 x float> %add, <32 x float> addrspace(1)* %gep, align 128
49  ret void
50}
51
52; GCN-LABEL: {{^}}fadd_v2_v_imm:
53; GFX90A:         s_mov_b32 s[[K:[0-9]+]], 0x42c80000
54; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, 0x42c80000, v{{[0-9]+}}
55; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}}
56define amdgpu_kernel void @fadd_v2_v_imm(<2 x float> addrspace(1)* %a) {
57  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
58  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
59  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
60  %add = fadd <2 x float> %load, <float 100.0, float 100.0>
61  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
62  ret void
63}
64
65; GCN-LABEL: {{^}}fadd_v2_v_v_splat:
66; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v0
67; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1] op_sel_hi:[1,0]{{$}}
68define amdgpu_kernel void @fadd_v2_v_v_splat(<2 x float> addrspace(1)* %a) {
69  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
70  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
71  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
72  %fid = bitcast i32 %id to float
73  %tmp1 = insertelement <2 x float> undef, float %fid, i64 0
74  %k = insertelement <2 x float> %tmp1, float %fid, i64 1
75  %add = fadd <2 x float> %load, %k
76  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
77  ret void
78}
79
80; GCN-LABEL: {{^}}fadd_v2_v_lit_splat:
81; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
82; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 1.0 op_sel_hi:[1,0]{{$}}
83define amdgpu_kernel void @fadd_v2_v_lit_splat(<2 x float> addrspace(1)* %a) {
84  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
85  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
86  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
87  %add = fadd <2 x float> %load, <float 1.0, float 1.0>
88  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
89  ret void
90}
91
92; GCN-LABEL: {{^}}fadd_v2_v_lit_hi0:
93; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
94; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
95; GFX90A-DAG: s_mov_b64 [[K:s\[[0-9:]+\]]], 0x3f800000
96; GFX90A:     v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], [[K]]
97define amdgpu_kernel void @fadd_v2_v_lit_hi0(<2 x float> addrspace(1)* %a) {
98  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
99  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
100  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
101  %add = fadd <2 x float> %load, <float 1.0, float 0.0>
102  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
103  ret void
104}
105
106; GCN-LABEL: {{^}}fadd_v2_v_lit_lo0:
107; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
108; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
109; GFX90A-DAG: s_mov_b32 s[[LO:[0-9]+]], 0
110; GFX90A-DAG: s_mov_b32 s[[HI:[0-9]+]], 1.0
111; GFX90A:     v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[LO]]:[[HI]]]{{$}}
112define amdgpu_kernel void @fadd_v2_v_lit_lo0(<2 x float> addrspace(1)* %a) {
113  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
114  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
115  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
116  %add = fadd <2 x float> %load, <float 0.0, float 1.0>
117  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
118  ret void
119}
120
121; GCN-LABEL: {{^}}fadd_v2_v_unfoldable_lit:
122; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
123; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
124; GFX90A-DAG: s_mov_b32 s{{[0-9]+}}, 1.0
125; GFX90A-DAG: s_mov_b32 s{{[0-9]+}}, 2.0
126; GFX90A:     v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
127define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(<2 x float> addrspace(1)* %a) {
128  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
129  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
130  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
131  %add = fadd <2 x float> %load, <float 1.0, float 2.0>
132  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
133  ret void
134}
135
136; GCN-LABEL: {{^}}fadd_v2_v_fneg:
137; GFX900-COUNT-2: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
138; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}}
139define amdgpu_kernel void @fadd_v2_v_fneg(<2 x float> addrspace(1)* %a, float %x) {
140  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
141  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
142  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
143  %fneg = fsub float -0.0, %x
144  %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0
145  %k = insertelement <2 x float> %tmp1, float %fneg, i64 1
146  %add = fadd <2 x float> %load, %k
147  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
148  ret void
149}
150
151; GCN-LABEL: {{^}}fadd_v2_v_fneg_lo:
152; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
153; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
154; GFX90A:     v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1]{{$}}
155define amdgpu_kernel void @fadd_v2_v_fneg_lo(<2 x float> addrspace(1)* %a, float %x) {
156  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
157  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
158  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
159  %fneg = fsub float -0.0, %x
160  %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0
161  %k = insertelement <2 x float> %tmp1, float %x, i64 1
162  %add = fadd <2 x float> %load, %k
163  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
164  ret void
165}
166
167; GCN-LABEL: {{^}}fadd_v2_v_fneg_hi:
168; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
169; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
170; GFX90A:     v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_hi:[0,1]{{$}}
171define amdgpu_kernel void @fadd_v2_v_fneg_hi(<2 x float> addrspace(1)* %a, float %x) {
172  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
173  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
174  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
175  %fneg = fsub float -0.0, %x
176  %tmp1 = insertelement <2 x float> undef, float %x, i64 0
177  %k = insertelement <2 x float> %tmp1, float %fneg, i64 1
178  %add = fadd <2 x float> %load, %k
179  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
180  ret void
181}
182
183; GCN-LABEL: {{^}}fadd_v2_v_fneg_lo2:
184; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
185; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
186; GFX90A:     v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] neg_lo:[0,1]{{$}}
187define amdgpu_kernel void @fadd_v2_v_fneg_lo2(<2 x float> addrspace(1)* %a, float %x, float %y) {
188  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
189  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
190  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
191  %fneg = fsub float -0.0, %x
192  %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0
193  %k = insertelement <2 x float> %tmp1, float %y, i64 1
194  %add = fadd <2 x float> %load, %k
195  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
196  ret void
197}
198
199; GCN-LABEL: {{^}}fadd_v2_v_fneg_hi2:
200; GFX900-DAG: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
201; GFX900-DAG: v_subrev_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
202; GFX90A:     v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel:[0,1] op_sel_hi:[1,0] neg_hi:[0,1]{{$}}
203define amdgpu_kernel void @fadd_v2_v_fneg_hi2(<2 x float> addrspace(1)* %a, float %x, float %y) {
204  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
205  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
206  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
207  %fneg = fsub float -0.0, %x
208  %tmp1 = insertelement <2 x float> undef, float %y, i64 0
209  %k = insertelement <2 x float> %tmp1, float %fneg, i64 1
210  %add = fadd <2 x float> %load, %k
211  store <2 x float> %add, <2 x float> addrspace(1)* %gep, align 8
212  ret void
213}
214
215; GCN-LABEL: {{^}}fmul_v2_vv:
216; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
217; GFX90A:         v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
218define amdgpu_kernel void @fmul_v2_vv(<2 x float> addrspace(1)* %a) {
219  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
220  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
221  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
222  %mul = fmul <2 x float> %load, %load
223  store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8
224  ret void
225}
226
227; GCN-LABEL: {{^}}fmul_v2_vs:
228; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
229; GFX90A:         v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
230define amdgpu_kernel void @fmul_v2_vs(<2 x float> addrspace(1)* %a, <2 x float> %x) {
231  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
232  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
233  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
234  %mul = fmul <2 x float> %load, %x
235  store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8
236  ret void
237}
238
239; GCN-LABEL: {{^}}fmul_v4_vs:
240; GFX900-COUNT-4: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
241; GFX90A-COUNT-2: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
242define amdgpu_kernel void @fmul_v4_vs(<4 x float> addrspace(1)* %a, <4 x float> %x) {
243  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
244  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a, i32 %id
245  %load = load <4 x float>, <4 x float> addrspace(1)* %gep, align 16
246  %mul = fmul <4 x float> %load, %x
247  store <4 x float> %mul, <4 x float> addrspace(1)* %gep, align 16
248  ret void
249}
250
251; GCN-LABEL: {{^}}fmul_v32_vs:
252; GFX900-COUNT-32: v_mul_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
253; GFX90A-COUNT-16: v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
254define amdgpu_kernel void @fmul_v32_vs(<32 x float> addrspace(1)* %a, <32 x float> %x) {
255  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
256  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %a, i32 %id
257  %load = load <32 x float>, <32 x float> addrspace(1)* %gep, align 128
258  %mul = fmul <32 x float> %load, %x
259  store <32 x float> %mul, <32 x float> addrspace(1)* %gep, align 128
260  ret void
261}
262
263; GCN-LABEL: {{^}}fmul_v2_v_imm:
264; GFX90A:         s_mov_b32 s[[K:[0-9]+]], 0x42c80000
265; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, 0x42c80000, v{{[0-9]+}}
266; GFX90A:         v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}}
267define amdgpu_kernel void @fmul_v2_v_imm(<2 x float> addrspace(1)* %a) {
268  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
269  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
270  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
271  %mul = fmul <2 x float> %load, <float 100.0, float 100.0>
272  store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8
273  ret void
274}
275
276; GCN-LABEL: {{^}}fmul_v2_v_v_splat:
277; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v0
278; GFX90A:         v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1] op_sel_hi:[1,0]{{$}}
279define amdgpu_kernel void @fmul_v2_v_v_splat(<2 x float> addrspace(1)* %a) {
280  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
281  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
282  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
283  %fid = bitcast i32 %id to float
284  %tmp1 = insertelement <2 x float> undef, float %fid, i64 0
285  %k = insertelement <2 x float> %tmp1, float %fid, i64 1
286  %mul = fmul <2 x float> %load, %k
287  store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8
288  ret void
289}
290
291; GCN-LABEL: {{^}}fmul_v2_v_lit_splat:
292; GFX900-COUNT-2: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
293; GFX90A:         v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0 op_sel_hi:[1,0]{{$}}
294define amdgpu_kernel void @fmul_v2_v_lit_splat(<2 x float> addrspace(1)* %a) {
295  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
296  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
297  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
298  %mul = fmul <2 x float> %load, <float 4.0, float 4.0>
299  store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8
300  ret void
301}
302
303; GCN-LABEL: {{^}}fmul_v2_v_unfoldable_lit:
304; GFX900-DAG: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
305; GFX900-DAG: v_mul_f32_e32 v{{[0-9]+}}, 0x40400000, v{{[0-9]+}}
306; GFX90A-DAG: s_mov_b32 s{{[0-9]+}}, 4.0
307; GFX90A-DAG: s_mov_b32 s{{[0-9]+}}, 0x40400000
308; GFX90A:     v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
309define amdgpu_kernel void @fmul_v2_v_unfoldable_lit(<2 x float> addrspace(1)* %a) {
310  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
311  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
312  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
313  %mul = fmul <2 x float> %load, <float 4.0, float 3.0>
314  store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8
315  ret void
316}
317
318; GCN-LABEL: {{^}}fmul_v2_v_fneg:
319; GFX900-COUNT-2: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -s{{[0-9]+}}
320; GFX90A:         v_pk_mul_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}}
321define amdgpu_kernel void @fmul_v2_v_fneg(<2 x float> addrspace(1)* %a, float %x) {
322  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
323  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
324  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
325  %fneg = fsub float -0.0, %x
326  %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0
327  %k = insertelement <2 x float> %tmp1, float %fneg, i64 1
328  %mul = fmul <2 x float> %load, %k
329  store <2 x float> %mul, <2 x float> addrspace(1)* %gep, align 8
330  ret void
331}
332
333; GCN-LABEL: {{^}}fma_v2_vv:
334; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
335; GFX90A:         v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
336define amdgpu_kernel void @fma_v2_vv(<2 x float> addrspace(1)* %a) {
337  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
338  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
339  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
340  %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %load, <2 x float> %load)
341  store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8
342  ret void
343}
344
345; GCN-LABEL: {{^}}fma_v2_vs:
346; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
347; GFX90A:         v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
348define amdgpu_kernel void @fma_v2_vs(<2 x float> addrspace(1)* %a, <2 x float> %x) {
349  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
350  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
351  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
352  %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %x, <2 x float> %x)
353  store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8
354  ret void
355}
356
357; GCN-LABEL: {{^}}fma_v4_vs:
358; GFX900-COUNT-4: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
359; GFX90A-COUNT-2: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
360define amdgpu_kernel void @fma_v4_vs(<4 x float> addrspace(1)* %a, <4 x float> %x) {
361  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
362  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a, i32 %id
363  %load = load <4 x float>, <4 x float> addrspace(1)* %gep, align 16
364  %fma = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %load, <4 x float> %x, <4 x float> %x)
365  store <4 x float> %fma, <4 x float> addrspace(1)* %gep, align 16
366  ret void
367}
368
369; GCN-LABEL: {{^}}fma_v32_vs:
370; GFX900-COUNT-32: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
371; GFX90A-COUNT-16: v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
372define amdgpu_kernel void @fma_v32_vs(<32 x float> addrspace(1)* %a, <32 x float> %x) {
373  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
374  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %a, i32 %id
375  %load = load <32 x float>, <32 x float> addrspace(1)* %gep, align 128
376  %fma = tail call <32 x float> @llvm.fma.v32f32(<32 x float> %load, <32 x float> %x, <32 x float> %x)
377  store <32 x float> %fma, <32 x float> addrspace(1)* %gep, align 128
378  ret void
379}
380
381; GCN-LABEL: {{^}}fma_v2_v_imm:
382; GCN-DAG:        s_mov_b32 s[[K1:[0-9]+]], 0x42c80000
383; GCN-DAG:        v_mov_b32_e32 v[[K2:[0-9]+]], 0x43480000
384; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s[[K1]], v[[K2]]
385; GFX90A:         v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K1]]:{{[0-9:]+}}], v[[[K2]]:{{[0-9:]+}}] op_sel_hi:[1,0,0]{{$}}
386define amdgpu_kernel void @fma_v2_v_imm(<2 x float> addrspace(1)* %a) {
387  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
388  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
389  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
390  %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> <float 100.0, float 100.0>, <2 x float> <float 200.0, float 200.0>)
391  store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8
392  ret void
393}
394
395; GCN-LABEL: {{^}}fma_v2_v_v_splat:
396; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, v0, v0
397; GFX90A:         v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1], v[0:1] op_sel_hi:[1,0,0]{{$}}
398define amdgpu_kernel void @fma_v2_v_v_splat(<2 x float> addrspace(1)* %a) {
399  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
400  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
401  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
402  %fid = bitcast i32 %id to float
403  %tmp1 = insertelement <2 x float> undef, float %fid, i64 0
404  %k = insertelement <2 x float> %tmp1, float %fid, i64 1
405  %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %k, <2 x float> %k)
406  store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8
407  ret void
408}
409
410; GCN-LABEL: {{^}}fma_v2_v_lit_splat:
411; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, 4.0, 1.0
412; GFX90A:         v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 4.0, 1.0 op_sel_hi:[1,0,0]{{$}}
413define amdgpu_kernel void @fma_v2_v_lit_splat(<2 x float> addrspace(1)* %a) {
414  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
415  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
416  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
417  %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> <float 4.0, float 4.0>, <2 x float> <float 1.0, float 1.0>)
418  store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8
419  ret void
420}
421
422; GCN-LABEL: {{^}}fma_v2_v_unfoldable_lit:
423; GCN-DAG:    s_mov_b32 s{{[0-9]+}}, 0x40400000
424; GFX900-DAG: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, 4.0, 1.0
425; GFX900-DAG: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, 2.0
426; GFX90A-DAG: s_mov_b32 s{{[0-9]+}}, 4.0
427; GFX90A-DAG: v_mov_b32_e32 v{{[0-9]+}}, 1.0
428; GFX90A-DAG: v_mov_b32_e32 v{{[0-9]+}}, 2.0
429; GFX90A:     v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], v[{{[0-9:]+}}]{{$}}
430define amdgpu_kernel void @fma_v2_v_unfoldable_lit(<2 x float> addrspace(1)* %a) {
431  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
432  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
433  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
434  %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> <float 4.0, float 3.0>, <2 x float> <float 1.0, float 2.0>)
435  store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8
436  ret void
437}
438
439; GCN-LABEL: {{^}}fma_v2_v_fneg:
440; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, -s{{[0-9]+}}, -s{{[0-9]+}}
441; GFX90A:         v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] op_sel_hi:[1,0,0] neg_lo:[0,1,1] neg_hi:[0,1,1]{{$}}
442define amdgpu_kernel void @fma_v2_v_fneg(<2 x float> addrspace(1)* %a, float %x) {
443  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
444  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
445  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
446  %fneg = fsub float -0.0, %x
447  %tmp1 = insertelement <2 x float> undef, float %fneg, i64 0
448  %k = insertelement <2 x float> %tmp1, float %fneg, i64 1
449  %fma = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %load, <2 x float> %k, <2 x float> %k)
450  store <2 x float> %fma, <2 x float> addrspace(1)* %gep, align 8
451  ret void
452}
453
454; GCN-LABEL: {{^}}add_vector_neg_bitcast_scalar_lo:
455; GFX900-COUNT-2: v_sub_f32_e32
456; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]
457define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %lds, float addrspace(3)* %arg2) {
458bb:
459  %vec0 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds, align 4
460  %scalar0 = load volatile float, float addrspace(3)* %arg2, align 4
461  %neg.scalar0 = fsub float -0.0, %scalar0
462
463  %neg.scalar0.vec = insertelement <2 x float> undef, float %neg.scalar0, i32 0
464  %neg.scalar0.broadcast = shufflevector <2 x float> %neg.scalar0.vec, <2 x float> undef, <2 x i32> zeroinitializer
465
466  %result = fadd <2 x float> %vec0, %neg.scalar0.broadcast
467  store <2 x float> %result, <2 x float> addrspace(1)* %out, align 4
468  ret void
469}
470
471; GCN-LABEL: {{^}}fma_vector_vector_neg_scalar_lo_scalar_hi:
472; GFX900-COUNT-2: v_fma_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
473; GFX90A:         v_pk_fma_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] neg_lo:[0,0,1] neg_hi:[0,0,1]
474define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %lds, float addrspace(3)* %arg2) {
475bb:
476  %lds.gep1 = getelementptr inbounds <2 x float>, <2 x float> addrspace(3)* %lds, i32 1
477  %arg2.gep = getelementptr inbounds float, float addrspace(3)* %arg2, i32 2
478
479  %vec0 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds, align 4
480  %vec1 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds.gep1, align 4
481
482  %scalar0 = load volatile float, float addrspace(3)* %arg2, align 4
483  %scalar1 = load volatile float, float addrspace(3)* %arg2.gep, align 4
484
485  %vec.ins0 = insertelement <2 x float> undef, float %scalar0, i32 0
486  %vec2 = insertelement <2 x float> %vec.ins0, float %scalar1, i32 1
487  %neg.vec2 = fsub <2 x float> <float -0.0, float -0.0>, %vec2
488
489  %result = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %vec0, <2 x float> %vec1, <2 x float> %neg.vec2)
490  store <2 x float> %result, <2 x float> addrspace(1)* %out, align 4
491  ret void
492}
493
494; GCN-LABEL: {{^}}shuffle_add_f32:
495; GFX900-COUNT-2: v_add_f32_e32
496; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel:[0,1] op_sel_hi:[1,0]{{$}}
497define amdgpu_kernel void @shuffle_add_f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %lds) #0 {
498bb:
499  %vec0 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds, align 8
500  %lds.gep1 = getelementptr inbounds <2 x float>, <2 x float> addrspace(3)* %lds, i32 1
501  %vec1 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds.gep1, align 8
502  %vec1.swap = shufflevector <2 x float> %vec1, <2 x float> undef, <2 x i32> <i32 1, i32 0>
503  %result = fadd <2 x float> %vec0, %vec1.swap
504  store <2 x float> %result, <2 x float> addrspace(1)* %out, align 8
505  ret void
506}
507
508; GCN-LABEL: {{^}}shuffle_neg_add_f32:
509; GFX900-COUNT-2: v_sub_f32_e32
510; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel:[0,1] op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1]{{$}}
511define amdgpu_kernel void @shuffle_neg_add_f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %lds) #0 {
512bb:
513  %vec0 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds, align 8
514  %lds.gep1 = getelementptr inbounds <2 x float>, <2 x float> addrspace(3)* %lds, i32 1
515  %f32 = load volatile float, float addrspace(3)* undef, align 8
516  %vec1 = load volatile <2 x float>, <2 x float> addrspace(3)* %lds.gep1, align 8
517  %vec1.neg = fsub <2 x float> <float -0.0, float -0.0>, %vec1
518  %vec1.neg.swap = shufflevector <2 x float> %vec1.neg, <2 x float> undef, <2 x i32> <i32 1, i32 0>
519  %result = fadd <2 x float> %vec0, %vec1.neg.swap
520  store <2 x float> %result, <2 x float> addrspace(1)* %out, align 8
521  ret void
522}
523
524; GCN-LABEL: {{^}}fadd_fadd_fsub:
525; GFX900: v_add_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 0
526; GFX900: v_add_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
527; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], s[{{[0-9:]+}}], 0 op_sel_hi:[1,0]
528; GFX90A: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0 op_sel_hi:[1,0]
529define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg) {
530bb:
531  %i12 = fadd <2 x float> zeroinitializer, %arg
532  %shift8 = shufflevector <2 x float> %i12, <2 x float> undef, <2 x i32> <i32 1, i32 undef>
533  %i13 = fadd <2 x float> zeroinitializer, %shift8
534  %i14 = shufflevector <2 x float> %arg, <2 x float> %i13, <2 x i32> <i32 0, i32 2>
535  %i15 = fsub <2 x float> %i14, zeroinitializer
536  store <2 x float> %i15, <2 x float>* undef
537  ret void
538}
539
540; GCN-LABEL: {{^}}fadd_shuffle_v4:
541; GFX900-COUNT-4: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
542; GFX90A-COUNT-2: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] op_sel_hi:[1,0]
543define amdgpu_kernel void @fadd_shuffle_v4(<4 x float> addrspace(1)* %arg) {
544bb:
545  %tid = call i32 @llvm.amdgcn.workitem.id.x()
546  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i32 %tid
547  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %gep
548  %shuf = shufflevector <4 x float> %in.1, <4 x float> undef, <4 x i32> zeroinitializer
549  %add.1 = fadd <4 x float> %in.1, %shuf
550  store <4 x float> %add.1, <4 x float> addrspace(1)* %gep
551  ret void
552}
553
554; GCN-LABEL: {{^}}fneg_v2f32_vec:
555; GFX900-COUNT-2: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
556; GFX90A:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0 neg_lo:[1,1] neg_hi:[1,1]{{$}}
557define amdgpu_kernel void @fneg_v2f32_vec(<2 x float> addrspace(1)* %a) {
558  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
559  %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i32 %id
560  %load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
561  %fneg = fsub <2 x float> <float -0.0, float -0.0>, %load
562  store <2 x float> %fneg, <2 x float> addrspace(1)* %gep, align 8
563  ret void
564}
565
566; GCN-LABEL: {{^}}fneg_v2f32_scalar:
567; GCN-COUNT-2: s_xor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
568define amdgpu_kernel void @fneg_v2f32_scalar(<2 x float> addrspace(1)* %a, <2 x float> %x) {
569  %fneg = fsub <2 x float> <float -0.0, float -0.0>, %x
570  store <2 x float> %fneg, <2 x float> addrspace(1)* %a, align 8
571  ret void
572}
573
574declare i32 @llvm.amdgcn.workitem.id.x()
575declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
576declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
577declare <32 x float> @llvm.fma.v32f32(<32 x float>, <32 x float>, <32 x float>)
578