1; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
2
3; GCN-LABEL: {{^}}test_mfma_loop_zeroinit:
4
5; GCN-COUNT32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
6
7; Check that we do not copy agprs to vgprs and back inside the loop.
8
9; GCN: [[LOOP:BB[0-9_]+]]:
10; GCN-NOT: v_accvgpr
11; GCN: v_mfma_f32_32x32x1f32
12; GCN-NOT: v_accvgpr
13; GCN: s_cbranch_scc1 [[LOOP]]
14
15; Final result should be read only once after the loop.
16
17; GCN-COUNT32: v_accvgpr_read_b32
18
19define amdgpu_kernel void @test_mfma_loop_zeroinit(<32 x float> addrspace(1)* %arg) {
20entry:
21  br label %for.cond.preheader
22
23for.cond.preheader:
24  %phi = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %for.cond.preheader ]
25  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
26  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
27  %inc = add nuw nsw i32 %c, 1
28  %cc = icmp eq i32 %inc, 16
29  br i1 %cc, label %exit, label %for.cond.preheader
30
31exit:
32  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
33  ret void
34}
35
36; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_splat:
37
38; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only.
39; 3 vgprs are needed to avoid wait states between writes.
40; Check that we do not use 32 temp sgprs as well.
41
42; GCN:         v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000
43; GCN-COUNT32: v_accvgpr_write_b32 a0, [[TMP]]
44
45; GCN: [[LOOP:BB[0-9_]+]]:
46; GCN-NOT: v_accvgpr
47; GCN: v_mfma_f32_32x32x1f32
48; GCN-NOT: v_accvgpr
49; GCN: s_cbranch_scc1 [[LOOP]]
50
51; GCN-COUNT32: v_accvgpr_read_b32
52
53define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(<32 x float> addrspace(1)* %arg) {
54entry:
55  br label %for.cond.preheader
56
57for.cond.preheader:
58  %phi = phi <32 x float> [ <float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0>, %entry ], [ %mai.1, %for.cond.preheader ]
59  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
60  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
61  %inc = add nuw nsw i32 %c, 1
62  %cc = icmp eq i32 %inc, 16
63  br i1 %cc, label %exit, label %for.cond.preheader
64
65exit:
66  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
67  ret void
68}
69
70; GCN-LABEL: {{^}}test_mfma_loop_non_splat:
71
72; GCN:         v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
73; GCN:         v_accvgpr_write_b32 a{{[0-9]+}}, 1.0{{$}}
74; GCN-COUNT30: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
75
76; GCN: [[LOOP:BB[0-9_]+]]:
77; GCN-NOT: v_accvgpr
78; GCN: v_mfma_f32_32x32x1f32
79; GCN-NOT: v_accvgpr
80; GCN: s_cbranch_scc1 [[LOOP]]
81
82; GCN-COUNT32: v_accvgpr_read_b32
83
84define amdgpu_kernel void @test_mfma_loop_non_splat(<32 x float> addrspace(1)* %arg) {
85entry:
86  br label %for.cond.preheader
87
88for.cond.preheader:
89  %phi = phi <32 x float> [ <float 0.0, float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %mai.1, %for.cond.preheader ]
90  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
91  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
92  %inc = add nuw nsw i32 %c, 1
93  %cc = icmp eq i32 %inc, 16
94  br i1 %cc, label %exit, label %for.cond.preheader
95
96exit:
97  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
98  ret void
99}
100
101; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_seq:
102
103; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only.
104; 3 vgprs are needed to avoid wait states between writes.
105
106; GCN: v_mov_b32_e32 [[TMP1:v[0-9]+]], 0x42f60000
107; GCN: v_mov_b32_e32 [[TMP2:v[0-9]+]], 0x42f80000
108; GCN: v_mov_b32_e32 [[TMP3:v[0-9]+]], 0x42fe0000
109; GCN-COUNT29: v_mov_b32_e32 v1, 0x4{{[0-9a-f]+}}
110; GCN-COUNT10: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
111; GCN-COUNT11: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
112; GCN-COUNT11: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
113
114; GCN: [[LOOP:BB[0-9_]+]]:
115; GCN-NOT: v_accvgpr
116; GCN: v_mfma_f32_32x32x1f32
117; GCN-NOT: v_accvgpr
118; GCN: s_cbranch_scc1 [[LOOP]]
119
120; GCN-COUNT32: v_accvgpr_read_b32
121
122define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(<32 x float> addrspace(1)* %arg) {
123entry:
124  br label %for.cond.preheader
125
126for.cond.preheader:
127  %phi = phi <32 x float> [ <float 123.0, float 124.0, float 125.0, float 126.0, float 127.0, float 128.0, float 129.0, float 130.0, float 131.0, float 132.0, float 133.0, float 134.0, float 135.0, float 136.0, float 137.0, float 138.0, float 139.0, float 140.0, float 141.0, float 142.0, float 143.0, float 144.0, float 145.0, float 146.0, float 147.0, float 148.0, float 149.0, float 150.0, float 151.0, float 152.0, float 153.0, float 154.0>, %entry ], [ %mai.1, %for.cond.preheader ]
128  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
129  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
130  %inc = add nuw nsw i32 %c, 1
131  %cc = icmp eq i32 %inc, 16
132  br i1 %cc, label %exit, label %for.cond.preheader
133
134exit:
135  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
136  ret void
137}
138
139; GCN-LABEL: {{^}}test_mfma_loop_vgpr_init:
140
141; GCN-COUNT32: v_accvgpr_write_b32 a{{[0-9]+}}, v0{{$}}
142
143; GCN: [[LOOP:BB[0-9_]+]]:
144; GCN-NOT: v_accvgpr
145; GCN: v_mfma_f32_32x32x1f32
146; GCN-NOT: v_accvgpr
147; GCN: s_cbranch_scc1 [[LOOP]]
148
149; GCN-COUNT32: v_accvgpr_read_b32
150
151define amdgpu_kernel void @test_mfma_loop_vgpr_init(<32 x float> addrspace(1)* %arg) {
152entry:
153  %tid = call i32 @llvm.amdgcn.workitem.id.x()
154  %init = bitcast i32 %tid to float
155  %tmp0 = insertelement <32 x float> undef, float %init, i32 0
156  %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1
157  %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2
158  %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3
159  %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4
160  %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5
161  %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6
162  %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7
163  %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8
164  %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9
165  %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10
166  %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11
167  %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12
168  %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13
169  %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14
170  %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15
171  %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16
172  %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17
173  %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18
174  %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19
175  %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20
176  %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21
177  %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22
178  %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23
179  %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24
180  %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25
181  %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26
182  %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27
183  %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28
184  %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29
185  %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30
186  %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31
187
188  br label %for.cond.preheader
189
190for.cond.preheader:
191  %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ]
192  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
193  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
194  %inc = add nuw nsw i32 %c, 1
195  %cc = icmp eq i32 %inc, 16
196  br i1 %cc, label %exit, label %for.cond.preheader
197
198exit:
199  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
200  ret void
201}
202
203; GCN-LABEL: {{^}}test_mfma_loop_sgpr_init:
204
205; GCN:         v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}}
206; GCN-COUNT32: v_accvgpr_write_b32 a0, [[TMP]]
207
208; GCN: [[LOOP:BB[0-9_]+]]:
209; GCN-NOT: v_accvgpr
210; GCN: v_mfma_f32_32x32x1f32
211; GCN-NOT: v_accvgpr
212; GCN: s_cbranch_scc1 [[LOOP]]
213
214; GCN-COUNT32: v_accvgpr_read_b32
215
216define amdgpu_kernel void @test_mfma_loop_sgpr_init(<32 x float> addrspace(1)* %arg, float %init) {
217entry:
218  %tmp0 = insertelement <32 x float> undef, float %init, i32 0
219  %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1
220  %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2
221  %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3
222  %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4
223  %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5
224  %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6
225  %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7
226  %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8
227  %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9
228  %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10
229  %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11
230  %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12
231  %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13
232  %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14
233  %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15
234  %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16
235  %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17
236  %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18
237  %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19
238  %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20
239  %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21
240  %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22
241  %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23
242  %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24
243  %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25
244  %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26
245  %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27
246  %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28
247  %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29
248  %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30
249  %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31
250
251  br label %for.cond.preheader
252
253for.cond.preheader:
254  %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ]
255  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
256  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
257  %inc = add nuw nsw i32 %c, 1
258  %cc = icmp eq i32 %inc, 16
259  br i1 %cc, label %exit, label %for.cond.preheader
260
261exit:
262  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
263  ret void
264}
265
266; GCN-LABEL: {{^}}test_mfma_loop_mixed_init:
267
268; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v0
269; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}}
270; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
271
272; GCN-COUNT30: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
273
274; GCN: [[LOOP:BB[0-9_]+]]:
275; GCN-NOT: v_accvgpr
276; GCN: v_mfma_f32_32x32x1f32
277; GCN-NOT: v_accvgpr
278; GCN: s_cbranch_scc1 [[LOOP]]
279
280; GCN-COUNT32: v_accvgpr_read_b32
281
282define amdgpu_kernel void @test_mfma_loop_mixed_init(<32 x float> addrspace(1)* %arg, float %x) {
283entry:
284  %tid = call i32 @llvm.amdgcn.workitem.id.x()
285  %init = bitcast i32 %tid to float
286  %tmp0 = insertelement <32 x float> zeroinitializer, float %init, i32 0
287  %tmp1 = insertelement <32 x float> %tmp0, float %x, i32 1
288
289  br label %for.cond.preheader
290
291for.cond.preheader:
292  %phi = phi <32 x float> [ %tmp1, %entry ], [ %mai.1, %for.cond.preheader ]
293  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
294  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
295  %inc = add nuw nsw i32 %c, 1
296  %cc = icmp eq i32 %inc, 16
297  br i1 %cc, label %exit, label %for.cond.preheader
298
299exit:
300  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
301  ret void
302}
303
304; GCN-LABEL: {{^}}test_mfma_loop_mfma_forward_init:
305
306; GCN-COUNT32: v_accvgpr_write_b32 a{{[0-9]+}}, 0
307; GCN: v_mfma_f32_32x32x1f32
308; GCN-NOT: v_accvgpr
309
310; GCN: [[LOOP:BB[0-9_]+]]:
311; GCN-NOT: v_accvgpr
312; GCN: v_mfma_f32_32x32x1f32
313; GCN-NOT: v_accvgpr
314; GCN: s_cbranch_scc1 [[LOOP]]
315
316; GCN-COUNT32: v_accvgpr_read_b32
317define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(<32 x float> addrspace(1)* %arg) {
318entry:
319  %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
320
321  br label %for.cond.preheader
322
323for.cond.preheader:
324  %phi = phi <32 x float> [ %mai.0, %entry ], [ %mai.1, %for.cond.preheader ]
325  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
326  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
327  %inc = add nuw nsw i32 %c, 1
328  %cc = icmp eq i32 %inc, 16
329  br i1 %cc, label %exit, label %for.cond.preheader
330
331exit:
332  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
333  ret void
334}
335
336; GCN-LABEL: {{^}}test_mfma_loop_agpr_init:
337
338; GCN-COUNT32: v_accvgpr_write_b32 a{{[0-9]+}}, 0
339; GCN: v_mfma_f32_32x32x1f32
340
341; Check that we are using only one tmp VGPR.
342
343; GCN: v_accvgpr_read_b32 [[TMP:v[0-9]+]], a{{[0-9]+}}
344; GCN-COUNT32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]{{$}}
345
346; GCN: [[LOOP:BB[0-9_]+]]:
347; GCN-NOT: v_accvgpr
348; GCN: v_mfma_f32_32x32x1f32
349; GCN-NOT: v_accvgpr
350; GCN: s_cbranch_scc1 [[LOOP]]
351
352; GCN-COUNT32: v_accvgpr_read_b32
353define amdgpu_kernel void @test_mfma_loop_agpr_init(<32 x float> addrspace(1)* %arg) {
354entry:
355  %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
356  %init = extractelement <32 x float> %mai.0, i32 0
357  %tmp0 = insertelement <32 x float> undef, float %init, i32 0
358  %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1
359  %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2
360  %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3
361  %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4
362  %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5
363  %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6
364  %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7
365  %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8
366  %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9
367  %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10
368  %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11
369  %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12
370  %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13
371  %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14
372  %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15
373  %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16
374  %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17
375  %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18
376  %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19
377  %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20
378  %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21
379  %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22
380  %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23
381  %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24
382  %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25
383  %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26
384  %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27
385  %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28
386  %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29
387  %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30
388  %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31
389
390  br label %for.cond.preheader
391
392for.cond.preheader:
393  %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ]
394  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
395  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
396  %inc = add nuw nsw i32 %c, 1
397  %cc = icmp eq i32 %inc, 16
398  br i1 %cc, label %exit, label %for.cond.preheader
399
400exit:
401  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
402  ret void
403}
404
405declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
406declare i32 @llvm.amdgcn.workitem.id.x()
407