1; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,GFX908_A %s
2; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,GFX908_A %s
3
4; GCN-LABEL: {{^}}test_mfma_loop_zeroinit:
5
6; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
7; GFX90A:          v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0
8; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
9
10; Check that we do not copy agprs to vgprs and back inside the loop.
11
12; GCN: [[LOOP:.LBB[0-9_]+]]:
13; GCN-NOT:  v_accvgpr
14; GFX908_A: v_mfma_f32_32x32x1f32
15; GCN-NOT:  v_accvgpr
16; GCN:      s_cbranch_scc1 [[LOOP]]
17
18; Final result should be read only once after the loop.
19
20; GFX908-COUNT-32: v_accvgpr_read_b32
21; GFX90A-NOT:      v_accvgpr_read_b32
22; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
23; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
24
25define amdgpu_kernel void @test_mfma_loop_zeroinit(<32 x float> addrspace(1)* %arg) {
26entry:
27  br label %for.cond.preheader
28
29for.cond.preheader:
30  %phi = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %for.cond.preheader ]
31  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
32  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
33  %inc = add nuw nsw i32 %c, 1
34  %cc = icmp eq i32 %inc, 16
35  br i1 %cc, label %exit, label %for.cond.preheader
36
37exit:
38  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
39  ret void
40}
41
42; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_splat:
43
44; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only.
45; 3 vgprs are needed to avoid wait states between writes.
46; Check that we do not use 32 temp sgprs as well.
47
48; GFX908_A:        v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000
49; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
50; GFX90A:          v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]]
51; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
52
53; GCN: [[LOOP:.LBB[0-9_]+]]:
54; GCN-NOT:  v_accvgpr
55; GFX908_A: v_mfma_f32_32x32x1f32
56; GCN-NOT:  v_accvgpr
57; GCN:      s_cbranch_scc1 [[LOOP]]
58
59; GFX908-COUNT-32: v_accvgpr_read_b32
60; GFX90A-NOT:      v_accvgpr_read_b32
61; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
62; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
63
64define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(<32 x float> addrspace(1)* %arg) {
65entry:
66  br label %for.cond.preheader
67
68for.cond.preheader:
69  %phi = phi <32 x float> [ <float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0>, %entry ], [ %mai.1, %for.cond.preheader ]
70  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
71  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
72  %inc = add nuw nsw i32 %c, 1
73  %cc = icmp eq i32 %inc, 16
74  br i1 %cc, label %exit, label %for.cond.preheader
75
76exit:
77  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
78  ret void
79}
80
81; GCN-LABEL: {{^}}test_mfma_loop_non_splat:
82
83; GCN:             v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0{{$}}
84; GCN:             v_accvgpr_write_b32 a{{[0-9]+}}, 1.0{{$}}
85; GFX908-COUNT-30: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
86; GFX90A-COUNT-30: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]{{$}}
87
88; GCN: [[LOOP:.LBB[0-9_]+]]:
89; GCN-NOT:  v_accvgpr
90; GFX908_A: v_mfma_f32_32x32x1f32
91; GCN-NOT:  v_accvgpr
92; GCN:      s_cbranch_scc1 [[LOOP]]
93
94; GFX908-COUNT-32: v_accvgpr_read_b32
95; GFX90A-NOT:      v_accvgpr_read_b32
96; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
97; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
98
99define amdgpu_kernel void @test_mfma_loop_non_splat(<32 x float> addrspace(1)* %arg) {
100entry:
101  br label %for.cond.preheader
102
103for.cond.preheader:
104  %phi = phi <32 x float> [ <float 0.0, float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %mai.1, %for.cond.preheader ]
105  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
106  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
107  %inc = add nuw nsw i32 %c, 1
108  %cc = icmp eq i32 %inc, 16
109  br i1 %cc, label %exit, label %for.cond.preheader
110
111exit:
112  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
113  ret void
114}
115
116; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_seq:
117
118; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only.
119; 3 vgprs are needed to avoid wait states between writes.
120
121; GFX908: v_mov_b32_e32 [[TMP1:v[0-9]+]], 0x42f60000
122; GFX908: v_mov_b32_e32 [[TMP2:v[0-9]+]], 0x42f80000
123; GFX908: v_mov_b32_e32 [[TMP3:v[0-9]+]], 0x42fe0000
124; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
125; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
126; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
127; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
128; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
129; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
130; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
131; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
132; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
133; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
134; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
135; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
136; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
137; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
138; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
139; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
140; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
141; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
142; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
143; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
144; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
145; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
146; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
147; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
148; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
149; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
150; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
151; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
152; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
153; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
154; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
155; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
156; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
157; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
158; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
159; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
160; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
161; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
162; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
163; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
164; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
165; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
166; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
167; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
168; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
169; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
170; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
171; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
172; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
173; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
174; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
175; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
176; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
177; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
178; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
179; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
180; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
181; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
182; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
183; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
184; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
185
186; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
187; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
188; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
189; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
190; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
191; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
192; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
193; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
194; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
195; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
196; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
197; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
198; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
199; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
200; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
201; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
202; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
203; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
204; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
205; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
206; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
207; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
208; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
209; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
210; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
211; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
212; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
213; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
214; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
215; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
216; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
217; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
218; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
219; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
220; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
221; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
222; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
223; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
224; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
225; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
226; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
227; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
228; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
229; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
230; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
231; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
232; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
233; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
234; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
235; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
236; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
237; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
238; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
239; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
240; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
241; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
242; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
243; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
244; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
245; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
246; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
247; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
248; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
249; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
250
251
252; GCN: [[LOOP:.LBB[0-9_]+]]:
253; GCN-NOT:  v_accvgpr
254; GFX908_A: v_mfma_f32_32x32x1f32
255; GCN-NOT:  v_accvgpr
256; GCN:      s_cbranch_scc1 [[LOOP]]
257
258; GFX908-COUNT-32: v_accvgpr_read_b32
259; GFX90A-NOT:      v_accvgpr_read_b32
260; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
261; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
262
263define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(<32 x float> addrspace(1)* %arg) {
264entry:
265  br label %for.cond.preheader
266
267for.cond.preheader:
268  %phi = phi <32 x float> [ <float 123.0, float 124.0, float 125.0, float 126.0, float 127.0, float 128.0, float 129.0, float 130.0, float 131.0, float 132.0, float 133.0, float 134.0, float 135.0, float 136.0, float 137.0, float 138.0, float 139.0, float 140.0, float 141.0, float 142.0, float 143.0, float 144.0, float 145.0, float 146.0, float 147.0, float 148.0, float 149.0, float 150.0, float 151.0, float 152.0, float 153.0, float 154.0>, %entry ], [ %mai.1, %for.cond.preheader ]
269  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
270  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
271  %inc = add nuw nsw i32 %c, 1
272  %cc = icmp eq i32 %inc, 16
273  br i1 %cc, label %exit, label %for.cond.preheader
274
275exit:
276  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
277  ret void
278}
279
280; GCN-LABEL: {{^}}test_mfma_loop_vgpr_init:
281
282; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v0{{$}}
283
284; GCN: [[LOOP:.LBB[0-9_]+]]:
285; GCN-NOT:  v_accvgpr
286; GFX908_A: v_mfma_f32_32x32x1f32
287; GCN-NOT:  v_accvgpr
288; GCN:      s_cbranch_scc1 [[LOOP]]
289
290; GFX908-COUNT-32: v_accvgpr_read_b32
291; GFX90A-NOT:      v_accvgpr_read_b32
292; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
293; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
294
295define amdgpu_kernel void @test_mfma_loop_vgpr_init(<32 x float> addrspace(1)* %arg) {
296entry:
297  %tid = call i32 @llvm.amdgcn.workitem.id.x()
298  %init = bitcast i32 %tid to float
299  %tmp0 = insertelement <32 x float> undef, float %init, i32 0
300  %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1
301  %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2
302  %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3
303  %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4
304  %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5
305  %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6
306  %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7
307  %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8
308  %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9
309  %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10
310  %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11
311  %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12
312  %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13
313  %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14
314  %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15
315  %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16
316  %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17
317  %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18
318  %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19
319  %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20
320  %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21
321  %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22
322  %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23
323  %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24
324  %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25
325  %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26
326  %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27
327  %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28
328  %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29
329  %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30
330  %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31
331
332  br label %for.cond.preheader
333
334for.cond.preheader:
335  %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ]
336  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
337  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
338  %inc = add nuw nsw i32 %c, 1
339  %cc = icmp eq i32 %inc, 16
340  br i1 %cc, label %exit, label %for.cond.preheader
341
342exit:
343  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
344  ret void
345}
346
347; GCN-LABEL: {{^}}test_mfma_loop_sgpr_init:
348
349; GFX908_A:        v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}}
350; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
351; GFX90A:          v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]]
352; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
353
354; GCN: [[LOOP:.LBB[0-9_]+]]:
355; GCN-NOT:  v_accvgpr
356; GFX908_A: v_mfma_f32_32x32x1f32
357; GCN-NOT:  v_accvgpr
358; GCN:      s_cbranch_scc1 [[LOOP]]
359
360; GFX908-COUNT-32: v_accvgpr_read_b32
361; GFX90A-NOT:      v_accvgpr_read_b32
362; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
363; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
364
365define amdgpu_kernel void @test_mfma_loop_sgpr_init(<32 x float> addrspace(1)* %arg, float %init) {
366entry:
367  %tmp0 = insertelement <32 x float> undef, float %init, i32 0
368  %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1
369  %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2
370  %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3
371  %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4
372  %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5
373  %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6
374  %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7
375  %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8
376  %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9
377  %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10
378  %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11
379  %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12
380  %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13
381  %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14
382  %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15
383  %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16
384  %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17
385  %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18
386  %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19
387  %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20
388  %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21
389  %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22
390  %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23
391  %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24
392  %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25
393  %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26
394  %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27
395  %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28
396  %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29
397  %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30
398  %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31
399
400  br label %for.cond.preheader
401
402for.cond.preheader:
403  %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ]
404  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
405  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
406  %inc = add nuw nsw i32 %c, 1
407  %cc = icmp eq i32 %inc, 16
408  br i1 %cc, label %exit, label %for.cond.preheader
409
410exit:
411  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
412  ret void
413}
414
415; GCN-LABEL: {{^}}test_mfma_loop_mixed_init:
416
417; GCN-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v0
418; GFX908_A-DAG: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}}
419; GCN-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
420; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
421; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
422; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
423; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
424; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
425; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
426; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
427; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
428; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
429; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
430; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
431; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
432; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
433; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
434; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
435; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
436; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
437; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
438; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
439; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
440; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
441; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
442; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
443; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
444; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
445; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
446; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
447; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
448; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
449; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
450
451; GFX90A-DAG:      v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0
452; GFX90A-COUNT-28: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
453
454; GCN: [[LOOP:.LBB[0-9_]+]]:
455; GCN-NOT:  v_accvgpr
456; GFX908_A: v_mfma_f32_32x32x1f32
457; GCN-NOT:  v_accvgpr
458; GCN:      s_cbranch_scc1 [[LOOP]]
459
460; GFX908-COUNT-32: v_accvgpr_read_b32
461; GFX90A-NOT:      v_accvgpr_read_b32
462; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
463; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
464
465define amdgpu_kernel void @test_mfma_loop_mixed_init(<32 x float> addrspace(1)* %arg, float %x) {
466entry:
467  %tid = call i32 @llvm.amdgcn.workitem.id.x()
468  %init = bitcast i32 %tid to float
469  %tmp0 = insertelement <32 x float> zeroinitializer, float %init, i32 0
470  %tmp1 = insertelement <32 x float> %tmp0, float %x, i32 1
471
472  br label %for.cond.preheader
473
474for.cond.preheader:
475  %phi = phi <32 x float> [ %tmp1, %entry ], [ %mai.1, %for.cond.preheader ]
476  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
477  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
478  %inc = add nuw nsw i32 %c, 1
479  %cc = icmp eq i32 %inc, 16
480  br i1 %cc, label %exit, label %for.cond.preheader
481
482exit:
483  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
484  ret void
485}
486
487; GCN-LABEL: {{^}}test_mfma_loop_mfma_forward_init:
488
489; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0
490; GFX908:          v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}]
491; GFX90A-NOT:      v_accvgpr
492; GFX90A:          v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
493; GFX90A-NOT:      v_accvgpr
494; GCN-NOT:         v_accvgpr
495
496; GCN: [[LOOP:.LBB[0-9_]+]]:
497; GCN-NOT:  v_accvgpr
498; GFX908_A: v_mfma_f32_32x32x1f32
499; GCN-NOT:  v_accvgpr
500; GCN:      s_cbranch_scc1 [[LOOP]]
501
502; GFX908-COUNT-32: v_accvgpr_read_b32
503; GFX90A-NOT:      v_accvgpr_read_b32
504; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
505; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
506
507define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(<32 x float> addrspace(1)* %arg) {
508entry:
509  %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
510
511  br label %for.cond.preheader
512
513for.cond.preheader:
514  %phi = phi <32 x float> [ %mai.0, %entry ], [ %mai.1, %for.cond.preheader ]
515  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
516  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
517  %inc = add nuw nsw i32 %c, 1
518  %cc = icmp eq i32 %inc, 16
519  br i1 %cc, label %exit, label %for.cond.preheader
520
521exit:
522  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
523  ret void
524}
525
526; GCN-LABEL: {{^}}test_mfma_loop_agpr_init:
527
528; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0
529; GFX908:          v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}]
530; GFX90A-NOT:      v_accvgpr
531; GFX90A:          v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
532; GFX90A-NOT:      v_accvgpr
533
534; Check that we are using only one tmp VGPR.
535
536; GCN:             v_accvgpr_read_b32 [[TMP:v[0-9]+]], a{{[0-9]+}}
537; GFX908-COUNT-31: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]{{$}}
538; GFX90A:          v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]]{{$}}
539; GFX90A-COUNT-29: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
540
541; GCN: [[LOOP:.LBB[0-9_]+]]:
542; GCN-NOT:  v_accvgpr
543; GFX908_A: v_mfma_f32_32x32x1f32
544; GCN-NOT:  v_accvgpr
545; GCN:      s_cbranch_scc1 [[LOOP]]
546
547; GFX908-COUNT-32: v_accvgpr_read_b32
548; GFX90A-NOT:      v_accvgpr_read_b32
549; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
550; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
551
552define amdgpu_kernel void @test_mfma_loop_agpr_init(<32 x float> addrspace(1)* %arg) {
553entry:
554  %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
555  %init = extractelement <32 x float> %mai.0, i32 0
556  %tmp0 = insertelement <32 x float> undef, float %init, i32 0
557  %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1
558  %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2
559  %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3
560  %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4
561  %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5
562  %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6
563  %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7
564  %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8
565  %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9
566  %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10
567  %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11
568  %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12
569  %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13
570  %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14
571  %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15
572  %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16
573  %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17
574  %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18
575  %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19
576  %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20
577  %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21
578  %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22
579  %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23
580  %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24
581  %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25
582  %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26
583  %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27
584  %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28
585  %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29
586  %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30
587  %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31
588
589  br label %for.cond.preheader
590
591for.cond.preheader:
592  %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ]
593  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
594  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
595  %inc = add nuw nsw i32 %c, 1
596  %cc = icmp eq i32 %inc, 16
597  br i1 %cc, label %exit, label %for.cond.preheader
598
599exit:
600  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
601  ret void
602}
603
604; GCN-LABEL: {{^}}test_mfma_nested_loop_zeroinit:
605
606; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
607; GFX90A:          v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0
608; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
609
610; Check that we do not copy agprs to vgprs and back in an outer loop.
611
612; GCN: [[OUTER_LOOP:.LBB[0-9_]+]]:
613; GCN-NOT:  v_accvgpr
614; GCN: [[INNER_LOOP:.LBB[0-9_]+]]:
615; GCN-NOT:  v_accvgpr
616; GFX908_A: v_mfma_f32_32x32x1f32
617; GCN-NOT:  v_accvgpr
618; GCN:      s_cbranch_scc1 [[INNER_LOOP]]
619; GCN-NOT:  v_accvgpr
620; GCN:      s_cbranch_scc1 [[OUTER_LOOP]]
621
622; Final result should be read only once after the loop.
623
624; GFX908-COUNT-32: v_accvgpr_read_b32
625; GFX90A-NOT:      v_accvgpr_read_b32
626; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
627; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
628
629define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(<32 x float> addrspace(1)* %arg) {
630entry:
631  br label %for.cond.preheader
632
633for.cond.preheader:
634  %phi.0 = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %inner.exit ]
635  %c.0 = phi i32 [ 0, %entry ], [ %inc.0, %inner.exit ]
636  br label %inner.for.cond.preheader
637
638inner.for.cond.preheader:
639  %phi = phi <32 x float> [ %phi.0, %for.cond.preheader ], [ %mai.1, %inner.for.cond.preheader ]
640  %c = phi i32 [ 0, %for.cond.preheader ], [ %inc, %inner.for.cond.preheader ]
641  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
642  %inc = add nuw nsw i32 %c, 1
643  %cc = icmp eq i32 %inc, 16
644  br i1 %cc, label %inner.exit, label %inner.for.cond.preheader
645
646inner.exit:
647  %inc.0 = add nuw nsw i32 %c.0, 1
648  %cc.0 = icmp eq i32 %inc.0, 16
649  br i1 %cc.0, label %exit, label %for.cond.preheader
650
651exit:
652  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
653  ret void
654}
655
656declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
657declare i32 @llvm.amdgcn.workitem.id.x()
658