1; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,GFX908_A %s
2; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,GFX908_A %s
3; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX940 %s
4
5; GCN-LABEL: {{^}}test_mfma_loop_zeroinit:
6
7; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
8; GFX90A:          v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0
9; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
10
11; Check that we do not copy agprs to vgprs and back inside the loop.
12
13; GCN: [[LOOP:.LBB[0-9_]+]]:
14; GCN-NOT:  v_accvgpr
15; GFX908_A: v_mfma_f32_32x32x1f32
16; GFX940:   v_mfma_f32_32x32x1_2b_f32
17; GCN-NOT:  v_accvgpr
18; GCN:      s_cbranch_scc1 [[LOOP]]
19
20; Final result should be read only once after the loop.
21
22; GFX908-COUNT-32: v_accvgpr_read_b32
23; GFX90A-NOT:      v_accvgpr_read_b32
24; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
25; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
26
27define amdgpu_kernel void @test_mfma_loop_zeroinit(<32 x float> addrspace(1)* %arg) #0 {
28entry:
29  br label %for.cond.preheader
30
31for.cond.preheader:
32  %phi = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %for.cond.preheader ]
33  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
34  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
35  %inc = add nuw nsw i32 %c, 1
36  %cc = icmp eq i32 %inc, 16
37  br i1 %cc, label %exit, label %for.cond.preheader
38
39exit:
40  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
41  ret void
42}
43
44; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_splat:
45
46; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only.
47; 3 vgprs are needed to avoid wait states between writes.
48; Check that we do not use 32 temp sgprs as well.
49
50; GFX908_A:        v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000
51; GFX940:          s_mov_b32 [[TMP:s[0-9]+]], 0x42f60000
52; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
53; GFX90A:          v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]]
54; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
55
56; GCN: [[LOOP:.LBB[0-9_]+]]:
57; GCN-NOT:  v_accvgpr
58; GFX908_A: v_mfma_f32_32x32x1f32
59; GFX940:   v_mfma_f32_32x32x1_2b_f32
60; GCN-NOT:  v_accvgpr
61; GCN:      s_cbranch_scc1 [[LOOP]]
62
63; GFX908-COUNT-32: v_accvgpr_read_b32
64; GFX90A-NOT:      v_accvgpr_read_b32
65; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
66; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
67
68define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(<32 x float> addrspace(1)* %arg) #0 {
69entry:
70  br label %for.cond.preheader
71
72for.cond.preheader:
73  %phi = phi <32 x float> [ <float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0>, %entry ], [ %mai.1, %for.cond.preheader ]
74  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
75  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
76  %inc = add nuw nsw i32 %c, 1
77  %cc = icmp eq i32 %inc, 16
78  br i1 %cc, label %exit, label %for.cond.preheader
79
80exit:
81  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
82  ret void
83}
84
85; GCN-LABEL: {{^}}test_mfma_loop_non_splat:
86
87; GCN:             v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0{{$}}
88; GCN:             v_accvgpr_write_b32 a{{[0-9]+}}, 1.0{{$}}
89; GFX908-COUNT-30: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
90; GFX90A-COUNT-30: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]{{$}}
91
92; GCN: [[LOOP:.LBB[0-9_]+]]:
93; GCN-NOT:  v_accvgpr
94; GFX908_A: v_mfma_f32_32x32x1f32
95; GFX940:   v_mfma_f32_32x32x1_2b_f32
96; GCN-NOT:  v_accvgpr
97; GCN:      s_cbranch_scc1 [[LOOP]]
98
99; GFX908-COUNT-32: v_accvgpr_read_b32
100; GFX90A-NOT:      v_accvgpr_read_b32
101; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
102; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
103
104define amdgpu_kernel void @test_mfma_loop_non_splat(<32 x float> addrspace(1)* %arg) #0 {
105entry:
106  br label %for.cond.preheader
107
108for.cond.preheader:
109  %phi = phi <32 x float> [ <float 0.0, float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %mai.1, %for.cond.preheader ]
110  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
111  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
112  %inc = add nuw nsw i32 %c, 1
113  %cc = icmp eq i32 %inc, 16
114  br i1 %cc, label %exit, label %for.cond.preheader
115
116exit:
117  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
118  ret void
119}
120
121; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_seq:
122
123; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only.
124; 3 vgprs are needed to avoid wait states between writes.
125
126; GFX908: v_mov_b32_e32 [[TMP1:v[0-9]+]], 0x42f60000
127; GFX908: v_mov_b32_e32 [[TMP2:v[0-9]+]], 0x42f80000
128; GFX908: v_mov_b32_e32 [[TMP3:v[0-9]+]], 0x42fe0000
129; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
130; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
131; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
132; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
133; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
134; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
135; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
136; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
137; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
138; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
139; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
140; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
141; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
142; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
143; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
144; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
145; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
146; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
147; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
148; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
149; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
150; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
151; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
152; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
153; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
154; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
155; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
156; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
157; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
158; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
159; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
160; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
161; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
162; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
163; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
164; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
165; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
166; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
167; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
168; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
169; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
170; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
171; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
172; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
173; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
174; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
175; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
176; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
177; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
178; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
179; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
180; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
181; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
182; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
183; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
184; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
185; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
186; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
187; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
188; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
189; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
190
191; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
192; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
193; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
194; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
195; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
196; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
197; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
198; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
199; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
200; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
201; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
202; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
203; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
204; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
205; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
206; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
207; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
208; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
209; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
210; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
211; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
212; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
213; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
214; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
215; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
216; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
217; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
218; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
219; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
220; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
221; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
222; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
223; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
224; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
225; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
226; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
227; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
228; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
229; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
230; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
231; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
232; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
233; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
234; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
235; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
236; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
237; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
238; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
239; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
240; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
241; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
242; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
243; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
244; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
245; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
246; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
247; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
248; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
249; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
250; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
251; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
252; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
253; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}}
254; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
255
256; GFX940-COUNT-32: s_mov_b32 s{{[0-9]+}}, 0x4{{[0-9a-f]+}}
257; GFX940-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
258
259; GCN: [[LOOP:.LBB[0-9_]+]]:
260; GCN-NOT:  v_accvgpr
261; GFX908_A: v_mfma_f32_32x32x1f32
262; GFX940:   v_mfma_f32_32x32x1_2b_f32
263; GCN-NOT:  v_accvgpr
264; GCN:      s_cbranch_scc1 [[LOOP]]
265
266; GFX908-COUNT-32: v_accvgpr_read_b32
267; GFX90A-NOT:      v_accvgpr_read_b32
268; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
269; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
270
271define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(<32 x float> addrspace(1)* %arg) #0 {
272entry:
273  br label %for.cond.preheader
274
275for.cond.preheader:
276  %phi = phi <32 x float> [ <float 123.0, float 124.0, float 125.0, float 126.0, float 127.0, float 128.0, float 129.0, float 130.0, float 131.0, float 132.0, float 133.0, float 134.0, float 135.0, float 136.0, float 137.0, float 138.0, float 139.0, float 140.0, float 141.0, float 142.0, float 143.0, float 144.0, float 145.0, float 146.0, float 147.0, float 148.0, float 149.0, float 150.0, float 151.0, float 152.0, float 153.0, float 154.0>, %entry ], [ %mai.1, %for.cond.preheader ]
277  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
278  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
279  %inc = add nuw nsw i32 %c, 1
280  %cc = icmp eq i32 %inc, 16
281  br i1 %cc, label %exit, label %for.cond.preheader
282
283exit:
284  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
285  ret void
286}
287
288; GCN-LABEL: {{^}}test_mfma_loop_vgpr_init:
289
290; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v0{{$}}
291
292; GCN: [[LOOP:.LBB[0-9_]+]]:
293; GCN-NOT:  v_accvgpr
294; GFX908_A: v_mfma_f32_32x32x1f32
295; GFX940:   v_mfma_f32_32x32x1_2b_f32
296; GCN-NOT:  v_accvgpr
297; GCN:      s_cbranch_scc1 [[LOOP]]
298
299; GFX908-COUNT-32: v_accvgpr_read_b32
300; GFX90A-NOT:      v_accvgpr_read_b32
301; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
302; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
303
304define amdgpu_kernel void @test_mfma_loop_vgpr_init(<32 x float> addrspace(1)* %arg) #0 {
305entry:
306  %tid = call i32 @llvm.amdgcn.workitem.id.x()
307  %init = bitcast i32 %tid to float
308  %tmp0 = insertelement <32 x float> undef, float %init, i32 0
309  %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1
310  %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2
311  %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3
312  %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4
313  %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5
314  %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6
315  %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7
316  %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8
317  %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9
318  %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10
319  %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11
320  %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12
321  %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13
322  %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14
323  %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15
324  %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16
325  %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17
326  %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18
327  %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19
328  %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20
329  %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21
330  %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22
331  %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23
332  %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24
333  %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25
334  %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26
335  %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27
336  %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28
337  %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29
338  %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30
339  %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31
340
341  br label %for.cond.preheader
342
343for.cond.preheader:
344  %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ]
345  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
346  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
347  %inc = add nuw nsw i32 %c, 1
348  %cc = icmp eq i32 %inc, 16
349  br i1 %cc, label %exit, label %for.cond.preheader
350
351exit:
352  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
353  ret void
354}
355
356; GCN-LABEL: {{^}}test_mfma_loop_sgpr_init:
357
358; GFX908_A:        v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}}
359; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
360; GFX90A:          v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]]
361; GFX940:          v_accvgpr_write_b32 [[LEAD:a[0-9]+]], s{{[0-9]+}}
362; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
363
364; GCN: [[LOOP:.LBB[0-9_]+]]:
365; GCN-NOT:  v_accvgpr
366; GFX908_A: v_mfma_f32_32x32x1f32
367; GFX940:   v_mfma_f32_32x32x1_2b_f32
368; GCN-NOT:  v_accvgpr
369; GCN:      s_cbranch_scc1 [[LOOP]]
370
371; GFX908-COUNT-32: v_accvgpr_read_b32
372; GFX90A-NOT:      v_accvgpr_read_b32
373; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
374; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
375
376define amdgpu_kernel void @test_mfma_loop_sgpr_init(<32 x float> addrspace(1)* %arg, float %init) #0 {
377entry:
378  %tmp0 = insertelement <32 x float> undef, float %init, i32 0
379  %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1
380  %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2
381  %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3
382  %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4
383  %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5
384  %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6
385  %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7
386  %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8
387  %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9
388  %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10
389  %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11
390  %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12
391  %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13
392  %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14
393  %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15
394  %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16
395  %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17
396  %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18
397  %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19
398  %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20
399  %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21
400  %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22
401  %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23
402  %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24
403  %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25
404  %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26
405  %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27
406  %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28
407  %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29
408  %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30
409  %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31
410
411  br label %for.cond.preheader
412
413for.cond.preheader:
414  %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ]
415  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
416  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
417  %inc = add nuw nsw i32 %c, 1
418  %cc = icmp eq i32 %inc, 16
419  br i1 %cc, label %exit, label %for.cond.preheader
420
421exit:
422  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
423  ret void
424}
425
426; GCN-LABEL: {{^}}test_mfma_loop_mixed_init:
427
428; GCN-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v0
429; GFX908_A-DAG: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}}
430; GFX940-DAG:   s_load_dword [[TMP:s[0-9]+]],
431; GCN-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
432; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
433; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
434; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
435; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
436; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
437; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
438; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
439; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
440; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
441; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
442; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
443; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
444; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
445; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
446; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
447; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
448; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
449; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
450; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
451; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
452; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
453; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
454; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
455; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
456; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
457; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
458; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
459; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
460; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
461; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
462
463; GFX90A-DAG:      v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0
464; GFX90A-COUNT-28: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
465
466; GCN: [[LOOP:.LBB[0-9_]+]]:
467; GCN-NOT:  v_accvgpr
468; GFX908_A: v_mfma_f32_32x32x1f32
469; GFX940:   v_mfma_f32_32x32x1_2b_f32
470; GCN-NOT:  v_accvgpr
471; GCN:      s_cbranch_scc1 [[LOOP]]
472
473; GFX908-COUNT-32: v_accvgpr_read_b32
474; GFX90A-NOT:      v_accvgpr_read_b32
475; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
476; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
477
478define amdgpu_kernel void @test_mfma_loop_mixed_init(<32 x float> addrspace(1)* %arg, float %x) #0 {
479entry:
480  %tid = call i32 @llvm.amdgcn.workitem.id.x()
481  %init = bitcast i32 %tid to float
482  %tmp0 = insertelement <32 x float> zeroinitializer, float %init, i32 0
483  %tmp1 = insertelement <32 x float> %tmp0, float %x, i32 1
484
485  br label %for.cond.preheader
486
487for.cond.preheader:
488  %phi = phi <32 x float> [ %tmp1, %entry ], [ %mai.1, %for.cond.preheader ]
489  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
490  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
491  %inc = add nuw nsw i32 %c, 1
492  %cc = icmp eq i32 %inc, 16
493  br i1 %cc, label %exit, label %for.cond.preheader
494
495exit:
496  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
497  ret void
498}
499
500; GCN-LABEL: {{^}}test_mfma_loop_mfma_forward_init:
501
502; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0
503; GFX908:          v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}]
504; GFX90A-NOT:      v_accvgpr
505; GFX90A:          v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
506; GFX90A-NOT:      v_accvgpr
507; GFX940:          v_mfma_f32_32x32x1_2b_f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
508; GCN-NOT:         v_accvgpr
509
510; GCN: [[LOOP:.LBB[0-9_]+]]:
511; GCN-NOT:  v_accvgpr
512; GFX908_A: v_mfma_f32_32x32x1f32
513; GFX940:   v_mfma_f32_32x32x1_2b_f32
514; GCN-NOT:  v_accvgpr
515; GCN:      s_cbranch_scc1 [[LOOP]]
516
517; GFX908-COUNT-32: v_accvgpr_read_b32
518; GFX90A-NOT:      v_accvgpr_read_b32
519; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
520; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
521
522define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(<32 x float> addrspace(1)* %arg) #0 {
523entry:
524  %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
525
526  br label %for.cond.preheader
527
528for.cond.preheader:
529  %phi = phi <32 x float> [ %mai.0, %entry ], [ %mai.1, %for.cond.preheader ]
530  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
531  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
532  %inc = add nuw nsw i32 %c, 1
533  %cc = icmp eq i32 %inc, 16
534  br i1 %cc, label %exit, label %for.cond.preheader
535
536exit:
537  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
538  ret void
539}
540
541; GCN-LABEL: {{^}}test_mfma_loop_agpr_init:
542
543; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0
544; GFX908:          v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}]
545; GFX90A-NOT:      v_accvgpr
546; GFX90A:          v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
547; GFX90A-NOT:      v_accvgpr
548; GFX940:          v_mfma_f32_32x32x1_2b_f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
549
550; Check that we are using only one tmp VGPR.
551
552; GCN:             v_accvgpr_read_b32 [[TMP:v[0-9]+]], a{{[0-9]+}}
553; GFX908-COUNT-31: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]{{$}}
554; GFX90A:          v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]]{{$}}
555; GFX90A-COUNT-29: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
556
557; GCN: [[LOOP:.LBB[0-9_]+]]:
558; GCN-NOT:  v_accvgpr
559; GFX908_A: v_mfma_f32_32x32x1f32
560; GFX940:   v_mfma_f32_32x32x1_2b_f32
561; GCN-NOT:  v_accvgpr
562; GCN:      s_cbranch_scc1 [[LOOP]]
563
564; GFX908-COUNT-32: v_accvgpr_read_b32
565; GFX90A-NOT:      v_accvgpr_read_b32
566; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
567; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
568
569define amdgpu_kernel void @test_mfma_loop_agpr_init(<32 x float> addrspace(1)* %arg) #0 {
570entry:
571  %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
572  %init = extractelement <32 x float> %mai.0, i32 0
573  %tmp0 = insertelement <32 x float> undef, float %init, i32 0
574  %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1
575  %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2
576  %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3
577  %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4
578  %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5
579  %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6
580  %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7
581  %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8
582  %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9
583  %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10
584  %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11
585  %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12
586  %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13
587  %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14
588  %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15
589  %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16
590  %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17
591  %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18
592  %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19
593  %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20
594  %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21
595  %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22
596  %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23
597  %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24
598  %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25
599  %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26
600  %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27
601  %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28
602  %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29
603  %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30
604  %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31
605
606  br label %for.cond.preheader
607
608for.cond.preheader:
609  %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ]
610  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
611  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
612  %inc = add nuw nsw i32 %c, 1
613  %cc = icmp eq i32 %inc, 16
614  br i1 %cc, label %exit, label %for.cond.preheader
615
616exit:
617  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
618  ret void
619}
620
621; GCN-LABEL: {{^}}test_mfma_nested_loop_zeroinit:
622
623; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
624; GFX90A:          v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0
625; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
626
627; Check that we do not copy agprs to vgprs and back in an outer loop.
628
629; GCN: [[OUTER_LOOP:.LBB[0-9_]+]]:
630; GCN-NOT:  v_accvgpr
631; GCN: [[INNER_LOOP:.LBB[0-9_]+]]:
632; GCN-NOT:  v_accvgpr
633; GFX908_A: v_mfma_f32_32x32x1f32
634; GFX940:   v_mfma_f32_32x32x1_2b_f32
635; GCN-NOT:  v_accvgpr
636; GCN:      s_cbranch_scc1 [[INNER_LOOP]]
637; GCN-NOT:  v_accvgpr
638; GCN:      s_cbranch_scc1 [[OUTER_LOOP]]
639
640; Final result should be read only once after the loop.
641
642; GFX908-COUNT-32: v_accvgpr_read_b32
643; GFX90A-NOT:      v_accvgpr_read_b32
644; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
645; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
646
647define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(<32 x float> addrspace(1)* %arg) #0 {
648entry:
649  br label %for.cond.preheader
650
651for.cond.preheader:
652  %phi.0 = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %inner.exit ]
653  %c.0 = phi i32 [ 0, %entry ], [ %inc.0, %inner.exit ]
654  br label %inner.for.cond.preheader
655
656inner.for.cond.preheader:
657  %phi = phi <32 x float> [ %phi.0, %for.cond.preheader ], [ %mai.1, %inner.for.cond.preheader ]
658  %c = phi i32 [ 0, %for.cond.preheader ], [ %inc, %inner.for.cond.preheader ]
659  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
660  %inc = add nuw nsw i32 %c, 1
661  %cc = icmp eq i32 %inc, 16
662  br i1 %cc, label %inner.exit, label %inner.for.cond.preheader
663
664inner.exit:
665  %inc.0 = add nuw nsw i32 %c.0, 1
666  %cc.0 = icmp eq i32 %inc.0, 16
667  br i1 %cc.0, label %exit, label %for.cond.preheader
668
669exit:
670  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
671  ret void
672}
673
674declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
675declare i32 @llvm.amdgcn.workitem.id.x()
676
677attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
678