1; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,GFX908_A %s
2; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,GFX908_A,GFX940_A %s
3; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX940,GFX940_A %s
4
5; GCN-LABEL: {{^}}test_mfma_loop_zeroinit:
6
7; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
8; GFX90A:          v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0
9; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
10
11; Check that we do not copy agprs to vgprs and back inside the loop.
12
13; GCN: [[LOOP:.LBB[0-9_]+]]:
14; GCN-NOT:  v_accvgpr
15; GFX908_A: v_mfma_f32_32x32x1f32
16; GFX940:   v_mfma_f32_32x32x1_2b_f32
17; GCN-NOT:  v_accvgpr
18; GCN:      s_cbranch_scc1 [[LOOP]]
19
20; Final result should be read only once after the loop.
21
22; GFX908-COUNT-32: v_accvgpr_read_b32
23; GFX90A-NOT:      v_accvgpr_read_b32
24; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
25; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
26
27define amdgpu_kernel void @test_mfma_loop_zeroinit(<32 x float> addrspace(1)* %arg) #0 {
28entry:
29  br label %for.cond.preheader
30
31for.cond.preheader:
32  %phi = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %for.cond.preheader ]
33  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
34  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
35  %inc = add nuw nsw i32 %c, 1
36  %cc = icmp eq i32 %inc, 16
37  br i1 %cc, label %exit, label %for.cond.preheader
38
39exit:
40  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
41  ret void
42}
43
44; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_splat:
45
46; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only.
47; 3 vgprs are needed to avoid wait states between writes.
48; Check that we do not use 32 temp sgprs as well.
49
50; GFX908:          v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000
51; GFX940_A:        s_mov_b32 [[TMP:s[0-9]+]], 0x42f60000
52; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
53; GFX90A:          v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]]
54; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
55
56; GCN: [[LOOP:.LBB[0-9_]+]]:
57; GCN-NOT:  v_accvgpr
58; GFX908_A: v_mfma_f32_32x32x1f32
59; GFX940:   v_mfma_f32_32x32x1_2b_f32
60; GCN-NOT:  v_accvgpr
61; GCN:      s_cbranch_scc1 [[LOOP]]
62
63; GFX908-COUNT-32: v_accvgpr_read_b32
64; GFX90A-NOT:      v_accvgpr_read_b32
65; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
66; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
67
68define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(<32 x float> addrspace(1)* %arg) #0 {
69entry:
70  br label %for.cond.preheader
71
72for.cond.preheader:
73  %phi = phi <32 x float> [ <float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0>, %entry ], [ %mai.1, %for.cond.preheader ]
74  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
75  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
76  %inc = add nuw nsw i32 %c, 1
77  %cc = icmp eq i32 %inc, 16
78  br i1 %cc, label %exit, label %for.cond.preheader
79
80exit:
81  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
82  ret void
83}
84
85; GCN-LABEL: {{^}}test_mfma_loop_non_splat:
86
87; GCN:             v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0{{$}}
88; GCN:             v_accvgpr_write_b32 a{{[0-9]+}}, 1.0{{$}}
89; GFX908-COUNT-30: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
90; GFX90A-COUNT-30: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]{{$}}
91
92; GCN: [[LOOP:.LBB[0-9_]+]]:
93; GCN-NOT:  v_accvgpr
94; GFX908_A: v_mfma_f32_32x32x1f32
95; GFX940:   v_mfma_f32_32x32x1_2b_f32
96; GCN-NOT:  v_accvgpr
97; GCN:      s_cbranch_scc1 [[LOOP]]
98
99; GFX908-COUNT-32: v_accvgpr_read_b32
100; GFX90A-NOT:      v_accvgpr_read_b32
101; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
102; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
103
104define amdgpu_kernel void @test_mfma_loop_non_splat(<32 x float> addrspace(1)* %arg) #0 {
105entry:
106  br label %for.cond.preheader
107
108for.cond.preheader:
109  %phi = phi <32 x float> [ <float 0.0, float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %mai.1, %for.cond.preheader ]
110  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
111  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
112  %inc = add nuw nsw i32 %c, 1
113  %cc = icmp eq i32 %inc, 16
114  br i1 %cc, label %exit, label %for.cond.preheader
115
116exit:
117  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
118  ret void
119}
120
121; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_seq:
122
123; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only.
124; 3 vgprs are needed to avoid wait states between writes.
125
126; GFX908: v_mov_b32_e32 [[TMP1:v[0-9]+]], 0x42f60000
127; GFX908: v_mov_b32_e32 [[TMP2:v[0-9]+]], 0x42f80000
128; GFX908: v_mov_b32_e32 [[TMP3:v[0-9]+]], 0x42fe0000
129; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
130; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
131; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
132; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
133; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
134; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
135; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
136; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
137; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
138; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
139; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
140; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
141; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
142; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
143; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
144; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
145; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
146; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
147; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
148; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
149; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
150; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
151; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
152; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
153; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
154; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
155; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
156; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
157; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
158; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
159; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
160; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
161; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
162; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
163; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
164; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
165; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
166; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
167; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
168; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
169; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
170; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
171; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
172; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
173; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
174; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
175; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
176; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
177; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
178; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
179; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
180; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
181; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
182; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
183; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
184; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}}
185; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}}
186; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}}
187; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
188; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
189; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
190
191; GFX940_A-COUNT-32: s_mov_b32 s{{[0-9]+}}, 0x4{{[0-9a-f]+}}
192; GFX940_A-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
193
194; GCN: [[LOOP:.LBB[0-9_]+]]:
195; GCN-NOT:  v_accvgpr
196; GFX908_A: v_mfma_f32_32x32x1f32
197; GFX940:   v_mfma_f32_32x32x1_2b_f32
198; GCN-NOT:  v_accvgpr
199; GCN:      s_cbranch_scc1 [[LOOP]]
200
201; GFX908-COUNT-32: v_accvgpr_read_b32
202; GFX90A-NOT:      v_accvgpr_read_b32
203; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
204; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
205
206define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(<32 x float> addrspace(1)* %arg) #0 {
207entry:
208  br label %for.cond.preheader
209
210for.cond.preheader:
211  %phi = phi <32 x float> [ <float 123.0, float 124.0, float 125.0, float 126.0, float 127.0, float 128.0, float 129.0, float 130.0, float 131.0, float 132.0, float 133.0, float 134.0, float 135.0, float 136.0, float 137.0, float 138.0, float 139.0, float 140.0, float 141.0, float 142.0, float 143.0, float 144.0, float 145.0, float 146.0, float 147.0, float 148.0, float 149.0, float 150.0, float 151.0, float 152.0, float 153.0, float 154.0>, %entry ], [ %mai.1, %for.cond.preheader ]
212  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
213  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
214  %inc = add nuw nsw i32 %c, 1
215  %cc = icmp eq i32 %inc, 16
216  br i1 %cc, label %exit, label %for.cond.preheader
217
218exit:
219  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
220  ret void
221}
222
223; GCN-LABEL: {{^}}test_mfma_loop_vgpr_init:
224
225; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v0{{$}}
226
227; GCN: [[LOOP:.LBB[0-9_]+]]:
228; GCN-NOT:  v_accvgpr
229; GFX908_A: v_mfma_f32_32x32x1f32
230; GFX940:   v_mfma_f32_32x32x1_2b_f32
231; GCN-NOT:  v_accvgpr
232; GCN:      s_cbranch_scc1 [[LOOP]]
233
234; GFX908-COUNT-32: v_accvgpr_read_b32
235; GFX90A-NOT:      v_accvgpr_read_b32
236; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
237; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
238
239define amdgpu_kernel void @test_mfma_loop_vgpr_init(<32 x float> addrspace(1)* %arg) #0 {
240entry:
241  %tid = call i32 @llvm.amdgcn.workitem.id.x()
242  %init = bitcast i32 %tid to float
243  %tmp0 = insertelement <32 x float> undef, float %init, i32 0
244  %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1
245  %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2
246  %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3
247  %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4
248  %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5
249  %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6
250  %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7
251  %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8
252  %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9
253  %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10
254  %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11
255  %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12
256  %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13
257  %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14
258  %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15
259  %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16
260  %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17
261  %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18
262  %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19
263  %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20
264  %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21
265  %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22
266  %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23
267  %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24
268  %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25
269  %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26
270  %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27
271  %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28
272  %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29
273  %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30
274  %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31
275
276  br label %for.cond.preheader
277
278for.cond.preheader:
279  %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ]
280  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
281  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
282  %inc = add nuw nsw i32 %c, 1
283  %cc = icmp eq i32 %inc, 16
284  br i1 %cc, label %exit, label %for.cond.preheader
285
286exit:
287  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
288  ret void
289}
290
291; GCN-LABEL: {{^}}test_mfma_loop_sgpr_init:
292
293; GFX908:          v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}}
294; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
295; GFX940_A:        v_accvgpr_write_b32 [[LEAD:a[0-9]+]], s{{[0-9]+}}
296; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
297
298; GCN: [[LOOP:.LBB[0-9_]+]]:
299; GCN-NOT:  v_accvgpr
300; GFX908_A: v_mfma_f32_32x32x1f32
301; GFX940:   v_mfma_f32_32x32x1_2b_f32
302; GCN-NOT:  v_accvgpr
303; GCN:      s_cbranch_scc1 [[LOOP]]
304
305; GFX908-COUNT-32: v_accvgpr_read_b32
306; GFX90A-NOT:      v_accvgpr_read_b32
307; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
308; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
309
310define amdgpu_kernel void @test_mfma_loop_sgpr_init(<32 x float> addrspace(1)* %arg, float %init) #0 {
311entry:
312  %tmp0 = insertelement <32 x float> undef, float %init, i32 0
313  %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1
314  %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2
315  %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3
316  %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4
317  %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5
318  %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6
319  %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7
320  %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8
321  %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9
322  %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10
323  %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11
324  %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12
325  %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13
326  %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14
327  %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15
328  %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16
329  %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17
330  %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18
331  %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19
332  %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20
333  %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21
334  %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22
335  %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23
336  %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24
337  %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25
338  %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26
339  %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27
340  %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28
341  %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29
342  %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30
343  %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31
344
345  br label %for.cond.preheader
346
347for.cond.preheader:
348  %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ]
349  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
350  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
351  %inc = add nuw nsw i32 %c, 1
352  %cc = icmp eq i32 %inc, 16
353  br i1 %cc, label %exit, label %for.cond.preheader
354
355exit:
356  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
357  ret void
358}
359
360; GCN-LABEL: {{^}}test_mfma_loop_mixed_init:
361
362; GCN-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, v0
363; GFX908-DAG:   v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}}
364; GFX940_A-DAG: s_load_dword [[TMP:s[0-9]+]],
365; GCN-DAG:      v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
366; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
367; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
368; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
369; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
370; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
371; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
372; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
373; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
374; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
375; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
376; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
377; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
378; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
379; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
380; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
381; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
382; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
383; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
384; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
385; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
386; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
387; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
388; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
389; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
390; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
391; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
392; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
393; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
394; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
395; GFX908-DAG:   v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
396
397; GFX90A-DAG:      v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0
398; GFX90A-COUNT-28: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
399
400; GCN: [[LOOP:.LBB[0-9_]+]]:
401; GCN-NOT:  v_accvgpr
402; GFX908_A: v_mfma_f32_32x32x1f32
403; GFX940:   v_mfma_f32_32x32x1_2b_f32
404; GCN-NOT:  v_accvgpr
405; GCN:      s_cbranch_scc1 [[LOOP]]
406
407; GFX908-COUNT-32: v_accvgpr_read_b32
408; GFX90A-NOT:      v_accvgpr_read_b32
409; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
410; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
411
412define amdgpu_kernel void @test_mfma_loop_mixed_init(<32 x float> addrspace(1)* %arg, float %x) #0 {
413entry:
414  %tid = call i32 @llvm.amdgcn.workitem.id.x()
415  %init = bitcast i32 %tid to float
416  %tmp0 = insertelement <32 x float> zeroinitializer, float %init, i32 0
417  %tmp1 = insertelement <32 x float> %tmp0, float %x, i32 1
418
419  br label %for.cond.preheader
420
421for.cond.preheader:
422  %phi = phi <32 x float> [ %tmp1, %entry ], [ %mai.1, %for.cond.preheader ]
423  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
424  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
425  %inc = add nuw nsw i32 %c, 1
426  %cc = icmp eq i32 %inc, 16
427  br i1 %cc, label %exit, label %for.cond.preheader
428
429exit:
430  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
431  ret void
432}
433
434; GCN-LABEL: {{^}}test_mfma_loop_mfma_forward_init:
435
436; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0
437; GFX908:          v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}]
438; GFX90A-NOT:      v_accvgpr
439; GFX90A:          v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
440; GFX90A-NOT:      v_accvgpr
441; GFX940:          v_mfma_f32_32x32x1_2b_f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
442; GCN-NOT:         v_accvgpr
443
444; GCN: [[LOOP:.LBB[0-9_]+]]:
445; GCN-NOT:  v_accvgpr
446; GFX908_A: v_mfma_f32_32x32x1f32
447; GFX940:   v_mfma_f32_32x32x1_2b_f32
448; GCN-NOT:  v_accvgpr
449; GCN:      s_cbranch_scc1 [[LOOP]]
450
451; GFX908-COUNT-32: v_accvgpr_read_b32
452; GFX90A-NOT:      v_accvgpr_read_b32
453; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
454; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
455
456define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(<32 x float> addrspace(1)* %arg) #0 {
457entry:
458  %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
459
460  br label %for.cond.preheader
461
462for.cond.preheader:
463  %phi = phi <32 x float> [ %mai.0, %entry ], [ %mai.1, %for.cond.preheader ]
464  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
465  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
466  %inc = add nuw nsw i32 %c, 1
467  %cc = icmp eq i32 %inc, 16
468  br i1 %cc, label %exit, label %for.cond.preheader
469
470exit:
471  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
472  ret void
473}
474
475; GCN-LABEL: {{^}}test_mfma_loop_agpr_init:
476
477; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0
478; GFX908:          v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}]
479; GFX90A-NOT:      v_accvgpr
480; GFX90A:          v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
481; GFX90A-NOT:      v_accvgpr
482; GFX940:          v_mfma_f32_32x32x1_2b_f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
483
484; Check that we are using only one tmp VGPR.
485
486; GCN:             v_accvgpr_read_b32 [[TMP:v[0-9]+]], a{{[0-9]+}}
487; GFX908-COUNT-31: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]{{$}}
488; GFX90A:          v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]]{{$}}
489; GFX90A-COUNT-29: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
490
491; GCN: [[LOOP:.LBB[0-9_]+]]:
492; GCN-NOT:  v_accvgpr
493; GFX908_A: v_mfma_f32_32x32x1f32
494; GFX940:   v_mfma_f32_32x32x1_2b_f32
495; GCN-NOT:  v_accvgpr
496; GCN:      s_cbranch_scc1 [[LOOP]]
497
498; GFX908-COUNT-32: v_accvgpr_read_b32
499; GFX90A-NOT:      v_accvgpr_read_b32
500; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
501; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
502
503define amdgpu_kernel void @test_mfma_loop_agpr_init(<32 x float> addrspace(1)* %arg) #0 {
504entry:
505  %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
506  %init = extractelement <32 x float> %mai.0, i32 0
507  %tmp0 = insertelement <32 x float> undef, float %init, i32 0
508  %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1
509  %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2
510  %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3
511  %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4
512  %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5
513  %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6
514  %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7
515  %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8
516  %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9
517  %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10
518  %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11
519  %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12
520  %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13
521  %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14
522  %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15
523  %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16
524  %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17
525  %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18
526  %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19
527  %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20
528  %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21
529  %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22
530  %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23
531  %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24
532  %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25
533  %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26
534  %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27
535  %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28
536  %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29
537  %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30
538  %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31
539
540  br label %for.cond.preheader
541
542for.cond.preheader:
543  %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ]
544  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
545  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
546  %inc = add nuw nsw i32 %c, 1
547  %cc = icmp eq i32 %inc, 16
548  br i1 %cc, label %exit, label %for.cond.preheader
549
550exit:
551  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
552  ret void
553}
554
555; GCN-LABEL: {{^}}test_mfma_nested_loop_zeroinit:
556
557; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
558; GFX90A:          v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0
559; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
560
561; Check that we do not copy agprs to vgprs and back in an outer loop.
562
563; GCN: [[OUTER_LOOP:.LBB[0-9_]+]]:
564; GCN-NOT:  v_accvgpr
565; GCN: [[INNER_LOOP:.LBB[0-9_]+]]:
566; GCN-NOT:  v_accvgpr
567; GFX908_A: v_mfma_f32_32x32x1f32
568; GFX940:   v_mfma_f32_32x32x1_2b_f32
569; GCN-NOT:  v_accvgpr
570; GCN:      s_cbranch_scc1 [[INNER_LOOP]]
571; GCN-NOT:  v_accvgpr
572; GCN:      s_cbranch_scc1 [[OUTER_LOOP]]
573
574; Final result should be read only once after the loop.
575
576; GFX908-COUNT-32: v_accvgpr_read_b32
577; GFX90A-NOT:      v_accvgpr_read_b32
578; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
579; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
580
581define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(<32 x float> addrspace(1)* %arg) #0 {
582entry:
583  br label %for.cond.preheader
584
585for.cond.preheader:
586  %phi.0 = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %inner.exit ]
587  %c.0 = phi i32 [ 0, %entry ], [ %inc.0, %inner.exit ]
588  br label %inner.for.cond.preheader
589
590inner.for.cond.preheader:
591  %phi = phi <32 x float> [ %phi.0, %for.cond.preheader ], [ %mai.1, %inner.for.cond.preheader ]
592  %c = phi i32 [ 0, %for.cond.preheader ], [ %inc, %inner.for.cond.preheader ]
593  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
594  %inc = add nuw nsw i32 %c, 1
595  %cc = icmp eq i32 %inc, 16
596  br i1 %cc, label %inner.exit, label %inner.for.cond.preheader
597
598inner.exit:
599  %inc.0 = add nuw nsw i32 %c.0, 1
600  %cc.0 = icmp eq i32 %inc.0, 16
601  br i1 %cc.0, label %exit, label %for.cond.preheader
602
603exit:
604  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
605  ret void
606}
607
608declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
609declare i32 @llvm.amdgcn.workitem.id.x()
610
611attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
612