1; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,GFX908_A %s 2; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,GFX908_A,GFX940_A %s 3; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX940,GFX940_A %s 4 5; GCN-LABEL: {{^}}test_mfma_loop_zeroinit: 6 7; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 8; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0 9; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] 10 11; Check that we do not copy agprs to vgprs and back inside the loop. 12 13; GCN: [[LOOP:.LBB[0-9_]+]]: 14; GCN-NOT: v_accvgpr 15; GFX908_A: v_mfma_f32_32x32x1f32 16; GFX940: v_mfma_f32_32x32x1_2b_f32 17; GCN-NOT: v_accvgpr 18; GCN: s_cbranch_scc1 [[LOOP]] 19 20; Final result should be read only once after the loop. 21 22; GFX908-COUNT-32: v_accvgpr_read_b32 23; GFX90A-NOT: v_accvgpr_read_b32 24; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 25; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 26 27define amdgpu_kernel void @test_mfma_loop_zeroinit(<32 x float> addrspace(1)* %arg) #0 { 28entry: 29 br label %for.cond.preheader 30 31for.cond.preheader: 32 %phi = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %for.cond.preheader ] 33 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 34 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 35 %inc = add nuw nsw i32 %c, 1 36 %cc = icmp eq i32 %inc, 16 37 br i1 %cc, label %exit, label %for.cond.preheader 38 39exit: 40 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 41 ret void 42} 43 44; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_splat: 45 46; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only. 47; 3 vgprs are needed to avoid wait states between writes. 48; Check that we do not use 32 temp sgprs as well. 49 50; GFX908: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000 51; GFX940_A: s_mov_b32 [[TMP:s[0-9]+]], 0x42f60000 52; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 53; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]] 54; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] 55 56; GCN: [[LOOP:.LBB[0-9_]+]]: 57; GCN-NOT: v_accvgpr 58; GFX908_A: v_mfma_f32_32x32x1f32 59; GFX940: v_mfma_f32_32x32x1_2b_f32 60; GCN-NOT: v_accvgpr 61; GCN: s_cbranch_scc1 [[LOOP]] 62 63; GFX908-COUNT-32: v_accvgpr_read_b32 64; GFX90A-NOT: v_accvgpr_read_b32 65; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 66; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 67 68define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(<32 x float> addrspace(1)* %arg) #0 { 69entry: 70 br label %for.cond.preheader 71 72for.cond.preheader: 73 %phi = phi <32 x float> [ <float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0>, %entry ], [ %mai.1, %for.cond.preheader ] 74 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 75 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 76 %inc = add nuw nsw i32 %c, 1 77 %cc = icmp eq i32 %inc, 16 78 br i1 %cc, label %exit, label %for.cond.preheader 79 80exit: 81 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 82 ret void 83} 84 85; GCN-LABEL: {{^}}test_mfma_loop_non_splat: 86 87; GCN: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0{{$}} 88; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0{{$}} 89; GFX908-COUNT-30: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 90; GFX90A-COUNT-30: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]{{$}} 91 92; GCN: [[LOOP:.LBB[0-9_]+]]: 93; GCN-NOT: v_accvgpr 94; GFX908_A: v_mfma_f32_32x32x1f32 95; GFX940: v_mfma_f32_32x32x1_2b_f32 96; GCN-NOT: v_accvgpr 97; GCN: s_cbranch_scc1 [[LOOP]] 98 99; GFX908-COUNT-32: v_accvgpr_read_b32 100; GFX90A-NOT: v_accvgpr_read_b32 101; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 102; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 103 104define amdgpu_kernel void @test_mfma_loop_non_splat(<32 x float> addrspace(1)* %arg) #0 { 105entry: 106 br label %for.cond.preheader 107 108for.cond.preheader: 109 %phi = phi <32 x float> [ <float 0.0, float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %mai.1, %for.cond.preheader ] 110 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 111 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 112 %inc = add nuw nsw i32 %c, 1 113 %cc = icmp eq i32 %inc, 16 114 br i1 %cc, label %exit, label %for.cond.preheader 115 116exit: 117 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 118 ret void 119} 120 121; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_seq: 122 123; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only. 124; 3 vgprs are needed to avoid wait states between writes. 125 126; GFX908: v_mov_b32_e32 [[TMP1:v[0-9]+]], 0x42f60000 127; GFX908: v_mov_b32_e32 [[TMP2:v[0-9]+]], 0x42f80000 128; GFX908: v_mov_b32_e32 [[TMP3:v[0-9]+]], 0x42fe0000 129; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 130; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 131; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 132; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 133; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 134; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 135; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 136; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 137; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 138; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 139; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 140; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 141; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 142; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 143; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 144; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 145; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 146; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 147; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 148; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 149; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 150; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 151; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 152; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 153; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 154; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 155; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 156; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 157; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 158; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 159; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 160; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 161; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 162; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 163; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 164; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 165; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 166; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 167; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 168; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 169; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 170; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 171; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 172; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 173; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 174; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 175; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 176; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 177; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 178; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 179; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 180; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 181; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 182; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 183; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 184; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 185; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 186; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 187; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 188; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 189; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 190 191; GFX940_A-COUNT-32: s_mov_b32 s{{[0-9]+}}, 0x4{{[0-9a-f]+}} 192; GFX940_A-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} 193 194; GCN: [[LOOP:.LBB[0-9_]+]]: 195; GCN-NOT: v_accvgpr 196; GFX908_A: v_mfma_f32_32x32x1f32 197; GFX940: v_mfma_f32_32x32x1_2b_f32 198; GCN-NOT: v_accvgpr 199; GCN: s_cbranch_scc1 [[LOOP]] 200 201; GFX908-COUNT-32: v_accvgpr_read_b32 202; GFX90A-NOT: v_accvgpr_read_b32 203; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 204; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 205 206define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(<32 x float> addrspace(1)* %arg) #0 { 207entry: 208 br label %for.cond.preheader 209 210for.cond.preheader: 211 %phi = phi <32 x float> [ <float 123.0, float 124.0, float 125.0, float 126.0, float 127.0, float 128.0, float 129.0, float 130.0, float 131.0, float 132.0, float 133.0, float 134.0, float 135.0, float 136.0, float 137.0, float 138.0, float 139.0, float 140.0, float 141.0, float 142.0, float 143.0, float 144.0, float 145.0, float 146.0, float 147.0, float 148.0, float 149.0, float 150.0, float 151.0, float 152.0, float 153.0, float 154.0>, %entry ], [ %mai.1, %for.cond.preheader ] 212 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 213 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 214 %inc = add nuw nsw i32 %c, 1 215 %cc = icmp eq i32 %inc, 16 216 br i1 %cc, label %exit, label %for.cond.preheader 217 218exit: 219 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 220 ret void 221} 222 223; GCN-LABEL: {{^}}test_mfma_loop_vgpr_init: 224 225; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v0{{$}} 226 227; GCN: [[LOOP:.LBB[0-9_]+]]: 228; GCN-NOT: v_accvgpr 229; GFX908_A: v_mfma_f32_32x32x1f32 230; GFX940: v_mfma_f32_32x32x1_2b_f32 231; GCN-NOT: v_accvgpr 232; GCN: s_cbranch_scc1 [[LOOP]] 233 234; GFX908-COUNT-32: v_accvgpr_read_b32 235; GFX90A-NOT: v_accvgpr_read_b32 236; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 237; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 238 239define amdgpu_kernel void @test_mfma_loop_vgpr_init(<32 x float> addrspace(1)* %arg) #0 { 240entry: 241 %tid = call i32 @llvm.amdgcn.workitem.id.x() 242 %init = bitcast i32 %tid to float 243 %tmp0 = insertelement <32 x float> undef, float %init, i32 0 244 %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1 245 %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2 246 %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3 247 %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4 248 %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5 249 %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6 250 %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7 251 %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8 252 %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9 253 %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10 254 %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11 255 %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12 256 %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13 257 %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14 258 %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15 259 %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16 260 %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17 261 %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18 262 %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19 263 %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20 264 %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21 265 %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22 266 %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23 267 %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24 268 %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25 269 %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26 270 %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27 271 %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28 272 %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29 273 %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30 274 %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31 275 276 br label %for.cond.preheader 277 278for.cond.preheader: 279 %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ] 280 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 281 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 282 %inc = add nuw nsw i32 %c, 1 283 %cc = icmp eq i32 %inc, 16 284 br i1 %cc, label %exit, label %for.cond.preheader 285 286exit: 287 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 288 ret void 289} 290 291; GCN-LABEL: {{^}}test_mfma_loop_sgpr_init: 292 293; GFX908: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}} 294; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 295; GFX940_A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], s{{[0-9]+}} 296; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] 297 298; GCN: [[LOOP:.LBB[0-9_]+]]: 299; GCN-NOT: v_accvgpr 300; GFX908_A: v_mfma_f32_32x32x1f32 301; GFX940: v_mfma_f32_32x32x1_2b_f32 302; GCN-NOT: v_accvgpr 303; GCN: s_cbranch_scc1 [[LOOP]] 304 305; GFX908-COUNT-32: v_accvgpr_read_b32 306; GFX90A-NOT: v_accvgpr_read_b32 307; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 308; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 309 310define amdgpu_kernel void @test_mfma_loop_sgpr_init(<32 x float> addrspace(1)* %arg, float %init) #0 { 311entry: 312 %tmp0 = insertelement <32 x float> undef, float %init, i32 0 313 %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1 314 %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2 315 %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3 316 %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4 317 %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5 318 %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6 319 %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7 320 %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8 321 %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9 322 %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10 323 %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11 324 %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12 325 %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13 326 %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14 327 %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15 328 %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16 329 %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17 330 %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18 331 %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19 332 %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20 333 %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21 334 %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22 335 %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23 336 %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24 337 %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25 338 %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26 339 %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27 340 %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28 341 %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29 342 %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30 343 %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31 344 345 br label %for.cond.preheader 346 347for.cond.preheader: 348 %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ] 349 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 350 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 351 %inc = add nuw nsw i32 %c, 1 352 %cc = icmp eq i32 %inc, 16 353 br i1 %cc, label %exit, label %for.cond.preheader 354 355exit: 356 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 357 ret void 358} 359 360; GCN-LABEL: {{^}}test_mfma_loop_mixed_init: 361 362; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v0 363; GFX908-DAG: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}} 364; GFX940_A-DAG: s_load_dword [[TMP:s[0-9]+]], 365; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 366; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 367; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 368; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 369; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 370; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 371; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 372; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 373; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 374; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 375; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 376; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 377; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 378; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 379; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 380; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 381; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 382; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 383; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 384; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 385; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 386; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 387; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 388; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 389; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 390; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 391; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 392; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 393; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 394; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 395; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 396 397; GFX90A-DAG: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0 398; GFX90A-COUNT-28: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] 399 400; GCN: [[LOOP:.LBB[0-9_]+]]: 401; GCN-NOT: v_accvgpr 402; GFX908_A: v_mfma_f32_32x32x1f32 403; GFX940: v_mfma_f32_32x32x1_2b_f32 404; GCN-NOT: v_accvgpr 405; GCN: s_cbranch_scc1 [[LOOP]] 406 407; GFX908-COUNT-32: v_accvgpr_read_b32 408; GFX90A-NOT: v_accvgpr_read_b32 409; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 410; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 411 412define amdgpu_kernel void @test_mfma_loop_mixed_init(<32 x float> addrspace(1)* %arg, float %x) #0 { 413entry: 414 %tid = call i32 @llvm.amdgcn.workitem.id.x() 415 %init = bitcast i32 %tid to float 416 %tmp0 = insertelement <32 x float> zeroinitializer, float %init, i32 0 417 %tmp1 = insertelement <32 x float> %tmp0, float %x, i32 1 418 419 br label %for.cond.preheader 420 421for.cond.preheader: 422 %phi = phi <32 x float> [ %tmp1, %entry ], [ %mai.1, %for.cond.preheader ] 423 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 424 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 425 %inc = add nuw nsw i32 %c, 1 426 %cc = icmp eq i32 %inc, 16 427 br i1 %cc, label %exit, label %for.cond.preheader 428 429exit: 430 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 431 ret void 432} 433 434; GCN-LABEL: {{^}}test_mfma_loop_mfma_forward_init: 435 436; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0 437; GFX908: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}] 438; GFX90A-NOT: v_accvgpr 439; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}} 440; GFX90A-NOT: v_accvgpr 441; GFX940: v_mfma_f32_32x32x1_2b_f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}} 442; GCN-NOT: v_accvgpr 443 444; GCN: [[LOOP:.LBB[0-9_]+]]: 445; GCN-NOT: v_accvgpr 446; GFX908_A: v_mfma_f32_32x32x1f32 447; GFX940: v_mfma_f32_32x32x1_2b_f32 448; GCN-NOT: v_accvgpr 449; GCN: s_cbranch_scc1 [[LOOP]] 450 451; GFX908-COUNT-32: v_accvgpr_read_b32 452; GFX90A-NOT: v_accvgpr_read_b32 453; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 454; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 455 456define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(<32 x float> addrspace(1)* %arg) #0 { 457entry: 458 %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0) 459 460 br label %for.cond.preheader 461 462for.cond.preheader: 463 %phi = phi <32 x float> [ %mai.0, %entry ], [ %mai.1, %for.cond.preheader ] 464 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 465 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 466 %inc = add nuw nsw i32 %c, 1 467 %cc = icmp eq i32 %inc, 16 468 br i1 %cc, label %exit, label %for.cond.preheader 469 470exit: 471 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 472 ret void 473} 474 475; GCN-LABEL: {{^}}test_mfma_loop_agpr_init: 476 477; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0 478; GFX908: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}] 479; GFX90A-NOT: v_accvgpr 480; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}} 481; GFX90A-NOT: v_accvgpr 482; GFX940: v_mfma_f32_32x32x1_2b_f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}} 483 484; Check that we are using only one tmp VGPR. 485 486; GCN: v_accvgpr_read_b32 [[TMP:v[0-9]+]], a{{[0-9]+}} 487; GFX908-COUNT-31: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]{{$}} 488; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]]{{$}} 489; GFX90A-COUNT-29: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] 490 491; GCN: [[LOOP:.LBB[0-9_]+]]: 492; GCN-NOT: v_accvgpr 493; GFX908_A: v_mfma_f32_32x32x1f32 494; GFX940: v_mfma_f32_32x32x1_2b_f32 495; GCN-NOT: v_accvgpr 496; GCN: s_cbranch_scc1 [[LOOP]] 497 498; GFX908-COUNT-32: v_accvgpr_read_b32 499; GFX90A-NOT: v_accvgpr_read_b32 500; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 501; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 502 503define amdgpu_kernel void @test_mfma_loop_agpr_init(<32 x float> addrspace(1)* %arg) #0 { 504entry: 505 %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0) 506 %init = extractelement <32 x float> %mai.0, i32 0 507 %tmp0 = insertelement <32 x float> undef, float %init, i32 0 508 %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1 509 %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2 510 %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3 511 %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4 512 %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5 513 %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6 514 %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7 515 %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8 516 %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9 517 %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10 518 %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11 519 %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12 520 %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13 521 %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14 522 %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15 523 %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16 524 %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17 525 %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18 526 %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19 527 %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20 528 %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21 529 %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22 530 %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23 531 %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24 532 %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25 533 %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26 534 %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27 535 %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28 536 %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29 537 %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30 538 %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31 539 540 br label %for.cond.preheader 541 542for.cond.preheader: 543 %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ] 544 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 545 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 546 %inc = add nuw nsw i32 %c, 1 547 %cc = icmp eq i32 %inc, 16 548 br i1 %cc, label %exit, label %for.cond.preheader 549 550exit: 551 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 552 ret void 553} 554 555; GCN-LABEL: {{^}}test_mfma_nested_loop_zeroinit: 556 557; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 558; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0 559; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] 560 561; Check that we do not copy agprs to vgprs and back in an outer loop. 562 563; GCN: [[OUTER_LOOP:.LBB[0-9_]+]]: 564; GCN-NOT: v_accvgpr 565; GCN: [[INNER_LOOP:.LBB[0-9_]+]]: 566; GCN-NOT: v_accvgpr 567; GFX908_A: v_mfma_f32_32x32x1f32 568; GFX940: v_mfma_f32_32x32x1_2b_f32 569; GCN-NOT: v_accvgpr 570; GCN: s_cbranch_scc1 [[INNER_LOOP]] 571; GCN-NOT: v_accvgpr 572; GCN: s_cbranch_scc1 [[OUTER_LOOP]] 573 574; Final result should be read only once after the loop. 575 576; GFX908-COUNT-32: v_accvgpr_read_b32 577; GFX90A-NOT: v_accvgpr_read_b32 578; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 579; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 580 581define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(<32 x float> addrspace(1)* %arg) #0 { 582entry: 583 br label %for.cond.preheader 584 585for.cond.preheader: 586 %phi.0 = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %inner.exit ] 587 %c.0 = phi i32 [ 0, %entry ], [ %inc.0, %inner.exit ] 588 br label %inner.for.cond.preheader 589 590inner.for.cond.preheader: 591 %phi = phi <32 x float> [ %phi.0, %for.cond.preheader ], [ %mai.1, %inner.for.cond.preheader ] 592 %c = phi i32 [ 0, %for.cond.preheader ], [ %inc, %inner.for.cond.preheader ] 593 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 594 %inc = add nuw nsw i32 %c, 1 595 %cc = icmp eq i32 %inc, 16 596 br i1 %cc, label %inner.exit, label %inner.for.cond.preheader 597 598inner.exit: 599 %inc.0 = add nuw nsw i32 %c.0, 1 600 %cc.0 = icmp eq i32 %inc.0, 16 601 br i1 %cc.0, label %exit, label %for.cond.preheader 602 603exit: 604 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 605 ret void 606} 607 608declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) 609declare i32 @llvm.amdgcn.workitem.id.x() 610 611attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } 612