1; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,GFX908_A %s 2; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,GFX908_A %s 3; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX940 %s 4 5; GCN-LABEL: {{^}}test_mfma_loop_zeroinit: 6 7; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 8; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0 9; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] 10 11; Check that we do not copy agprs to vgprs and back inside the loop. 12 13; GCN: [[LOOP:.LBB[0-9_]+]]: 14; GCN-NOT: v_accvgpr 15; GFX908_A: v_mfma_f32_32x32x1f32 16; GFX940: v_mfma_f32_32x32x1_2b_f32 17; GCN-NOT: v_accvgpr 18; GCN: s_cbranch_scc1 [[LOOP]] 19 20; Final result should be read only once after the loop. 21 22; GFX908-COUNT-32: v_accvgpr_read_b32 23; GFX90A-NOT: v_accvgpr_read_b32 24; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 25; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 26 27define amdgpu_kernel void @test_mfma_loop_zeroinit(<32 x float> addrspace(1)* %arg) #0 { 28entry: 29 br label %for.cond.preheader 30 31for.cond.preheader: 32 %phi = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %for.cond.preheader ] 33 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 34 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 35 %inc = add nuw nsw i32 %c, 1 36 %cc = icmp eq i32 %inc, 16 37 br i1 %cc, label %exit, label %for.cond.preheader 38 39exit: 40 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 41 ret void 42} 43 44; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_splat: 45 46; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only. 47; 3 vgprs are needed to avoid wait states between writes. 48; Check that we do not use 32 temp sgprs as well. 49 50; GFX908_A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000 51; GFX940: s_mov_b32 [[TMP:s[0-9]+]], 0x42f60000 52; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 53; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]] 54; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] 55 56; GCN: [[LOOP:.LBB[0-9_]+]]: 57; GCN-NOT: v_accvgpr 58; GFX908_A: v_mfma_f32_32x32x1f32 59; GFX940: v_mfma_f32_32x32x1_2b_f32 60; GCN-NOT: v_accvgpr 61; GCN: s_cbranch_scc1 [[LOOP]] 62 63; GFX908-COUNT-32: v_accvgpr_read_b32 64; GFX90A-NOT: v_accvgpr_read_b32 65; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 66; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 67 68define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(<32 x float> addrspace(1)* %arg) #0 { 69entry: 70 br label %for.cond.preheader 71 72for.cond.preheader: 73 %phi = phi <32 x float> [ <float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0>, %entry ], [ %mai.1, %for.cond.preheader ] 74 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 75 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 76 %inc = add nuw nsw i32 %c, 1 77 %cc = icmp eq i32 %inc, 16 78 br i1 %cc, label %exit, label %for.cond.preheader 79 80exit: 81 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 82 ret void 83} 84 85; GCN-LABEL: {{^}}test_mfma_loop_non_splat: 86 87; GCN: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0{{$}} 88; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0{{$}} 89; GFX908-COUNT-30: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 90; GFX90A-COUNT-30: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]{{$}} 91 92; GCN: [[LOOP:.LBB[0-9_]+]]: 93; GCN-NOT: v_accvgpr 94; GFX908_A: v_mfma_f32_32x32x1f32 95; GFX940: v_mfma_f32_32x32x1_2b_f32 96; GCN-NOT: v_accvgpr 97; GCN: s_cbranch_scc1 [[LOOP]] 98 99; GFX908-COUNT-32: v_accvgpr_read_b32 100; GFX90A-NOT: v_accvgpr_read_b32 101; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 102; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 103 104define amdgpu_kernel void @test_mfma_loop_non_splat(<32 x float> addrspace(1)* %arg) #0 { 105entry: 106 br label %for.cond.preheader 107 108for.cond.preheader: 109 %phi = phi <32 x float> [ <float 0.0, float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %mai.1, %for.cond.preheader ] 110 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 111 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 112 %inc = add nuw nsw i32 %c, 1 113 %cc = icmp eq i32 %inc, 16 114 br i1 %cc, label %exit, label %for.cond.preheader 115 116exit: 117 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 118 ret void 119} 120 121; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_seq: 122 123; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only. 124; 3 vgprs are needed to avoid wait states between writes. 125 126; GFX908: v_mov_b32_e32 [[TMP1:v[0-9]+]], 0x42f60000 127; GFX908: v_mov_b32_e32 [[TMP2:v[0-9]+]], 0x42f80000 128; GFX908: v_mov_b32_e32 [[TMP3:v[0-9]+]], 0x42fe0000 129; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 130; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 131; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 132; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 133; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 134; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 135; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 136; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 137; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 138; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 139; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 140; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 141; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 142; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 143; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 144; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 145; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 146; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 147; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 148; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 149; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 150; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 151; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 152; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 153; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 154; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 155; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 156; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 157; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 158; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 159; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 160; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 161; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 162; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 163; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 164; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 165; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 166; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 167; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 168; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 169; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 170; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 171; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 172; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 173; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 174; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 175; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 176; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 177; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 178; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 179; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 180; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 181; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 182; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 183; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 184; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 185; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 186; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 187; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 188; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 189; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 190 191; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 192; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 193; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 194; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 195; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 196; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 197; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 198; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 199; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 200; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 201; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 202; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 203; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 204; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 205; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 206; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 207; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 208; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 209; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 210; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 211; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 212; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 213; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 214; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 215; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 216; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 217; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 218; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 219; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 220; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 221; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 222; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 223; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 224; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 225; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 226; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 227; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 228; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 229; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 230; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 231; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 232; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 233; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 234; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 235; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 236; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 237; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 238; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 239; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 240; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 241; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 242; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 243; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 244; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 245; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 246; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 247; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 248; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 249; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 250; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 251; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 252; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 253; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 254; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 255 256; GFX940-COUNT-32: s_mov_b32 s{{[0-9]+}}, 0x4{{[0-9a-f]+}} 257; GFX940-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}} 258 259; GCN: [[LOOP:.LBB[0-9_]+]]: 260; GCN-NOT: v_accvgpr 261; GFX908_A: v_mfma_f32_32x32x1f32 262; GFX940: v_mfma_f32_32x32x1_2b_f32 263; GCN-NOT: v_accvgpr 264; GCN: s_cbranch_scc1 [[LOOP]] 265 266; GFX908-COUNT-32: v_accvgpr_read_b32 267; GFX90A-NOT: v_accvgpr_read_b32 268; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 269; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 270 271define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(<32 x float> addrspace(1)* %arg) #0 { 272entry: 273 br label %for.cond.preheader 274 275for.cond.preheader: 276 %phi = phi <32 x float> [ <float 123.0, float 124.0, float 125.0, float 126.0, float 127.0, float 128.0, float 129.0, float 130.0, float 131.0, float 132.0, float 133.0, float 134.0, float 135.0, float 136.0, float 137.0, float 138.0, float 139.0, float 140.0, float 141.0, float 142.0, float 143.0, float 144.0, float 145.0, float 146.0, float 147.0, float 148.0, float 149.0, float 150.0, float 151.0, float 152.0, float 153.0, float 154.0>, %entry ], [ %mai.1, %for.cond.preheader ] 277 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 278 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 279 %inc = add nuw nsw i32 %c, 1 280 %cc = icmp eq i32 %inc, 16 281 br i1 %cc, label %exit, label %for.cond.preheader 282 283exit: 284 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 285 ret void 286} 287 288; GCN-LABEL: {{^}}test_mfma_loop_vgpr_init: 289 290; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v0{{$}} 291 292; GCN: [[LOOP:.LBB[0-9_]+]]: 293; GCN-NOT: v_accvgpr 294; GFX908_A: v_mfma_f32_32x32x1f32 295; GFX940: v_mfma_f32_32x32x1_2b_f32 296; GCN-NOT: v_accvgpr 297; GCN: s_cbranch_scc1 [[LOOP]] 298 299; GFX908-COUNT-32: v_accvgpr_read_b32 300; GFX90A-NOT: v_accvgpr_read_b32 301; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 302; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 303 304define amdgpu_kernel void @test_mfma_loop_vgpr_init(<32 x float> addrspace(1)* %arg) #0 { 305entry: 306 %tid = call i32 @llvm.amdgcn.workitem.id.x() 307 %init = bitcast i32 %tid to float 308 %tmp0 = insertelement <32 x float> undef, float %init, i32 0 309 %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1 310 %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2 311 %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3 312 %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4 313 %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5 314 %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6 315 %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7 316 %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8 317 %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9 318 %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10 319 %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11 320 %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12 321 %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13 322 %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14 323 %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15 324 %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16 325 %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17 326 %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18 327 %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19 328 %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20 329 %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21 330 %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22 331 %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23 332 %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24 333 %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25 334 %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26 335 %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27 336 %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28 337 %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29 338 %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30 339 %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31 340 341 br label %for.cond.preheader 342 343for.cond.preheader: 344 %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ] 345 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 346 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 347 %inc = add nuw nsw i32 %c, 1 348 %cc = icmp eq i32 %inc, 16 349 br i1 %cc, label %exit, label %for.cond.preheader 350 351exit: 352 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 353 ret void 354} 355 356; GCN-LABEL: {{^}}test_mfma_loop_sgpr_init: 357 358; GFX908_A: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}} 359; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 360; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]] 361; GFX940: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], s{{[0-9]+}} 362; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] 363 364; GCN: [[LOOP:.LBB[0-9_]+]]: 365; GCN-NOT: v_accvgpr 366; GFX908_A: v_mfma_f32_32x32x1f32 367; GFX940: v_mfma_f32_32x32x1_2b_f32 368; GCN-NOT: v_accvgpr 369; GCN: s_cbranch_scc1 [[LOOP]] 370 371; GFX908-COUNT-32: v_accvgpr_read_b32 372; GFX90A-NOT: v_accvgpr_read_b32 373; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 374; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 375 376define amdgpu_kernel void @test_mfma_loop_sgpr_init(<32 x float> addrspace(1)* %arg, float %init) #0 { 377entry: 378 %tmp0 = insertelement <32 x float> undef, float %init, i32 0 379 %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1 380 %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2 381 %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3 382 %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4 383 %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5 384 %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6 385 %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7 386 %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8 387 %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9 388 %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10 389 %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11 390 %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12 391 %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13 392 %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14 393 %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15 394 %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16 395 %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17 396 %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18 397 %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19 398 %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20 399 %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21 400 %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22 401 %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23 402 %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24 403 %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25 404 %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26 405 %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27 406 %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28 407 %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29 408 %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30 409 %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31 410 411 br label %for.cond.preheader 412 413for.cond.preheader: 414 %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ] 415 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 416 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 417 %inc = add nuw nsw i32 %c, 1 418 %cc = icmp eq i32 %inc, 16 419 br i1 %cc, label %exit, label %for.cond.preheader 420 421exit: 422 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 423 ret void 424} 425 426; GCN-LABEL: {{^}}test_mfma_loop_mixed_init: 427 428; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v0 429; GFX908_A-DAG: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}} 430; GFX940-DAG: s_load_dword [[TMP:s[0-9]+]], 431; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 432; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 433; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 434; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 435; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 436; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 437; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 438; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 439; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 440; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 441; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 442; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 443; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 444; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 445; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 446; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 447; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 448; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 449; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 450; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 451; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 452; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 453; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 454; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 455; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 456; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 457; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 458; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 459; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 460; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 461; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 462 463; GFX90A-DAG: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0 464; GFX90A-COUNT-28: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] 465 466; GCN: [[LOOP:.LBB[0-9_]+]]: 467; GCN-NOT: v_accvgpr 468; GFX908_A: v_mfma_f32_32x32x1f32 469; GFX940: v_mfma_f32_32x32x1_2b_f32 470; GCN-NOT: v_accvgpr 471; GCN: s_cbranch_scc1 [[LOOP]] 472 473; GFX908-COUNT-32: v_accvgpr_read_b32 474; GFX90A-NOT: v_accvgpr_read_b32 475; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 476; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 477 478define amdgpu_kernel void @test_mfma_loop_mixed_init(<32 x float> addrspace(1)* %arg, float %x) #0 { 479entry: 480 %tid = call i32 @llvm.amdgcn.workitem.id.x() 481 %init = bitcast i32 %tid to float 482 %tmp0 = insertelement <32 x float> zeroinitializer, float %init, i32 0 483 %tmp1 = insertelement <32 x float> %tmp0, float %x, i32 1 484 485 br label %for.cond.preheader 486 487for.cond.preheader: 488 %phi = phi <32 x float> [ %tmp1, %entry ], [ %mai.1, %for.cond.preheader ] 489 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 490 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 491 %inc = add nuw nsw i32 %c, 1 492 %cc = icmp eq i32 %inc, 16 493 br i1 %cc, label %exit, label %for.cond.preheader 494 495exit: 496 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 497 ret void 498} 499 500; GCN-LABEL: {{^}}test_mfma_loop_mfma_forward_init: 501 502; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0 503; GFX908: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}] 504; GFX90A-NOT: v_accvgpr 505; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}} 506; GFX90A-NOT: v_accvgpr 507; GFX940: v_mfma_f32_32x32x1_2b_f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}} 508; GCN-NOT: v_accvgpr 509 510; GCN: [[LOOP:.LBB[0-9_]+]]: 511; GCN-NOT: v_accvgpr 512; GFX908_A: v_mfma_f32_32x32x1f32 513; GFX940: v_mfma_f32_32x32x1_2b_f32 514; GCN-NOT: v_accvgpr 515; GCN: s_cbranch_scc1 [[LOOP]] 516 517; GFX908-COUNT-32: v_accvgpr_read_b32 518; GFX90A-NOT: v_accvgpr_read_b32 519; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 520; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 521 522define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(<32 x float> addrspace(1)* %arg) #0 { 523entry: 524 %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0) 525 526 br label %for.cond.preheader 527 528for.cond.preheader: 529 %phi = phi <32 x float> [ %mai.0, %entry ], [ %mai.1, %for.cond.preheader ] 530 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 531 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 532 %inc = add nuw nsw i32 %c, 1 533 %cc = icmp eq i32 %inc, 16 534 br i1 %cc, label %exit, label %for.cond.preheader 535 536exit: 537 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 538 ret void 539} 540 541; GCN-LABEL: {{^}}test_mfma_loop_agpr_init: 542 543; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0 544; GFX908: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}] 545; GFX90A-NOT: v_accvgpr 546; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}} 547; GFX90A-NOT: v_accvgpr 548; GFX940: v_mfma_f32_32x32x1_2b_f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}} 549 550; Check that we are using only one tmp VGPR. 551 552; GCN: v_accvgpr_read_b32 [[TMP:v[0-9]+]], a{{[0-9]+}} 553; GFX908-COUNT-31: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]{{$}} 554; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]]{{$}} 555; GFX90A-COUNT-29: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] 556 557; GCN: [[LOOP:.LBB[0-9_]+]]: 558; GCN-NOT: v_accvgpr 559; GFX908_A: v_mfma_f32_32x32x1f32 560; GFX940: v_mfma_f32_32x32x1_2b_f32 561; GCN-NOT: v_accvgpr 562; GCN: s_cbranch_scc1 [[LOOP]] 563 564; GFX908-COUNT-32: v_accvgpr_read_b32 565; GFX90A-NOT: v_accvgpr_read_b32 566; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 567; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 568 569define amdgpu_kernel void @test_mfma_loop_agpr_init(<32 x float> addrspace(1)* %arg) #0 { 570entry: 571 %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0) 572 %init = extractelement <32 x float> %mai.0, i32 0 573 %tmp0 = insertelement <32 x float> undef, float %init, i32 0 574 %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1 575 %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2 576 %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3 577 %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4 578 %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5 579 %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6 580 %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7 581 %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8 582 %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9 583 %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10 584 %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11 585 %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12 586 %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13 587 %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14 588 %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15 589 %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16 590 %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17 591 %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18 592 %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19 593 %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20 594 %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21 595 %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22 596 %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23 597 %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24 598 %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25 599 %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26 600 %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27 601 %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28 602 %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29 603 %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30 604 %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31 605 606 br label %for.cond.preheader 607 608for.cond.preheader: 609 %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ] 610 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 611 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 612 %inc = add nuw nsw i32 %c, 1 613 %cc = icmp eq i32 %inc, 16 614 br i1 %cc, label %exit, label %for.cond.preheader 615 616exit: 617 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 618 ret void 619} 620 621; GCN-LABEL: {{^}}test_mfma_nested_loop_zeroinit: 622 623; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 624; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0 625; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] 626 627; Check that we do not copy agprs to vgprs and back in an outer loop. 628 629; GCN: [[OUTER_LOOP:.LBB[0-9_]+]]: 630; GCN-NOT: v_accvgpr 631; GCN: [[INNER_LOOP:.LBB[0-9_]+]]: 632; GCN-NOT: v_accvgpr 633; GFX908_A: v_mfma_f32_32x32x1f32 634; GFX940: v_mfma_f32_32x32x1_2b_f32 635; GCN-NOT: v_accvgpr 636; GCN: s_cbranch_scc1 [[INNER_LOOP]] 637; GCN-NOT: v_accvgpr 638; GCN: s_cbranch_scc1 [[OUTER_LOOP]] 639 640; Final result should be read only once after the loop. 641 642; GFX908-COUNT-32: v_accvgpr_read_b32 643; GFX90A-NOT: v_accvgpr_read_b32 644; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 645; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 646 647define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(<32 x float> addrspace(1)* %arg) #0 { 648entry: 649 br label %for.cond.preheader 650 651for.cond.preheader: 652 %phi.0 = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %inner.exit ] 653 %c.0 = phi i32 [ 0, %entry ], [ %inc.0, %inner.exit ] 654 br label %inner.for.cond.preheader 655 656inner.for.cond.preheader: 657 %phi = phi <32 x float> [ %phi.0, %for.cond.preheader ], [ %mai.1, %inner.for.cond.preheader ] 658 %c = phi i32 [ 0, %for.cond.preheader ], [ %inc, %inner.for.cond.preheader ] 659 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 660 %inc = add nuw nsw i32 %c, 1 661 %cc = icmp eq i32 %inc, 16 662 br i1 %cc, label %inner.exit, label %inner.for.cond.preheader 663 664inner.exit: 665 %inc.0 = add nuw nsw i32 %c.0, 1 666 %cc.0 = icmp eq i32 %inc.0, 16 667 br i1 %cc.0, label %exit, label %for.cond.preheader 668 669exit: 670 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 671 ret void 672} 673 674declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) 675declare i32 @llvm.amdgcn.workitem.id.x() 676 677attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } 678