1; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,GFX908_A %s 2; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,GFX908_A %s 3 4; GCN-LABEL: {{^}}test_mfma_loop_zeroinit: 5 6; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 7; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0 8; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] 9 10; Check that we do not copy agprs to vgprs and back inside the loop. 11 12; GCN: [[LOOP:.LBB[0-9_]+]]: 13; GCN-NOT: v_accvgpr 14; GFX908_A: v_mfma_f32_32x32x1f32 15; GCN-NOT: v_accvgpr 16; GCN: s_cbranch_scc1 [[LOOP]] 17 18; Final result should be read only once after the loop. 19 20; GFX908-COUNT-32: v_accvgpr_read_b32 21; GFX90A-NOT: v_accvgpr_read_b32 22; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 23; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 24 25define amdgpu_kernel void @test_mfma_loop_zeroinit(<32 x float> addrspace(1)* %arg) { 26entry: 27 br label %for.cond.preheader 28 29for.cond.preheader: 30 %phi = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %for.cond.preheader ] 31 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 32 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 33 %inc = add nuw nsw i32 %c, 1 34 %cc = icmp eq i32 %inc, 16 35 br i1 %cc, label %exit, label %for.cond.preheader 36 37exit: 38 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 39 ret void 40} 41 42; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_splat: 43 44; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only. 45; 3 vgprs are needed to avoid wait states between writes. 46; Check that we do not use 32 temp sgprs as well. 47 48; GFX908_A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000 49; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 50; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]] 51; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] 52 53; GCN: [[LOOP:.LBB[0-9_]+]]: 54; GCN-NOT: v_accvgpr 55; GFX908_A: v_mfma_f32_32x32x1f32 56; GCN-NOT: v_accvgpr 57; GCN: s_cbranch_scc1 [[LOOP]] 58 59; GFX908-COUNT-32: v_accvgpr_read_b32 60; GFX90A-NOT: v_accvgpr_read_b32 61; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 62; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 63 64define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(<32 x float> addrspace(1)* %arg) { 65entry: 66 br label %for.cond.preheader 67 68for.cond.preheader: 69 %phi = phi <32 x float> [ <float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0>, %entry ], [ %mai.1, %for.cond.preheader ] 70 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 71 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 72 %inc = add nuw nsw i32 %c, 1 73 %cc = icmp eq i32 %inc, 16 74 br i1 %cc, label %exit, label %for.cond.preheader 75 76exit: 77 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 78 ret void 79} 80 81; GCN-LABEL: {{^}}test_mfma_loop_non_splat: 82 83; GCN: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0{{$}} 84; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0{{$}} 85; GFX908-COUNT-30: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 86; GFX90A-COUNT-30: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]{{$}} 87 88; GCN: [[LOOP:.LBB[0-9_]+]]: 89; GCN-NOT: v_accvgpr 90; GFX908_A: v_mfma_f32_32x32x1f32 91; GCN-NOT: v_accvgpr 92; GCN: s_cbranch_scc1 [[LOOP]] 93 94; GFX908-COUNT-32: v_accvgpr_read_b32 95; GFX90A-NOT: v_accvgpr_read_b32 96; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 97; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 98 99define amdgpu_kernel void @test_mfma_loop_non_splat(<32 x float> addrspace(1)* %arg) { 100entry: 101 br label %for.cond.preheader 102 103for.cond.preheader: 104 %phi = phi <32 x float> [ <float 0.0, float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %mai.1, %for.cond.preheader ] 105 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 106 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 107 %inc = add nuw nsw i32 %c, 1 108 %cc = icmp eq i32 %inc, 16 109 br i1 %cc, label %exit, label %for.cond.preheader 110 111exit: 112 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 113 ret void 114} 115 116; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_seq: 117 118; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only. 119; 3 vgprs are needed to avoid wait states between writes. 120 121; GFX908: v_mov_b32_e32 [[TMP1:v[0-9]+]], 0x42f60000 122; GFX908: v_mov_b32_e32 [[TMP2:v[0-9]+]], 0x42f80000 123; GFX908: v_mov_b32_e32 [[TMP3:v[0-9]+]], 0x42fe0000 124; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 125; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 126; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 127; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 128; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 129; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 130; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 131; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 132; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 133; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 134; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 135; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 136; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 137; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 138; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 139; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 140; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 141; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 142; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 143; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 144; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 145; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 146; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 147; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 148; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 149; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 150; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 151; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 152; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 153; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 154; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 155; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 156; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 157; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 158; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 159; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 160; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 161; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 162; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 163; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 164; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 165; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 166; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 167; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 168; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 169; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 170; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 171; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 172; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 173; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 174; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 175; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 176; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 177; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 178; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 179; GFX908: v_mov_b32_e32 [[TMP1]], 0x4{{[0-9a-f]+}} 180; GFX908: v_mov_b32_e32 [[TMP2]], 0x4{{[0-9a-f]+}} 181; GFX908: v_mov_b32_e32 [[TMP3]], 0x4{{[0-9a-f]+}} 182; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 183; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 184; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 185 186; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 187; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 188; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 189; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 190; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 191; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 192; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 193; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 194; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 195; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 196; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 197; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 198; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 199; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 200; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 201; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 202; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 203; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 204; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 205; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 206; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 207; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 208; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 209; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 210; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 211; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 212; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 213; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 214; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 215; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 216; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 217; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 218; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 219; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 220; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 221; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 222; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 223; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 224; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 225; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 226; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 227; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 228; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 229; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 230; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 231; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 232; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 233; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 234; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 235; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 236; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 237; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 238; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 239; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 240; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 241; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 242; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 243; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 244; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 245; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 246; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 247; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 248; GFX90A: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x4{{[0-9a-f]+}} 249; GFX90A: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 250 251 252; GCN: [[LOOP:.LBB[0-9_]+]]: 253; GCN-NOT: v_accvgpr 254; GFX908_A: v_mfma_f32_32x32x1f32 255; GCN-NOT: v_accvgpr 256; GCN: s_cbranch_scc1 [[LOOP]] 257 258; GFX908-COUNT-32: v_accvgpr_read_b32 259; GFX90A-NOT: v_accvgpr_read_b32 260; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 261; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 262 263define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(<32 x float> addrspace(1)* %arg) { 264entry: 265 br label %for.cond.preheader 266 267for.cond.preheader: 268 %phi = phi <32 x float> [ <float 123.0, float 124.0, float 125.0, float 126.0, float 127.0, float 128.0, float 129.0, float 130.0, float 131.0, float 132.0, float 133.0, float 134.0, float 135.0, float 136.0, float 137.0, float 138.0, float 139.0, float 140.0, float 141.0, float 142.0, float 143.0, float 144.0, float 145.0, float 146.0, float 147.0, float 148.0, float 149.0, float 150.0, float 151.0, float 152.0, float 153.0, float 154.0>, %entry ], [ %mai.1, %for.cond.preheader ] 269 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 270 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 271 %inc = add nuw nsw i32 %c, 1 272 %cc = icmp eq i32 %inc, 16 273 br i1 %cc, label %exit, label %for.cond.preheader 274 275exit: 276 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 277 ret void 278} 279 280; GCN-LABEL: {{^}}test_mfma_loop_vgpr_init: 281 282; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v0{{$}} 283 284; GCN: [[LOOP:.LBB[0-9_]+]]: 285; GCN-NOT: v_accvgpr 286; GFX908_A: v_mfma_f32_32x32x1f32 287; GCN-NOT: v_accvgpr 288; GCN: s_cbranch_scc1 [[LOOP]] 289 290; GFX908-COUNT-32: v_accvgpr_read_b32 291; GFX90A-NOT: v_accvgpr_read_b32 292; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 293; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 294 295define amdgpu_kernel void @test_mfma_loop_vgpr_init(<32 x float> addrspace(1)* %arg) { 296entry: 297 %tid = call i32 @llvm.amdgcn.workitem.id.x() 298 %init = bitcast i32 %tid to float 299 %tmp0 = insertelement <32 x float> undef, float %init, i32 0 300 %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1 301 %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2 302 %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3 303 %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4 304 %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5 305 %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6 306 %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7 307 %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8 308 %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9 309 %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10 310 %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11 311 %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12 312 %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13 313 %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14 314 %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15 315 %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16 316 %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17 317 %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18 318 %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19 319 %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20 320 %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21 321 %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22 322 %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23 323 %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24 324 %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25 325 %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26 326 %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27 327 %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28 328 %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29 329 %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30 330 %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31 331 332 br label %for.cond.preheader 333 334for.cond.preheader: 335 %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ] 336 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 337 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 338 %inc = add nuw nsw i32 %c, 1 339 %cc = icmp eq i32 %inc, 16 340 br i1 %cc, label %exit, label %for.cond.preheader 341 342exit: 343 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 344 ret void 345} 346 347; GCN-LABEL: {{^}}test_mfma_loop_sgpr_init: 348 349; GFX908_A: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}} 350; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 351; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]] 352; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] 353 354; GCN: [[LOOP:.LBB[0-9_]+]]: 355; GCN-NOT: v_accvgpr 356; GFX908_A: v_mfma_f32_32x32x1f32 357; GCN-NOT: v_accvgpr 358; GCN: s_cbranch_scc1 [[LOOP]] 359 360; GFX908-COUNT-32: v_accvgpr_read_b32 361; GFX90A-NOT: v_accvgpr_read_b32 362; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 363; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 364 365define amdgpu_kernel void @test_mfma_loop_sgpr_init(<32 x float> addrspace(1)* %arg, float %init) { 366entry: 367 %tmp0 = insertelement <32 x float> undef, float %init, i32 0 368 %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1 369 %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2 370 %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3 371 %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4 372 %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5 373 %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6 374 %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7 375 %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8 376 %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9 377 %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10 378 %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11 379 %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12 380 %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13 381 %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14 382 %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15 383 %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16 384 %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17 385 %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18 386 %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19 387 %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20 388 %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21 389 %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22 390 %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23 391 %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24 392 %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25 393 %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26 394 %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27 395 %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28 396 %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29 397 %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30 398 %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31 399 400 br label %for.cond.preheader 401 402for.cond.preheader: 403 %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ] 404 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 405 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 406 %inc = add nuw nsw i32 %c, 1 407 %cc = icmp eq i32 %inc, 16 408 br i1 %cc, label %exit, label %for.cond.preheader 409 410exit: 411 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 412 ret void 413} 414 415; GCN-LABEL: {{^}}test_mfma_loop_mixed_init: 416 417; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v0 418; GFX908_A-DAG: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}} 419; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 420; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 421; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 422; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 423; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 424; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 425; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 426; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 427; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 428; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 429; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 430; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 431; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 432; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 433; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 434; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 435; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 436; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 437; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 438; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 439; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 440; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 441; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 442; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 443; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 444; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 445; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 446; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 447; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 448; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 449; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 450 451; GFX90A-DAG: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0 452; GFX90A-COUNT-28: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] 453 454; GCN: [[LOOP:.LBB[0-9_]+]]: 455; GCN-NOT: v_accvgpr 456; GFX908_A: v_mfma_f32_32x32x1f32 457; GCN-NOT: v_accvgpr 458; GCN: s_cbranch_scc1 [[LOOP]] 459 460; GFX908-COUNT-32: v_accvgpr_read_b32 461; GFX90A-NOT: v_accvgpr_read_b32 462; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 463; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 464 465define amdgpu_kernel void @test_mfma_loop_mixed_init(<32 x float> addrspace(1)* %arg, float %x) { 466entry: 467 %tid = call i32 @llvm.amdgcn.workitem.id.x() 468 %init = bitcast i32 %tid to float 469 %tmp0 = insertelement <32 x float> zeroinitializer, float %init, i32 0 470 %tmp1 = insertelement <32 x float> %tmp0, float %x, i32 1 471 472 br label %for.cond.preheader 473 474for.cond.preheader: 475 %phi = phi <32 x float> [ %tmp1, %entry ], [ %mai.1, %for.cond.preheader ] 476 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 477 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 478 %inc = add nuw nsw i32 %c, 1 479 %cc = icmp eq i32 %inc, 16 480 br i1 %cc, label %exit, label %for.cond.preheader 481 482exit: 483 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 484 ret void 485} 486 487; GCN-LABEL: {{^}}test_mfma_loop_mfma_forward_init: 488 489; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0 490; GFX908: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}] 491; GFX90A-NOT: v_accvgpr 492; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}} 493; GFX90A-NOT: v_accvgpr 494; GCN-NOT: v_accvgpr 495 496; GCN: [[LOOP:.LBB[0-9_]+]]: 497; GCN-NOT: v_accvgpr 498; GFX908_A: v_mfma_f32_32x32x1f32 499; GCN-NOT: v_accvgpr 500; GCN: s_cbranch_scc1 [[LOOP]] 501 502; GFX908-COUNT-32: v_accvgpr_read_b32 503; GFX90A-NOT: v_accvgpr_read_b32 504; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 505; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 506 507define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(<32 x float> addrspace(1)* %arg) { 508entry: 509 %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0) 510 511 br label %for.cond.preheader 512 513for.cond.preheader: 514 %phi = phi <32 x float> [ %mai.0, %entry ], [ %mai.1, %for.cond.preheader ] 515 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 516 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 517 %inc = add nuw nsw i32 %c, 1 518 %cc = icmp eq i32 %inc, 16 519 br i1 %cc, label %exit, label %for.cond.preheader 520 521exit: 522 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 523 ret void 524} 525 526; GCN-LABEL: {{^}}test_mfma_loop_agpr_init: 527 528; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0 529; GFX908: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}] 530; GFX90A-NOT: v_accvgpr 531; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}} 532; GFX90A-NOT: v_accvgpr 533 534; Check that we are using only one tmp VGPR. 535 536; GCN: v_accvgpr_read_b32 [[TMP:v[0-9]+]], a{{[0-9]+}} 537; GFX908-COUNT-31: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]{{$}} 538; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], [[TMP]]{{$}} 539; GFX90A-COUNT-29: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] 540 541; GCN: [[LOOP:.LBB[0-9_]+]]: 542; GCN-NOT: v_accvgpr 543; GFX908_A: v_mfma_f32_32x32x1f32 544; GCN-NOT: v_accvgpr 545; GCN: s_cbranch_scc1 [[LOOP]] 546 547; GFX908-COUNT-32: v_accvgpr_read_b32 548; GFX90A-NOT: v_accvgpr_read_b32 549; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 550; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 551 552define amdgpu_kernel void @test_mfma_loop_agpr_init(<32 x float> addrspace(1)* %arg) { 553entry: 554 %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0) 555 %init = extractelement <32 x float> %mai.0, i32 0 556 %tmp0 = insertelement <32 x float> undef, float %init, i32 0 557 %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1 558 %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2 559 %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3 560 %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4 561 %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5 562 %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6 563 %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7 564 %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8 565 %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9 566 %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10 567 %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11 568 %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12 569 %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13 570 %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14 571 %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15 572 %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16 573 %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17 574 %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18 575 %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19 576 %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20 577 %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21 578 %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22 579 %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23 580 %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24 581 %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25 582 %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26 583 %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27 584 %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28 585 %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29 586 %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30 587 %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31 588 589 br label %for.cond.preheader 590 591for.cond.preheader: 592 %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ] 593 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 594 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 595 %inc = add nuw nsw i32 %c, 1 596 %cc = icmp eq i32 %inc, 16 597 br i1 %cc, label %exit, label %for.cond.preheader 598 599exit: 600 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 601 ret void 602} 603 604; GCN-LABEL: {{^}}test_mfma_nested_loop_zeroinit: 605 606; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 607; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0 608; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]] 609 610; Check that we do not copy agprs to vgprs and back in an outer loop. 611 612; GCN: [[OUTER_LOOP:.LBB[0-9_]+]]: 613; GCN-NOT: v_accvgpr 614; GCN: [[INNER_LOOP:.LBB[0-9_]+]]: 615; GCN-NOT: v_accvgpr 616; GFX908_A: v_mfma_f32_32x32x1f32 617; GCN-NOT: v_accvgpr 618; GCN: s_cbranch_scc1 [[INNER_LOOP]] 619; GCN-NOT: v_accvgpr 620; GCN: s_cbranch_scc1 [[OUTER_LOOP]] 621 622; Final result should be read only once after the loop. 623 624; GFX908-COUNT-32: v_accvgpr_read_b32 625; GFX90A-NOT: v_accvgpr_read_b32 626; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}] 627; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}] 628 629define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(<32 x float> addrspace(1)* %arg) { 630entry: 631 br label %for.cond.preheader 632 633for.cond.preheader: 634 %phi.0 = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %inner.exit ] 635 %c.0 = phi i32 [ 0, %entry ], [ %inc.0, %inner.exit ] 636 br label %inner.for.cond.preheader 637 638inner.for.cond.preheader: 639 %phi = phi <32 x float> [ %phi.0, %for.cond.preheader ], [ %mai.1, %inner.for.cond.preheader ] 640 %c = phi i32 [ 0, %for.cond.preheader ], [ %inc, %inner.for.cond.preheader ] 641 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 642 %inc = add nuw nsw i32 %c, 1 643 %cc = icmp eq i32 %inc, 16 644 br i1 %cc, label %inner.exit, label %inner.for.cond.preheader 645 646inner.exit: 647 %inc.0 = add nuw nsw i32 %c.0, 1 648 %cc.0 = icmp eq i32 %inc.0, 16 649 br i1 %cc.0, label %exit, label %for.cond.preheader 650 651exit: 652 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 653 ret void 654} 655 656declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) 657declare i32 @llvm.amdgcn.workitem.id.x() 658