1; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s 2 3; GCN-LABEL: {{^}}test_mfma_loop_zeroinit: 4 5; GCN-COUNT32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 6 7; Check that we do not copy agprs to vgprs and back inside the loop. 8 9; GCN: [[LOOP:BB[0-9_]+]]: 10; GCN-NOT: v_accvgpr 11; GCN: v_mfma_f32_32x32x1f32 12; GCN-NOT: v_accvgpr 13; GCN: s_cbranch_scc1 [[LOOP]] 14 15; Final result should be read only once after the loop. 16 17; GCN-COUNT32: v_accvgpr_read_b32 18 19define amdgpu_kernel void @test_mfma_loop_zeroinit(<32 x float> addrspace(1)* %arg) { 20entry: 21 br label %for.cond.preheader 22 23for.cond.preheader: 24 %phi = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %for.cond.preheader ] 25 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 26 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 27 %inc = add nuw nsw i32 %c, 1 28 %cc = icmp eq i32 %inc, 16 29 br i1 %cc, label %exit, label %for.cond.preheader 30 31exit: 32 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 33 ret void 34} 35 36; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_splat: 37 38; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only. 39; 3 vgprs are needed to avoid wait states between writes. 40; Check that we do not use 32 temp sgprs as well. 41 42; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000 43; GCN-COUNT32: v_accvgpr_write_b32 a0, [[TMP]] 44 45; GCN: [[LOOP:BB[0-9_]+]]: 46; GCN-NOT: v_accvgpr 47; GCN: v_mfma_f32_32x32x1f32 48; GCN-NOT: v_accvgpr 49; GCN: s_cbranch_scc1 [[LOOP]] 50 51; GCN-COUNT32: v_accvgpr_read_b32 52 53define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(<32 x float> addrspace(1)* %arg) { 54entry: 55 br label %for.cond.preheader 56 57for.cond.preheader: 58 %phi = phi <32 x float> [ <float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0>, %entry ], [ %mai.1, %for.cond.preheader ] 59 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 60 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 61 %inc = add nuw nsw i32 %c, 1 62 %cc = icmp eq i32 %inc, 16 63 br i1 %cc, label %exit, label %for.cond.preheader 64 65exit: 66 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 67 ret void 68} 69 70; GCN-LABEL: {{^}}test_mfma_loop_non_splat: 71 72; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 73; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0{{$}} 74; GCN-COUNT30: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 75 76; GCN: [[LOOP:BB[0-9_]+]]: 77; GCN-NOT: v_accvgpr 78; GCN: v_mfma_f32_32x32x1f32 79; GCN-NOT: v_accvgpr 80; GCN: s_cbranch_scc1 [[LOOP]] 81 82; GCN-COUNT32: v_accvgpr_read_b32 83 84define amdgpu_kernel void @test_mfma_loop_non_splat(<32 x float> addrspace(1)* %arg) { 85entry: 86 br label %for.cond.preheader 87 88for.cond.preheader: 89 %phi = phi <32 x float> [ <float 0.0, float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %mai.1, %for.cond.preheader ] 90 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 91 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 92 %inc = add nuw nsw i32 %c, 1 93 %cc = icmp eq i32 %inc, 16 94 br i1 %cc, label %exit, label %for.cond.preheader 95 96exit: 97 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 98 ret void 99} 100 101; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_seq: 102 103; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only. 104; 3 vgprs are needed to avoid wait states between writes. 105 106; GCN: v_mov_b32_e32 [[TMP1:v[0-9]+]], 0x42f60000 107; GCN: v_mov_b32_e32 [[TMP2:v[0-9]+]], 0x42f80000 108; GCN: v_mov_b32_e32 [[TMP3:v[0-9]+]], 0x42fe0000 109; GCN-COUNT29: v_mov_b32_e32 v1, 0x4{{[0-9a-f]+}} 110; GCN-COUNT10: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]] 111; GCN-COUNT11: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]] 112; GCN-COUNT11: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]] 113 114; GCN: [[LOOP:BB[0-9_]+]]: 115; GCN-NOT: v_accvgpr 116; GCN: v_mfma_f32_32x32x1f32 117; GCN-NOT: v_accvgpr 118; GCN: s_cbranch_scc1 [[LOOP]] 119 120; GCN-COUNT32: v_accvgpr_read_b32 121 122define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(<32 x float> addrspace(1)* %arg) { 123entry: 124 br label %for.cond.preheader 125 126for.cond.preheader: 127 %phi = phi <32 x float> [ <float 123.0, float 124.0, float 125.0, float 126.0, float 127.0, float 128.0, float 129.0, float 130.0, float 131.0, float 132.0, float 133.0, float 134.0, float 135.0, float 136.0, float 137.0, float 138.0, float 139.0, float 140.0, float 141.0, float 142.0, float 143.0, float 144.0, float 145.0, float 146.0, float 147.0, float 148.0, float 149.0, float 150.0, float 151.0, float 152.0, float 153.0, float 154.0>, %entry ], [ %mai.1, %for.cond.preheader ] 128 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 129 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 130 %inc = add nuw nsw i32 %c, 1 131 %cc = icmp eq i32 %inc, 16 132 br i1 %cc, label %exit, label %for.cond.preheader 133 134exit: 135 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 136 ret void 137} 138 139; GCN-LABEL: {{^}}test_mfma_loop_vgpr_init: 140 141; GCN-COUNT32: v_accvgpr_write_b32 a{{[0-9]+}}, v0{{$}} 142 143; GCN: [[LOOP:BB[0-9_]+]]: 144; GCN-NOT: v_accvgpr 145; GCN: v_mfma_f32_32x32x1f32 146; GCN-NOT: v_accvgpr 147; GCN: s_cbranch_scc1 [[LOOP]] 148 149; GCN-COUNT32: v_accvgpr_read_b32 150 151define amdgpu_kernel void @test_mfma_loop_vgpr_init(<32 x float> addrspace(1)* %arg) { 152entry: 153 %tid = call i32 @llvm.amdgcn.workitem.id.x() 154 %init = bitcast i32 %tid to float 155 %tmp0 = insertelement <32 x float> undef, float %init, i32 0 156 %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1 157 %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2 158 %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3 159 %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4 160 %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5 161 %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6 162 %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7 163 %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8 164 %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9 165 %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10 166 %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11 167 %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12 168 %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13 169 %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14 170 %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15 171 %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16 172 %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17 173 %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18 174 %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19 175 %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20 176 %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21 177 %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22 178 %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23 179 %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24 180 %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25 181 %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26 182 %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27 183 %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28 184 %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29 185 %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30 186 %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31 187 188 br label %for.cond.preheader 189 190for.cond.preheader: 191 %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ] 192 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 193 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 194 %inc = add nuw nsw i32 %c, 1 195 %cc = icmp eq i32 %inc, 16 196 br i1 %cc, label %exit, label %for.cond.preheader 197 198exit: 199 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 200 ret void 201} 202 203; GCN-LABEL: {{^}}test_mfma_loop_sgpr_init: 204 205; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}} 206; GCN-COUNT32: v_accvgpr_write_b32 a0, [[TMP]] 207 208; GCN: [[LOOP:BB[0-9_]+]]: 209; GCN-NOT: v_accvgpr 210; GCN: v_mfma_f32_32x32x1f32 211; GCN-NOT: v_accvgpr 212; GCN: s_cbranch_scc1 [[LOOP]] 213 214; GCN-COUNT32: v_accvgpr_read_b32 215 216define amdgpu_kernel void @test_mfma_loop_sgpr_init(<32 x float> addrspace(1)* %arg, float %init) { 217entry: 218 %tmp0 = insertelement <32 x float> undef, float %init, i32 0 219 %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1 220 %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2 221 %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3 222 %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4 223 %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5 224 %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6 225 %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7 226 %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8 227 %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9 228 %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10 229 %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11 230 %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12 231 %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13 232 %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14 233 %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15 234 %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16 235 %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17 236 %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18 237 %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19 238 %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20 239 %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21 240 %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22 241 %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23 242 %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24 243 %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25 244 %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26 245 %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27 246 %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28 247 %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29 248 %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30 249 %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31 250 251 br label %for.cond.preheader 252 253for.cond.preheader: 254 %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ] 255 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 256 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 257 %inc = add nuw nsw i32 %c, 1 258 %cc = icmp eq i32 %inc, 16 259 br i1 %cc, label %exit, label %for.cond.preheader 260 261exit: 262 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 263 ret void 264} 265 266; GCN-LABEL: {{^}}test_mfma_loop_mixed_init: 267 268; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v0 269; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}} 270; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]] 271 272; GCN-COUNT30: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}} 273 274; GCN: [[LOOP:BB[0-9_]+]]: 275; GCN-NOT: v_accvgpr 276; GCN: v_mfma_f32_32x32x1f32 277; GCN-NOT: v_accvgpr 278; GCN: s_cbranch_scc1 [[LOOP]] 279 280; GCN-COUNT32: v_accvgpr_read_b32 281 282define amdgpu_kernel void @test_mfma_loop_mixed_init(<32 x float> addrspace(1)* %arg, float %x) { 283entry: 284 %tid = call i32 @llvm.amdgcn.workitem.id.x() 285 %init = bitcast i32 %tid to float 286 %tmp0 = insertelement <32 x float> zeroinitializer, float %init, i32 0 287 %tmp1 = insertelement <32 x float> %tmp0, float %x, i32 1 288 289 br label %for.cond.preheader 290 291for.cond.preheader: 292 %phi = phi <32 x float> [ %tmp1, %entry ], [ %mai.1, %for.cond.preheader ] 293 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 294 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 295 %inc = add nuw nsw i32 %c, 1 296 %cc = icmp eq i32 %inc, 16 297 br i1 %cc, label %exit, label %for.cond.preheader 298 299exit: 300 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 301 ret void 302} 303 304; GCN-LABEL: {{^}}test_mfma_loop_mfma_forward_init: 305 306; GCN-COUNT32: v_accvgpr_write_b32 a{{[0-9]+}}, 0 307; GCN: v_mfma_f32_32x32x1f32 308; GCN-NOT: v_accvgpr 309 310; GCN: [[LOOP:BB[0-9_]+]]: 311; GCN-NOT: v_accvgpr 312; GCN: v_mfma_f32_32x32x1f32 313; GCN-NOT: v_accvgpr 314; GCN: s_cbranch_scc1 [[LOOP]] 315 316; GCN-COUNT32: v_accvgpr_read_b32 317define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(<32 x float> addrspace(1)* %arg) { 318entry: 319 %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0) 320 321 br label %for.cond.preheader 322 323for.cond.preheader: 324 %phi = phi <32 x float> [ %mai.0, %entry ], [ %mai.1, %for.cond.preheader ] 325 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 326 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 327 %inc = add nuw nsw i32 %c, 1 328 %cc = icmp eq i32 %inc, 16 329 br i1 %cc, label %exit, label %for.cond.preheader 330 331exit: 332 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 333 ret void 334} 335 336; GCN-LABEL: {{^}}test_mfma_loop_agpr_init: 337 338; GCN-COUNT32: v_accvgpr_write_b32 a{{[0-9]+}}, 0 339; GCN: v_mfma_f32_32x32x1f32 340 341; Check that we are using only one tmp VGPR. 342 343; GCN: v_accvgpr_read_b32 [[TMP:v[0-9]+]], a{{[0-9]+}} 344; GCN-COUNT32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]{{$}} 345 346; GCN: [[LOOP:BB[0-9_]+]]: 347; GCN-NOT: v_accvgpr 348; GCN: v_mfma_f32_32x32x1f32 349; GCN-NOT: v_accvgpr 350; GCN: s_cbranch_scc1 [[LOOP]] 351 352; GCN-COUNT32: v_accvgpr_read_b32 353define amdgpu_kernel void @test_mfma_loop_agpr_init(<32 x float> addrspace(1)* %arg) { 354entry: 355 %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0) 356 %init = extractelement <32 x float> %mai.0, i32 0 357 %tmp0 = insertelement <32 x float> undef, float %init, i32 0 358 %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1 359 %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2 360 %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3 361 %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4 362 %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5 363 %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6 364 %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7 365 %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8 366 %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9 367 %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10 368 %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11 369 %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12 370 %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13 371 %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14 372 %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15 373 %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16 374 %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17 375 %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18 376 %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19 377 %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20 378 %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21 379 %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22 380 %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23 381 %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24 382 %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25 383 %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26 384 %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27 385 %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28 386 %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29 387 %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30 388 %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31 389 390 br label %for.cond.preheader 391 392for.cond.preheader: 393 %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ] 394 %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ] 395 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0) 396 %inc = add nuw nsw i32 %c, 1 397 %cc = icmp eq i32 %inc, 16 398 br i1 %cc, label %exit, label %for.cond.preheader 399 400exit: 401 store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg 402 ret void 403} 404 405declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) 406declare i32 @llvm.amdgcn.workitem.id.x() 407