1; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s 2; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s 3 4; GCN-LABEL: {{^}}max_24regs_32a_used: 5; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 6; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 7; GCN-DAG: v_mfma_f32_16x16x1f32 8; GCN-DAG: v_mfma_f32_16x16x1f32 9; GCN-DAG: v_accvgpr_read_b32 10; GCN-NOT: buffer_store_dword 11; GCN-NOT: buffer_load_dword 12; GFX908-NOT: v_accvgpr_write_b32 13; GFX90A: v_accvgpr_write_b32 14; GCN: ScratchSize: 0 15define amdgpu_kernel void @max_24regs_32a_used(<16 x float> addrspace(1)* %arg, float addrspace(1)* %out) #0 { 16bb: 17 %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg 18 %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 1.0, <16 x float> %in.1, i32 0, i32 0, i32 0) 19 %mai.2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 1.0, <16 x float> %mai.1, i32 0, i32 0, i32 0) 20 %elt1 = extractelement <16 x float> %mai.2, i32 0 21 %elt2 = extractelement <16 x float> %mai.1, i32 15 22 %elt3 = extractelement <16 x float> %mai.1, i32 14 23 %elt4 = extractelement <16 x float> %mai.2, i32 1 24 store float %elt1, float addrspace(1)* %out 25 %gep1 = getelementptr float, float addrspace(1)* %out, i64 1 26 store float %elt2, float addrspace(1)* %gep1 27 %gep2 = getelementptr float, float addrspace(1)* %out, i64 2 28 store float %elt3, float addrspace(1)* %gep2 29 %gep3 = getelementptr float, float addrspace(1)* %out, i64 3 30 store float %elt4, float addrspace(1)* %gep3 31 32 ret void 33} 34 35; GCN-LABEL: {{^}}max_12regs_13a_used: 36; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 37; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 38; GCN: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} 39; GCN-NOT: buffer_store_dword 40; GCN-NOT: buffer_load_dword 41; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] 42; GCN: ScratchSize: 0 43define amdgpu_kernel void @max_12regs_13a_used(i32 %cond, <4 x float> addrspace(1)* %arg, <4 x float> addrspace(1)* %out) #2 { 44bb: 45 %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg 46 %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %in.1, i32 0, i32 0, i32 0) 47 %mai.2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai.1, i32 0, i32 0, i32 0) 48 %cmp = icmp eq i32 %cond, 0 49 br i1 %cmp, label %use, label %st 50 51use: 52 call void asm sideeffect "", "a,a,a,a,a"(i32 1, i32 2, i32 3, i32 4, i32 5) 53 store volatile <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float> addrspace(1)* %out 54 br label %st 55 56st: 57 %gep1 = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i64 16 58 %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i64 32 59 call void asm sideeffect "", "a,a"(<4 x float> %mai.1, <4 x float> %mai.2) 60 ret void 61} 62 63; GCN-LABEL: {{^}}max_10_vgprs_used_9a: 64; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 65; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 66; GCN: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a{{[0-9]+}} 67; GCN-NOT: buffer_store_dword 68; GCN-NOT: buffer_load_dword 69; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] 70; GCN: ScratchSize: 0 71define amdgpu_kernel void @max_10_vgprs_used_9a() #1 { 72 %a1 = call <4 x i32> asm sideeffect "", "=a"() 73 %a2 = call <4 x i32> asm sideeffect "", "=a"() 74 %a3 = call i32 asm sideeffect "", "=a"() 75 %a4 = call <2 x i32> asm sideeffect "", "=a"() 76 call void asm sideeffect "", "a,a,a"(<4 x i32> %a1, <4 x i32> %a2, i32 %a3) 77 call void asm sideeffect "", "a"(<2 x i32> %a4) 78 ret void 79} 80 81; GCN-LABEL: {{^}}max_32regs_mfma32: 82; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 83; GCN-NOT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 84; GCN-NOT: buffer_store_dword 85; GCN: v_accvgpr_read_b32 86; GCN: v_mfma_f32_32x32x1f32 87; GCN-NOT: buffer_load_dword 88; GCN: v_accvgpr_write_b32 89; GCN: ScratchSize: 0 90define amdgpu_kernel void @max_32regs_mfma32(float addrspace(1)* %arg) #3 { 91bb: 92 %v = call i32 asm sideeffect "", "=a"() 93 br label %use 94 95use: 96 %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 1.0, <32 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0, float 17.0, float 18.0, float 19.0, float 20.0, float 21.0, float 22.0, float 23.0, float 24.0, float 25.0, float 26.0, float 27.0, float 28.0, float 29.0, float 30.0, float 31.0, float 2.0>, i32 0, i32 0, i32 0) 97 call void asm sideeffect "", "a"(i32 %v) 98 %elt1 = extractelement <32 x float> %mai.1, i32 0 99 store float %elt1, float addrspace(1)* %arg 100 ret void 101} 102 103; Should spill agprs to memory for both gfx908 and gfx90a. 104; GCN-LABEL: {{^}}max_5regs_used_8a: 105; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 106; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 107 108; GFX908-DAG: v_accvgpr_read_b32 v1, a0 ; Reload Reuse 109; GFX908-DAG: buffer_store_dword v1, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill 110; GFX908-DAG: v_accvgpr_read_b32 v1, a1 ; Reload Reuse 111; GFX908-DAG: buffer_store_dword v1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill 112; GFX908-DAG: v_accvgpr_read_b32 v1, a2 ; Reload Reuse 113; GFX908-DAG: buffer_store_dword v1, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Spill 114; GFX908-DAG: v_accvgpr_read_b32 v1, a3 ; Reload Reuse 115; GFX908-DAG: buffer_store_dword v1, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Spill 116 117; GFX90A-DAG: buffer_store_dword a0, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Spill 118; GFX90A-DAG: buffer_store_dword a1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Spill 119; GFX90A-DAG: v_accvgpr_read_b32 v4, a2 ; Reload Reuse 120; GFX90A-DAG: v_accvgpr_read_b32 v3, a3 ; Reload Reuse 121 122; GCN: v_mfma_f32_4x4x1f32 a[0:3], v{{[0-9]+}}, v{{[0-9]+}}, a[0:3] 123 124; GFX908-DAG: buffer_load_dword v0, off, s[{{[0-9:]+}}], 0 offset:4 ; 4-byte Folded Reload 125; GFX908-DAG: buffer_load_dword v1, off, s[{{[0-9:]+}}], 0 offset:8 ; 4-byte Folded Reload 126; GFX908-DAG: buffer_load_dword v2, off, s[{{[0-9:]+}}], 0 offset:12 ; 4-byte Folded Reload 127; GFX908-DAG: buffer_load_dword v3, off, s[{{[0-9:]+}}], 0 offset:16 ; 4-byte Folded Reload 128; GFX908: global_store_dwordx4 v[{{[0-9:]+}}], v[0:3], off 129 130; GFX90A-DAG: buffer_load_dword a0, off, s[4:7], 0 offset:4 ; 4-byte Folded Reload 131; GFX90A-DAG: buffer_load_dword a1, off, s[4:7], 0 offset:8 ; 4-byte Folded Reload 132; GFX90A-DAG: v_accvgpr_write_b32 a2, v4 ; Reload Reuse 133; GFX90A-DAG: v_accvgpr_write_b32 a3, v3 ; Reload Reuse 134; GFX90A: global_store_dwordx4 v[0:1], a[0:3], off 135 136; GCN: ScratchSize: 20 137define amdgpu_kernel void @max_5regs_used_8a(<4 x float> addrspace(1)* %arg) #4 { 138 %tid = call i32 @llvm.amdgcn.workitem.id.x() 139 %v0 = call float asm sideeffect "; def $0", "=v"() 140 %a4 = call <4 x float> asm sideeffect "; def $0", "=a"() 141 %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i32 %tid 142 %mai.in = load <4 x float>, <4 x float> addrspace(1)* %gep 143 %mai.out = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai.in, i32 0, i32 0, i32 0) 144 store <4 x float> %mai.out, <4 x float> addrspace(1)* %gep 145 store volatile <4 x float> %a4, <4 x float> addrspace(1)* undef 146 call void asm sideeffect "; use $0", "v"(float %v0); 147 ret void 148} 149 150declare i32 @llvm.amdgcn.workitem.id.x() 151declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32) 152declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32) 153declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) 154 155attributes #0 = { nounwind "amdgpu-num-vgpr"="24" } 156attributes #1 = { nounwind "amdgpu-num-vgpr"="10" } 157attributes #2 = { nounwind "amdgpu-num-vgpr"="12" } 158attributes #3 = { nounwind "amdgpu-num-vgpr"="32" } 159attributes #4 = { nounwind "amdgpu-num-vgpr"="5" } 160