1; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s 2; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s 3 4; GCN-LABEL: {{^}}max_11_vgprs: 5; GFX900-NOT: SCRATCH_RSRC 6; GFX908-NOT: SCRATCH_RSRC 7; GFX908-DAG: v_accvgpr_write_b32 [[A_REG:a[0-9]+]], v{{[0-9]}} 8; GFX900-NOT: buffer_ 9; GFX908-NOT: buffer_ 10; GFX908-DAG: v_mov_b32_e32 v{{[0-9]}}, [[V_REG:v[0-9]+]] 11; GFX908-DAG: v_accvgpr_read_b32 [[V_REG]], [[A_REG]] 12 13; GFX900: NumVgprs: 11 14; GFX908: NumVgprs: 10 15; GFX900: ScratchSize: 0 16; GFX908: ScratchSize: 0 17; GCN: VGPRBlocks: 2 18; GFX900: NumVGPRsForWavesPerEU: 11 19; GFX908: NumVGPRsForWavesPerEU: 10 20define amdgpu_kernel void @max_11_vgprs(i32 addrspace(1)* %p) #2 { 21 %tid = load volatile i32, i32 addrspace(1)* undef 22 %p1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i32 %tid 23 %p2 = getelementptr inbounds i32, i32 addrspace(1)* %p1, i32 4 24 %p3 = getelementptr inbounds i32, i32 addrspace(1)* %p2, i32 8 25 %p4 = getelementptr inbounds i32, i32 addrspace(1)* %p3, i32 12 26 %p5 = getelementptr inbounds i32, i32 addrspace(1)* %p4, i32 16 27 %p6 = getelementptr inbounds i32, i32 addrspace(1)* %p5, i32 20 28 %p7 = getelementptr inbounds i32, i32 addrspace(1)* %p6, i32 24 29 %p8 = getelementptr inbounds i32, i32 addrspace(1)* %p7, i32 28 30 %p9 = getelementptr inbounds i32, i32 addrspace(1)* %p8, i32 32 31 %p10 = getelementptr inbounds i32, i32 addrspace(1)* %p9, i32 36 32 %v1 = load volatile i32, i32 addrspace(1)* %p1 33 %v2 = load volatile i32, i32 addrspace(1)* %p2 34 %v3 = load volatile i32, i32 addrspace(1)* %p3 35 %v4 = load volatile i32, i32 addrspace(1)* %p4 36 %v5 = load volatile i32, i32 addrspace(1)* %p5 37 %v6 = load volatile i32, i32 addrspace(1)* %p6 38 %v7 = load volatile i32, i32 addrspace(1)* %p7 39 %v8 = load volatile i32, i32 addrspace(1)* %p8 40 %v9 = load volatile i32, i32 addrspace(1)* %p9 41 %v10 = load volatile i32, i32 addrspace(1)* %p10 42 call void asm sideeffect "", "v,v,v,v,v,v,v,v,v,v"(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, i32 %v6, i32 %v7, i32 %v8, i32 %v9, i32 %v10) 43 store volatile i32 %v1, i32 addrspace(1)* undef 44 store volatile i32 %v2, i32 addrspace(1)* undef 45 store volatile i32 %v3, i32 addrspace(1)* undef 46 store volatile i32 %v4, i32 addrspace(1)* undef 47 store volatile i32 %v5, i32 addrspace(1)* undef 48 store volatile i32 %v6, i32 addrspace(1)* undef 49 store volatile i32 %v7, i32 addrspace(1)* undef 50 store volatile i32 %v8, i32 addrspace(1)* undef 51 store volatile i32 %v9, i32 addrspace(1)* undef 52 store volatile i32 %v10, i32 addrspace(1)* undef 53 ret void 54} 55 56; GCN-LABEL: {{^}}max_10_vgprs_spill_v32: 57; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 58; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 59; GCN: buffer_store_dword v{{[0-9]}}, 60; GFX908-DAG: v_accvgpr_write_b32 a0, v{{[0-9]}} 61; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}} 62; GCN-NOT: a10 63 64; GFX908: NumVgprs: 10 65; GFX900: ScratchSize: 100 66; GFX908: ScratchSize: 68 67; GFX908: VGPRBlocks: 2 68; GFX908: NumVGPRsForWavesPerEU: 10 69define amdgpu_kernel void @max_10_vgprs_spill_v32(<32 x float> addrspace(1)* %p) #0 { 70 %tid = call i32 @llvm.amdgcn.workitem.id.x() 71 %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid 72 %v = load volatile <32 x float>, <32 x float> addrspace(1)* %gep 73 store volatile <32 x float> %v, <32 x float> addrspace(1)* undef 74 ret void 75} 76 77; GCN-LABEL: {{^}}max_256_vgprs_spill_9x32: 78; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 79; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 80; GFX908-NOT: SCRATCH_RSRC 81; GFX908-DAG: v_accvgpr_write_b32 a0, v 82; GFX900: buffer_store_dword v 83; GFX900: buffer_load_dword v 84; GFX908-NOT: buffer_ 85; GFX908-DAG: v_accvgpr_read_b32 86 87; GFX900: NumVgprs: 256 88; GFX900: ScratchSize: 148 89; GFX908: NumVgprs: 254 90; GFX908: ScratchSize: 0 91; GCN: VGPRBlocks: 63 92; GFX900: NumVGPRsForWavesPerEU: 256 93; GFX908: NumVGPRsForWavesPerEU: 254 94define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %p) #1 { 95 %tid = call i32 @llvm.amdgcn.workitem.id.x() 96 %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid 97 %p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid 98 %p3 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p2, i32 %tid 99 %p4 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p3, i32 %tid 100 %p5 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p4, i32 %tid 101 %p6 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p5, i32 %tid 102 %p7 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p6, i32 %tid 103 %p8 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p7, i32 %tid 104 %p9 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p8, i32 %tid 105 %v1 = load volatile <32 x float>, <32 x float> addrspace(1)* %p1 106 %v2 = load volatile <32 x float>, <32 x float> addrspace(1)* %p2 107 %v3 = load volatile <32 x float>, <32 x float> addrspace(1)* %p3 108 %v4 = load volatile <32 x float>, <32 x float> addrspace(1)* %p4 109 %v5 = load volatile <32 x float>, <32 x float> addrspace(1)* %p5 110 %v6 = load volatile <32 x float>, <32 x float> addrspace(1)* %p6 111 %v7 = load volatile <32 x float>, <32 x float> addrspace(1)* %p7 112 %v8 = load volatile <32 x float>, <32 x float> addrspace(1)* %p8 113 %v9 = load volatile <32 x float>, <32 x float> addrspace(1)* %p9 114 store volatile <32 x float> %v1, <32 x float> addrspace(1)* undef 115 store volatile <32 x float> %v2, <32 x float> addrspace(1)* undef 116 store volatile <32 x float> %v3, <32 x float> addrspace(1)* undef 117 store volatile <32 x float> %v4, <32 x float> addrspace(1)* undef 118 store volatile <32 x float> %v5, <32 x float> addrspace(1)* undef 119 store volatile <32 x float> %v6, <32 x float> addrspace(1)* undef 120 store volatile <32 x float> %v7, <32 x float> addrspace(1)* undef 121 store volatile <32 x float> %v8, <32 x float> addrspace(1)* undef 122 store volatile <32 x float> %v9, <32 x float> addrspace(1)* undef 123 ret void 124} 125 126; GCN-LABEL: {{^}}max_256_vgprs_spill_9x32_2bb: 127; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 128; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 129; GFX908-NOT: SCRATCH_RSRC 130; GFX908: v_accvgpr_write_b32 131; GFX908: global_load_ 132; GFX900: buffer_store_dword v 133; GFX900: buffer_load_dword v 134; GFX908-NOT: buffer_ 135; GFX908-DAG: v_accvgpr_read_b32 136 137; GFX900: NumVgprs: 256 138; GFX908: NumVgprs: 254 139; GFX900: ScratchSize: 1796 140; GFX908: ScratchSize: 0 141; GFX900: VGPRBlocks: 63 142; GFX908: VGPRBlocks: 63 143; GFX900: NumVGPRsForWavesPerEU: 256 144; GFX908: NumVGPRsForWavesPerEU: 25 145define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) #1 { 146 %tid = call i32 @llvm.amdgcn.workitem.id.x() 147 %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid 148 %p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid 149 %p3 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p2, i32 %tid 150 %p4 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p3, i32 %tid 151 %p5 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p4, i32 %tid 152 %p6 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p5, i32 %tid 153 %p7 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p6, i32 %tid 154 %p8 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p7, i32 %tid 155 %p9 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p8, i32 %tid 156 %v1 = load volatile <32 x float>, <32 x float> addrspace(1)* %p1 157 %v2 = load volatile <32 x float>, <32 x float> addrspace(1)* %p2 158 %v3 = load volatile <32 x float>, <32 x float> addrspace(1)* %p3 159 %v4 = load volatile <32 x float>, <32 x float> addrspace(1)* %p4 160 %v5 = load volatile <32 x float>, <32 x float> addrspace(1)* %p5 161 %v6 = load volatile <32 x float>, <32 x float> addrspace(1)* %p6 162 %v7 = load volatile <32 x float>, <32 x float> addrspace(1)* %p7 163 %v8 = load volatile <32 x float>, <32 x float> addrspace(1)* %p8 164 %v9 = load volatile <32 x float>, <32 x float> addrspace(1)* %p9 165 br label %st 166 167st: 168 store volatile <32 x float> %v1, <32 x float> addrspace(1)* undef 169 store volatile <32 x float> %v2, <32 x float> addrspace(1)* undef 170 store volatile <32 x float> %v3, <32 x float> addrspace(1)* undef 171 store volatile <32 x float> %v4, <32 x float> addrspace(1)* undef 172 store volatile <32 x float> %v5, <32 x float> addrspace(1)* undef 173 store volatile <32 x float> %v6, <32 x float> addrspace(1)* undef 174 store volatile <32 x float> %v7, <32 x float> addrspace(1)* undef 175 store volatile <32 x float> %v8, <32 x float> addrspace(1)* undef 176 store volatile <32 x float> %v9, <32 x float> addrspace(1)* undef 177 ret void 178} 179 180; Make sure there's no crash when we have loads from fixed stack 181; objects and are processing VGPR spills 182 183; GCN-LABEL: {{^}}stack_args_vgpr_spill: 184; GFX908: v_accvgpr_write_b32 185; GFX908: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 186; GFX908: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 187define void @stack_args_vgpr_spill(<32 x float> %arg0, <32 x float> %arg1, <32 x float> addrspace(1)* %p) #1 { 188 %tid = call i32 @llvm.amdgcn.workitem.id.x() 189 %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid 190 %p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid 191 %p3 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p2, i32 %tid 192 %p4 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p3, i32 %tid 193 %p5 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p4, i32 %tid 194 %p6 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p5, i32 %tid 195 %p7 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p6, i32 %tid 196 %v1 = load volatile <32 x float>, <32 x float> addrspace(1)* %p1 197 %v2 = load volatile <32 x float>, <32 x float> addrspace(1)* %p2 198 %v3 = load volatile <32 x float>, <32 x float> addrspace(1)* %p3 199 %v4 = load volatile <32 x float>, <32 x float> addrspace(1)* %p4 200 %v5 = load volatile <32 x float>, <32 x float> addrspace(1)* %p5 201 %v6 = load volatile <32 x float>, <32 x float> addrspace(1)* %p6 202 %v7 = load volatile <32 x float>, <32 x float> addrspace(1)* %p7 203 br label %st 204 205st: 206 store volatile <32 x float> %arg0, <32 x float> addrspace(1)* undef 207 store volatile <32 x float> %arg1, <32 x float> addrspace(1)* undef 208 store volatile <32 x float> %v1, <32 x float> addrspace(1)* undef 209 store volatile <32 x float> %v2, <32 x float> addrspace(1)* undef 210 store volatile <32 x float> %v3, <32 x float> addrspace(1)* undef 211 store volatile <32 x float> %v4, <32 x float> addrspace(1)* undef 212 store volatile <32 x float> %v5, <32 x float> addrspace(1)* undef 213 store volatile <32 x float> %v6, <32 x float> addrspace(1)* undef 214 store volatile <32 x float> %v7, <32 x float> addrspace(1)* undef 215 ret void 216} 217 218 219declare i32 @llvm.amdgcn.workitem.id.x() 220 221attributes #0 = { nounwind "amdgpu-num-vgpr"="10" } 222attributes #1 = { "amdgpu-flat-work-group-size"="1,256" } 223attributes #2 = { nounwind "amdgpu-num-vgpr"="11" } 224