1; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s 2 3; Test that the VGPR spiller correctly switches to SGPR offsets when the 4; instruction offset field would overflow, and that it accounts for memory 5; swizzling. 6 7; CHECK-LABEL: test_inst_offset_kernel 8define amdgpu_kernel void @test_inst_offset_kernel() { 9entry: 10 ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in 11 ; the instruction offset field. 12 %alloca = alloca i8, i32 4088, align 4, addrspace(5) 13 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 14 15 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 16 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill 17 %a = load volatile i32, i32 addrspace(5)* %aptr 18 19 ; Force %a to spill. 20 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 21 22 %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 23 store volatile i32 %a, i32 addrspace(5)* %outptr 24 25 ret void 26} 27 28; CHECK-LABEL: test_sgpr_offset_kernel 29define amdgpu_kernel void @test_sgpr_offset_kernel() { 30entry: 31 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not 32 ; fit in the instruction, and has to live in the SGPR offset. 33 %alloca = alloca i8, i32 4092, align 4, addrspace(5) 34 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 35 36 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 37 ; 0x40000 / 64 = 4096 (for wave64) 38 ; CHECK: s_mov_b32 s6, 0x40000 39 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill 40 %a = load volatile i32, i32 addrspace(5)* %aptr 41 42 ; Force %a to spill 43 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 44 45 %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 46 store volatile i32 %a, i32 addrspace(5)* %outptr 47 48 ret void 49} 50 51; FIXME: If we fail to scavenge an SGPR in a kernel we don't have a stack 52; pointer to temporarily update, so we just crash. 53 54; CHECK-LABEL: test_sgpr_offset_function_scavenge_fail 55define void @test_sgpr_offset_function_scavenge_fail() #2 { 56entry: 57 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not 58 ; fit in the instruction, and has to live in the SGPR offset. 59 %alloca = alloca i8, i32 4096, align 4, addrspace(5) 60 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 61 62 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 63 64 %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"() 65 %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0 66 %asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1 67 %asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2 68 %asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3 69 %asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4 70 %asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5 71 %asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6 72 %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7 73 74 ; 0x40000 / 64 = 4096 (for wave64) 75 %a = load volatile i32, i32 addrspace(5)* %aptr 76 77 ; CHECK: s_add_u32 s32, s32, 0x40000 78 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Spill 79 ; CHECK: s_sub_u32 s32, s32, 0x40000 80 call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a) 81 82 %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"() 83 %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0 84 %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1 85 %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2 86 %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3 87 %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4 88 %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5 89 %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6 90 %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7 91 92 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 93 94 ; CHECK: s_add_u32 s32, s32, 0x40000 95 ; CHECK: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Reload 96 ; CHECK: s_sub_u32 s32, s32, 0x40000 97 98 ; Force %a to spill with no free SGPRs 99 call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a) 100 ret void 101} 102 103; CHECK-LABEL: test_sgpr_offset_subregs_kernel 104define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() { 105entry: 106 ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a 107 ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in 108 ; the instruction offset field. 109 %alloca = alloca i8, i32 4084, align 4, addrspace(5) 110 %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 111 %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* 112 113 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4088 ; 4-byte Folded Spill 114 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill 115 %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 116 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr 117 118 ; Force %a to spill. 119 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 120 121 ; Ensure the alloca sticks around. 122 %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1 123 %b = load volatile i32, i32 addrspace(5)* %bptr 124 125 ; Ensure the spill is of the full super-reg. 126 call void asm sideeffect "; $0", "r"(<2 x i32> %a) 127 128 ret void 129} 130 131; CHECK-LABEL: test_inst_offset_subregs_kernel 132define amdgpu_kernel void @test_inst_offset_subregs_kernel() { 133entry: 134 ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a 135 ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live 136 ; in the SGPR offset. 137 %alloca = alloca i8, i32 4088, align 4, addrspace(5) 138 %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 139 %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* 140 141 ; 0x3ff00 / 64 = 4092 (for wave64) 142 ; CHECK: s_mov_b32 s6, 0x3ff00 143 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill 144 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 offset:4 ; 4-byte Folded Spill 145 %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 146 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr 147 148 ; Force %a to spill. 149 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 150 151 ; Ensure the alloca sticks around. 152 %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1 153 %b = load volatile i32, i32 addrspace(5)* %bptr 154 155 ; Ensure the spill is of the full super-reg. 156 call void asm sideeffect "; $0", "r"(<2 x i32> %a) 157 158 ret void 159} 160 161; CHECK-LABEL: test_inst_offset_function 162define void @test_inst_offset_function() { 163entry: 164 ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in 165 ; the instruction offset field. 166 %alloca = alloca i8, i32 4092, align 4, addrspace(5) 167 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 168 169 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 170 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill 171 %a = load volatile i32, i32 addrspace(5)* %aptr 172 173 ; Force %a to spill. 174 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 175 176 %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 177 store volatile i32 %a, i32 addrspace(5)* %outptr 178 179 ret void 180} 181 182; CHECK-LABEL: test_sgpr_offset_function 183define void @test_sgpr_offset_function() { 184entry: 185 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not 186 ; fit in the instruction, and has to live in the SGPR offset. 187 %alloca = alloca i8, i32 4096, align 4, addrspace(5) 188 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 189 190 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 191 ; 0x40000 / 64 = 4096 (for wave64) 192 ; CHECK: s_add_u32 s4, s32, 0x40000 193 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill 194 %a = load volatile i32, i32 addrspace(5)* %aptr 195 196 ; Force %a to spill 197 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 198 199 %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 200 store volatile i32 %a, i32 addrspace(5)* %outptr 201 202 ret void 203} 204 205; CHECK-LABEL: test_sgpr_offset_subregs_function 206define void @test_sgpr_offset_subregs_function() { 207entry: 208 ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a 209 ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in 210 ; the instruction offset field. 211 %alloca = alloca i8, i32 4088, align 4, addrspace(5) 212 %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 213 %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* 214 215 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill 216 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill 217 %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 218 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr 219 220 ; Force %a to spill. 221 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 222 223 ; Ensure the alloca sticks around. 224 %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1 225 %b = load volatile i32, i32 addrspace(5)* %bptr 226 227 ; Ensure the spill is of the full super-reg. 228 call void asm sideeffect "; $0", "r"(<2 x i32> %a) 229 230 ret void 231} 232 233; CHECK-LABEL: test_inst_offset_subregs_function 234define void @test_inst_offset_subregs_function() { 235entry: 236 ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a 237 ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live 238 ; in the SGPR offset. 239 %alloca = alloca i8, i32 4092, align 4, addrspace(5) 240 %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 241 %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* 242 243 ; 0x3ff00 / 64 = 4092 (for wave64) 244 ; CHECK: s_add_u32 s4, s32, 0x3ff00 245 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill 246 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill 247 %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 248 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr 249 250 ; Force %a to spill. 251 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 252 253 ; Ensure the alloca sticks around. 254 %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1 255 %b = load volatile i32, i32 addrspace(5)* %bptr 256 257 ; Ensure the spill is of the full super-reg. 258 call void asm sideeffect "; $0", "r"(<2 x i32> %a) 259 260 ret void 261} 262 263attributes #0 = { nounwind } 264attributes #1 = { nounwind "amdgpu-num-sgpr"="17" "amdgpu-num-vgpr"="8" } 265attributes #2 = { nounwind "amdgpu-num-sgpr"="14" "amdgpu-num-vgpr"="8" } 266