1; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck -check-prefixes=GCN,MUBUF %s 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s 3 4; Test that the VGPR spiller correctly switches to SGPR offsets when the 5; instruction offset field would overflow, and that it accounts for memory 6; swizzling. 7 8; GCN-LABEL: test_inst_offset_kernel 9define amdgpu_kernel void @test_inst_offset_kernel() { 10entry: 11 ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in 12 ; the instruction offset field. 13 %alloca = alloca i8, i32 4088, align 4, addrspace(5) 14 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 15 16 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 17 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill 18 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s{{[0-9]+}} ; 4-byte Folded Spill 19 %a = load volatile i32, i32 addrspace(5)* %aptr 20 21 ; Force %a to spill. 22 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 23 24 %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 25 store volatile i32 %a, i32 addrspace(5)* %outptr 26 27 ret void 28} 29 30; GCN-LABEL: test_sgpr_offset_kernel 31define amdgpu_kernel void @test_sgpr_offset_kernel() { 32entry: 33 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not 34 ; fit in the instruction, and has to live in the SGPR offset. 35 %alloca = alloca i8, i32 4092, align 4, addrspace(5) 36 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 37 38 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 39 ; 0x40000 / 64 = 4096 (for wave64) 40 ; MUBUF: s_mov_b32 s4, 0x40000 41 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill 42 ; FLATSCR: s_movk_i32 s2, 0x1000 43 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s2 ; 4-byte Folded Spill 44 %a = load volatile i32, i32 addrspace(5)* %aptr 45 46 ; Force %a to spill 47 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 48 49 %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 50 store volatile i32 %a, i32 addrspace(5)* %outptr 51 52 ret void 53} 54 55; FIXME: If we fail to scavenge an SGPR in a kernel we don't have a stack 56; pointer to temporarily update, so we just crash. 57 58; GCN-LABEL: test_sgpr_offset_function_scavenge_fail_func 59define void @test_sgpr_offset_function_scavenge_fail_func() #2 { 60entry: 61 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not 62 ; fit in the instruction, and has to live in the SGPR offset. 63 %alloca = alloca i8, i32 4096, align 4, addrspace(5) 64 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 65 66 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 67 68 %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"() 69 %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0 70 %asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1 71 %asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2 72 %asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3 73 %asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4 74 %asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5 75 %asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6 76 %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7 77 78 ; 0x40000 / 64 = 4096 (for wave64) 79 %a = load volatile i32, i32 addrspace(5)* %aptr 80 81 ; MUBUF: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1004 82 ; MUBUF-NEXT: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+:[0-9]+}}], s32 offen ; 4-byte Folded Spill 83 84; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s32, 0x1004 85 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill 86 call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a) 87 88 %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"() 89 %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0 90 %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1 91 %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2 92 %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3 93 %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4 94 %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5 95 %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6 96 %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7 97 98 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 99 100 ; MUBUF: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1004 101 ; MUBUF-NEXT: buffer_load_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+:[0-9]+}}], s32 offen ; 4-byte Folded Reload 102 ; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s32, 0x1004 103 ; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, [[SOFF]] ; 4-byte Folded Reload 104 105 ; Force %a to spill with no free SGPRs 106 call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a) 107 ret void 108} 109 110define amdgpu_kernel void @test_sgpr_offset_function_scavenge_fail_kernel() #3 { 111entry: 112 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not 113 ; fit in the instruction, and has to live in the SGPR offset. 114 %alloca = alloca i8, i32 4096, align 4, addrspace(5) 115 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 116 117 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 118 119 %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"() 120 %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0 121 %asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1 122 %asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2 123 %asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3 124 %asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4 125 %asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5 126 %asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6 127 %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7 128 129 ; 0x40000 / 64 = 4096 (for wave64) 130 %a = load volatile i32, i32 addrspace(5)* %aptr 131 132 ; MUBUF: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1004 133 ; MUBUF: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+:[0-9]+}}], 0 offen ; 4-byte Folded Spill 134 135 ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0x1004 136 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill 137 call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a) 138 139 %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"() 140 %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0 141 %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1 142 %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2 143 %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3 144 %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4 145 %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5 146 %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6 147 %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7 148 149 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 150 151 ; MUBUF: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1004 152 ; MUBUF: buffer_load_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+:[0-9]+}}], 0 offen ; 4-byte Folded Reload 153 ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0x1004 154 ; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, [[SOFF]] ; 4-byte Folded Reload 155 156 ; Force %a to spill with no free SGPRs 157 call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a) 158 ret void 159} 160 161; GCN-LABEL: test_sgpr_offset_subregs_kernel 162define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() { 163entry: 164 ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a 165 ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in 166 ; the instruction offset field. 167 %alloca = alloca i8, i32 4084, align 4, addrspace(5) 168 %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 169 %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* 170 171 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4088 ; 4-byte Folded Spill 172 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill 173 ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0xff8 174 ; FLATSCR: scratch_store_dwordx2 off, v[{{[0-9:]+}}], [[SOFF]] ; 8-byte Folded Spill 175 %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 176 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr 177 178 ; Force %a to spill. 179 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 180 181 ; Ensure the alloca sticks around. 182 %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1 183 %b = load volatile i32, i32 addrspace(5)* %bptr 184 185 ; Ensure the spill is of the full super-reg. 186 call void asm sideeffect "; $0", "r"(<2 x i32> %a) 187 188 ret void 189} 190 191; GCN-LABEL: test_inst_offset_subregs_kernel 192define amdgpu_kernel void @test_inst_offset_subregs_kernel() { 193entry: 194 ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a 195 ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live 196 ; in the SGPR offset. 197 %alloca = alloca i8, i32 4088, align 4, addrspace(5) 198 %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 199 %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* 200 201 ; 0x3ff00 / 64 = 4092 (for wave64) 202 ; MUBUF: s_mov_b32 s4, 0x3ff00 203 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill 204 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill 205 ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0xffc 206 ; FLATSCR: scratch_store_dwordx2 off, v[{{[0-9:]+}}], [[SOFF]] ; 8-byte Folded Spill 207 %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 208 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr 209 210 ; Force %a to spill. 211 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 212 213 ; Ensure the alloca sticks around. 214 %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1 215 %b = load volatile i32, i32 addrspace(5)* %bptr 216 217 ; Ensure the spill is of the full super-reg. 218 call void asm sideeffect "; $0", "r"(<2 x i32> %a) 219 220 ret void 221} 222 223; GCN-LABEL: test_inst_offset_function 224define void @test_inst_offset_function() { 225entry: 226 ; Occupy enough bytes of scratch, so the offset of the spill of %a 227 ; just fits in the instruction offset field when the emergency stack 228 ; slot is added. It's hard to hit the actual limit since we're also 229 ; going to insert the emergency stack slot for large frames. 230 %alloca = alloca i8, i32 4088, align 4, addrspace(5) 231 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 232 233 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 234 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill 235 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill 236 %a = load volatile i32, i32 addrspace(5)* %aptr 237 238 ; Force %a to spill. 239 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 240 241 %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 242 store volatile i32 %a, i32 addrspace(5)* %outptr 243 244 ret void 245} 246 247; GCN-LABEL: test_sgpr_offset_function 248define void @test_sgpr_offset_function() { 249entry: 250 ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not 251 ; fit in the instruction, and has to live in the SGPR offset. 252 %alloca = alloca i8, i32 4096, align 4, addrspace(5) 253 %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 254 255 %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 256 ; 0x40000 / 64 = 4096 (for wave64) 257 ; MUBUF: s_add_i32 s4, s32, 0x40100 258 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill 259 ; FLATSCR: s_add_i32 s0, s32, 0x1004 260 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s0 ; 4-byte Folded Spill 261 %a = load volatile i32, i32 addrspace(5)* %aptr 262 263 ; Force %a to spill 264 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 265 266 %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 267 store volatile i32 %a, i32 addrspace(5)* %outptr 268 269 ret void 270} 271 272; GCN-LABEL: test_sgpr_offset_subregs_function 273define void @test_sgpr_offset_subregs_function() { 274entry: 275 ; We want to test the spill of the last subreg of %a is the highest 276 ; valid value for the immediate offset. We enable the emergency 277 ; stack slot for large frames, so it's hard to get the frame layout 278 ; exactly as we want to test it. 279 ; 280 ; Occupy 4084 bytes of scratch, so that the spill of the last subreg of %a 281 ; still fits below offset 4096 (4084 + 8 - 4 = 4092), and can be placed in 282 ; the instruction offset field. 283 %alloca = alloca i8, i32 4084, align 4, addrspace(5) 284 %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 285 %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* 286 287 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4084 ; 4-byte Folded Spill 288 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill 289 ; FLATSCR: scratch_store_dwordx2 off, v[{{[0-9:]+}}], s32 offset:4084 ; 8-byte Folded Spill 290 %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 291 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr 292 293 ; Force %a to spill. 294 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 295 296 ; Ensure the alloca sticks around. 297 %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1 298 %b = load volatile i32, i32 addrspace(5)* %bptr 299 300 ; Ensure the spill is of the full super-reg. 301 call void asm sideeffect "; $0", "r"(<2 x i32> %a) 302 303 ret void 304} 305 306; GCN-LABEL: test_inst_offset_subregs_function 307define void @test_inst_offset_subregs_function() { 308entry: 309 ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a 310 ; does not fit below offset 4096 (408 + 4 + 8 - 4 = 4096), and has to live 311 ; in the SGPR offset. 312 %alloca = alloca i8, i32 4088, align 4, addrspace(5) 313 %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* 314 %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* 315 316 ; 0x3ff0000 / 64 = 4092 (for wave64) 317 ; MUBUF: s_add_i32 s4, s32, 0x3ff00 318 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill 319 ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill 320 ; FLATSCR: scratch_store_dwordx2 off, v[{{[0-9:]+}}], s32 offset:4092 ; 8-byte Folded Spill 321 %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 322 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr 323 324 ; Force %a to spill. 325 call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () 326 327 ; Ensure the alloca sticks around. 328 %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1 329 %b = load volatile i32, i32 addrspace(5)* %bptr 330 331 ; Ensure the spill is of the full super-reg. 332 call void asm sideeffect "; $0", "r"(<2 x i32> %a) 333 334 ret void 335} 336 337attributes #0 = { nounwind } 338attributes #1 = { nounwind "amdgpu-num-sgpr"="17" "amdgpu-num-vgpr"="8" } 339attributes #2 = { nounwind "amdgpu-num-sgpr"="14" "amdgpu-num-vgpr"="8" } 340attributes #3 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" } 341