1; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s
2
3; Test that the VGPR spiller correctly switches to SGPR offsets when the
4; instruction offset field would overflow, and that it accounts for memory
5; swizzling.
6
7; CHECK-LABEL: test_inst_offset_kernel
8define amdgpu_kernel void @test_inst_offset_kernel() {
9entry:
10  ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
11  ; the instruction offset field.
12  %alloca = alloca i8, i32 4088, align 4, addrspace(5)
13  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
14
15  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
16  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill
17  %a = load volatile i32, i32 addrspace(5)* %aptr
18
19  ; Force %a to spill.
20  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
21
22  %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
23  store volatile i32 %a, i32 addrspace(5)* %outptr
24
25  ret void
26}
27
28; CHECK-LABEL: test_sgpr_offset_kernel
29define amdgpu_kernel void @test_sgpr_offset_kernel() {
30entry:
31  ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
32  ; fit in the instruction, and has to live in the SGPR offset.
33  %alloca = alloca i8, i32 4092, align 4, addrspace(5)
34  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
35
36  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
37  ; 0x40000 / 64 = 4096 (for wave64)
38  ; CHECK: s_mov_b32 s6, 0x40000
39  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
40  %a = load volatile i32, i32 addrspace(5)* %aptr
41
42  ; Force %a to spill
43  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
44
45  %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
46  store volatile i32 %a, i32 addrspace(5)* %outptr
47
48  ret void
49}
50
51; FIXME: If we fail to scavenge an SGPR in a kernel we don't have a stack
52; pointer to temporarily update, so we just crash.
53
54; CHECK-LABEL: test_sgpr_offset_function_scavenge_fail
55define void @test_sgpr_offset_function_scavenge_fail() #2 {
56entry:
57  ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
58  ; fit in the instruction, and has to live in the SGPR offset.
59  %alloca = alloca i8, i32 4096, align 4, addrspace(5)
60  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
61
62  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
63
64  %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
65  %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0
66  %asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1
67  %asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2
68  %asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3
69  %asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4
70  %asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5
71  %asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6
72  %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7
73
74  ; 0x40000 / 64 = 4096 (for wave64)
75  %a = load volatile i32, i32 addrspace(5)* %aptr
76
77  ; CHECK: s_add_u32 s32, s32, 0x40000
78  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Spill
79  ; CHECK: s_sub_u32 s32, s32, 0x40000
80  call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a)
81
82  %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
83  %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0
84  %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1
85  %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2
86  %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3
87  %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4
88  %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5
89  %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6
90  %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7
91
92  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
93
94  ; CHECK: s_add_u32 s32, s32, 0x40000
95  ; CHECK: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Reload
96  ; CHECK: s_sub_u32 s32, s32, 0x40000
97
98   ; Force %a to spill with no free SGPRs
99  call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a)
100  ret void
101}
102
103; CHECK-LABEL: test_sgpr_offset_subregs_kernel
104define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
105entry:
106  ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
107  ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in
108  ; the instruction offset field.
109  %alloca = alloca i8, i32 4084, align 4, addrspace(5)
110  %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
111  %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
112
113  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4088 ; 4-byte Folded Spill
114  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill
115  %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
116  %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
117
118  ; Force %a to spill.
119  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
120
121  ; Ensure the alloca sticks around.
122  %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
123  %b = load volatile i32, i32 addrspace(5)* %bptr
124
125  ; Ensure the spill is of the full super-reg.
126  call void asm sideeffect "; $0", "r"(<2 x i32> %a)
127
128  ret void
129}
130
131; CHECK-LABEL: test_inst_offset_subregs_kernel
132define amdgpu_kernel void @test_inst_offset_subregs_kernel() {
133entry:
134  ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
135  ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live
136  ; in the SGPR offset.
137  %alloca = alloca i8, i32 4088, align 4, addrspace(5)
138  %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
139  %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
140
141  ; 0x3ff00 / 64 = 4092 (for wave64)
142  ; CHECK: s_mov_b32 s6, 0x3ff00
143  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
144  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 offset:4 ; 4-byte Folded Spill
145  %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
146  %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
147
148  ; Force %a to spill.
149  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
150
151  ; Ensure the alloca sticks around.
152  %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
153  %b = load volatile i32, i32 addrspace(5)* %bptr
154
155  ; Ensure the spill is of the full super-reg.
156  call void asm sideeffect "; $0", "r"(<2 x i32> %a)
157
158  ret void
159}
160
161; CHECK-LABEL: test_inst_offset_function
162define void @test_inst_offset_function() {
163entry:
164  ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
165  ; the instruction offset field.
166  %alloca = alloca i8, i32 4092, align 4, addrspace(5)
167  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
168
169  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
170  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
171  %a = load volatile i32, i32 addrspace(5)* %aptr
172
173  ; Force %a to spill.
174  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
175
176  %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
177  store volatile i32 %a, i32 addrspace(5)* %outptr
178
179  ret void
180}
181
182; CHECK-LABEL: test_sgpr_offset_function
183define void @test_sgpr_offset_function() {
184entry:
185  ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
186  ; fit in the instruction, and has to live in the SGPR offset.
187  %alloca = alloca i8, i32 4096, align 4, addrspace(5)
188  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
189
190  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
191  ; 0x40000 / 64 = 4096 (for wave64)
192  ; CHECK: s_add_u32 s4, s32, 0x40000
193  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
194  %a = load volatile i32, i32 addrspace(5)* %aptr
195
196  ; Force %a to spill
197  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
198
199  %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
200  store volatile i32 %a, i32 addrspace(5)* %outptr
201
202  ret void
203}
204
205; CHECK-LABEL: test_sgpr_offset_subregs_function
206define void @test_sgpr_offset_subregs_function() {
207entry:
208  ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
209  ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in
210  ; the instruction offset field.
211  %alloca = alloca i8, i32 4088, align 4, addrspace(5)
212  %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
213  %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
214
215  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill
216  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill
217  %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
218  %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
219
220  ; Force %a to spill.
221  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
222
223  ; Ensure the alloca sticks around.
224  %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
225  %b = load volatile i32, i32 addrspace(5)* %bptr
226
227  ; Ensure the spill is of the full super-reg.
228  call void asm sideeffect "; $0", "r"(<2 x i32> %a)
229
230  ret void
231}
232
233; CHECK-LABEL: test_inst_offset_subregs_function
234define void @test_inst_offset_subregs_function() {
235entry:
236  ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
237  ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live
238  ; in the SGPR offset.
239  %alloca = alloca i8, i32 4092, align 4, addrspace(5)
240  %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
241  %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
242
243  ; 0x3ff00 / 64 = 4092 (for wave64)
244  ; CHECK: s_add_u32 s4, s32, 0x3ff00
245  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
246  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill
247  %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
248  %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
249
250  ; Force %a to spill.
251  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
252
253  ; Ensure the alloca sticks around.
254  %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
255  %b = load volatile i32, i32 addrspace(5)* %bptr
256
257  ; Ensure the spill is of the full super-reg.
258  call void asm sideeffect "; $0", "r"(<2 x i32> %a)
259
260  ret void
261}
262
263attributes #0 = { nounwind }
264attributes #1 = { nounwind "amdgpu-num-sgpr"="17" "amdgpu-num-vgpr"="8" }
265attributes #2 = { nounwind "amdgpu-num-sgpr"="14" "amdgpu-num-vgpr"="8" }
266