1; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck -check-prefixes=GCN,MUBUF %s
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s
3
4; Test that the VGPR spiller correctly switches to SGPR offsets when the
5; instruction offset field would overflow, and that it accounts for memory
6; swizzling.
7
8; GCN-LABEL: test_inst_offset_kernel
9define amdgpu_kernel void @test_inst_offset_kernel() {
10entry:
11  ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
12  ; the instruction offset field.
13  %alloca = alloca i8, i32 4088, align 4, addrspace(5)
14  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
15
16  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
17  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill
18  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s{{[0-9]+}} ; 4-byte Folded Spill
19  %a = load volatile i32, i32 addrspace(5)* %aptr
20
21  ; Force %a to spill.
22  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
23
24  %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
25  store volatile i32 %a, i32 addrspace(5)* %outptr
26
27  ret void
28}
29
30; GCN-LABEL: test_sgpr_offset_kernel
31define amdgpu_kernel void @test_sgpr_offset_kernel() {
32entry:
33  ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
34  ; fit in the instruction, and has to live in the SGPR offset.
35  %alloca = alloca i8, i32 4092, align 4, addrspace(5)
36  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
37
38  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
39  ; 0x40000 / 64 = 4096 (for wave64)
40  ; MUBUF:   s_mov_b32 s4, 0x40000
41  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
42  ; FLATSCR: s_movk_i32 s2, 0x1000
43  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s2 ; 4-byte Folded Spill
44  %a = load volatile i32, i32 addrspace(5)* %aptr
45
46  ; Force %a to spill
47  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
48
49  %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
50  store volatile i32 %a, i32 addrspace(5)* %outptr
51
52  ret void
53}
54
55; FIXME: If we fail to scavenge an SGPR in a kernel we don't have a stack
56; pointer to temporarily update, so we just crash.
57
58; GCN-LABEL: test_sgpr_offset_function_scavenge_fail_func
59define void @test_sgpr_offset_function_scavenge_fail_func() #2 {
60entry:
61  ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
62  ; fit in the instruction, and has to live in the SGPR offset.
63  %alloca = alloca i8, i32 4096, align 4, addrspace(5)
64  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
65
66  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
67
68  %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
69  %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0
70  %asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1
71  %asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2
72  %asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3
73  %asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4
74  %asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5
75  %asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6
76  %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7
77
78  ; 0x40000 / 64 = 4096 (for wave64)
79  %a = load volatile i32, i32 addrspace(5)* %aptr
80
81  ; MUBUF:   v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1004
82  ; MUBUF-NEXT: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+:[0-9]+}}], s32 offen ; 4-byte Folded Spill
83
84; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s32, 0x1004
85  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill
86  call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a)
87
88  %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
89  %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0
90  %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1
91  %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2
92  %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3
93  %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4
94  %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5
95  %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6
96  %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7
97
98  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
99
100  ; MUBUF:   v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1004
101  ; MUBUF-NEXT: buffer_load_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+:[0-9]+}}], s32 offen ; 4-byte Folded Reload
102  ; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s32, 0x1004
103  ; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, [[SOFF]] ; 4-byte Folded Reload
104
105   ; Force %a to spill with no free SGPRs
106  call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a)
107  ret void
108}
109
110define amdgpu_kernel void @test_sgpr_offset_function_scavenge_fail_kernel() #3 {
111entry:
112  ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
113  ; fit in the instruction, and has to live in the SGPR offset.
114  %alloca = alloca i8, i32 4096, align 4, addrspace(5)
115  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
116
117  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
118
119  %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
120  %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0
121  %asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1
122  %asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2
123  %asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3
124  %asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4
125  %asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5
126  %asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6
127  %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7
128
129  ; 0x40000 / 64 = 4096 (for wave64)
130  %a = load volatile i32, i32 addrspace(5)* %aptr
131
132  ; MUBUF: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1004
133  ; MUBUF: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+:[0-9]+}}], 0 offen ; 4-byte Folded Spill
134
135  ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0x1004
136  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, [[SOFF]] ; 4-byte Folded Spill
137  call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a)
138
139  %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
140  %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0
141  %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1
142  %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2
143  %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3
144  %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4
145  %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5
146  %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6
147  %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7
148
149  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
150
151  ; MUBUF: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1004
152  ; MUBUF: buffer_load_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+:[0-9]+}}], 0 offen ; 4-byte Folded Reload
153  ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0x1004
154  ; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, [[SOFF]] ; 4-byte Folded Reload
155
156   ; Force %a to spill with no free SGPRs
157  call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a)
158  ret void
159}
160
161; GCN-LABEL: test_sgpr_offset_subregs_kernel
162define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
163entry:
164  ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
165  ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in
166  ; the instruction offset field.
167  %alloca = alloca i8, i32 4084, align 4, addrspace(5)
168  %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
169  %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
170
171  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4088 ; 4-byte Folded Spill
172  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill
173  ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0xff8
174  ; FLATSCR: scratch_store_dwordx2 off, v[{{[0-9:]+}}], [[SOFF]]          ; 8-byte Folded Spill
175  %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
176  %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
177
178  ; Force %a to spill.
179  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
180
181  ; Ensure the alloca sticks around.
182  %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
183  %b = load volatile i32, i32 addrspace(5)* %bptr
184
185  ; Ensure the spill is of the full super-reg.
186  call void asm sideeffect "; $0", "r"(<2 x i32> %a)
187
188  ret void
189}
190
191; GCN-LABEL: test_inst_offset_subregs_kernel
192define amdgpu_kernel void @test_inst_offset_subregs_kernel() {
193entry:
194  ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
195  ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live
196  ; in the SGPR offset.
197  %alloca = alloca i8, i32 4088, align 4, addrspace(5)
198  %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
199  %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
200
201  ; 0x3ff00 / 64 = 4092 (for wave64)
202  ; MUBUF:   s_mov_b32 s4, 0x3ff00
203  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
204  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill
205  ; FLATSCR: s_movk_i32 [[SOFF:s[0-9]+]], 0xffc
206  ; FLATSCR: scratch_store_dwordx2 off, v[{{[0-9:]+}}], [[SOFF]]          ; 8-byte Folded Spill
207  %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
208  %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
209
210  ; Force %a to spill.
211  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
212
213  ; Ensure the alloca sticks around.
214  %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
215  %b = load volatile i32, i32 addrspace(5)* %bptr
216
217  ; Ensure the spill is of the full super-reg.
218  call void asm sideeffect "; $0", "r"(<2 x i32> %a)
219
220  ret void
221}
222
223; GCN-LABEL: test_inst_offset_function
224define void @test_inst_offset_function() {
225entry:
226  ; Occupy enough bytes of scratch, so the offset of the spill of %a
227  ; just fits in the instruction offset field when the emergency stack
228  ; slot is added. It's hard to hit the actual limit since we're also
229  ; going to insert the emergency stack slot for large frames.
230  %alloca = alloca i8, i32 4088, align 4, addrspace(5)
231  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
232
233  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
234  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill
235  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill
236  %a = load volatile i32, i32 addrspace(5)* %aptr
237
238  ; Force %a to spill.
239  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
240
241  %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
242  store volatile i32 %a, i32 addrspace(5)* %outptr
243
244  ret void
245}
246
247; GCN-LABEL: test_sgpr_offset_function
248define void @test_sgpr_offset_function() {
249entry:
250  ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
251  ; fit in the instruction, and has to live in the SGPR offset.
252  %alloca = alloca i8, i32 4096, align 4, addrspace(5)
253  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
254
255  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
256  ; 0x40000 / 64 = 4096 (for wave64)
257  ; MUBUF:   s_add_i32 s4, s32, 0x40100
258  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
259  ; FLATSCR: s_add_i32 s0, s32, 0x1004
260  ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s0 ; 4-byte Folded Spill
261  %a = load volatile i32, i32 addrspace(5)* %aptr
262
263  ; Force %a to spill
264  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
265
266  %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
267  store volatile i32 %a, i32 addrspace(5)* %outptr
268
269  ret void
270}
271
272; GCN-LABEL: test_sgpr_offset_subregs_function
273define void @test_sgpr_offset_subregs_function() {
274entry:
275  ; We want to test the spill of the last subreg of %a is the highest
276  ; valid value for the immediate offset. We enable the emergency
277  ; stack slot for large frames, so it's hard to get the frame layout
278  ; exactly as we want to test it.
279  ;
280  ; Occupy 4084 bytes of scratch, so that the spill of the last subreg of %a
281  ; still fits below offset 4096 (4084 + 8 - 4 = 4092), and can be placed in
282  ; the instruction offset field.
283  %alloca = alloca i8, i32 4084, align 4, addrspace(5)
284  %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
285  %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
286
287  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4084 ; 4-byte Folded Spill
288  ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill
289  ; FLATSCR: scratch_store_dwordx2 off, v[{{[0-9:]+}}], s32 offset:4084 ; 8-byte Folded Spill
290  %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
291  %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
292
293  ; Force %a to spill.
294  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
295
296  ; Ensure the alloca sticks around.
297  %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
298  %b = load volatile i32, i32 addrspace(5)* %bptr
299
300  ; Ensure the spill is of the full super-reg.
301  call void asm sideeffect "; $0", "r"(<2 x i32> %a)
302
303  ret void
304}
305
306; GCN-LABEL: test_inst_offset_subregs_function
307define void @test_inst_offset_subregs_function() {
308entry:
309  ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
310  ; does not fit below offset 4096 (408 + 4 + 8 - 4 = 4096), and has to live
311  ; in the SGPR offset.
312  %alloca = alloca i8, i32 4088, align 4, addrspace(5)
313  %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
314  %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
315
316  ; 0x3ff0000 / 64 = 4092 (for wave64)
317  ; MUBUF: s_add_i32 s4, s32, 0x3ff00
318  ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 ; 4-byte Folded Spill
319  ; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s4 offset:4 ; 4-byte Folded Spill
320  ; FLATSCR: scratch_store_dwordx2 off, v[{{[0-9:]+}}], s32 offset:4092 ; 8-byte Folded Spill
321  %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1
322  %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr
323
324  ; Force %a to spill.
325  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
326
327  ; Ensure the alloca sticks around.
328  %bptr = getelementptr i32, i32 addrspace(5)* %bufv1, i32 1
329  %b = load volatile i32, i32 addrspace(5)* %bptr
330
331  ; Ensure the spill is of the full super-reg.
332  call void asm sideeffect "; $0", "r"(<2 x i32> %a)
333
334  ret void
335}
336
337attributes #0 = { nounwind }
338attributes #1 = { nounwind "amdgpu-num-sgpr"="17" "amdgpu-num-vgpr"="8" }
339attributes #2 = { nounwind "amdgpu-num-sgpr"="14" "amdgpu-num-vgpr"="8" }
340attributes #3 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" }
341