1; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s
2; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s
3
4; GCN-LABEL: {{^}}max_11_vgprs:
5; GFX900-NOT: SCRATCH_RSRC
6; GFX908-NOT: SCRATCH_RSRC
7; GFX908-DAG: v_accvgpr_write_b32 [[A_REG:a[0-9]+]], v{{[0-9]}}
8; GFX900-NOT: buffer_
9; GFX908-NOT: buffer_
10; GFX908-DAG: v_mov_b32_e32 v{{[0-9]}}, [[V_REG:v[0-9]+]]
11; GFX908-DAG: v_accvgpr_read_b32 [[V_REG]], [[A_REG]]
12
13; GFX900: NumVgprs: 11
14; GFX908: NumVgprs: 10
15; GFX900: ScratchSize: 0
16; GFX908: ScratchSize: 0
17; GCN:    VGPRBlocks: 2
18; GFX900: NumVGPRsForWavesPerEU: 11
19; GFX908: NumVGPRsForWavesPerEU: 10
20define amdgpu_kernel void @max_11_vgprs(i32 addrspace(1)* %p) #2 {
21  %tid = load volatile i32, i32 addrspace(1)* undef
22  %p1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i32 %tid
23  %p2 = getelementptr inbounds i32, i32 addrspace(1)* %p1, i32 4
24  %p3 = getelementptr inbounds i32, i32 addrspace(1)* %p2, i32 8
25  %p4 = getelementptr inbounds i32, i32 addrspace(1)* %p3, i32 12
26  %p5 = getelementptr inbounds i32, i32 addrspace(1)* %p4, i32 16
27  %p6 = getelementptr inbounds i32, i32 addrspace(1)* %p5, i32 20
28  %p7 = getelementptr inbounds i32, i32 addrspace(1)* %p6, i32 24
29  %p8 = getelementptr inbounds i32, i32 addrspace(1)* %p7, i32 28
30  %p9 = getelementptr inbounds i32, i32 addrspace(1)* %p8, i32 32
31  %p10 = getelementptr inbounds i32, i32 addrspace(1)* %p9, i32 36
32  %v1 = load volatile i32, i32 addrspace(1)* %p1
33  %v2 = load volatile i32, i32 addrspace(1)* %p2
34  %v3 = load volatile i32, i32 addrspace(1)* %p3
35  %v4 = load volatile i32, i32 addrspace(1)* %p4
36  %v5 = load volatile i32, i32 addrspace(1)* %p5
37  %v6 = load volatile i32, i32 addrspace(1)* %p6
38  %v7 = load volatile i32, i32 addrspace(1)* %p7
39  %v8 = load volatile i32, i32 addrspace(1)* %p8
40  %v9 = load volatile i32, i32 addrspace(1)* %p9
41  %v10 = load volatile i32, i32 addrspace(1)* %p10
42  call void asm sideeffect "", "v,v,v,v,v,v,v,v,v,v"(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, i32 %v6, i32 %v7, i32 %v8, i32 %v9, i32 %v10)
43  store volatile i32 %v1, i32 addrspace(1)* undef
44  store volatile i32 %v2, i32 addrspace(1)* undef
45  store volatile i32 %v3, i32 addrspace(1)* undef
46  store volatile i32 %v4, i32 addrspace(1)* undef
47  store volatile i32 %v5, i32 addrspace(1)* undef
48  store volatile i32 %v6, i32 addrspace(1)* undef
49  store volatile i32 %v7, i32 addrspace(1)* undef
50  store volatile i32 %v8, i32 addrspace(1)* undef
51  store volatile i32 %v9, i32 addrspace(1)* undef
52  store volatile i32 %v10, i32 addrspace(1)* undef
53  ret void
54}
55
56; GCN-LABEL: {{^}}max_10_vgprs_spill_v32:
57; GCN-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
58; GCN-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
59; GCN:        buffer_store_dword v{{[0-9]}},
60; GFX908-DAG: v_accvgpr_write_b32 a0, v{{[0-9]}}
61; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}}
62; GCN-NOT:    a10
63
64; GFX908: NumVgprs: 10
65; GFX900: ScratchSize: 100
66; GFX908: ScratchSize: 68
67; GFX908: VGPRBlocks: 2
68; GFX908: NumVGPRsForWavesPerEU: 10
69define amdgpu_kernel void @max_10_vgprs_spill_v32(<32 x float> addrspace(1)* %p) #0 {
70  %tid = call i32 @llvm.amdgcn.workitem.id.x()
71  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
72  %v = load volatile <32 x float>, <32 x float> addrspace(1)* %gep
73  store volatile <32 x float> %v, <32 x float> addrspace(1)* undef
74  ret void
75}
76
77; GCN-LABEL: {{^}}max_256_vgprs_spill_9x32:
78; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
79; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
80; GFX908-NOT: SCRATCH_RSRC
81; GFX908-DAG: v_accvgpr_write_b32 a0, v
82; GFX900:     buffer_store_dword v
83; GFX900:     buffer_load_dword v
84; GFX908-NOT: buffer_
85; GFX908-DAG: v_accvgpr_read_b32
86
87; GFX900: NumVgprs: 256
88; GFX900: ScratchSize: 148
89; GFX908: NumVgprs: 254
90; GFX908: ScratchSize: 0
91; GCN:    VGPRBlocks: 63
92; GFX900:    NumVGPRsForWavesPerEU: 256
93; GFX908:    NumVGPRsForWavesPerEU: 254
94define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %p) #1 {
95  %tid = call i32 @llvm.amdgcn.workitem.id.x()
96  %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
97  %p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid
98  %p3 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p2, i32 %tid
99  %p4 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p3, i32 %tid
100  %p5 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p4, i32 %tid
101  %p6 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p5, i32 %tid
102  %p7 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p6, i32 %tid
103  %p8 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p7, i32 %tid
104  %p9 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p8, i32 %tid
105  %v1 = load volatile <32 x float>, <32 x float> addrspace(1)* %p1
106  %v2 = load volatile <32 x float>, <32 x float> addrspace(1)* %p2
107  %v3 = load volatile <32 x float>, <32 x float> addrspace(1)* %p3
108  %v4 = load volatile <32 x float>, <32 x float> addrspace(1)* %p4
109  %v5 = load volatile <32 x float>, <32 x float> addrspace(1)* %p5
110  %v6 = load volatile <32 x float>, <32 x float> addrspace(1)* %p6
111  %v7 = load volatile <32 x float>, <32 x float> addrspace(1)* %p7
112  %v8 = load volatile <32 x float>, <32 x float> addrspace(1)* %p8
113  %v9 = load volatile <32 x float>, <32 x float> addrspace(1)* %p9
114  store volatile <32 x float> %v1, <32 x float> addrspace(1)* undef
115  store volatile <32 x float> %v2, <32 x float> addrspace(1)* undef
116  store volatile <32 x float> %v3, <32 x float> addrspace(1)* undef
117  store volatile <32 x float> %v4, <32 x float> addrspace(1)* undef
118  store volatile <32 x float> %v5, <32 x float> addrspace(1)* undef
119  store volatile <32 x float> %v6, <32 x float> addrspace(1)* undef
120  store volatile <32 x float> %v7, <32 x float> addrspace(1)* undef
121  store volatile <32 x float> %v8, <32 x float> addrspace(1)* undef
122  store volatile <32 x float> %v9, <32 x float> addrspace(1)* undef
123  ret void
124}
125
126; GCN-LABEL: {{^}}max_256_vgprs_spill_9x32_2bb:
127; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
128; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
129; GFX908-NOT: SCRATCH_RSRC
130; GFX908: v_accvgpr_write_b32
131; GFX908:  global_load_
132; GFX900:     buffer_store_dword v
133; GFX900:     buffer_load_dword v
134; GFX908-NOT: buffer_
135; GFX908-DAG: v_accvgpr_read_b32
136
137; GFX900: NumVgprs: 256
138; GFX908: NumVgprs: 254
139; GFX900: ScratchSize: 1796
140; GFX908: ScratchSize: 0
141; GFX900: VGPRBlocks: 63
142; GFX908: VGPRBlocks: 63
143; GFX900: NumVGPRsForWavesPerEU: 256
144; GFX908: NumVGPRsForWavesPerEU: 25
145define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) #1 {
146  %tid = call i32 @llvm.amdgcn.workitem.id.x()
147  %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
148  %p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid
149  %p3 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p2, i32 %tid
150  %p4 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p3, i32 %tid
151  %p5 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p4, i32 %tid
152  %p6 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p5, i32 %tid
153  %p7 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p6, i32 %tid
154  %p8 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p7, i32 %tid
155  %p9 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p8, i32 %tid
156  %v1 = load volatile <32 x float>, <32 x float> addrspace(1)* %p1
157  %v2 = load volatile <32 x float>, <32 x float> addrspace(1)* %p2
158  %v3 = load volatile <32 x float>, <32 x float> addrspace(1)* %p3
159  %v4 = load volatile <32 x float>, <32 x float> addrspace(1)* %p4
160  %v5 = load volatile <32 x float>, <32 x float> addrspace(1)* %p5
161  %v6 = load volatile <32 x float>, <32 x float> addrspace(1)* %p6
162  %v7 = load volatile <32 x float>, <32 x float> addrspace(1)* %p7
163  %v8 = load volatile <32 x float>, <32 x float> addrspace(1)* %p8
164  %v9 = load volatile <32 x float>, <32 x float> addrspace(1)* %p9
165  br label %st
166
167st:
168  store volatile <32 x float> %v1, <32 x float> addrspace(1)* undef
169  store volatile <32 x float> %v2, <32 x float> addrspace(1)* undef
170  store volatile <32 x float> %v3, <32 x float> addrspace(1)* undef
171  store volatile <32 x float> %v4, <32 x float> addrspace(1)* undef
172  store volatile <32 x float> %v5, <32 x float> addrspace(1)* undef
173  store volatile <32 x float> %v6, <32 x float> addrspace(1)* undef
174  store volatile <32 x float> %v7, <32 x float> addrspace(1)* undef
175  store volatile <32 x float> %v8, <32 x float> addrspace(1)* undef
176  store volatile <32 x float> %v9, <32 x float> addrspace(1)* undef
177  ret void
178}
179
180; Make sure there's no crash when we have loads from fixed stack
181; objects and are processing VGPR spills
182
183; GCN-LABEL: {{^}}stack_args_vgpr_spill:
184; GFX908: v_accvgpr_write_b32
185; GFX908: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32
186; GFX908: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4
187define void @stack_args_vgpr_spill(<32 x float> %arg0, <32 x float> %arg1, <32 x float> addrspace(1)* %p) #1 {
188  %tid = call i32 @llvm.amdgcn.workitem.id.x()
189  %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
190  %p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid
191  %p3 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p2, i32 %tid
192  %p4 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p3, i32 %tid
193  %p5 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p4, i32 %tid
194  %p6 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p5, i32 %tid
195  %p7 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p6, i32 %tid
196  %v1 = load volatile <32 x float>, <32 x float> addrspace(1)* %p1
197  %v2 = load volatile <32 x float>, <32 x float> addrspace(1)* %p2
198  %v3 = load volatile <32 x float>, <32 x float> addrspace(1)* %p3
199  %v4 = load volatile <32 x float>, <32 x float> addrspace(1)* %p4
200  %v5 = load volatile <32 x float>, <32 x float> addrspace(1)* %p5
201  %v6 = load volatile <32 x float>, <32 x float> addrspace(1)* %p6
202  %v7 = load volatile <32 x float>, <32 x float> addrspace(1)* %p7
203  br label %st
204
205st:
206  store volatile <32 x float> %arg0, <32 x float> addrspace(1)* undef
207  store volatile <32 x float> %arg1, <32 x float> addrspace(1)* undef
208  store volatile <32 x float> %v1, <32 x float> addrspace(1)* undef
209  store volatile <32 x float> %v2, <32 x float> addrspace(1)* undef
210  store volatile <32 x float> %v3, <32 x float> addrspace(1)* undef
211  store volatile <32 x float> %v4, <32 x float> addrspace(1)* undef
212  store volatile <32 x float> %v5, <32 x float> addrspace(1)* undef
213  store volatile <32 x float> %v6, <32 x float> addrspace(1)* undef
214  store volatile <32 x float> %v7, <32 x float> addrspace(1)* undef
215  ret void
216}
217
218
219declare i32 @llvm.amdgcn.workitem.id.x()
220
221attributes #0 = { nounwind "amdgpu-num-vgpr"="10" }
222attributes #1 = { "amdgpu-flat-work-group-size"="1,256" }
223attributes #2 = { nounwind "amdgpu-num-vgpr"="11" }
224