1; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSGPR -check-prefix=ALL %s
2; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=ALL %s
3
4; If spilling to smem, additional registers are used for the resource
5; descriptor.
6
7; ALL-LABEL: {{^}}max_12_sgprs:
8
9; FIXME: Should be ablo to skip this copying of the private segment
10; buffer because all the SGPR spills are to VGPRs.
11
12; ALL: s_mov_b64 s[10:11], s[2:3]
13; ALL: s_mov_b64 s[8:9], s[0:1]
14; ALL: SGPRBlocks: 1
15; ALL: NumSGPRsForWavesPerEU: 14
16define void @max_12_sgprs(i32 addrspace(1)* %out1,
17
18                          i32 addrspace(1)* %out2,
19                          i32 addrspace(1)* %out3,
20                          i32 addrspace(1)* %out4,
21                          i32 %one, i32 %two, i32 %three, i32 %four) #0 {
22  store i32 %one, i32 addrspace(1)* %out1
23  store i32 %two, i32 addrspace(1)* %out2
24  store i32 %three, i32 addrspace(1)* %out3
25  store i32 %four, i32 addrspace(1)* %out4
26  ret void
27}
28
29; private resource: 4
30; scratch wave offset: 1
31; workgroup ids: 3
32; dispatch id: 2
33; queue ptr: 2
34; flat scratch init: 2
35; ---------------------
36; total: 14
37
38; + reserved vcc = 16
39
40; Because we can't handle re-using the last few input registers as the
41; special vcc etc. registers (as well as decide to not use the unused
42; features when the number of registers is frozen), this ends up using
43; more than expected.
44
45; ALL-LABEL: {{^}}max_12_sgprs_14_input_sgprs:
46; TOSGPR: SGPRBlocks: 1
47; TOSGPR: NumSGPRsForWavesPerEU: 16
48
49; TOSMEM: s_mov_b64 s[10:11], s[2:3]
50; TOSMEM: s_mov_b64 s[8:9], s[0:1]
51; TOSMEM: s_mov_b32 s7, s13
52
53; TOSMEM: SGPRBlocks: 1
54; TOSMEM: NumSGPRsForWavesPerEU: 16
55define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
56                                        i32 addrspace(1)* %out2,
57                                        i32 addrspace(1)* %out3,
58                                        i32 addrspace(1)* %out4,
59                                        i32 %one, i32 %two, i32 %three, i32 %four) #2 {
60  store volatile i32 0, i32* undef
61  %x.0 = call i32 @llvm.amdgcn.workgroup.id.x()
62  store volatile i32 %x.0, i32 addrspace(1)* undef
63  %x.1 = call i32 @llvm.amdgcn.workgroup.id.y()
64  store volatile i32 %x.0, i32 addrspace(1)* undef
65  %x.2 = call i32 @llvm.amdgcn.workgroup.id.z()
66  store volatile i32 %x.0, i32 addrspace(1)* undef
67  %x.3 = call i64 @llvm.amdgcn.dispatch.id()
68  store volatile i64 %x.3, i64 addrspace(1)* undef
69  %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
70  store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef
71  %x.5 = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
72  store volatile i8 addrspace(2)* %x.5, i8 addrspace(2)* addrspace(1)* undef
73
74  store i32 %one, i32 addrspace(1)* %out1
75  store i32 %two, i32 addrspace(1)* %out2
76  store i32 %three, i32 addrspace(1)* %out3
77  store i32 %four, i32 addrspace(1)* %out4
78  ret void
79}
80
81; The following test is commented out for now; http://llvm.org/PR31230
82; XALL-LABEL: max_12_sgprs_12_input_sgprs{{$}}
83; ; Make sure copies for input buffer are not clobbered. This requires
84; ; swapping the order the registers are copied from what normally
85; ; happens.
86
87; XTOSMEM: s_mov_b32 s5, s11
88; XTOSMEM: s_add_u32 m0, s5,
89; XTOSMEM: s_buffer_store_dword vcc_lo, s[0:3], m0
90
91; XALL: SGPRBlocks: 2
92; XALL: NumSGPRsForWavesPerEU: 18
93;define void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1,
94;                                        i32 addrspace(1)* %out2,
95;                                        i32 addrspace(1)* %out3,
96;                                        i32 addrspace(1)* %out4,
97;                                        i32 %one, i32 %two, i32 %three, i32 %four) #2 {
98;  store volatile i32 0, i32* undef
99;  %x.0 = call i32 @llvm.amdgcn.workgroup.id.x()
100;  store volatile i32 %x.0, i32 addrspace(1)* undef
101;  %x.1 = call i32 @llvm.amdgcn.workgroup.id.y()
102;  store volatile i32 %x.0, i32 addrspace(1)* undef
103;  %x.2 = call i32 @llvm.amdgcn.workgroup.id.z()
104;  store volatile i32 %x.0, i32 addrspace(1)* undef
105;  %x.3 = call i64 @llvm.amdgcn.dispatch.id()
106;  store volatile i64 %x.3, i64 addrspace(1)* undef
107;  %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
108;  store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef
109;
110;  store i32 %one, i32 addrspace(1)* %out1
111;  store i32 %two, i32 addrspace(1)* %out2
112;  store i32 %three, i32 addrspace(1)* %out3
113;  store i32 %four, i32 addrspace(1)* %out4
114;  ret void
115;}
116
117declare i32 @llvm.amdgcn.workgroup.id.x() #1
118declare i32 @llvm.amdgcn.workgroup.id.y() #1
119declare i32 @llvm.amdgcn.workgroup.id.z() #1
120declare i64 @llvm.amdgcn.dispatch.id() #1
121declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #1
122declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #1
123
124attributes #0 = { nounwind "amdgpu-num-sgpr"="14" }
125attributes #1 = { nounwind readnone }
126attributes #2 = { nounwind "amdgpu-num-sgpr"="12" }
127attributes #3 = { nounwind "amdgpu-num-sgpr"="11" }
128