1; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=2 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,HSA %s
2; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MESA %s
3
4; GCN-LABEL: {{^}}kernel_implicitarg_ptr_empty:
5; HSA: enable_sgpr_kernarg_segment_ptr = 1
6; HSA: kernarg_segment_byte_size = 56
7; HSA: kernarg_segment_alignment = 4
8
9; MESA: enable_sgpr_kernarg_segment_ptr = 1
10; MESA: kernarg_segment_byte_size = 16
11; MESA: kernarg_segment_alignment = 4
12
13; HSA: s_load_dword s0, s[4:5], 0x0
14define amdgpu_kernel void @kernel_implicitarg_ptr_empty() #0 {
15  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
16  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
17  %load = load volatile i32, i32 addrspace(4)* %cast
18  ret void
19}
20
21; GCN-LABEL: {{^}}kernel_implicitarg_ptr_empty_0implicit:
22; HSA: enable_sgpr_kernarg_segment_ptr = 0
23; HSA: kernarg_segment_byte_size = 0
24; HSA: kernarg_segment_alignment = 4
25
26; MESA: enable_sgpr_kernarg_segment_ptr = 1
27; MESA: kernarg_segment_byte_size = 16
28; MESA: kernarg_segment_alignment = 4
29
30; HSA: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0{{$}}
31; HSA: s_load_dword s0, [[NULL]], 0x0
32
33; MESA: s_load_dword s0, s[4:5], 0x0
34define amdgpu_kernel void @kernel_implicitarg_ptr_empty_0implicit() #3 {
35  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
36  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
37  %load = load volatile i32, i32 addrspace(4)* %cast
38  ret void
39}
40
41; GCN-LABEL: {{^}}opencl_kernel_implicitarg_ptr_empty:
42; GCN: enable_sgpr_kernarg_segment_ptr = 1
43
44; HSA: kernarg_segment_byte_size = 48
45; HSA: kernarg_segment_alignment = 4
46
47; MESA: kernarg_segment_byte_size = 16
48; MESA: kernarg_segment_alignment = 4
49
50; HSA: s_load_dword s0, s[4:5], 0x0
51define amdgpu_kernel void @opencl_kernel_implicitarg_ptr_empty() #1 {
52  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
53  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
54  %load = load volatile i32, i32 addrspace(4)* %cast
55  ret void
56}
57
58; GCN-LABEL: {{^}}kernel_implicitarg_ptr:
59; GCN: enable_sgpr_kernarg_segment_ptr = 1
60
61; HSA: kernarg_segment_byte_size = 168
62; HSA: kernarg_segment_alignment = 4
63
64; MESA: kernarg_segment_byte_size = 128
65; MESA: kernarg_segment_alignment = 4
66
67; HSA: s_load_dword s0, s[4:5], 0x1c
68define amdgpu_kernel void @kernel_implicitarg_ptr([112 x i8]) #0 {
69  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
70  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
71  %load = load volatile i32, i32 addrspace(4)* %cast
72  ret void
73}
74
75; GCN-LABEL: {{^}}opencl_kernel_implicitarg_ptr:
76; GCN: enable_sgpr_kernarg_segment_ptr = 1
77
78; HSA: kernarg_segment_byte_size = 160
79; HSA: kernarg_segment_alignment = 4
80
81; MESA: kernarg_segment_byte_size = 128
82; MESA: kernarg_segment_alignment = 4
83
84; HSA: s_load_dword s0, s[4:5], 0x1c
85define amdgpu_kernel void @opencl_kernel_implicitarg_ptr([112 x i8]) #1 {
86  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
87  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
88  %load = load volatile i32, i32 addrspace(4)* %cast
89  ret void
90}
91
92; GCN-LABEL: {{^}}func_implicitarg_ptr:
93; GCN: s_waitcnt
94; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
95; GCN-NEXT: s_waitcnt
96; GCN-NEXT: s_setpc_b64
97define void @func_implicitarg_ptr() #0 {
98  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
99  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
100  %load = load volatile i32, i32 addrspace(4)* %cast
101  ret void
102}
103
104; GCN-LABEL: {{^}}opencl_func_implicitarg_ptr:
105; GCN: s_waitcnt
106; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
107; GCN-NEXT: s_waitcnt
108; GCN-NEXT: s_setpc_b64
109define void @opencl_func_implicitarg_ptr() #0 {
110  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
111  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
112  %load = load volatile i32, i32 addrspace(4)* %cast
113  ret void
114}
115
116; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func_empty:
117; HSA: enable_sgpr_kernarg_segment_ptr = 1
118; HSA: kernarg_segment_byte_size = 56
119; HSA: kernarg_segment_alignment = 4
120
121; MESA: enable_sgpr_kernarg_segment_ptr = 1
122; MESA: kernarg_segment_byte_size = 16
123; MESA: kernarg_segment_alignment = 4
124
125; GCN: s_mov_b64 s[8:9], s[4:5]
126; GCN: s_swappc_b64
127define amdgpu_kernel void @kernel_call_implicitarg_ptr_func_empty() #0 {
128  call void @func_implicitarg_ptr()
129  ret void
130}
131
132; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func_empty_implicit0:
133; HSA: enable_sgpr_kernarg_segment_ptr = 0
134; HSA: kernarg_segment_byte_size = 0
135; HSA: kernarg_segment_alignment = 4
136
137; MESA: enable_sgpr_kernarg_segment_ptr = 1
138; MESA: kernarg_segment_byte_size = 16
139; MESA: kernarg_segment_alignment = 4
140
141; HSA: s_mov_b64 s[8:9], 0{{$}}
142; MESA: s_mov_b64 s[8:9], s[4:5]{{$}}
143; GCN: s_swappc_b64
144define amdgpu_kernel void @kernel_call_implicitarg_ptr_func_empty_implicit0() #3 {
145  call void @func_implicitarg_ptr()
146  ret void
147}
148
149; GCN-LABEL: {{^}}opencl_kernel_call_implicitarg_ptr_func_empty:
150; GCN: enable_sgpr_kernarg_segment_ptr = 1
151; HSA: kernarg_segment_byte_size = 48
152; HSA: kernarg_segment_alignment = 4
153; MESA: kernarg_segment_byte_size = 16
154; GCN: s_mov_b64 s[8:9], s[4:5]
155; GCN-NOT: s4
156; GCN-NOT: s5
157; GCN: s_swappc_b64
158define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func_empty() #1 {
159  call void @func_implicitarg_ptr()
160  ret void
161}
162
163; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func:
164; GCN: enable_sgpr_kernarg_segment_ptr = 1
165; HSA: kernarg_segment_byte_size = 168
166; HSA: kernarg_segment_alignment = 4
167
168; MESA: kernarg_segment_byte_size = 128
169; MESA: kernarg_segment_alignment = 4
170
171; HSA: s_add_u32 s8, s4, 0x70
172; MESA: s_add_u32 s8, s4, 0x70
173
174; GCN: s_addc_u32 s9, s5, 0{{$}}
175; GCN: s_swappc_b64
176define amdgpu_kernel void @kernel_call_implicitarg_ptr_func([112 x i8]) #0 {
177  call void @func_implicitarg_ptr()
178  ret void
179}
180
181; GCN-LABEL: {{^}}opencl_kernel_call_implicitarg_ptr_func:
182; GCN: enable_sgpr_kernarg_segment_ptr = 1
183; HSA: kernarg_segment_byte_size = 160
184; HSA: kernarg_segment_alignment = 4
185; MESA: kernarg_segment_byte_size = 128
186; MESA: kernarg_segment_alignment = 4
187
188; GCN: s_add_u32 s8, s4, 0x70
189; GCN: s_addc_u32 s9, s5, 0{{$}}
190; GCN: s_swappc_b64
191define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func([112 x i8]) #1 {
192  call void @func_implicitarg_ptr()
193  ret void
194}
195
196; GCN-LABEL: {{^}}func_call_implicitarg_ptr_func:
197; GCN-NOT: s8
198; GCN-NOT: s9
199; GCN-NOT: s[8:9]
200; GCN: s_swappc_b64
201; GCN: s_setpc_b64 s[30:31]
202define void @func_call_implicitarg_ptr_func() #0 {
203  call void @func_implicitarg_ptr()
204  ret void
205}
206
207; GCN-LABEL: {{^}}opencl_func_call_implicitarg_ptr_func:
208; GCN-NOT: s8
209; GCN-NOT: s9
210; GCN-NOT: s[8:9]
211; GCN: s_swappc_b64
212; GCN: s_setpc_b64 s[30:31]
213define void @opencl_func_call_implicitarg_ptr_func() #0 {
214  call void @func_implicitarg_ptr()
215  ret void
216}
217
218; GCN-LABEL: {{^}}func_kernarg_implicitarg_ptr:
219; GCN: s_waitcnt
220; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0
221; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0
222; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
223; GCN: s_waitcnt lgkmcnt(0)
224define void @func_kernarg_implicitarg_ptr() #0 {
225  %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
226  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
227  %cast.kernarg.segment.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)*
228  %cast.implicitarg = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
229  %load0 = load volatile i32, i32 addrspace(4)* %cast.kernarg.segment.ptr
230  %load1 = load volatile i32, i32 addrspace(4)* %cast.implicitarg
231  ret void
232}
233
234; GCN-LABEL: {{^}}opencl_func_kernarg_implicitarg_ptr:
235; GCN: s_waitcnt
236; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0
237; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0
238; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
239; GCN: s_waitcnt lgkmcnt(0)
240define void @opencl_func_kernarg_implicitarg_ptr() #0 {
241  %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
242  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
243  %cast.kernarg.segment.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)*
244  %cast.implicitarg = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
245  %load0 = load volatile i32, i32 addrspace(4)* %cast.kernarg.segment.ptr
246  %load1 = load volatile i32, i32 addrspace(4)* %cast.implicitarg
247  ret void
248}
249
250; GCN-LABEL: {{^}}kernel_call_kernarg_implicitarg_ptr_func:
251; GCN: s_add_u32 s8, s4, 0x70
252; GCN: s_addc_u32 s9, s5, 0
253; GCN: s_swappc_b64
254define amdgpu_kernel void @kernel_call_kernarg_implicitarg_ptr_func([112 x i8]) #0 {
255  call void @func_kernarg_implicitarg_ptr()
256  ret void
257}
258
259; GCN-LABEL: {{^}}kernel_implicitarg_no_struct_align_padding:
260; HSA: kernarg_segment_byte_size = 120
261; MESA: kernarg_segment_byte_size = 84
262; GCN: kernarg_segment_alignment = 6
263define amdgpu_kernel void @kernel_implicitarg_no_struct_align_padding(<16 x i32>, i32) #1 {
264  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
265  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
266  %load = load volatile i32, i32 addrspace(4)* %cast
267  ret void
268}
269
270
271; HSA-LABEL: Kernels:
272; HSA-LABEL: - Name:            kernel_implicitarg_ptr_empty
273; HSA: CodeProps:
274; HSA: KernargSegmentSize: 56
275; HSA: KernargSegmentAlign: 8
276
277; HSA-LABEL: - Name:            kernel_implicitarg_ptr_empty_0implicit
278; HSA: KernargSegmentSize: 0
279; HSA: KernargSegmentAlign: 4
280
281; HSA-LABEL: - Name:            opencl_kernel_implicitarg_ptr_empty
282; HSA: KernargSegmentSize: 48
283; HSA: KernargSegmentAlign: 8
284
285; HSA-LABEL: - Name:            kernel_implicitarg_ptr
286; HSA: KernargSegmentSize: 168
287; HSA: KernargSegmentAlign: 8
288
289; HSA-LABEL: - Name:            opencl_kernel_implicitarg_ptr
290; HSA: KernargSegmentSize: 160
291; HSA: KernargSegmentAlign: 8
292
293; HSA-LABEL: - Name:            kernel_call_implicitarg_ptr_func_empty
294; HSA: KernargSegmentSize: 56
295; HSA: KernargSegmentAlign: 8
296
297; HSA-LABEL: - Name:            kernel_call_implicitarg_ptr_func_empty_implicit0
298; HSA: KernargSegmentSize: 0
299; HSA: KernargSegmentAlign: 4
300
301; HSA-LABEL:  - Name:            opencl_kernel_call_implicitarg_ptr_func_empty
302; HSA: KernargSegmentSize: 48
303; HSA: KernargSegmentAlign: 8
304
305; HSA-LABEL:  - Name:            kernel_call_implicitarg_ptr_func
306; HSA: KernargSegmentSize: 168
307; HSA: KernargSegmentAlign: 8
308
309; HSA-LABEL:  - Name:            opencl_kernel_call_implicitarg_ptr_func
310; HSA: KernargSegmentSize: 160
311; HSA: KernargSegmentAlign: 8
312
313; HSA-LABEL: - Name:            kernel_call_kernarg_implicitarg_ptr_func
314; HSA: KernargSegmentSize: 168
315; HSA: KernargSegmentAlign: 8
316
317; HSA-LABEL: - Name:            kernel_implicitarg_no_struct_align_padding
318; HSA: KernargSegmentSize: 120
319; HSA: KernargSegmentAlign: 64
320
321declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #2
322declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #2
323
324attributes #0 = { nounwind noinline }
325attributes #1 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="48" }
326attributes #2 = { nounwind readnone speculatable }
327attributes #3 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="0" }
328