1; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=2 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,HSA %s
2; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MESA %s
3
4; GCN-LABEL: {{^}}kernel_implicitarg_ptr_empty:
5; HSA: enable_sgpr_kernarg_segment_ptr = 1
6; HSA: kernarg_segment_byte_size = 56
7; HSA: kernarg_segment_alignment = 4
8
9; MESA: enable_sgpr_kernarg_segment_ptr = 1
10; MESA: kernarg_segment_byte_size = 16
11; MESA: kernarg_segment_alignment = 4
12
13; HSA: s_load_dword s0, s[4:5], 0x0
14define amdgpu_kernel void @kernel_implicitarg_ptr_empty() #0 {
15  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
16  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
17  %load = load volatile i32, i32 addrspace(4)* %cast
18  ret void
19}
20
21; GCN-LABEL: {{^}}kernel_implicitarg_ptr_empty_0implicit:
22; HSA: enable_sgpr_kernarg_segment_ptr = 0
23; HSA: kernarg_segment_byte_size = 0
24; HSA: kernarg_segment_alignment = 4
25
26; MESA: enable_sgpr_kernarg_segment_ptr = 1
27; MESA: kernarg_segment_byte_size = 16
28; MESA: kernarg_segment_alignment = 4
29
30; HSA: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0{{$}}
31; HSA: s_load_dword s0, [[NULL]], 0x0
32
33; MESA: s_load_dword s0, s[4:5], 0x0
34define amdgpu_kernel void @kernel_implicitarg_ptr_empty_0implicit() #3 {
35  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
36  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
37  %load = load volatile i32, i32 addrspace(4)* %cast
38  ret void
39}
40
41; GCN-LABEL: {{^}}opencl_kernel_implicitarg_ptr_empty:
42; GCN: enable_sgpr_kernarg_segment_ptr = 1
43
44; HSA: kernarg_segment_byte_size = 48
45; HSA: kernarg_segment_alignment = 4
46
47; MESA: kernarg_segment_byte_size = 16
48; MESA: kernarg_segment_alignment = 4
49
50; HSA: s_load_dword s0, s[4:5], 0x0
51define amdgpu_kernel void @opencl_kernel_implicitarg_ptr_empty() #1 {
52  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
53  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
54  %load = load volatile i32, i32 addrspace(4)* %cast
55  ret void
56}
57
58; GCN-LABEL: {{^}}kernel_implicitarg_ptr:
59; GCN: enable_sgpr_kernarg_segment_ptr = 1
60
61; HSA: kernarg_segment_byte_size = 168
62; HSA: kernarg_segment_alignment = 4
63
64; MESA: kernarg_segment_byte_size = 128
65; MESA: kernarg_segment_alignment = 4
66
67; HSA: s_load_dword s0, s[4:5], 0x1c
68define amdgpu_kernel void @kernel_implicitarg_ptr([112 x i8]) #0 {
69  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
70  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
71  %load = load volatile i32, i32 addrspace(4)* %cast
72  ret void
73}
74
75; GCN-LABEL: {{^}}opencl_kernel_implicitarg_ptr:
76; GCN: enable_sgpr_kernarg_segment_ptr = 1
77
78; HSA: kernarg_segment_byte_size = 160
79; HSA: kernarg_segment_alignment = 4
80
81; MESA: kernarg_segment_byte_size = 128
82; MESA: kernarg_segment_alignment = 4
83
84; HSA: s_load_dword s0, s[4:5], 0x1c
85define amdgpu_kernel void @opencl_kernel_implicitarg_ptr([112 x i8]) #1 {
86  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
87  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
88  %load = load volatile i32, i32 addrspace(4)* %cast
89  ret void
90}
91
92; GCN-LABEL: {{^}}func_implicitarg_ptr:
93; GCN: s_waitcnt
94; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
95; GCN-NEXT: s_waitcnt
96; GCN-NEXT: s_setpc_b64
97define void @func_implicitarg_ptr() #0 {
98  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
99  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
100  %load = load volatile i32, i32 addrspace(4)* %cast
101  ret void
102}
103
104; GCN-LABEL: {{^}}opencl_func_implicitarg_ptr:
105; GCN: s_waitcnt
106; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
107; GCN-NEXT: s_waitcnt
108; GCN-NEXT: s_setpc_b64
109define void @opencl_func_implicitarg_ptr() #0 {
110  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
111  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
112  %load = load volatile i32, i32 addrspace(4)* %cast
113  ret void
114}
115
116; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func_empty:
117; HSA: enable_sgpr_kernarg_segment_ptr = 1
118; HSA: kernarg_segment_byte_size = 56
119; HSA: kernarg_segment_alignment = 4
120
121; MESA: enable_sgpr_kernarg_segment_ptr = 1
122; MESA: kernarg_segment_byte_size = 16
123; MESA: kernarg_segment_alignment = 4
124
125; GCN: s_mov_b64 s[8:9], s[4:5]
126; GCN: s_swappc_b64
127define amdgpu_kernel void @kernel_call_implicitarg_ptr_func_empty() #0 {
128  call void @func_implicitarg_ptr()
129  ret void
130}
131
132; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func_empty_implicit0:
133; HSA: enable_sgpr_kernarg_segment_ptr = 0
134; HSA: kernarg_segment_byte_size = 0
135; HSA: kernarg_segment_alignment = 4
136
137; MESA: enable_sgpr_kernarg_segment_ptr = 1
138; MESA: kernarg_segment_byte_size = 16
139; MESA: kernarg_segment_alignment = 4
140
141; HSA: s_mov_b64 s[8:9], 0{{$}}
142; MESA: s_mov_b64 s[8:9], s[4:5]{{$}}
143; GCN: s_swappc_b64
144define amdgpu_kernel void @kernel_call_implicitarg_ptr_func_empty_implicit0() #3 {
145  call void @func_implicitarg_ptr()
146  ret void
147}
148
149; GCN-LABEL: {{^}}opencl_kernel_call_implicitarg_ptr_func_empty:
150; GCN: enable_sgpr_kernarg_segment_ptr = 1
151; HSA: kernarg_segment_byte_size = 48
152; HSA: kernarg_segment_alignment = 4
153; MESA: kernarg_segment_byte_size = 16
154; GCN: s_mov_b64 s[8:9], s[4:5]
155; GCN-NOT: s4
156; GCN-NOT: s5
157; GCN: s_swappc_b64
158define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func_empty() #1 {
159  call void @func_implicitarg_ptr()
160  ret void
161}
162
163; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func:
164; GCN: enable_sgpr_kernarg_segment_ptr = 1
165; HSA: kernarg_segment_byte_size = 168
166; HSA: kernarg_segment_alignment = 4
167
168; MESA: kernarg_segment_byte_size = 128
169; MESA: kernarg_segment_alignment = 4
170
171; HSA: s_add_u32 s8, s4, 0x70
172; MESA: s_add_u32 s8, s4, 0x70
173
174; GCN: s_addc_u32 s9, s5, 0{{$}}
175; GCN: s_swappc_b64
176define amdgpu_kernel void @kernel_call_implicitarg_ptr_func([112 x i8]) #0 {
177  call void @func_implicitarg_ptr()
178  ret void
179}
180
181; GCN-LABEL: {{^}}opencl_kernel_call_implicitarg_ptr_func:
182; GCN: enable_sgpr_kernarg_segment_ptr = 1
183; HSA: kernarg_segment_byte_size = 160
184; HSA: kernarg_segment_alignment = 4
185; MESA: kernarg_segment_byte_size = 128
186; MESA: kernarg_segment_alignment = 4
187
188; GCN: s_add_u32 s8, s4, 0x70
189; GCN: s_addc_u32 s9, s5, 0{{$}}
190; GCN: s_swappc_b64
191define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func([112 x i8]) #1 {
192  call void @func_implicitarg_ptr()
193  ret void
194}
195
196; GCN-LABEL: {{^}}func_call_implicitarg_ptr_func:
197; GCN-NOT: s8
198; GCN-NOT: s9
199; GCN-NOT: s[8:9]
200define void @func_call_implicitarg_ptr_func() #0 {
201  call void @func_implicitarg_ptr()
202  ret void
203}
204
205; GCN-LABEL: {{^}}opencl_func_call_implicitarg_ptr_func:
206; GCN-NOT: s8
207; GCN-NOT: s9
208; GCN-NOT: s[8:9]
209define void @opencl_func_call_implicitarg_ptr_func() #0 {
210  call void @func_implicitarg_ptr()
211  ret void
212}
213
214; GCN-LABEL: {{^}}func_kernarg_implicitarg_ptr:
215; GCN: s_waitcnt
216; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0
217; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0
218; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
219; GCN: s_waitcnt lgkmcnt(0)
220define void @func_kernarg_implicitarg_ptr() #0 {
221  %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
222  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
223  %cast.kernarg.segment.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)*
224  %cast.implicitarg = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
225  %load0 = load volatile i32, i32 addrspace(4)* %cast.kernarg.segment.ptr
226  %load1 = load volatile i32, i32 addrspace(4)* %cast.implicitarg
227  ret void
228}
229
230; GCN-LABEL: {{^}}opencl_func_kernarg_implicitarg_ptr:
231; GCN: s_waitcnt
232; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0
233; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0
234; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
235; GCN: s_waitcnt lgkmcnt(0)
236define void @opencl_func_kernarg_implicitarg_ptr() #0 {
237  %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
238  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
239  %cast.kernarg.segment.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)*
240  %cast.implicitarg = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
241  %load0 = load volatile i32, i32 addrspace(4)* %cast.kernarg.segment.ptr
242  %load1 = load volatile i32, i32 addrspace(4)* %cast.implicitarg
243  ret void
244}
245
246; GCN-LABEL: {{^}}kernel_call_kernarg_implicitarg_ptr_func:
247; GCN: s_add_u32 s8, s4, 0x70
248; GCN: s_addc_u32 s9, s5, 0
249; GCN: s_swappc_b64
250define amdgpu_kernel void @kernel_call_kernarg_implicitarg_ptr_func([112 x i8]) #0 {
251  call void @func_kernarg_implicitarg_ptr()
252  ret void
253}
254
255; GCN-LABEL: {{^}}kernel_implicitarg_no_struct_align_padding:
256; HSA: kernarg_segment_byte_size = 120
257; MESA: kernarg_segment_byte_size = 84
258; GCN: kernarg_segment_alignment = 6
259define amdgpu_kernel void @kernel_implicitarg_no_struct_align_padding(<16 x i32>, i32) #1 {
260  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
261  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
262  %load = load volatile i32, i32 addrspace(4)* %cast
263  ret void
264}
265
266
267; HSA-LABEL: Kernels:
268; HSA-LABEL: - Name:            kernel_implicitarg_ptr_empty
269; HSA: CodeProps:
270; HSA: KernargSegmentSize: 56
271; HSA: KernargSegmentAlign: 8
272
273; HSA-LABEL: - Name:            kernel_implicitarg_ptr_empty_0implicit
274; HSA: KernargSegmentSize: 0
275; HSA: KernargSegmentAlign: 4
276
277; HSA-LABEL: - Name:            opencl_kernel_implicitarg_ptr_empty
278; HSA: KernargSegmentSize: 48
279; HSA: KernargSegmentAlign: 8
280
281; HSA-LABEL: - Name:            kernel_implicitarg_ptr
282; HSA: KernargSegmentSize: 168
283; HSA: KernargSegmentAlign: 8
284
285; HSA-LABEL: - Name:            opencl_kernel_implicitarg_ptr
286; HSA: KernargSegmentSize: 160
287; HSA: KernargSegmentAlign: 8
288
289; HSA-LABEL: - Name:            kernel_call_implicitarg_ptr_func_empty
290; HSA: KernargSegmentSize: 56
291; HSA: KernargSegmentAlign: 8
292
293; HSA-LABEL: - Name:            kernel_call_implicitarg_ptr_func_empty_implicit0
294; HSA: KernargSegmentSize: 0
295; HSA: KernargSegmentAlign: 4
296
297; HSA-LABEL:  - Name:            opencl_kernel_call_implicitarg_ptr_func_empty
298; HSA: KernargSegmentSize: 48
299; HSA: KernargSegmentAlign: 8
300
301; HSA-LABEL:  - Name:            kernel_call_implicitarg_ptr_func
302; HSA: KernargSegmentSize: 168
303; HSA: KernargSegmentAlign: 8
304
305; HSA-LABEL:  - Name:            opencl_kernel_call_implicitarg_ptr_func
306; HSA: KernargSegmentSize: 160
307; HSA: KernargSegmentAlign: 8
308
309; HSA-LABEL: - Name:            kernel_call_kernarg_implicitarg_ptr_func
310; HSA: KernargSegmentSize: 168
311; HSA: KernargSegmentAlign: 8
312
313; HSA-LABEL: - Name:            kernel_implicitarg_no_struct_align_padding
314; HSA: KernargSegmentSize: 120
315; HSA: KernargSegmentAlign: 64
316
317declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #2
318declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #2
319
320attributes #0 = { nounwind noinline }
321attributes #1 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="48" }
322attributes #2 = { nounwind readnone speculatable }
323attributes #3 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="0" }
324