1; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=2 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,HSA %s
2; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MESA %s
3
4; GCN-LABEL: {{^}}kernel_implicitarg_ptr_empty:
5; HSA: enable_sgpr_kernarg_segment_ptr = 0
6; HSA: kernarg_segment_byte_size = 0
7; HSA: kernarg_segment_alignment = 4
8
9; MESA: enable_sgpr_kernarg_segment_ptr = 1
10; MESA: kernarg_segment_byte_size = 16
11; MESA: kernarg_segment_alignment = 4
12
13; HSA: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0{{$}}
14; HSA: s_load_dword s0, [[NULL]], 0x0
15define amdgpu_kernel void @kernel_implicitarg_ptr_empty() #0 {
16  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
17  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
18  %load = load volatile i32, i32 addrspace(4)* %cast
19  ret void
20}
21
22; GCN-LABEL: {{^}}kernel_implicitarg_ptr_empty_0implicit:
23; HSA: enable_sgpr_kernarg_segment_ptr = 0
24; HSA: kernarg_segment_byte_size = 0
25; HSA: kernarg_segment_alignment = 4
26
27; MESA: enable_sgpr_kernarg_segment_ptr = 1
28; MESA: kernarg_segment_byte_size = 16
29; MESA: kernarg_segment_alignment = 4
30
31; HSA: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0{{$}}
32; HSA: s_load_dword s0, [[NULL]], 0x0
33
34; MESA: s_load_dword s0, s[4:5], 0x0
35define amdgpu_kernel void @kernel_implicitarg_ptr_empty_0implicit() #3 {
36  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
37  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
38  %load = load volatile i32, i32 addrspace(4)* %cast
39  ret void
40}
41
42; GCN-LABEL: {{^}}opencl_kernel_implicitarg_ptr_empty:
43; GCN: enable_sgpr_kernarg_segment_ptr = 1
44
45; HSA: kernarg_segment_byte_size = 48
46; HSA: kernarg_segment_alignment = 4
47
48; MESA: kernarg_segment_byte_size = 16
49; MESA: kernarg_segment_alignment = 4
50
51; HSA: s_load_dword s0, s[4:5], 0x0
52define amdgpu_kernel void @opencl_kernel_implicitarg_ptr_empty() #1 {
53  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
54  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
55  %load = load volatile i32, i32 addrspace(4)* %cast
56  ret void
57}
58
59; GCN-LABEL: {{^}}kernel_implicitarg_ptr:
60; GCN: enable_sgpr_kernarg_segment_ptr = 1
61
62; HSA: kernarg_segment_byte_size = 112
63; HSA: kernarg_segment_alignment = 4
64
65; MESA: kernarg_segment_byte_size = 128
66; MESA: kernarg_segment_alignment = 4
67
68; HSA: s_load_dword s0, s[4:5], 0x1c
69define amdgpu_kernel void @kernel_implicitarg_ptr([112 x i8]) #0 {
70  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
71  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
72  %load = load volatile i32, i32 addrspace(4)* %cast
73  ret void
74}
75
76; GCN-LABEL: {{^}}opencl_kernel_implicitarg_ptr:
77; GCN: enable_sgpr_kernarg_segment_ptr = 1
78
79; HSA: kernarg_segment_byte_size = 160
80; HSA: kernarg_segment_alignment = 4
81
82; MESA: kernarg_segment_byte_size = 128
83; MESA: kernarg_segment_alignment = 4
84
85; HSA: s_load_dword s0, s[4:5], 0x1c
86define amdgpu_kernel void @opencl_kernel_implicitarg_ptr([112 x i8]) #1 {
87  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
88  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
89  %load = load volatile i32, i32 addrspace(4)* %cast
90  ret void
91}
92
93; GCN-LABEL: {{^}}func_implicitarg_ptr:
94; GCN: s_waitcnt
95; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
96; GCN-NEXT: s_waitcnt
97; GCN-NEXT: s_setpc_b64
98define void @func_implicitarg_ptr() #0 {
99  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
100  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
101  %load = load volatile i32, i32 addrspace(4)* %cast
102  ret void
103}
104
105; GCN-LABEL: {{^}}opencl_func_implicitarg_ptr:
106; GCN: s_waitcnt
107; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
108; GCN-NEXT: s_waitcnt
109; GCN-NEXT: s_setpc_b64
110define void @opencl_func_implicitarg_ptr() #0 {
111  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
112  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
113  %load = load volatile i32, i32 addrspace(4)* %cast
114  ret void
115}
116
117; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func_empty:
118; HSA: enable_sgpr_kernarg_segment_ptr = 0
119; HSA: kernarg_segment_byte_size = 0
120; HSA: kernarg_segment_alignment = 4
121
122; MESA: enable_sgpr_kernarg_segment_ptr = 1
123; MESA: kernarg_segment_byte_size = 16
124; MESA: kernarg_segment_alignment = 4
125
126; XGCN-NOT: s[4:5]
127; XGCN-NOT: s4
128; XGCN-NOT: s5
129; GCN: s_swappc_b64
130define amdgpu_kernel void @kernel_call_implicitarg_ptr_func_empty() #0 {
131  call void @func_implicitarg_ptr()
132  ret void
133}
134
135; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func_empty_implicit0:
136; HSA: enable_sgpr_kernarg_segment_ptr = 0
137; HSA: kernarg_segment_byte_size = 0
138; HSA: kernarg_segment_alignment = 4
139
140; MESA: enable_sgpr_kernarg_segment_ptr = 1
141; MESA: kernarg_segment_byte_size = 16
142; MESA: kernarg_segment_alignment = 4
143
144; HSA: s_mov_b64 s[4:5], 0{{$}}
145; MESA-NOT: s[4:5]
146; MESA-NOT: s4
147; MESA-NOT: s5
148; GCN: s_swappc_b64
149define amdgpu_kernel void @kernel_call_implicitarg_ptr_func_empty_implicit0() #3 {
150  call void @func_implicitarg_ptr()
151  ret void
152}
153
154; GCN-LABEL: {{^}}opencl_kernel_call_implicitarg_ptr_func_empty:
155; GCN: enable_sgpr_kernarg_segment_ptr = 1
156; HSA: kernarg_segment_byte_size = 48
157; HSA: kernarg_segment_alignment = 4
158; MESA: kernarg_segment_byte_size = 16
159; MESA: kernarg_segment_alignment = 4
160; GCN-NOT: s[4:5]
161; GCN-NOT: s4
162; GCN-NOT: s5
163; GCN: s_swappc_b64
164define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func_empty() #1 {
165  call void @func_implicitarg_ptr()
166  ret void
167}
168
169; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func:
170; GCN: enable_sgpr_kernarg_segment_ptr = 1
171; HSA: kernarg_segment_byte_size = 112
172; HSA: kernarg_segment_alignment = 4
173; MESA: kernarg_segment_byte_size = 128
174; MESA: kernarg_segment_alignment = 4
175
176; HSA: s_add_u32 s4, s4, 0x70
177; MESA: s_add_u32 s4, s4, 0x70
178
179; GCN: s_addc_u32 s5, s5, 0{{$}}
180; GCN: s_swappc_b64
181define amdgpu_kernel void @kernel_call_implicitarg_ptr_func([112 x i8]) #0 {
182  call void @func_implicitarg_ptr()
183  ret void
184}
185
186; GCN-LABEL: {{^}}opencl_kernel_call_implicitarg_ptr_func:
187; GCN: enable_sgpr_kernarg_segment_ptr = 1
188; HSA: kernarg_segment_byte_size = 160
189; HSA: kernarg_segment_alignment = 4
190; MESA: kernarg_segment_byte_size = 128
191; MESA: kernarg_segment_alignment = 4
192
193; GCN: s_add_u32 s4, s4, 0x70
194; GCN: s_addc_u32 s5, s5, 0{{$}}
195; GCN: s_swappc_b64
196define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func([112 x i8]) #1 {
197  call void @func_implicitarg_ptr()
198  ret void
199}
200
201; GCN-LABEL: {{^}}func_call_implicitarg_ptr_func:
202; GCN-NOT: s4
203; GCN-NOT: s5
204; GCN-NOT: s[4:5]
205define void @func_call_implicitarg_ptr_func() #0 {
206  call void @func_implicitarg_ptr()
207  ret void
208}
209
210; GCN-LABEL: {{^}}opencl_func_call_implicitarg_ptr_func:
211; GCN-NOT: s4
212; GCN-NOT: s5
213; GCN-NOT: s[4:5]
214define void @opencl_func_call_implicitarg_ptr_func() #0 {
215  call void @func_implicitarg_ptr()
216  ret void
217}
218
219; GCN-LABEL: {{^}}func_kernarg_implicitarg_ptr:
220; GCN: s_waitcnt
221; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0
222; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0
223; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
224; GCN: s_waitcnt lgkmcnt(0)
225define void @func_kernarg_implicitarg_ptr() #0 {
226  %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
227  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
228  %cast.kernarg.segment.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)*
229  %cast.implicitarg = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
230  %load0 = load volatile i32, i32 addrspace(4)* %cast.kernarg.segment.ptr
231  %load1 = load volatile i32, i32 addrspace(4)* %cast.implicitarg
232  ret void
233}
234
235; GCN-LABEL: {{^}}opencl_func_kernarg_implicitarg_ptr:
236; GCN: s_waitcnt
237; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0
238; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0
239; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
240; GCN: s_waitcnt lgkmcnt(0)
241define void @opencl_func_kernarg_implicitarg_ptr() #0 {
242  %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
243  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
244  %cast.kernarg.segment.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)*
245  %cast.implicitarg = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
246  %load0 = load volatile i32, i32 addrspace(4)* %cast.kernarg.segment.ptr
247  %load1 = load volatile i32, i32 addrspace(4)* %cast.implicitarg
248  ret void
249}
250
251; GCN-LABEL: {{^}}kernel_call_kernarg_implicitarg_ptr_func:
252; GCN: s_add_u32 s4, s4, 0x70
253; GCN: s_addc_u32 s5, s5, 0
254; GCN: s_swappc_b64
255define amdgpu_kernel void @kernel_call_kernarg_implicitarg_ptr_func([112 x i8]) #0 {
256  call void @func_kernarg_implicitarg_ptr()
257  ret void
258}
259
260; GCN-LABEL: {{^}}kernel_implicitarg_no_struct_align_padding:
261; HSA: kernarg_segment_byte_size = 120
262; MESA: kernarg_segment_byte_size = 84
263; GCN: kernarg_segment_alignment = 6
264define amdgpu_kernel void @kernel_implicitarg_no_struct_align_padding(<16 x i32>, i32) #1 {
265  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
266  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
267  %load = load volatile i32, i32 addrspace(4)* %cast
268  ret void
269}
270
271
272; HSA-LABEL: Kernels:
273; HSA-LABEL: - Name:            kernel_implicitarg_ptr_empty
274; HSA: CodeProps:
275; HSA: KernargSegmentSize: 0
276; HSA: KernargSegmentAlign: 4
277
278; HSA-LABEL: - Name:            kernel_implicitarg_ptr_empty_0implicit
279; HSA: KernargSegmentSize: 0
280; HSA: KernargSegmentAlign: 4
281
282; HSA-LABEL: - Name:            opencl_kernel_implicitarg_ptr_empty
283; HSA: KernargSegmentSize: 48
284; HSA: KernargSegmentAlign: 8
285
286; HSA-LABEL: - Name:            kernel_implicitarg_ptr
287; HSA: KernargSegmentSize: 112
288; HSA: KernargSegmentAlign: 4
289
290; HSA-LABEL: - Name:            opencl_kernel_implicitarg_ptr
291; HSA: KernargSegmentSize: 160
292; HSA: KernargSegmentAlign: 8
293
294; HSA-LABEL: - Name:            kernel_call_implicitarg_ptr_func_empty
295; HSA: KernargSegmentSize: 0
296; HSA: KernargSegmentAlign: 4
297
298; HSA-LABEL: - Name:            kernel_call_implicitarg_ptr_func_empty_implicit0
299; HSA: KernargSegmentSize: 0
300; HSA: KernargSegmentAlign: 4
301
302; HSA-LABEL:  - Name:            opencl_kernel_call_implicitarg_ptr_func_empty
303; HSA: KernargSegmentSize: 48
304; HSA: KernargSegmentAlign: 8
305
306; HSA-LABEL:  - Name:            kernel_call_implicitarg_ptr_func
307; HSA: KernargSegmentSize: 112
308; HSA: KernargSegmentAlign: 4
309
310; HSA-LABEL:  - Name:            opencl_kernel_call_implicitarg_ptr_func
311; HSA: KernargSegmentSize: 160
312; HSA: KernargSegmentAlign: 8
313
314; HSA-LABEL: - Name:            kernel_call_kernarg_implicitarg_ptr_func
315; HSA: KernargSegmentSize: 112
316; HSA: KernargSegmentAlign: 4
317
318; HSA-LABEL: - Name:            kernel_implicitarg_no_struct_align_padding
319; HSA: KernargSegmentSize: 120
320; HSA: KernargSegmentAlign: 64
321
322declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #2
323declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #2
324
325attributes #0 = { nounwind noinline }
326attributes #1 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="48" }
327attributes #2 = { nounwind readnone speculatable }
328attributes #3 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="0" }
329