1; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=2 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,HSA %s
2; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MESA %s
3
4; GCN-LABEL: {{^}}kernel_implicitarg_ptr_empty:
5; GCN: enable_sgpr_kernarg_segment_ptr = 1
6
7; HSA: kernarg_segment_byte_size = 0
8; MESA: kernarg_segment_byte_size = 16
9
10; HSA: s_load_dword s0, s[4:5], 0x0
11define amdgpu_kernel void @kernel_implicitarg_ptr_empty() #0 {
12  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
13  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
14  %load = load volatile i32, i32 addrspace(4)* %cast
15  ret void
16}
17
18; GCN-LABEL: {{^}}opencl_kernel_implicitarg_ptr_empty:
19; GCN: enable_sgpr_kernarg_segment_ptr = 1
20
21; HSA: kernarg_segment_byte_size = 48
22; MESA: kernarg_segment_byte_size = 16
23
24; HSA: s_load_dword s0, s[4:5], 0x0
25define amdgpu_kernel void @opencl_kernel_implicitarg_ptr_empty() #1 {
26  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
27  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
28  %load = load volatile i32, i32 addrspace(4)* %cast
29  ret void
30}
31
32; GCN-LABEL: {{^}}kernel_implicitarg_ptr:
33; GCN: enable_sgpr_kernarg_segment_ptr = 1
34
35; HSA: kernarg_segment_byte_size = 112
36; MESA: kernarg_segment_byte_size = 128
37
38; HSA: s_load_dword s0, s[4:5], 0x1c
39define amdgpu_kernel void @kernel_implicitarg_ptr([112 x i8]) #0 {
40  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
41  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
42  %load = load volatile i32, i32 addrspace(4)* %cast
43  ret void
44}
45
46; GCN-LABEL: {{^}}opencl_kernel_implicitarg_ptr:
47; GCN: enable_sgpr_kernarg_segment_ptr = 1
48
49; HSA: kernarg_segment_byte_size = 160
50; MESA: kernarg_segment_byte_size = 128
51
52; HSA: s_load_dword s0, s[4:5], 0x1c
53define amdgpu_kernel void @opencl_kernel_implicitarg_ptr([112 x i8]) #1 {
54  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
55  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
56  %load = load volatile i32, i32 addrspace(4)* %cast
57  ret void
58}
59
60; GCN-LABEL: {{^}}func_implicitarg_ptr:
61; GCN: s_waitcnt
62; HSA: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
63; MESA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
64; GCN-NEXT: s_waitcnt
65; GCN-NEXT: s_setpc_b64
66define void @func_implicitarg_ptr() #0 {
67  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
68  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
69  %load = load volatile i32, i32 addrspace(4)* %cast
70  ret void
71}
72
73; GCN-LABEL: {{^}}opencl_func_implicitarg_ptr:
74; GCN: s_waitcnt
75; HSA: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
76; MESA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
77; GCN-NEXT: s_waitcnt
78; GCN-NEXT: s_setpc_b64
79define void @opencl_func_implicitarg_ptr() #0 {
80  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
81  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
82  %load = load volatile i32, i32 addrspace(4)* %cast
83  ret void
84}
85
86; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func_empty:
87; GCN: enable_sgpr_kernarg_segment_ptr = 1
88; HSA: kernarg_segment_byte_size = 0
89; MESA: kernarg_segment_byte_size = 16
90; GCN-NOT: s[4:5]
91; GCN-NOT: s4
92; GCN-NOT: s5
93; GCN: s_swappc_b64
94define amdgpu_kernel void @kernel_call_implicitarg_ptr_func_empty() #0 {
95  call void @func_implicitarg_ptr()
96  ret void
97}
98
99; GCN-LABEL: {{^}}opencl_kernel_call_implicitarg_ptr_func_empty:
100; GCN: enable_sgpr_kernarg_segment_ptr = 1
101; HSA: kernarg_segment_byte_size = 48
102; MESA: kernarg_segment_byte_size = 16
103; GCN-NOT: s[4:5]
104; GCN-NOT: s4
105; GCN-NOT: s5
106; GCN: s_swappc_b64
107define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func_empty() #1 {
108  call void @func_implicitarg_ptr()
109  ret void
110}
111
112; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func:
113; GCN: enable_sgpr_kernarg_segment_ptr = 1
114; HSA: kernarg_segment_byte_size = 112
115; MESA: kernarg_segment_byte_size = 128
116
117; HSA: s_add_u32 s8, s8, 0x70
118; MESA: s_add_u32 s4, s4, 0x70
119
120; HSA: s_addc_u32 s9, s9, 0{{$}}
121; MESA: s_addc_u32 s5, s5, 0{{$}}
122; GCN: s_swappc_b64
123define amdgpu_kernel void @kernel_call_implicitarg_ptr_func([112 x i8]) #0 {
124  call void @func_implicitarg_ptr()
125  ret void
126}
127
128; GCN-LABEL: {{^}}opencl_kernel_call_implicitarg_ptr_func:
129; GCN: enable_sgpr_kernarg_segment_ptr = 1
130; HSA: kernarg_segment_byte_size = 160
131; MESA: kernarg_segment_byte_size = 128
132
133; HSA: s_add_u32 s8, s8, 0x70
134; HSA: s_addc_u32 s9, s9, 0{{$}}
135; MESA: s_add_u32 s4, s4, 0x70
136; MESA: s_addc_u32 s5, s5, 0{{$}}
137; GCN: s_swappc_b64
138define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func([112 x i8]) #1 {
139  call void @func_implicitarg_ptr()
140  ret void
141}
142
143; GCN-LABEL: {{^}}func_call_implicitarg_ptr_func:
144; HSA-NOT: s8
145; HSA-NOT: s9
146; HSA-NOT: s[8:9]
147; MESA-NOT: s4
148; MESA-NOT: s5
149; MESA-NOT: s[4:5]
150define void @func_call_implicitarg_ptr_func() #0 {
151  call void @func_implicitarg_ptr()
152  ret void
153}
154
155; GCN-LABEL: {{^}}opencl_func_call_implicitarg_ptr_func:
156; HSA-NOT: s8
157; HSA-NOT: s9
158; HSA-NOT: s[8:9]
159; MESA-NOT: s4
160; MESA-NOT: s5
161; MESA-NOT: s[4:5]
162define void @opencl_func_call_implicitarg_ptr_func() #0 {
163  call void @func_implicitarg_ptr()
164  ret void
165}
166
167; GCN-LABEL: {{^}}func_kernarg_implicitarg_ptr:
168; GCN: s_waitcnt
169; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0
170; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0
171; HSA: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
172; MESA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
173; GCN: s_waitcnt lgkmcnt(0)
174define void @func_kernarg_implicitarg_ptr() #0 {
175  %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
176  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
177  %cast.kernarg.segment.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)*
178  %cast.implicitarg = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
179  %load0 = load volatile i32, i32 addrspace(4)* %cast.kernarg.segment.ptr
180  %load1 = load volatile i32, i32 addrspace(4)* %cast.implicitarg
181  ret void
182}
183
184; GCN-LABEL: {{^}}opencl_func_kernarg_implicitarg_ptr:
185; GCN: s_waitcnt
186; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0
187; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0
188; HSA: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
189; MESA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
190; GCN: s_waitcnt lgkmcnt(0)
191define void @opencl_func_kernarg_implicitarg_ptr() #0 {
192  %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
193  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
194  %cast.kernarg.segment.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)*
195  %cast.implicitarg = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
196  %load0 = load volatile i32, i32 addrspace(4)* %cast.kernarg.segment.ptr
197  %load1 = load volatile i32, i32 addrspace(4)* %cast.implicitarg
198  ret void
199}
200
201; GCN-LABEL: {{^}}kernel_call_kernarg_implicitarg_ptr_func:
202; HSA: s_add_u32 s8, s8, 0x70
203; HSA: s_addc_u32 s9, s9, 0
204; MESA: s_add_u32 s4, s4, 0x70
205; MESA: s_addc_u32 s5, s5, 0
206; GCN: s_swappc_b64
207define amdgpu_kernel void @kernel_call_kernarg_implicitarg_ptr_func([112 x i8]) #0 {
208  call void @func_kernarg_implicitarg_ptr()
209  ret void
210}
211
212; GCN-LABEL: {{^}}kernel_implicitarg_no_struct_align_padding:
213; HSA: kernarg_segment_byte_size = 120
214; MESA: kernarg_segment_byte_size = 84
215; GCN: kernarg_segment_alignment = 6
216define amdgpu_kernel void @kernel_implicitarg_no_struct_align_padding(<16 x i32>, i32) #1 {
217  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
218  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
219  %load = load volatile i32, i32 addrspace(4)* %cast
220  ret void
221}
222
223declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #2
224declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #2
225
226attributes #0 = { nounwind noinline }
227attributes #1 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="48" }
228attributes #2 = { nounwind readnone speculatable }
229