1; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=2 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,HSA %s
2; RUN: llc -mtriple=amdgcn-amd-amdhsa --amdhsa-code-object-version=5 -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,COV5 %s
3; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MESA %s
4
5; GCN-LABEL: {{^}}kernel_implicitarg_ptr_empty:
6; HSA: enable_sgpr_kernarg_segment_ptr = 1
7; HSA: kernarg_segment_byte_size = 56
8; HSA: kernarg_segment_alignment = 4
9
10; MESA: enable_sgpr_kernarg_segment_ptr = 1
11; MESA: kernarg_segment_byte_size = 16
12; MESA: kernarg_segment_alignment = 4
13
14; HSA: s_load_dword s0, s[4:5], 0x0
15
16; COV5: .amdhsa_kernarg_size 256
17define amdgpu_kernel void @kernel_implicitarg_ptr_empty() #0 {
18  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
19  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
20  %load = load volatile i32, i32 addrspace(4)* %cast
21  ret void
22}
23
24; GCN-LABEL: {{^}}kernel_implicitarg_ptr_empty_0implicit:
25; HSA: enable_sgpr_kernarg_segment_ptr = 0
26; HSA: kernarg_segment_byte_size = 0
27; HSA: kernarg_segment_alignment = 4
28
29; MESA: enable_sgpr_kernarg_segment_ptr = 1
30; MESA: kernarg_segment_byte_size = 16
31; MESA: kernarg_segment_alignment = 4
32
33; HSA: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0{{$}}
34; HSA: s_load_dword s0, [[NULL]], 0x0
35
36; MESA: s_load_dword s0, s[4:5], 0x0
37
38; COV5: .amdhsa_kernarg_size 0
39define amdgpu_kernel void @kernel_implicitarg_ptr_empty_0implicit() #3 {
40  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
41  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
42  %load = load volatile i32, i32 addrspace(4)* %cast
43  ret void
44}
45
46; GCN-LABEL: {{^}}opencl_kernel_implicitarg_ptr_empty:
47
48; HSA: enable_sgpr_kernarg_segment_ptr = 1
49; HSA: kernarg_segment_byte_size = 48
50; HSA: kernarg_segment_alignment = 4
51
52; MESA: enable_sgpr_kernarg_segment_ptr = 1
53; MESA: kernarg_segment_byte_size = 16
54; MESA: kernarg_segment_alignment = 4
55
56; HSA: s_load_dword s0, s[4:5], 0x0
57
58; COV5: .amdhsa_kernarg_size 48
59define amdgpu_kernel void @opencl_kernel_implicitarg_ptr_empty() #1 {
60  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
61  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
62  %load = load volatile i32, i32 addrspace(4)* %cast
63  ret void
64}
65
66; GCN-LABEL: {{^}}kernel_implicitarg_ptr:
67
68; HSA: enable_sgpr_kernarg_segment_ptr = 1
69; HSA: kernarg_segment_byte_size = 168
70; HSA: kernarg_segment_alignment = 4
71
72; MESA: enable_sgpr_kernarg_segment_ptr = 1
73; MESA: kernarg_segment_byte_size = 128
74; MESA: kernarg_segment_alignment = 4
75
76; HSA: s_load_dword s0, s[4:5], 0x1c
77
78; COV5: .amdhsa_kernarg_size 368
79define amdgpu_kernel void @kernel_implicitarg_ptr([112 x i8]) #0 {
80  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
81  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
82  %load = load volatile i32, i32 addrspace(4)* %cast
83  ret void
84}
85
86; GCN-LABEL: {{^}}opencl_kernel_implicitarg_ptr:
87
88; HSA: enable_sgpr_kernarg_segment_ptr = 1
89; HSA: kernarg_segment_byte_size = 160
90; HSA: kernarg_segment_alignment = 4
91
92; MESA: enable_sgpr_kernarg_segment_ptr = 1
93; MESA: kernarg_segment_byte_size = 128
94; MESA: kernarg_segment_alignment = 4
95
96; HSA: s_load_dword s0, s[4:5], 0x1c
97
98; COV5: .amdhsa_kernarg_size 160
99define amdgpu_kernel void @opencl_kernel_implicitarg_ptr([112 x i8]) #1 {
100  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
101  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
102  %load = load volatile i32, i32 addrspace(4)* %cast
103  ret void
104}
105
106; GCN-LABEL: {{^}}func_implicitarg_ptr:
107; GCN: s_waitcnt
108; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
109; GCN-NEXT: s_waitcnt
110; GCN-NEXT: s_setpc_b64
111define void @func_implicitarg_ptr() #0 {
112  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
113  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
114  %load = load volatile i32, i32 addrspace(4)* %cast
115  ret void
116}
117
118; GCN-LABEL: {{^}}opencl_func_implicitarg_ptr:
119; GCN: s_waitcnt
120; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
121; GCN-NEXT: s_waitcnt
122; GCN-NEXT: s_setpc_b64
123define void @opencl_func_implicitarg_ptr() #0 {
124  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
125  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
126  %load = load volatile i32, i32 addrspace(4)* %cast
127  ret void
128}
129
130; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func_empty:
131; HSA: enable_sgpr_kernarg_segment_ptr = 1
132; HSA: kernarg_segment_byte_size = 56
133; HSA: kernarg_segment_alignment = 4
134
135; MESA: enable_sgpr_kernarg_segment_ptr = 1
136; MESA: kernarg_segment_byte_size = 16
137; MESA: kernarg_segment_alignment = 4
138
139; GCN: s_mov_b64 s[8:9], s[4:5]
140; GCN: s_swappc_b64
141
142; COV5: .amdhsa_kernarg_size 256
143define amdgpu_kernel void @kernel_call_implicitarg_ptr_func_empty() #0 {
144  call void @func_implicitarg_ptr()
145  ret void
146}
147
148; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func_empty_implicit0:
149; HSA: enable_sgpr_kernarg_segment_ptr = 0
150; HSA: kernarg_segment_byte_size = 0
151; HSA: kernarg_segment_alignment = 4
152
153; MESA: enable_sgpr_kernarg_segment_ptr = 1
154; MESA: kernarg_segment_byte_size = 16
155; MESA: kernarg_segment_alignment = 4
156
157; HSA: s_mov_b64 s[8:9], 0{{$}}
158; MESA: s_mov_b64 s[8:9], s[4:5]{{$}}
159; GCN: s_swappc_b64
160
161; COV5: .amdhsa_kernarg_size 0
162define amdgpu_kernel void @kernel_call_implicitarg_ptr_func_empty_implicit0() #3 {
163  call void @func_implicitarg_ptr()
164  ret void
165}
166
167; GCN-LABEL: {{^}}opencl_kernel_call_implicitarg_ptr_func_empty:
168; HSA: enable_sgpr_kernarg_segment_ptr = 1
169; HSA: kernarg_segment_byte_size = 48
170; HSA: kernarg_segment_alignment = 4
171; MESA: enable_sgpr_kernarg_segment_ptr = 1
172; MESA: kernarg_segment_byte_size = 16
173; GCN: s_mov_b64 s[8:9], s[4:5]
174; GCN-NOT: s4
175; GCN-NOT: s5
176; GCN: s_swappc_b64
177
178; COV5: .amdhsa_kernarg_size 48
179define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func_empty() #1 {
180  call void @func_implicitarg_ptr()
181  ret void
182}
183
184; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func:
185; HSA: enable_sgpr_kernarg_segment_ptr = 1
186; HSA: kernarg_segment_byte_size = 168
187; HSA: kernarg_segment_alignment = 4
188
189; MESA: enable_sgpr_kernarg_segment_ptr = 1
190; MESA: kernarg_segment_byte_size = 128
191; MESA: kernarg_segment_alignment = 4
192
193; HSA: s_add_u32 s8, s4, 0x70
194; MESA: s_add_u32 s8, s4, 0x70
195
196; GCN: s_addc_u32 s9, s5, 0{{$}}
197; GCN: s_swappc_b64
198
199; COV5: .amdhsa_kernarg_size 368
200define amdgpu_kernel void @kernel_call_implicitarg_ptr_func([112 x i8]) #0 {
201  call void @func_implicitarg_ptr()
202  ret void
203}
204
205; GCN-LABEL: {{^}}opencl_kernel_call_implicitarg_ptr_func:
206; HSA: enable_sgpr_kernarg_segment_ptr = 1
207; HSA: kernarg_segment_byte_size = 160
208; HSA: kernarg_segment_alignment = 4
209; MESA: enable_sgpr_kernarg_segment_ptr = 1
210; MESA: kernarg_segment_byte_size = 128
211; MESA: kernarg_segment_alignment = 4
212
213; GCN: s_add_u32 s8, s4, 0x70
214; GCN: s_addc_u32 s9, s5, 0{{$}}
215; GCN: s_swappc_b64
216
217; COV5: .amdhsa_kernarg_size 160
218define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func([112 x i8]) #1 {
219  call void @func_implicitarg_ptr()
220  ret void
221}
222
223; GCN-LABEL: {{^}}func_call_implicitarg_ptr_func:
224; GCN-NOT: s8
225; GCN-NOT: s9
226; GCN-NOT: s[8:9]
227; GCN: s_swappc_b64
228; GCN: s_setpc_b64 s[30:31]
229define void @func_call_implicitarg_ptr_func() #0 {
230  call void @func_implicitarg_ptr()
231  ret void
232}
233
234; GCN-LABEL: {{^}}opencl_func_call_implicitarg_ptr_func:
235; GCN-NOT: s8
236; GCN-NOT: s9
237; GCN-NOT: s[8:9]
238; GCN: s_swappc_b64
239; GCN: s_setpc_b64 s[30:31]
240define void @opencl_func_call_implicitarg_ptr_func() #0 {
241  call void @func_implicitarg_ptr()
242  ret void
243}
244
245; GCN-LABEL: {{^}}func_kernarg_implicitarg_ptr:
246; GCN: s_waitcnt
247; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0
248; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0
249; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
250; GCN: s_waitcnt lgkmcnt(0)
251define void @func_kernarg_implicitarg_ptr() #0 {
252  %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
253  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
254  %cast.kernarg.segment.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)*
255  %cast.implicitarg = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
256  %load0 = load volatile i32, i32 addrspace(4)* %cast.kernarg.segment.ptr
257  %load1 = load volatile i32, i32 addrspace(4)* %cast.implicitarg
258  ret void
259}
260
261; GCN-LABEL: {{^}}opencl_func_kernarg_implicitarg_ptr:
262; GCN: s_waitcnt
263; GCN-DAG: s_mov_b64 [[NULL:s\[[0-9]+:[0-9]+\]]], 0
264; GCN-DAG: s_load_dword s{{[0-9]+}}, [[NULL]], 0x0
265; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0
266; GCN: s_waitcnt lgkmcnt(0)
267define void @opencl_func_kernarg_implicitarg_ptr() #0 {
268  %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
269  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
270  %cast.kernarg.segment.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)*
271  %cast.implicitarg = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
272  %load0 = load volatile i32, i32 addrspace(4)* %cast.kernarg.segment.ptr
273  %load1 = load volatile i32, i32 addrspace(4)* %cast.implicitarg
274  ret void
275}
276
277; GCN-LABEL: {{^}}kernel_call_kernarg_implicitarg_ptr_func:
278; GCN: s_add_u32 s8, s4, 0x70
279; GCN: s_addc_u32 s9, s5, 0
280; GCN: s_swappc_b64
281define amdgpu_kernel void @kernel_call_kernarg_implicitarg_ptr_func([112 x i8]) #0 {
282  call void @func_kernarg_implicitarg_ptr()
283  ret void
284}
285
286; GCN-LABEL: {{^}}kernel_implicitarg_no_struct_align_padding:
287; HSA: kernarg_segment_byte_size = 120
288; HSA: kernarg_segment_alignment = 6
289; MESA: kernarg_segment_byte_size = 84
290; MESA: kernarg_segment_alignment = 6
291
292; COV5: .amdhsa_kernarg_size 120
293define amdgpu_kernel void @kernel_implicitarg_no_struct_align_padding(<16 x i32>, i32) #1 {
294  %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
295  %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
296  %load = load volatile i32, i32 addrspace(4)* %cast
297  ret void
298}
299
300; HSA-LABEL: Kernels:
301; HSA-LABEL: - Name:            kernel_implicitarg_ptr_empty
302; HSA: CodeProps:
303; HSA: KernargSegmentSize: 56
304; HSA: KernargSegmentAlign: 8
305
306; HSA-LABEL: - Name:            kernel_implicitarg_ptr_empty_0implicit
307; HSA: KernargSegmentSize: 0
308; HSA: KernargSegmentAlign: 4
309
310; HSA-LABEL: - Name:            opencl_kernel_implicitarg_ptr_empty
311; HSA: KernargSegmentSize: 48
312; HSA: KernargSegmentAlign: 8
313
314; HSA-LABEL: - Name:            kernel_implicitarg_ptr
315; HSA: KernargSegmentSize: 168
316; HSA: KernargSegmentAlign: 8
317
318; HSA-LABEL: - Name:            opencl_kernel_implicitarg_ptr
319; HSA: KernargSegmentSize: 160
320; HSA: KernargSegmentAlign: 8
321
322; HSA-LABEL: - Name:            kernel_call_implicitarg_ptr_func_empty
323; HSA: KernargSegmentSize: 56
324; HSA: KernargSegmentAlign: 8
325
326; HSA-LABEL: - Name:            kernel_call_implicitarg_ptr_func_empty_implicit0
327; HSA: KernargSegmentSize: 0
328; HSA: KernargSegmentAlign: 4
329
330; HSA-LABEL:  - Name:            opencl_kernel_call_implicitarg_ptr_func_empty
331; HSA: KernargSegmentSize: 48
332; HSA: KernargSegmentAlign: 8
333
334; HSA-LABEL:  - Name:            kernel_call_implicitarg_ptr_func
335; HSA: KernargSegmentSize: 168
336; HSA: KernargSegmentAlign: 8
337
338; HSA-LABEL:  - Name:            opencl_kernel_call_implicitarg_ptr_func
339; HSA: KernargSegmentSize: 160
340; HSA: KernargSegmentAlign: 8
341
342; HSA-LABEL: - Name:            kernel_call_kernarg_implicitarg_ptr_func
343; HSA: KernargSegmentSize: 168
344; HSA: KernargSegmentAlign: 8
345
346; HSA-LABEL: - Name:            kernel_implicitarg_no_struct_align_padding
347; HSA: KernargSegmentSize: 120
348; HSA: KernargSegmentAlign: 64
349
350; COV5-LABEL:   amdhsa.kernels:
351; COV5:         .kernarg_segment_align: 8
352; COV5-NEXT:    .kernarg_segment_size: 256
353; COV5-LABEL:   .name:           kernel_implicitarg_ptr_empty
354
355; COV5:         .kernarg_segment_align: 4
356; COV5-NEXT:    .kernarg_segment_size: 0
357; COV5-LABEL:   .name:           kernel_implicitarg_ptr_empty_0implicit
358
359; COV5:         .kernarg_segment_align: 8
360; COV5-NEXT:    .kernarg_segment_size: 48
361; COV5-LABEL:   .name:           opencl_kernel_implicitarg_ptr_empty
362
363; COV5:         .kernarg_segment_align: 8
364; COV5-NEXT:    .kernarg_segment_size: 368
365; COV5-LABEL:   .name:           kernel_implicitarg_ptr
366
367; COV5:         .kernarg_segment_align: 8
368; COV5-NEXT:    .kernarg_segment_size: 160
369; COV5-LABEL:   .name:           opencl_kernel_implicitarg_ptr
370
371; COV5:         .kernarg_segment_align: 8
372; COV5-NEXT:    .kernarg_segment_size: 256
373; COV5-LABEL:   .name:           kernel_call_implicitarg_ptr_func_empty
374
375; COV5:         .kernarg_segment_align: 4
376; COV5-NEXT:    .kernarg_segment_size: 0
377; COV5-LABEL:   .name:           kernel_call_implicitarg_ptr_func_empty_implicit0
378
379; COV5:         .kernarg_segment_align: 8
380; COV5-NEXT:    .kernarg_segment_size: 48
381; COV5-LABEL:   .name:           opencl_kernel_call_implicitarg_ptr_func_empty
382
383; COV5:         .kernarg_segment_align: 8
384; COV5-NEXT:    .kernarg_segment_size: 368
385; COV5-LABEL:   .name:           kernel_call_implicitarg_ptr_func
386
387; COV5:         .kernarg_segment_align: 8
388; COV5-NEXT:    .kernarg_segment_size: 160
389; COV5-LABEL:   .name:           opencl_kernel_call_implicitarg_ptr_func
390
391; COV5:         .kernarg_segment_align: 8
392; COV5-NEXT:    .kernarg_segment_size: 368
393; COV5-LABEL:   .name:           kernel_call_kernarg_implicitarg_ptr_func
394
395; COV5:         .kernarg_segment_align: 64
396; COV5-NEXT:    .kernarg_segment_size: 120
397; COV5-LABEL:   .name:           kernel_implicitarg_no_struct_align_padding
398
399declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #2
400declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #2
401
402attributes #0 = { nounwind noinline }
403attributes #1 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="48" }
404attributes #2 = { nounwind readnone speculatable }
405attributes #3 = { nounwind noinline "amdgpu-implicitarg-num-bytes"="0" }
406