1; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
4target datalayout = "A5"
5
6; FIXME: Why is this commuted only sometimes?
7; GCN-LABEL: {{^}}i32_fastcc_i32_i32:
8; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
9; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
10; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
11; GCN-NEXT: s_setpc_b64
12define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 {
13  %add0 = add i32 %arg0, %arg1
14  ret i32 %add0
15}
16
17; GCN-LABEL: {{^}}i32_fastcc_i32_i32_stack_object:
18; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19; GCN-NEXT: v_mov_b32_e32 [[K:v[0-9]+]], 9
20; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
21; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
22; GCN: buffer_store_dword [[K]], off, s[0:3], s32 offset:20
23; GCN: s_waitcnt vmcnt(0)
24; GCN: s_setpc_b64
25; GCN: ; ScratchSize: 68
26define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 {
27  %alloca = alloca [16 x i32], align 4, addrspace(5)
28  %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5
29  store volatile i32 9, i32 addrspace(5)* %gep
30  %add0 = add i32 %arg0, %arg1
31  ret i32 %add0
32}
33
34; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32:
35define hidden fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 {
36entry:
37  %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
38  ret i32 %ret
39}
40
41; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_stack_object:
42; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
43; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:20
44; GCN: s_setpc_b64
45; GCN: ; ScratchSize: 68
46define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, i32 %c) #1 {
47entry:
48  %alloca = alloca [16 x i32], align 4, addrspace(5)
49  %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5
50  store volatile i32 9, i32 addrspace(5)* %gep
51  %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
52  ret i32 %ret
53}
54
55; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_callee_stack_object:
56; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
57; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:20
58; GCN: s_setpc_b64
59; GCN: ; ScratchSize: 136
60define fastcc i32 @sibling_call_i32_fastcc_i32_i32_callee_stack_object(i32 %a, i32 %b, i32 %c) #1 {
61entry:
62  %alloca = alloca [16 x i32], align 4, addrspace(5)
63  %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5
64  store volatile i32 9, i32 addrspace(5)* %gep
65  %ret = tail call fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b)
66  ret i32 %ret
67}
68
69; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_unused_result:
70define fastcc void @sibling_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
71entry:
72  %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
73  ret void
74}
75
76; It doesn't make sense to do a tail from a kernel
77; GCN-LABEL: {{^}}kernel_call_i32_fastcc_i32_i32_unused_result:
78;define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
79define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
80entry:
81  %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
82  ret void
83}
84
85; GCN-LABEL: {{^}}i32_fastcc_i32_byval_i32:
86; GCN: s_waitcnt
87; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32{{$}}
88; GCN-NEXT: s_waitcnt vmcnt(0)
89
90; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
91; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
92
93; GCN-NEXT: s_setpc_b64 s[30:31]
94define hidden fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, i32 addrspace(5)* byval(i32) align 4 %arg1) #1 {
95  %arg1.load = load i32, i32 addrspace(5)* %arg1, align 4
96  %add0 = add i32 %arg0, %arg1.load
97  ret i32 %add0
98}
99
100; Tail call disallowed with byval in parent.
101; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32_byval_parent:
102; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32
103; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
104; GCN: s_swappc_b64
105; GCN-NOT: v_readlane_b32 s32
106; GCN: s_setpc_b64
107define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, i32 addrspace(5)* byval(i32) %b.byval, i32 %c) #1 {
108entry:
109  %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32 addrspace(5)* byval(i32) %b.byval)
110  ret i32 %ret
111}
112
113; Tail call disallowed with byval in parent, not callee. The stack
114; usage of incoming arguments must be <= the outgoing stack
115; arguments.
116
117; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32:
118; GCN-NOT: v0
119; GCN-NOT: s32
120; GCN: buffer_load_dword v1, off, s[0:3], 0 offset:16
121; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}}
122; GCN-NEXT: s_setpc_b64
123define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [32 x i32] %large) #1 {
124entry:
125  %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32 addrspace(5)* byval(i32) inttoptr (i32 16 to i32 addrspace(5)*))
126  ret i32 %ret
127}
128
129; GCN-LABEL: {{^}}i32_fastcc_i32_i32_a32i32:
130; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
131; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32 offset:4{{$}}
132; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:8{{$}}
133
134; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
135; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_0]]
136; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_1]]
137
138
139; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
140; GFX9: v_add3_u32 v0, v0, v3, v2
141
142; GCN-NEXT: s_setpc_b64
143define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %large) #1 {
144  %val_firststack = extractvalue [32 x i32] %large, 30
145  %val_laststack = extractvalue [32 x i32] %large, 31
146  %add0 = add i32 %arg0, %arg1
147  %add1 = add i32 %add0, %val_firststack
148  %add2 = add i32 %add1, %val_laststack
149  ret i32 %add2
150}
151
152; FIXME: Why load and store same location for stack args?
153; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32:
154
155; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}}
156; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4
157; GCN-DAG: buffer_load_dword [[LOAD_2:v[0-9]+]], off, s[0:3], s32 offset:8
158
159; GCN-NOT: s32
160
161; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s32{{$}}
162; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s32 offset:4
163; GCN-DAG: buffer_store_dword [[LOAD_2]], off, s[0:3], s32 offset:8
164
165; GCN-NOT: s32
166; GCN: s_setpc_b64
167define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
168entry:
169  %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
170  ret i32 %ret
171}
172
173; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object:
174; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
175; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:32
176; GCN: s_setpc_b64
177define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 {
178entry:
179  %alloca = alloca [16 x i32], align 4, addrspace(5)
180  %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5
181  store volatile i32 9, i32 addrspace(5)* %gep
182  %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
183  ret i32 %ret
184}
185
186; If the callee requires more stack argument space than the caller,
187; don't do a tail call.
188; TODO: Do we really need this restriction?
189
190; GCN-LABEL: {{^}}no_sibling_call_callee_more_stack_space:
191; GCN: s_swappc_b64
192; GCN: s_setpc_b64
193define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 {
194entry:
195  %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer)
196  ret i32 %ret
197}
198
199; Have another non-tail in the function
200; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call:
201; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
202; GCN-NEXT: buffer_store_dword [[CSRV:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
203; GCN-NEXT: s_mov_b64 exec
204; GCN: v_writelane_b32 [[CSRV]], s33, 2
205; GCN-DAG: s_addk_i32 s32, 0x400
206
207; GCN-DAG: s_getpc_b64 s[4:5]
208; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4
209; GCN-DAG: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12
210
211; GCN-DAG: v_writelane_b32 [[CSRV]], s30, 0
212; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
213; GCN-DAG: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
214; GCN-DAG: v_writelane_b32 [[CSRV]], s31, 1
215
216
217; GCN: s_swappc_b64
218
219; GCN-DAG: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
220; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
221
222; GCN: s_getpc_b64 s[4:5]
223; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4
224; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12
225
226; GCN-DAG: v_readlane_b32 s30, [[CSRV]], 0
227; GCN-DAG: v_readlane_b32 s31, [[CSRV]], 1
228
229; GCN: s_addk_i32 s32, 0xfc00
230; GCN-NEXT: v_readlane_b32 s33,
231; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
232; GCN-NEXT: buffer_load_dword [[CSRV]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
233; GCN-NEXT: s_mov_b64 exec, s[6:7]
234; GCN-NEXT: s_setpc_b64 s[4:5]
235define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 {
236entry:
237  %other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
238  %ret = tail call fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %other.call)
239  ret i32 %ret
240}
241
242; Have stack object in caller and stack passed arguments. SP should be
243; in same place at function exit.
244
245; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32:
246; GCN-NOT: s33
247; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:
248
249; GCN-NOT: s33
250
251; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:
252; GCN: s_setpc_b64 s[4:5]
253define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
254entry:
255  %alloca = alloca [16 x i32], align 4, addrspace(5)
256  %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5
257  store volatile i32 9, i32 addrspace(5)* %gep
258  %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
259  ret i32 %ret
260}
261
262; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area:
263; GCN-NOT: s33
264; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:48
265
266; GCN-NOT: s33
267; GCN: s_setpc_b64 s[4:5]
268define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 {
269entry:
270  %alloca = alloca [16 x i32], align 4, addrspace(5)
271  %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5
272  store volatile i32 9, i32 addrspace(5)* %gep
273  %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer)
274  ret i32 %ret
275}
276
277@func_ptr_gv = external unnamed_addr addrspace(4) constant i32(i32, i32)*, align 4
278
279; Do support tail calls with a uniform, but unknown, callee.
280; GCN-LABEL: {{^}}indirect_uniform_sibling_call_i32_fastcc_i32_i32:
281; GCN: s_load_dwordx2 [[GV_ADDR:s\[[0-9]+:[0-9]+\]]]
282; GCN: s_load_dwordx2 [[FUNC_PTR:s\[[0-9]+:[0-9]+\]]], [[GV_ADDR]]
283; GCN: s_setpc_b64 [[FUNC_PTR]]
284define hidden fastcc i32 @indirect_uniform_sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 {
285entry:
286  %func.ptr.load = load i32(i32, i32)*, i32(i32, i32)* addrspace(4)* @func_ptr_gv
287  %ret = tail call fastcc i32 %func.ptr.load(i32 %a, i32 %b)
288  ret i32 %ret
289}
290
291; We can't support a tail call to a divergent target. Use a waterfall
292; loop around a regular call
293; GCN-LABEL: {{^}}indirect_divergent_sibling_call_i32_fastcc_i32_i32:
294; GCN: v_readfirstlane_b32
295; GCN: v_readfirstlane_b32
296; GCN: s_and_saveexec_b64
297; GCN: s_swappc_b64
298; GCN: s_cbranch_execnz
299; GCN: s_setpc_b64
300define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(i32(i32, i32)* %func.ptr, i32 %a, i32 %b, i32 %c) #1 {
301entry:
302  %add = add i32 %b, %c
303  %ret = tail call fastcc i32 %func.ptr(i32 %a, i32 %add)
304  ret i32 %ret
305}
306
307declare hidden void @void_fastcc_multi_byval(i32 %a, [3 x i32] addrspace(5)* byval([3 x i32]) align 16, [2 x i64] addrspace(5)* byval([2 x i64]))
308
309; GCN-LABEL: {{^}}sibling_call_fastcc_multi_byval:
310; GCN-DAG: s_getpc_b64 [[TARGET_ADDR:s\[[0-9]+:[0-9]+\]]]
311; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
312; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
313
314; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:144
315; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:148
316; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:152
317
318; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32{{$}}
319; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:4{{$}}
320; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:8{{$}}
321
322; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:160
323; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:164
324; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:168
325; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:172
326; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:16{{$}}
327; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:20{{$}}
328; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:24{{$}}
329; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:28{{$}}
330
331; GCN: s_setpc_b64 [[TARGET_ADDR]]
332define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 {
333entry:
334  %alloca0 = alloca [3 x i32], align 16, addrspace(5)
335  %alloca1 = alloca [2 x i64], align 8, addrspace(5)
336  store [3 x i32] [i32 9, i32 9, i32 9], [3 x i32] addrspace(5)* %alloca0
337  store [2 x i64] zeroinitializer, [2 x i64] addrspace(5)* %alloca1
338  tail call fastcc void @void_fastcc_multi_byval(i32 %a, [3 x i32] addrspace(5)* byval([3 x i32]) %alloca0, [2 x i64] addrspace(5)* byval([2 x i64]) %alloca1)
339  ret void
340}
341
342declare hidden void @void_fastcc_byval_and_stack_passed([3 x i32] addrspace(5)* byval([3 x i32]) align 16, [32 x i32], i32)
343
344; Callee has a byval and non-byval stack passed argument
345; GCN-LABEL: {{^}}sibling_call_byval_and_stack_passed:
346; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
347
348; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:144
349; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:148
350; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:152
351; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32{{$}}
352; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:4{{$}}
353; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:8{{$}}
354; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:12{{$}}
355; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 offset:16
356
357; GCN: v_mov_b32_e32 v0, 0
358; GCN: v_mov_b32_e32 v30, 0
359
360; GCN: s_getpc_b64 [[TARGET_ADDR:s\[[0-9]+:[0-9]+\]]]
361; GCN-NEXT: s_add_u32
362; GCN-NEXT: s_addc_u32
363; GCN-NEXT: s_setpc_b64 [[TARGET_ADDR]]
364define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64 x i32]) #1 {
365entry:
366  %alloca = alloca [3 x i32], align 16, addrspace(5)
367  store [3 x i32] [i32 9, i32 9, i32 9], [3 x i32] addrspace(5)* %alloca
368  tail call fastcc void @void_fastcc_byval_and_stack_passed([3 x i32] addrspace(5)* byval([3 x i32]) %alloca, [32 x i32] zeroinitializer, i32 %stack.out.arg)
369  ret void
370}
371
372declare hidden fastcc i64 @i64_fastcc_i64(i64 %arg0)
373
374; GCN-LABEL: {{^}}sibling_call_i64_fastcc_i64:
375; GCN: s_waitcnt
376; GCN-NEXT: s_getpc_b64
377; GCN-NEXT: s_add_u32
378; GCN-NEXT: s_addc_u32
379; GCN-NEXT: s_setpc_b64
380define hidden fastcc i64 @sibling_call_i64_fastcc_i64(i64 %a) #1 {
381entry:
382  %ret = tail call fastcc i64 @i64_fastcc_i64(i64 %a)
383  ret i64 %ret
384}
385
386declare hidden fastcc i8 addrspace(1)* @p1i8_fastcc_p1i8(i8 addrspace(1)* %arg0)
387
388; GCN-LABEL: {{^}}sibling_call_p1i8_fastcc_p1i8:
389; GCN: s_waitcnt
390; GCN-NEXT: s_getpc_b64
391; GCN-NEXT: s_add_u32
392; GCN-NEXT: s_addc_u32
393; GCN-NEXT: s_setpc_b64
394define hidden fastcc i8 addrspace(1)* @sibling_call_p1i8_fastcc_p1i8(i8 addrspace(1)* %a) #1 {
395entry:
396  %ret = tail call fastcc i8 addrspace(1)* @p1i8_fastcc_p1i8(i8 addrspace(1)* %a)
397  ret i8 addrspace(1)* %ret
398}
399
400declare hidden fastcc i16 @i16_fastcc_i16(i16 %arg0)
401
402; GCN-LABEL: {{^}}sibling_call_i16_fastcc_i16:
403; GCN: s_waitcnt
404; GCN-NEXT: s_getpc_b64
405; GCN-NEXT: s_add_u32
406; GCN-NEXT: s_addc_u32
407; GCN-NEXT: s_setpc_b64
408define hidden fastcc i16 @sibling_call_i16_fastcc_i16(i16 %a) #1 {
409entry:
410  %ret = tail call fastcc i16 @i16_fastcc_i16(i16 %a)
411  ret i16 %ret
412}
413
414declare hidden fastcc half @f16_fastcc_f16(half %arg0)
415
416; GCN-LABEL: {{^}}sibling_call_f16_fastcc_f16:
417; GCN: s_waitcnt
418; GCN-NEXT: s_getpc_b64
419; GCN-NEXT: s_add_u32
420; GCN-NEXT: s_addc_u32
421; GCN-NEXT: s_setpc_b64
422define hidden fastcc half @sibling_call_f16_fastcc_f16(half %a) #1 {
423entry:
424  %ret = tail call fastcc half @f16_fastcc_f16(half %a)
425  ret half %ret
426}
427
428declare hidden fastcc <3 x i16> @v3i16_fastcc_v3i16(<3 x i16> %arg0)
429
430; GCN-LABEL: {{^}}sibling_call_v3i16_fastcc_v3i16:
431; GCN: s_waitcnt
432; GCN-NEXT: s_getpc_b64
433; GCN-NEXT: s_add_u32
434; GCN-NEXT: s_addc_u32
435; GCN-NEXT: s_setpc_b64
436define hidden fastcc <3 x i16> @sibling_call_v3i16_fastcc_v3i16(<3 x i16> %a) #1 {
437entry:
438  %ret = tail call fastcc <3 x i16> @v3i16_fastcc_v3i16(<3 x i16> %a)
439  ret <3 x i16> %ret
440}
441
442declare hidden fastcc <4 x i16> @v4i16_fastcc_v4i16(<4 x i16> %arg0)
443
444; GCN-LABEL: {{^}}sibling_call_v4i16_fastcc_v4i16:
445; GCN: s_waitcnt
446; GCN-NEXT: s_getpc_b64
447; GCN-NEXT: s_add_u32
448; GCN-NEXT: s_addc_u32
449; GCN-NEXT: s_setpc_b64
450define hidden fastcc <4 x i16> @sibling_call_v4i16_fastcc_v4i16(<4 x i16> %a) #1 {
451entry:
452  %ret = tail call fastcc <4 x i16> @v4i16_fastcc_v4i16(<4 x i16> %a)
453  ret <4 x i16> %ret
454}
455
456declare hidden fastcc <2 x i64> @v2i64_fastcc_v2i64(<2 x i64> %arg0)
457
458; GCN-LABEL: {{^}}sibling_call_v2i64_fastcc_v2i64:
459; GCN: s_waitcnt
460; GCN-NEXT: s_getpc_b64
461; GCN-NEXT: s_add_u32
462; GCN-NEXT: s_addc_u32
463; GCN-NEXT: s_setpc_b64
464define hidden fastcc <2 x i64> @sibling_call_v2i64_fastcc_v2i64(<2 x i64> %a) #1 {
465entry:
466  %ret = tail call fastcc <2 x i64> @v2i64_fastcc_v2i64(<2 x i64> %a)
467  ret <2 x i64> %ret
468}
469
470attributes #0 = { nounwind }
471attributes #1 = { nounwind noinline }
472