1; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s 2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s 3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -enable-ipra=0 -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s 4target datalayout = "A5" 5 6; FIXME: Why is this commuted only sometimes? 7; GCN-LABEL: {{^}}i32_fastcc_i32_i32: 8; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 9; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 10; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 11; GCN-NEXT: s_setpc_b64 12define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 { 13 %add0 = add i32 %arg0, %arg1 14 ret i32 %add0 15} 16 17; GCN-LABEL: {{^}}i32_fastcc_i32_i32_stack_object: 18; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 19; GCN-NEXT: v_mov_b32_e32 [[K:v[0-9]+]], 9 20; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 21; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 22; GCN: buffer_store_dword [[K]], off, s[0:3], s32 offset:20 23; GCN: s_waitcnt vmcnt(0) 24; GCN: s_setpc_b64 25; GCN: ; ScratchSize: 68 26define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 { 27 %alloca = alloca [16 x i32], align 4, addrspace(5) 28 %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5 29 store volatile i32 9, i32 addrspace(5)* %gep 30 %add0 = add i32 %arg0, %arg1 31 ret i32 %add0 32} 33 34; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32: 35define hidden fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 { 36entry: 37 %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) 38 ret i32 %ret 39} 40 41; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_stack_object: 42; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 43; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:20 44; GCN: s_setpc_b64 45; GCN: ; ScratchSize: 68 46define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, i32 %c) #1 { 47entry: 48 %alloca = alloca [16 x i32], align 4, addrspace(5) 49 %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5 50 store volatile i32 9, i32 addrspace(5)* %gep 51 %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) 52 ret i32 %ret 53} 54 55; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_callee_stack_object: 56; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 57; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:20 58; GCN: s_setpc_b64 59; GCN: ; ScratchSize: 136 60define fastcc i32 @sibling_call_i32_fastcc_i32_i32_callee_stack_object(i32 %a, i32 %b, i32 %c) #1 { 61entry: 62 %alloca = alloca [16 x i32], align 4, addrspace(5) 63 %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5 64 store volatile i32 9, i32 addrspace(5)* %gep 65 %ret = tail call fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b) 66 ret i32 %ret 67} 68 69; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_unused_result: 70define fastcc void @sibling_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 { 71entry: 72 %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) 73 ret void 74} 75 76; It doesn't make sense to do a tail from a kernel 77; GCN-LABEL: {{^}}kernel_call_i32_fastcc_i32_i32_unused_result: 78;define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 { 79define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 { 80entry: 81 %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) 82 ret void 83} 84 85; GCN-LABEL: {{^}}i32_fastcc_i32_byval_i32: 86; GCN: s_waitcnt 87; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32{{$}} 88; GCN-NEXT: s_waitcnt vmcnt(0) 89 90; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 91; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 92 93; GCN-NEXT: s_setpc_b64 s[30:31] 94define hidden fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, i32 addrspace(5)* byval(i32) align 4 %arg1) #1 { 95 %arg1.load = load i32, i32 addrspace(5)* %arg1, align 4 96 %add0 = add i32 %arg0, %arg1.load 97 ret i32 %add0 98} 99 100; Tail call disallowed with byval in parent. 101; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32_byval_parent: 102; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32 103; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32{{$}} 104; GCN: s_swappc_b64 105; GCN-NOT: v_readlane_b32 s32 106; GCN: s_setpc_b64 107define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, i32 addrspace(5)* byval(i32) %b.byval, i32 %c) #1 { 108entry: 109 %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32 addrspace(5)* byval(i32) %b.byval) 110 ret i32 %ret 111} 112 113; Tail call disallowed with byval in parent, not callee. The stack 114; usage of incoming arguments must be <= the outgoing stack 115; arguments. 116 117; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32: 118; GCN-NOT: v0 119; GCN-NOT: s32 120; GCN: buffer_load_dword v1, off, s[0:3], 0 offset:16 121; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}} 122; GCN-NEXT: s_setpc_b64 123define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [32 x i32] %large) #1 { 124entry: 125 %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32 addrspace(5)* byval(i32) inttoptr (i32 16 to i32 addrspace(5)*)) 126 ret i32 %ret 127} 128 129; GCN-LABEL: {{^}}i32_fastcc_i32_i32_a32i32: 130; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 131; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32 offset:4{{$}} 132; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:8{{$}} 133 134; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1 135; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_0]] 136; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_1]] 137 138 139; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 140; GFX9: v_add3_u32 v0, v0, v3, v2 141 142; GCN-NEXT: s_setpc_b64 143define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %large) #1 { 144 %val_firststack = extractvalue [32 x i32] %large, 30 145 %val_laststack = extractvalue [32 x i32] %large, 31 146 %add0 = add i32 %arg0, %arg1 147 %add1 = add i32 %add0, %val_firststack 148 %add2 = add i32 %add1, %val_laststack 149 ret i32 %add2 150} 151 152; FIXME: Why load and store same location for stack args? 153; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32: 154 155; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}} 156; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4 157; GCN-DAG: buffer_load_dword [[LOAD_2:v[0-9]+]], off, s[0:3], s32 offset:8 158 159; GCN-NOT: s32 160 161; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s32{{$}} 162; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s32 offset:4 163; GCN-DAG: buffer_store_dword [[LOAD_2]], off, s[0:3], s32 offset:8 164 165; GCN-NOT: s32 166; GCN: s_setpc_b64 167define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { 168entry: 169 %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) 170 ret i32 %ret 171} 172 173; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object: 174; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 175; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:32 176; GCN: s_setpc_b64 177define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 { 178entry: 179 %alloca = alloca [16 x i32], align 4, addrspace(5) 180 %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5 181 store volatile i32 9, i32 addrspace(5)* %gep 182 %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) 183 ret i32 %ret 184} 185 186; If the callee requires more stack argument space than the caller, 187; don't do a tail call. 188; TODO: Do we really need this restriction? 189 190; GCN-LABEL: {{^}}no_sibling_call_callee_more_stack_space: 191; GCN: s_swappc_b64 192; GCN: s_setpc_b64 193define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 { 194entry: 195 %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer) 196 ret i32 %ret 197} 198 199; Have another non-tail in the function 200; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call: 201; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 202; GCN-NEXT: buffer_store_dword [[CSRV:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill 203; GCN-NEXT: s_mov_b64 exec 204; GCN: v_writelane_b32 [[CSRV]], s33, 2 205; GCN-DAG: s_addk_i32 s32, 0x400 206 207; GCN-DAG: s_getpc_b64 s[4:5] 208; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 209; GCN-DAG: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 210 211; GCN-DAG: v_writelane_b32 [[CSRV]], s30, 0 212; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill 213; GCN-DAG: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill 214; GCN-DAG: v_writelane_b32 [[CSRV]], s31, 1 215 216 217; GCN: s_swappc_b64 218 219; GCN-DAG: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload 220; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload 221 222; GCN: s_getpc_b64 s[4:5] 223; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 224; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12 225 226; GCN-DAG: v_readlane_b32 s30, [[CSRV]], 0 227; GCN-DAG: v_readlane_b32 s31, [[CSRV]], 1 228 229; GCN: s_addk_i32 s32, 0xfc00 230; GCN-NEXT: v_readlane_b32 s33, 231; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 232; GCN-NEXT: buffer_load_dword [[CSRV]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload 233; GCN-NEXT: s_mov_b64 exec, s[6:7] 234; GCN-NEXT: s_setpc_b64 s[4:5] 235define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 { 236entry: 237 %other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b) 238 %ret = tail call fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %other.call) 239 ret i32 %ret 240} 241 242; Have stack object in caller and stack passed arguments. SP should be 243; in same place at function exit. 244 245; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32: 246; GCN-NOT: s33 247; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset: 248 249; GCN-NOT: s33 250 251; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset: 252; GCN: s_setpc_b64 s[4:5] 253define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 { 254entry: 255 %alloca = alloca [16 x i32], align 4, addrspace(5) 256 %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5 257 store volatile i32 9, i32 addrspace(5)* %gep 258 %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) 259 ret i32 %ret 260} 261 262; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area: 263; GCN-NOT: s33 264; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:48 265 266; GCN-NOT: s33 267; GCN: s_setpc_b64 s[4:5] 268define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 { 269entry: 270 %alloca = alloca [16 x i32], align 4, addrspace(5) 271 %gep = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 5 272 store volatile i32 9, i32 addrspace(5)* %gep 273 %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer) 274 ret i32 %ret 275} 276 277@func_ptr_gv = external unnamed_addr addrspace(4) constant i32(i32, i32)*, align 4 278 279; Do support tail calls with a uniform, but unknown, callee. 280; GCN-LABEL: {{^}}indirect_uniform_sibling_call_i32_fastcc_i32_i32: 281; GCN: s_load_dwordx2 [[GV_ADDR:s\[[0-9]+:[0-9]+\]]] 282; GCN: s_load_dwordx2 [[FUNC_PTR:s\[[0-9]+:[0-9]+\]]], [[GV_ADDR]] 283; GCN: s_setpc_b64 [[FUNC_PTR]] 284define hidden fastcc i32 @indirect_uniform_sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 { 285entry: 286 %func.ptr.load = load i32(i32, i32)*, i32(i32, i32)* addrspace(4)* @func_ptr_gv 287 %ret = tail call fastcc i32 %func.ptr.load(i32 %a, i32 %b) 288 ret i32 %ret 289} 290 291; We can't support a tail call to a divergent target. Use a waterfall 292; loop around a regular call 293; GCN-LABEL: {{^}}indirect_divergent_sibling_call_i32_fastcc_i32_i32: 294; GCN: v_readfirstlane_b32 295; GCN: v_readfirstlane_b32 296; GCN: s_and_saveexec_b64 297; GCN: s_swappc_b64 298; GCN: s_cbranch_execnz 299; GCN: s_setpc_b64 300define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(i32(i32, i32)* %func.ptr, i32 %a, i32 %b, i32 %c) #1 { 301entry: 302 %add = add i32 %b, %c 303 %ret = tail call fastcc i32 %func.ptr(i32 %a, i32 %add) 304 ret i32 %ret 305} 306 307declare hidden void @void_fastcc_multi_byval(i32 %a, [3 x i32] addrspace(5)* byval([3 x i32]) align 16, [2 x i64] addrspace(5)* byval([2 x i64])) 308 309; GCN-LABEL: {{^}}sibling_call_fastcc_multi_byval: 310; GCN-DAG: s_getpc_b64 [[TARGET_ADDR:s\[[0-9]+:[0-9]+\]]] 311; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 312; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 313 314; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:144 315; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:148 316; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:152 317 318; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32{{$}} 319; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:4{{$}} 320; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:8{{$}} 321 322; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:160 323; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:164 324; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:168 325; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:172 326; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:16{{$}} 327; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:20{{$}} 328; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:24{{$}} 329; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:28{{$}} 330 331; GCN: s_setpc_b64 [[TARGET_ADDR]] 332define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 { 333entry: 334 %alloca0 = alloca [3 x i32], align 16, addrspace(5) 335 %alloca1 = alloca [2 x i64], align 8, addrspace(5) 336 store [3 x i32] [i32 9, i32 9, i32 9], [3 x i32] addrspace(5)* %alloca0 337 store [2 x i64] zeroinitializer, [2 x i64] addrspace(5)* %alloca1 338 tail call fastcc void @void_fastcc_multi_byval(i32 %a, [3 x i32] addrspace(5)* byval([3 x i32]) %alloca0, [2 x i64] addrspace(5)* byval([2 x i64]) %alloca1) 339 ret void 340} 341 342declare hidden void @void_fastcc_byval_and_stack_passed([3 x i32] addrspace(5)* byval([3 x i32]) align 16, [32 x i32], i32) 343 344; Callee has a byval and non-byval stack passed argument 345; GCN-LABEL: {{^}}sibling_call_byval_and_stack_passed: 346; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 347 348; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:144 349; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:148 350; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:152 351; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32{{$}} 352; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:4{{$}} 353; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:8{{$}} 354; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:12{{$}} 355; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 offset:16 356 357; GCN: v_mov_b32_e32 v0, 0 358; GCN: v_mov_b32_e32 v30, 0 359 360; GCN: s_getpc_b64 [[TARGET_ADDR:s\[[0-9]+:[0-9]+\]]] 361; GCN-NEXT: s_add_u32 362; GCN-NEXT: s_addc_u32 363; GCN-NEXT: s_setpc_b64 [[TARGET_ADDR]] 364define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64 x i32]) #1 { 365entry: 366 %alloca = alloca [3 x i32], align 16, addrspace(5) 367 store [3 x i32] [i32 9, i32 9, i32 9], [3 x i32] addrspace(5)* %alloca 368 tail call fastcc void @void_fastcc_byval_and_stack_passed([3 x i32] addrspace(5)* byval([3 x i32]) %alloca, [32 x i32] zeroinitializer, i32 %stack.out.arg) 369 ret void 370} 371 372declare hidden fastcc i64 @i64_fastcc_i64(i64 %arg0) 373 374; GCN-LABEL: {{^}}sibling_call_i64_fastcc_i64: 375; GCN: s_waitcnt 376; GCN-NEXT: s_getpc_b64 377; GCN-NEXT: s_add_u32 378; GCN-NEXT: s_addc_u32 379; GCN-NEXT: s_setpc_b64 380define hidden fastcc i64 @sibling_call_i64_fastcc_i64(i64 %a) #1 { 381entry: 382 %ret = tail call fastcc i64 @i64_fastcc_i64(i64 %a) 383 ret i64 %ret 384} 385 386declare hidden fastcc i8 addrspace(1)* @p1i8_fastcc_p1i8(i8 addrspace(1)* %arg0) 387 388; GCN-LABEL: {{^}}sibling_call_p1i8_fastcc_p1i8: 389; GCN: s_waitcnt 390; GCN-NEXT: s_getpc_b64 391; GCN-NEXT: s_add_u32 392; GCN-NEXT: s_addc_u32 393; GCN-NEXT: s_setpc_b64 394define hidden fastcc i8 addrspace(1)* @sibling_call_p1i8_fastcc_p1i8(i8 addrspace(1)* %a) #1 { 395entry: 396 %ret = tail call fastcc i8 addrspace(1)* @p1i8_fastcc_p1i8(i8 addrspace(1)* %a) 397 ret i8 addrspace(1)* %ret 398} 399 400declare hidden fastcc i16 @i16_fastcc_i16(i16 %arg0) 401 402; GCN-LABEL: {{^}}sibling_call_i16_fastcc_i16: 403; GCN: s_waitcnt 404; GCN-NEXT: s_getpc_b64 405; GCN-NEXT: s_add_u32 406; GCN-NEXT: s_addc_u32 407; GCN-NEXT: s_setpc_b64 408define hidden fastcc i16 @sibling_call_i16_fastcc_i16(i16 %a) #1 { 409entry: 410 %ret = tail call fastcc i16 @i16_fastcc_i16(i16 %a) 411 ret i16 %ret 412} 413 414declare hidden fastcc half @f16_fastcc_f16(half %arg0) 415 416; GCN-LABEL: {{^}}sibling_call_f16_fastcc_f16: 417; GCN: s_waitcnt 418; GCN-NEXT: s_getpc_b64 419; GCN-NEXT: s_add_u32 420; GCN-NEXT: s_addc_u32 421; GCN-NEXT: s_setpc_b64 422define hidden fastcc half @sibling_call_f16_fastcc_f16(half %a) #1 { 423entry: 424 %ret = tail call fastcc half @f16_fastcc_f16(half %a) 425 ret half %ret 426} 427 428declare hidden fastcc <3 x i16> @v3i16_fastcc_v3i16(<3 x i16> %arg0) 429 430; GCN-LABEL: {{^}}sibling_call_v3i16_fastcc_v3i16: 431; GCN: s_waitcnt 432; GCN-NEXT: s_getpc_b64 433; GCN-NEXT: s_add_u32 434; GCN-NEXT: s_addc_u32 435; GCN-NEXT: s_setpc_b64 436define hidden fastcc <3 x i16> @sibling_call_v3i16_fastcc_v3i16(<3 x i16> %a) #1 { 437entry: 438 %ret = tail call fastcc <3 x i16> @v3i16_fastcc_v3i16(<3 x i16> %a) 439 ret <3 x i16> %ret 440} 441 442declare hidden fastcc <4 x i16> @v4i16_fastcc_v4i16(<4 x i16> %arg0) 443 444; GCN-LABEL: {{^}}sibling_call_v4i16_fastcc_v4i16: 445; GCN: s_waitcnt 446; GCN-NEXT: s_getpc_b64 447; GCN-NEXT: s_add_u32 448; GCN-NEXT: s_addc_u32 449; GCN-NEXT: s_setpc_b64 450define hidden fastcc <4 x i16> @sibling_call_v4i16_fastcc_v4i16(<4 x i16> %a) #1 { 451entry: 452 %ret = tail call fastcc <4 x i16> @v4i16_fastcc_v4i16(<4 x i16> %a) 453 ret <4 x i16> %ret 454} 455 456declare hidden fastcc <2 x i64> @v2i64_fastcc_v2i64(<2 x i64> %arg0) 457 458; GCN-LABEL: {{^}}sibling_call_v2i64_fastcc_v2i64: 459; GCN: s_waitcnt 460; GCN-NEXT: s_getpc_b64 461; GCN-NEXT: s_add_u32 462; GCN-NEXT: s_addc_u32 463; GCN-NEXT: s_setpc_b64 464define hidden fastcc <2 x i64> @sibling_call_v2i64_fastcc_v2i64(<2 x i64> %a) #1 { 465entry: 466 %ret = tail call fastcc <2 x i64> @v2i64_fastcc_v2i64(<2 x i64> %a) 467 ret <2 x i64> %ret 468} 469 470attributes #0 = { nounwind } 471attributes #1 = { nounwind noinline } 472