1 //===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file implements the lowering of LLVM calls to machine code calls for 11 /// GlobalISel. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUCallLowering.h" 16 #include "AMDGPU.h" 17 #include "AMDGPULegalizerInfo.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "SIRegisterInfo.h" 21 #include "llvm/CodeGen/Analysis.h" 22 #include "llvm/CodeGen/FunctionLoweringInfo.h" 23 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 24 #include "llvm/IR/IntrinsicsAMDGPU.h" 25 26 #define DEBUG_TYPE "amdgpu-call-lowering" 27 28 using namespace llvm; 29 30 namespace { 31 32 /// Wrapper around extendRegister to ensure we extend to a full 32-bit register. 33 static Register extendRegisterMin32(CallLowering::ValueHandler &Handler, 34 Register ValVReg, CCValAssign &VA) { 35 if (VA.getLocVT().getSizeInBits() < 32) { 36 // 16-bit types are reported as legal for 32-bit registers. We need to 37 // extend and do a 32-bit copy to avoid the verifier complaining about it. 38 return Handler.MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0); 39 } 40 41 return Handler.extendRegister(ValVReg, VA); 42 } 43 44 struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler { 45 AMDGPUOutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, 46 MachineInstrBuilder MIB) 47 : OutgoingValueHandler(B, MRI), MIB(MIB) {} 48 49 MachineInstrBuilder MIB; 50 51 Register getStackAddress(uint64_t Size, int64_t Offset, 52 MachinePointerInfo &MPO, 53 ISD::ArgFlagsTy Flags) override { 54 llvm_unreachable("not implemented"); 55 } 56 57 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, 58 MachinePointerInfo &MPO, CCValAssign &VA) override { 59 llvm_unreachable("not implemented"); 60 } 61 62 void assignValueToReg(Register ValVReg, Register PhysReg, 63 CCValAssign &VA) override { 64 Register ExtReg = extendRegisterMin32(*this, ValVReg, VA); 65 66 // If this is a scalar return, insert a readfirstlane just in case the value 67 // ends up in a VGPR. 68 // FIXME: Assert this is a shader return. 69 const SIRegisterInfo *TRI 70 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 71 if (TRI->isSGPRReg(MRI, PhysReg)) { 72 auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, 73 {MRI.getType(ExtReg)}, false) 74 .addReg(ExtReg); 75 ExtReg = ToSGPR.getReg(0); 76 } 77 78 MIRBuilder.buildCopy(PhysReg, ExtReg); 79 MIB.addUse(PhysReg, RegState::Implicit); 80 } 81 }; 82 83 struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler { 84 uint64_t StackUsed = 0; 85 86 AMDGPUIncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI) 87 : IncomingValueHandler(B, MRI) {} 88 89 Register getStackAddress(uint64_t Size, int64_t Offset, 90 MachinePointerInfo &MPO, 91 ISD::ArgFlagsTy Flags) override { 92 auto &MFI = MIRBuilder.getMF().getFrameInfo(); 93 94 // Byval is assumed to be writable memory, but other stack passed arguments 95 // are not. 96 const bool IsImmutable = !Flags.isByVal(); 97 int FI = MFI.CreateFixedObject(Size, Offset, IsImmutable); 98 MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); 99 auto AddrReg = MIRBuilder.buildFrameIndex( 100 LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32), FI); 101 StackUsed = std::max(StackUsed, Size + Offset); 102 return AddrReg.getReg(0); 103 } 104 105 void assignValueToReg(Register ValVReg, Register PhysReg, 106 CCValAssign &VA) override { 107 markPhysRegUsed(PhysReg); 108 109 if (VA.getLocVT().getSizeInBits() < 32) { 110 // 16-bit types are reported as legal for 32-bit registers. We need to do 111 // a 32-bit copy, and truncate to avoid the verifier complaining about it. 112 auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg); 113 114 // If we have signext/zeroext, it applies to the whole 32-bit register 115 // before truncation. 116 auto Extended = 117 buildExtensionHint(VA, Copy.getReg(0), LLT(VA.getLocVT())); 118 MIRBuilder.buildTrunc(ValVReg, Extended); 119 return; 120 } 121 122 IncomingValueHandler::assignValueToReg(ValVReg, PhysReg, VA); 123 } 124 125 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t MemSize, 126 MachinePointerInfo &MPO, CCValAssign &VA) override { 127 MachineFunction &MF = MIRBuilder.getMF(); 128 129 // The reported memory location may be wider than the value. 130 const LLT RegTy = MRI.getType(ValVReg); 131 MemSize = std::min(static_cast<uint64_t>(RegTy.getSizeInBytes()), MemSize); 132 133 // FIXME: Get alignment 134 auto MMO = MF.getMachineMemOperand( 135 MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, MemSize, 136 inferAlignFromPtrInfo(MF, MPO)); 137 MIRBuilder.buildLoad(ValVReg, Addr, *MMO); 138 } 139 140 /// How the physical register gets marked varies between formal 141 /// parameters (it's a basic-block live-in), and a call instruction 142 /// (it's an implicit-def of the BL). 143 virtual void markPhysRegUsed(unsigned PhysReg) = 0; 144 }; 145 146 struct FormalArgHandler : public AMDGPUIncomingArgHandler { 147 FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI) 148 : AMDGPUIncomingArgHandler(B, MRI) {} 149 150 void markPhysRegUsed(unsigned PhysReg) override { 151 MIRBuilder.getMBB().addLiveIn(PhysReg); 152 } 153 }; 154 155 struct CallReturnHandler : public AMDGPUIncomingArgHandler { 156 CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, 157 MachineInstrBuilder MIB) 158 : AMDGPUIncomingArgHandler(MIRBuilder, MRI), MIB(MIB) {} 159 160 void markPhysRegUsed(unsigned PhysReg) override { 161 MIB.addDef(PhysReg, RegState::Implicit); 162 } 163 164 MachineInstrBuilder MIB; 165 }; 166 167 struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler { 168 /// For tail calls, the byte offset of the call's argument area from the 169 /// callee's. Unused elsewhere. 170 int FPDiff; 171 172 // Cache the SP register vreg if we need it more than once in this call site. 173 Register SPReg; 174 175 bool IsTailCall; 176 177 AMDGPUOutgoingArgHandler(MachineIRBuilder &MIRBuilder, 178 MachineRegisterInfo &MRI, MachineInstrBuilder MIB, 179 bool IsTailCall = false, int FPDiff = 0) 180 : AMDGPUOutgoingValueHandler(MIRBuilder, MRI, MIB), FPDiff(FPDiff), 181 IsTailCall(IsTailCall) {} 182 183 Register getStackAddress(uint64_t Size, int64_t Offset, 184 MachinePointerInfo &MPO, 185 ISD::ArgFlagsTy Flags) override { 186 MachineFunction &MF = MIRBuilder.getMF(); 187 const LLT PtrTy = LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32); 188 const LLT S32 = LLT::scalar(32); 189 190 if (IsTailCall) { 191 llvm_unreachable("implement me"); 192 } 193 194 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 195 196 if (!SPReg) 197 SPReg = MIRBuilder.buildCopy(PtrTy, MFI->getStackPtrOffsetReg()).getReg(0); 198 199 auto OffsetReg = MIRBuilder.buildConstant(S32, Offset); 200 201 auto AddrReg = MIRBuilder.buildPtrAdd(PtrTy, SPReg, OffsetReg); 202 MPO = MachinePointerInfo::getStack(MF, Offset); 203 return AddrReg.getReg(0); 204 } 205 206 void assignValueToReg(Register ValVReg, Register PhysReg, 207 CCValAssign &VA) override { 208 MIB.addUse(PhysReg, RegState::Implicit); 209 Register ExtReg = extendRegisterMin32(*this, ValVReg, VA); 210 MIRBuilder.buildCopy(PhysReg, ExtReg); 211 } 212 213 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, 214 MachinePointerInfo &MPO, CCValAssign &VA) override { 215 MachineFunction &MF = MIRBuilder.getMF(); 216 uint64_t LocMemOffset = VA.getLocMemOffset(); 217 const auto &ST = MF.getSubtarget<GCNSubtarget>(); 218 219 auto MMO = MF.getMachineMemOperand( 220 MPO, MachineMemOperand::MOStore, Size, 221 commonAlignment(ST.getStackAlignment(), LocMemOffset)); 222 MIRBuilder.buildStore(ValVReg, Addr, *MMO); 223 } 224 225 void assignValueToAddress(const CallLowering::ArgInfo &Arg, 226 unsigned ValRegIndex, Register Addr, 227 uint64_t MemSize, MachinePointerInfo &MPO, 228 CCValAssign &VA) override { 229 Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt 230 ? extendRegister(Arg.Regs[ValRegIndex], VA) 231 : Arg.Regs[ValRegIndex]; 232 233 // If we extended the value type we might need to adjust the MMO's 234 // Size. This happens if ComputeValueVTs widened a small type value to a 235 // legal register type (e.g. s8->s16) 236 const LLT RegTy = MRI.getType(ValVReg); 237 MemSize = std::min(MemSize, (uint64_t)RegTy.getSizeInBytes()); 238 assignValueToAddress(ValVReg, Addr, MemSize, MPO, VA); 239 } 240 }; 241 } 242 243 AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI) 244 : CallLowering(&TLI) { 245 } 246 247 // FIXME: Compatability shim 248 static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) { 249 switch (MIOpc) { 250 case TargetOpcode::G_SEXT: 251 return ISD::SIGN_EXTEND; 252 case TargetOpcode::G_ZEXT: 253 return ISD::ZERO_EXTEND; 254 case TargetOpcode::G_ANYEXT: 255 return ISD::ANY_EXTEND; 256 default: 257 llvm_unreachable("not an extend opcode"); 258 } 259 } 260 261 bool AMDGPUCallLowering::canLowerReturn(MachineFunction &MF, 262 CallingConv::ID CallConv, 263 SmallVectorImpl<BaseArgInfo> &Outs, 264 bool IsVarArg) const { 265 // For shaders. Vector types should be explicitly handled by CC. 266 if (AMDGPU::isEntryFunctionCC(CallConv)) 267 return true; 268 269 SmallVector<CCValAssign, 16> ArgLocs; 270 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 271 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, 272 MF.getFunction().getContext()); 273 274 return checkReturn(CCInfo, Outs, TLI.CCAssignFnForReturn(CallConv, IsVarArg)); 275 } 276 277 /// Lower the return value for the already existing \p Ret. This assumes that 278 /// \p B's insertion point is correct. 279 bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B, 280 const Value *Val, ArrayRef<Register> VRegs, 281 MachineInstrBuilder &Ret) const { 282 if (!Val) 283 return true; 284 285 auto &MF = B.getMF(); 286 const auto &F = MF.getFunction(); 287 const DataLayout &DL = MF.getDataLayout(); 288 MachineRegisterInfo *MRI = B.getMRI(); 289 LLVMContext &Ctx = F.getContext(); 290 291 CallingConv::ID CC = F.getCallingConv(); 292 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 293 294 SmallVector<EVT, 8> SplitEVTs; 295 ComputeValueVTs(TLI, DL, Val->getType(), SplitEVTs); 296 assert(VRegs.size() == SplitEVTs.size() && 297 "For each split Type there should be exactly one VReg."); 298 299 SmallVector<ArgInfo, 8> SplitRetInfos; 300 301 for (unsigned i = 0; i < SplitEVTs.size(); ++i) { 302 EVT VT = SplitEVTs[i]; 303 Register Reg = VRegs[i]; 304 ArgInfo RetInfo(Reg, VT.getTypeForEVT(Ctx)); 305 setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F); 306 307 if (VT.isScalarInteger()) { 308 unsigned ExtendOp = TargetOpcode::G_ANYEXT; 309 if (RetInfo.Flags[0].isSExt()) { 310 assert(RetInfo.Regs.size() == 1 && "expect only simple return values"); 311 ExtendOp = TargetOpcode::G_SEXT; 312 } else if (RetInfo.Flags[0].isZExt()) { 313 assert(RetInfo.Regs.size() == 1 && "expect only simple return values"); 314 ExtendOp = TargetOpcode::G_ZEXT; 315 } 316 317 EVT ExtVT = TLI.getTypeForExtReturn(Ctx, VT, 318 extOpcodeToISDExtOpcode(ExtendOp)); 319 if (ExtVT != VT) { 320 RetInfo.Ty = ExtVT.getTypeForEVT(Ctx); 321 LLT ExtTy = getLLTForType(*RetInfo.Ty, DL); 322 Reg = B.buildInstr(ExtendOp, {ExtTy}, {Reg}).getReg(0); 323 } 324 } 325 326 if (Reg != RetInfo.Regs[0]) { 327 RetInfo.Regs[0] = Reg; 328 // Reset the arg flags after modifying Reg. 329 setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F); 330 } 331 332 splitToValueTypes(RetInfo, SplitRetInfos, DL, CC); 333 } 334 335 CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg()); 336 337 OutgoingValueAssigner Assigner(AssignFn); 338 AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret); 339 return determineAndHandleAssignments(RetHandler, Assigner, SplitRetInfos, B, 340 CC, F.isVarArg()); 341 } 342 343 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val, 344 ArrayRef<Register> VRegs, 345 FunctionLoweringInfo &FLI) const { 346 347 MachineFunction &MF = B.getMF(); 348 MachineRegisterInfo &MRI = MF.getRegInfo(); 349 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 350 MFI->setIfReturnsVoid(!Val); 351 352 assert(!Val == VRegs.empty() && "Return value without a vreg"); 353 354 CallingConv::ID CC = B.getMF().getFunction().getCallingConv(); 355 const bool IsShader = AMDGPU::isShader(CC); 356 const bool IsWaveEnd = 357 (IsShader && MFI->returnsVoid()) || AMDGPU::isKernel(CC); 358 if (IsWaveEnd) { 359 B.buildInstr(AMDGPU::S_ENDPGM) 360 .addImm(0); 361 return true; 362 } 363 364 auto const &ST = MF.getSubtarget<GCNSubtarget>(); 365 366 unsigned ReturnOpc = 367 IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return; 368 369 auto Ret = B.buildInstrNoInsert(ReturnOpc); 370 Register ReturnAddrVReg; 371 if (ReturnOpc == AMDGPU::S_SETPC_B64_return) { 372 ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass); 373 Ret.addUse(ReturnAddrVReg); 374 } 375 376 if (!FLI.CanLowerReturn) 377 insertSRetStores(B, Val->getType(), VRegs, FLI.DemoteRegister); 378 else if (!lowerReturnVal(B, Val, VRegs, Ret)) 379 return false; 380 381 if (ReturnOpc == AMDGPU::S_SETPC_B64_return) { 382 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 383 Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF), 384 &AMDGPU::SGPR_64RegClass); 385 B.buildCopy(ReturnAddrVReg, LiveInReturn); 386 } 387 388 // TODO: Handle CalleeSavedRegsViaCopy. 389 390 B.insertInstr(Ret); 391 return true; 392 } 393 394 void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B, 395 Type *ParamTy, 396 uint64_t Offset) const { 397 MachineFunction &MF = B.getMF(); 398 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 399 MachineRegisterInfo &MRI = MF.getRegInfo(); 400 Register KernArgSegmentPtr = 401 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 402 Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr); 403 404 auto OffsetReg = B.buildConstant(LLT::scalar(64), Offset); 405 406 B.buildPtrAdd(DstReg, KernArgSegmentVReg, OffsetReg); 407 } 408 409 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy, 410 uint64_t Offset, Align Alignment, 411 Register DstReg) const { 412 MachineFunction &MF = B.getMF(); 413 const Function &F = MF.getFunction(); 414 const DataLayout &DL = F.getParent()->getDataLayout(); 415 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 416 unsigned TypeSize = DL.getTypeStoreSize(ParamTy); 417 418 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 419 Register PtrReg = B.getMRI()->createGenericVirtualRegister(PtrTy); 420 lowerParameterPtr(PtrReg, B, ParamTy, Offset); 421 422 MachineMemOperand *MMO = MF.getMachineMemOperand( 423 PtrInfo, 424 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 425 MachineMemOperand::MOInvariant, 426 TypeSize, Alignment); 427 428 B.buildLoad(DstReg, PtrReg, *MMO); 429 } 430 431 // Allocate special inputs passed in user SGPRs. 432 static void allocateHSAUserSGPRs(CCState &CCInfo, 433 MachineIRBuilder &B, 434 MachineFunction &MF, 435 const SIRegisterInfo &TRI, 436 SIMachineFunctionInfo &Info) { 437 // FIXME: How should these inputs interact with inreg / custom SGPR inputs? 438 if (Info.hasPrivateSegmentBuffer()) { 439 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); 440 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); 441 CCInfo.AllocateReg(PrivateSegmentBufferReg); 442 } 443 444 if (Info.hasDispatchPtr()) { 445 Register DispatchPtrReg = Info.addDispatchPtr(TRI); 446 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); 447 CCInfo.AllocateReg(DispatchPtrReg); 448 } 449 450 if (Info.hasQueuePtr()) { 451 Register QueuePtrReg = Info.addQueuePtr(TRI); 452 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); 453 CCInfo.AllocateReg(QueuePtrReg); 454 } 455 456 if (Info.hasKernargSegmentPtr()) { 457 MachineRegisterInfo &MRI = MF.getRegInfo(); 458 Register InputPtrReg = Info.addKernargSegmentPtr(TRI); 459 const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 460 Register VReg = MRI.createGenericVirtualRegister(P4); 461 MRI.addLiveIn(InputPtrReg, VReg); 462 B.getMBB().addLiveIn(InputPtrReg); 463 B.buildCopy(VReg, InputPtrReg); 464 CCInfo.AllocateReg(InputPtrReg); 465 } 466 467 if (Info.hasDispatchID()) { 468 Register DispatchIDReg = Info.addDispatchID(TRI); 469 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); 470 CCInfo.AllocateReg(DispatchIDReg); 471 } 472 473 if (Info.hasFlatScratchInit()) { 474 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI); 475 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); 476 CCInfo.AllocateReg(FlatScratchInitReg); 477 } 478 479 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read 480 // these from the dispatch pointer. 481 } 482 483 bool AMDGPUCallLowering::lowerFormalArgumentsKernel( 484 MachineIRBuilder &B, const Function &F, 485 ArrayRef<ArrayRef<Register>> VRegs) const { 486 MachineFunction &MF = B.getMF(); 487 const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>(); 488 MachineRegisterInfo &MRI = MF.getRegInfo(); 489 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 490 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); 491 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 492 const DataLayout &DL = F.getParent()->getDataLayout(); 493 494 Info->allocateModuleLDSGlobal(F.getParent()); 495 496 SmallVector<CCValAssign, 16> ArgLocs; 497 CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); 498 499 allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info); 500 501 unsigned i = 0; 502 const Align KernArgBaseAlign(16); 503 const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F); 504 uint64_t ExplicitArgOffset = 0; 505 506 // TODO: Align down to dword alignment and extract bits for extending loads. 507 for (auto &Arg : F.args()) { 508 const bool IsByRef = Arg.hasByRefAttr(); 509 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 510 unsigned AllocSize = DL.getTypeAllocSize(ArgTy); 511 if (AllocSize == 0) 512 continue; 513 514 MaybeAlign ABIAlign = IsByRef ? Arg.getParamAlign() : None; 515 if (!ABIAlign) 516 ABIAlign = DL.getABITypeAlign(ArgTy); 517 518 uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset; 519 ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize; 520 521 if (Arg.use_empty()) { 522 ++i; 523 continue; 524 } 525 526 Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset); 527 528 if (IsByRef) { 529 unsigned ByRefAS = cast<PointerType>(Arg.getType())->getAddressSpace(); 530 531 assert(VRegs[i].size() == 1 && 532 "expected only one register for byval pointers"); 533 if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) { 534 lowerParameterPtr(VRegs[i][0], B, ArgTy, ArgOffset); 535 } else { 536 const LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 537 Register PtrReg = MRI.createGenericVirtualRegister(ConstPtrTy); 538 lowerParameterPtr(PtrReg, B, ArgTy, ArgOffset); 539 540 B.buildAddrSpaceCast(VRegs[i][0], PtrReg); 541 } 542 } else { 543 ArrayRef<Register> OrigArgRegs = VRegs[i]; 544 Register ArgReg = 545 OrigArgRegs.size() == 1 546 ? OrigArgRegs[0] 547 : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL)); 548 549 lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg); 550 if (OrigArgRegs.size() > 1) 551 unpackRegs(OrigArgRegs, ArgReg, ArgTy, B); 552 } 553 554 ++i; 555 } 556 557 TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); 558 TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false); 559 return true; 560 } 561 562 bool AMDGPUCallLowering::lowerFormalArguments( 563 MachineIRBuilder &B, const Function &F, ArrayRef<ArrayRef<Register>> VRegs, 564 FunctionLoweringInfo &FLI) const { 565 CallingConv::ID CC = F.getCallingConv(); 566 567 // The infrastructure for normal calling convention lowering is essentially 568 // useless for kernels. We want to avoid any kind of legalization or argument 569 // splitting. 570 if (CC == CallingConv::AMDGPU_KERNEL) 571 return lowerFormalArgumentsKernel(B, F, VRegs); 572 573 const bool IsGraphics = AMDGPU::isGraphics(CC); 574 const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC); 575 576 MachineFunction &MF = B.getMF(); 577 MachineBasicBlock &MBB = B.getMBB(); 578 MachineRegisterInfo &MRI = MF.getRegInfo(); 579 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 580 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 581 const SIRegisterInfo *TRI = Subtarget.getRegisterInfo(); 582 const DataLayout &DL = F.getParent()->getDataLayout(); 583 584 Info->allocateModuleLDSGlobal(F.getParent()); 585 586 SmallVector<CCValAssign, 16> ArgLocs; 587 CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext()); 588 589 if (!IsEntryFunc) { 590 Register ReturnAddrReg = TRI->getReturnAddressReg(MF); 591 Register LiveInReturn = MF.addLiveIn(ReturnAddrReg, 592 &AMDGPU::SGPR_64RegClass); 593 MBB.addLiveIn(ReturnAddrReg); 594 B.buildCopy(LiveInReturn, ReturnAddrReg); 595 } 596 597 if (Info->hasImplicitBufferPtr()) { 598 Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI); 599 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); 600 CCInfo.AllocateReg(ImplicitBufferPtrReg); 601 } 602 603 SmallVector<ArgInfo, 32> SplitArgs; 604 unsigned Idx = 0; 605 unsigned PSInputNum = 0; 606 607 // Insert the hidden sret parameter if the return value won't fit in the 608 // return registers. 609 if (!FLI.CanLowerReturn) 610 insertSRetIncomingArgument(F, SplitArgs, FLI.DemoteRegister, MRI, DL); 611 612 for (auto &Arg : F.args()) { 613 if (DL.getTypeStoreSize(Arg.getType()) == 0) 614 continue; 615 616 const bool InReg = Arg.hasAttribute(Attribute::InReg); 617 618 // SGPR arguments to functions not implemented. 619 if (!IsGraphics && InReg) 620 return false; 621 622 if (Arg.hasAttribute(Attribute::SwiftSelf) || 623 Arg.hasAttribute(Attribute::SwiftError) || 624 Arg.hasAttribute(Attribute::Nest)) 625 return false; 626 627 if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) { 628 const bool ArgUsed = !Arg.use_empty(); 629 bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum); 630 631 if (!SkipArg) { 632 Info->markPSInputAllocated(PSInputNum); 633 if (ArgUsed) 634 Info->markPSInputEnabled(PSInputNum); 635 } 636 637 ++PSInputNum; 638 639 if (SkipArg) { 640 for (int I = 0, E = VRegs[Idx].size(); I != E; ++I) 641 B.buildUndef(VRegs[Idx][I]); 642 643 ++Idx; 644 continue; 645 } 646 } 647 648 ArgInfo OrigArg(VRegs[Idx], Arg); 649 const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex; 650 setArgFlags(OrigArg, OrigArgIdx, DL, F); 651 652 splitToValueTypes(OrigArg, SplitArgs, DL, CC); 653 ++Idx; 654 } 655 656 // At least one interpolation mode must be enabled or else the GPU will 657 // hang. 658 // 659 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user 660 // set PSInputAddr, the user wants to enable some bits after the compilation 661 // based on run-time states. Since we can't know what the final PSInputEna 662 // will look like, so we shouldn't do anything here and the user should take 663 // responsibility for the correct programming. 664 // 665 // Otherwise, the following restrictions apply: 666 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. 667 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be 668 // enabled too. 669 if (CC == CallingConv::AMDGPU_PS) { 670 if ((Info->getPSInputAddr() & 0x7F) == 0 || 671 ((Info->getPSInputAddr() & 0xF) == 0 && 672 Info->isPSInputAllocated(11))) { 673 CCInfo.AllocateReg(AMDGPU::VGPR0); 674 CCInfo.AllocateReg(AMDGPU::VGPR1); 675 Info->markPSInputAllocated(0); 676 Info->markPSInputEnabled(0); 677 } 678 679 if (Subtarget.isAmdPalOS()) { 680 // For isAmdPalOS, the user does not enable some bits after compilation 681 // based on run-time states; the register values being generated here are 682 // the final ones set in hardware. Therefore we need to apply the 683 // workaround to PSInputAddr and PSInputEnable together. (The case where 684 // a bit is set in PSInputAddr but not PSInputEnable is where the frontend 685 // set up an input arg for a particular interpolation mode, but nothing 686 // uses that input arg. Really we should have an earlier pass that removes 687 // such an arg.) 688 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable(); 689 if ((PsInputBits & 0x7F) == 0 || 690 ((PsInputBits & 0xF) == 0 && 691 (PsInputBits >> 11 & 1))) 692 Info->markPSInputEnabled( 693 countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined)); 694 } 695 } 696 697 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 698 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg()); 699 700 if (!MBB.empty()) 701 B.setInstr(*MBB.begin()); 702 703 if (!IsEntryFunc) { 704 // For the fixed ABI, pass workitem IDs in the last argument register. 705 if (AMDGPUTargetMachine::EnableFixedFunctionABI) 706 TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); 707 } 708 709 IncomingValueAssigner Assigner(AssignFn); 710 if (!determineAssignments(Assigner, SplitArgs, CCInfo)) 711 return false; 712 713 FormalArgHandler Handler(B, MRI); 714 if (!handleAssignments(Handler, SplitArgs, CCInfo, ArgLocs, B)) 715 return false; 716 717 if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) { 718 // Special inputs come after user arguments. 719 TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); 720 } 721 722 // Start adding system SGPRs. 723 if (IsEntryFunc) { 724 TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics); 725 } else { 726 if (!Subtarget.enableFlatScratch()) 727 CCInfo.AllocateReg(Info->getScratchRSrcReg()); 728 TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); 729 } 730 731 // Move back to the end of the basic block. 732 B.setMBB(MBB); 733 734 return true; 735 } 736 737 bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder, 738 CCState &CCInfo, 739 SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs, 740 CallLoweringInfo &Info) const { 741 MachineFunction &MF = MIRBuilder.getMF(); 742 743 const AMDGPUFunctionArgInfo *CalleeArgInfo 744 = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo; 745 746 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 747 const AMDGPUFunctionArgInfo &CallerArgInfo = MFI->getArgInfo(); 748 749 750 // TODO: Unify with private memory register handling. This is complicated by 751 // the fact that at least in kernels, the input argument is not necessarily 752 // in the same location as the input. 753 AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = { 754 AMDGPUFunctionArgInfo::DISPATCH_PTR, 755 AMDGPUFunctionArgInfo::QUEUE_PTR, 756 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, 757 AMDGPUFunctionArgInfo::DISPATCH_ID, 758 AMDGPUFunctionArgInfo::WORKGROUP_ID_X, 759 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, 760 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z 761 }; 762 763 MachineRegisterInfo &MRI = MF.getRegInfo(); 764 765 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 766 const AMDGPULegalizerInfo *LI 767 = static_cast<const AMDGPULegalizerInfo*>(ST.getLegalizerInfo()); 768 769 for (auto InputID : InputRegs) { 770 const ArgDescriptor *OutgoingArg; 771 const TargetRegisterClass *ArgRC; 772 LLT ArgTy; 773 774 std::tie(OutgoingArg, ArgRC, ArgTy) = 775 CalleeArgInfo->getPreloadedValue(InputID); 776 if (!OutgoingArg) 777 continue; 778 779 const ArgDescriptor *IncomingArg; 780 const TargetRegisterClass *IncomingArgRC; 781 std::tie(IncomingArg, IncomingArgRC, ArgTy) = 782 CallerArgInfo.getPreloadedValue(InputID); 783 assert(IncomingArgRC == ArgRC); 784 785 Register InputReg = MRI.createGenericVirtualRegister(ArgTy); 786 787 if (IncomingArg) { 788 LI->loadInputValue(InputReg, MIRBuilder, IncomingArg, ArgRC, ArgTy); 789 } else { 790 assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 791 LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder); 792 } 793 794 if (OutgoingArg->isRegister()) { 795 ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg); 796 if (!CCInfo.AllocateReg(OutgoingArg->getRegister())) 797 report_fatal_error("failed to allocate implicit input argument"); 798 } else { 799 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n"); 800 return false; 801 } 802 } 803 804 // Pack workitem IDs into a single register or pass it as is if already 805 // packed. 806 const ArgDescriptor *OutgoingArg; 807 const TargetRegisterClass *ArgRC; 808 LLT ArgTy; 809 810 std::tie(OutgoingArg, ArgRC, ArgTy) = 811 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X); 812 if (!OutgoingArg) 813 std::tie(OutgoingArg, ArgRC, ArgTy) = 814 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 815 if (!OutgoingArg) 816 std::tie(OutgoingArg, ArgRC, ArgTy) = 817 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 818 if (!OutgoingArg) 819 return false; 820 821 auto WorkitemIDX = 822 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X); 823 auto WorkitemIDY = 824 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 825 auto WorkitemIDZ = 826 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 827 828 const ArgDescriptor *IncomingArgX = std::get<0>(WorkitemIDX); 829 const ArgDescriptor *IncomingArgY = std::get<0>(WorkitemIDY); 830 const ArgDescriptor *IncomingArgZ = std::get<0>(WorkitemIDZ); 831 const LLT S32 = LLT::scalar(32); 832 833 // If incoming ids are not packed we need to pack them. 834 // FIXME: Should consider known workgroup size to eliminate known 0 cases. 835 Register InputReg; 836 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX) { 837 InputReg = MRI.createGenericVirtualRegister(S32); 838 LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX, 839 std::get<1>(WorkitemIDX), std::get<2>(WorkitemIDX)); 840 } 841 842 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY) { 843 Register Y = MRI.createGenericVirtualRegister(S32); 844 LI->loadInputValue(Y, MIRBuilder, IncomingArgY, std::get<1>(WorkitemIDY), 845 std::get<2>(WorkitemIDY)); 846 847 Y = MIRBuilder.buildShl(S32, Y, MIRBuilder.buildConstant(S32, 10)).getReg(0); 848 InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Y).getReg(0) : Y; 849 } 850 851 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ) { 852 Register Z = MRI.createGenericVirtualRegister(S32); 853 LI->loadInputValue(Z, MIRBuilder, IncomingArgZ, std::get<1>(WorkitemIDZ), 854 std::get<2>(WorkitemIDZ)); 855 856 Z = MIRBuilder.buildShl(S32, Z, MIRBuilder.buildConstant(S32, 20)).getReg(0); 857 InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Z).getReg(0) : Z; 858 } 859 860 if (!InputReg) { 861 InputReg = MRI.createGenericVirtualRegister(S32); 862 863 // Workitem ids are already packed, any of present incoming arguments will 864 // carry all required fields. 865 ArgDescriptor IncomingArg = ArgDescriptor::createArg( 866 IncomingArgX ? *IncomingArgX : 867 IncomingArgY ? *IncomingArgY : *IncomingArgZ, ~0u); 868 LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg, 869 &AMDGPU::VGPR_32RegClass, S32); 870 } 871 872 if (OutgoingArg->isRegister()) { 873 ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg); 874 if (!CCInfo.AllocateReg(OutgoingArg->getRegister())) 875 report_fatal_error("failed to allocate implicit input argument"); 876 } else { 877 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n"); 878 return false; 879 } 880 881 return true; 882 } 883 884 /// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for 885 /// CC. 886 static std::pair<CCAssignFn *, CCAssignFn *> 887 getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) { 888 return {TLI.CCAssignFnForCall(CC, false), TLI.CCAssignFnForCall(CC, true)}; 889 } 890 891 static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect, 892 bool IsTailCall) { 893 return AMDGPU::SI_CALL; 894 } 895 896 // Add operands to call instruction to track the callee. 897 static bool addCallTargetOperands(MachineInstrBuilder &CallInst, 898 MachineIRBuilder &MIRBuilder, 899 AMDGPUCallLowering::CallLoweringInfo &Info) { 900 if (Info.Callee.isReg()) { 901 CallInst.addReg(Info.Callee.getReg()); 902 CallInst.addImm(0); 903 } else if (Info.Callee.isGlobal() && Info.Callee.getOffset() == 0) { 904 // The call lowering lightly assumed we can directly encode a call target in 905 // the instruction, which is not the case. Materialize the address here. 906 const GlobalValue *GV = Info.Callee.getGlobal(); 907 auto Ptr = MIRBuilder.buildGlobalValue( 908 LLT::pointer(GV->getAddressSpace(), 64), GV); 909 CallInst.addReg(Ptr.getReg(0)); 910 CallInst.add(Info.Callee); 911 } else 912 return false; 913 914 return true; 915 } 916 917 bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, 918 CallLoweringInfo &Info) const { 919 if (Info.IsVarArg) { 920 LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n"); 921 return false; 922 } 923 924 MachineFunction &MF = MIRBuilder.getMF(); 925 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 926 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 927 928 const Function &F = MF.getFunction(); 929 MachineRegisterInfo &MRI = MF.getRegInfo(); 930 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 931 const DataLayout &DL = F.getParent()->getDataLayout(); 932 CallingConv::ID CallConv = F.getCallingConv(); 933 934 if (!AMDGPUTargetMachine::EnableFixedFunctionABI && 935 CallConv != CallingConv::AMDGPU_Gfx) { 936 LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n"); 937 return false; 938 } 939 940 if (AMDGPU::isShader(CallConv)) { 941 LLVM_DEBUG(dbgs() << "Unhandled call from graphics shader\n"); 942 return false; 943 } 944 945 SmallVector<ArgInfo, 8> OutArgs; 946 for (auto &OrigArg : Info.OrigArgs) 947 splitToValueTypes(OrigArg, OutArgs, DL, Info.CallConv); 948 949 SmallVector<ArgInfo, 8> InArgs; 950 if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy()) 951 splitToValueTypes(Info.OrigRet, InArgs, DL, Info.CallConv); 952 953 // If we can lower as a tail call, do that instead. 954 bool CanTailCallOpt = false; 955 956 // We must emit a tail call if we have musttail. 957 if (Info.IsMustTailCall && !CanTailCallOpt) { 958 LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n"); 959 return false; 960 } 961 962 // Find out which ABI gets to decide where things go. 963 CCAssignFn *AssignFnFixed; 964 CCAssignFn *AssignFnVarArg; 965 std::tie(AssignFnFixed, AssignFnVarArg) = 966 getAssignFnsForCC(Info.CallConv, TLI); 967 968 MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP) 969 .addImm(0) 970 .addImm(0); 971 972 // Create a temporarily-floating call instruction so we can add the implicit 973 // uses of arg registers. 974 unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false); 975 976 auto MIB = MIRBuilder.buildInstrNoInsert(Opc); 977 MIB.addDef(TRI->getReturnAddressReg(MF)); 978 979 if (!addCallTargetOperands(MIB, MIRBuilder, Info)) 980 return false; 981 982 // Tell the call which registers are clobbered. 983 const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv); 984 MIB.addRegMask(Mask); 985 986 SmallVector<CCValAssign, 16> ArgLocs; 987 CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext()); 988 989 // We could pass MIB and directly add the implicit uses to the call 990 // now. However, as an aesthetic choice, place implicit argument operands 991 // after the ordinary user argument registers. 992 SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs; 993 994 if (AMDGPUTargetMachine::EnableFixedFunctionABI && 995 Info.CallConv != CallingConv::AMDGPU_Gfx) { 996 // With a fixed ABI, allocate fixed registers before user arguments. 997 if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info)) 998 return false; 999 } 1000 1001 // Do the actual argument marshalling. 1002 SmallVector<Register, 8> PhysRegs; 1003 1004 OutgoingValueAssigner Assigner(AssignFnFixed, AssignFnVarArg); 1005 if (!determineAssignments(Assigner, OutArgs, CCInfo)) 1006 return false; 1007 1008 AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, false); 1009 if (!handleAssignments(Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder)) 1010 return false; 1011 1012 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1013 1014 if (!ST.enableFlatScratch()) { 1015 // Insert copies for the SRD. In the HSA case, this should be an identity 1016 // copy. 1017 auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::vector(4, 32), 1018 MFI->getScratchRSrcReg()); 1019 MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); 1020 MIB.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit); 1021 } 1022 1023 for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) { 1024 MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second); 1025 MIB.addReg(ArgReg.first, RegState::Implicit); 1026 } 1027 1028 // Get a count of how many bytes are to be pushed on the stack. 1029 unsigned NumBytes = CCInfo.getNextStackOffset(); 1030 1031 // If Callee is a reg, since it is used by a target specific 1032 // instruction, it must have a register class matching the 1033 // constraint of that instruction. 1034 1035 // FIXME: We should define regbankselectable call instructions to handle 1036 // divergent call targets. 1037 if (MIB->getOperand(1).isReg()) { 1038 MIB->getOperand(1).setReg(constrainOperandRegClass( 1039 MF, *TRI, MRI, *ST.getInstrInfo(), 1040 *ST.getRegBankInfo(), *MIB, MIB->getDesc(), MIB->getOperand(1), 1041 1)); 1042 } 1043 1044 // Now we can add the actual call instruction to the correct position. 1045 MIRBuilder.insertInstr(MIB); 1046 1047 // Finally we can copy the returned value back into its virtual-register. In 1048 // symmetry with the arguments, the physical register must be an 1049 // implicit-define of the call instruction. 1050 if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy()) { 1051 CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv, 1052 Info.IsVarArg); 1053 OutgoingValueAssigner Assigner(RetAssignFn); 1054 CallReturnHandler Handler(MIRBuilder, MRI, MIB); 1055 if (!determineAndHandleAssignments(Handler, Assigner, InArgs, MIRBuilder, 1056 Info.CallConv, Info.IsVarArg)) 1057 return false; 1058 } 1059 1060 uint64_t CalleePopBytes = NumBytes; 1061 1062 MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN) 1063 .addImm(0) 1064 .addImm(CalleePopBytes); 1065 1066 if (!Info.CanLowerReturn) { 1067 insertSRetLoads(MIRBuilder, Info.OrigRet.Ty, Info.OrigRet.Regs, 1068 Info.DemoteRegister, Info.DemoteStackIndex); 1069 } 1070 1071 return true; 1072 } 1073