1 //===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file implements the lowering of LLVM calls to machine code calls for 11 /// GlobalISel. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUCallLowering.h" 16 #include "AMDGPU.h" 17 #include "AMDGPULegalizerInfo.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "SIRegisterInfo.h" 21 #include "llvm/CodeGen/Analysis.h" 22 #include "llvm/CodeGen/FunctionLoweringInfo.h" 23 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 24 #include "llvm/IR/IntrinsicsAMDGPU.h" 25 26 #define DEBUG_TYPE "amdgpu-call-lowering" 27 28 using namespace llvm; 29 30 namespace { 31 32 /// Wrapper around extendRegister to ensure we extend to a full 32-bit register. 33 static Register extendRegisterMin32(CallLowering::ValueHandler &Handler, 34 Register ValVReg, CCValAssign &VA) { 35 if (VA.getLocVT().getSizeInBits() < 32) { 36 // 16-bit types are reported as legal for 32-bit registers. We need to 37 // extend and do a 32-bit copy to avoid the verifier complaining about it. 38 return Handler.MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0); 39 } 40 41 return Handler.extendRegister(ValVReg, VA); 42 } 43 44 struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler { 45 AMDGPUOutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, 46 MachineInstrBuilder MIB, CCAssignFn *AssignFn) 47 : OutgoingValueHandler(B, MRI, AssignFn), MIB(MIB) {} 48 49 MachineInstrBuilder MIB; 50 51 Register getStackAddress(uint64_t Size, int64_t Offset, 52 MachinePointerInfo &MPO) override { 53 llvm_unreachable("not implemented"); 54 } 55 56 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, 57 MachinePointerInfo &MPO, CCValAssign &VA) override { 58 llvm_unreachable("not implemented"); 59 } 60 61 void assignValueToReg(Register ValVReg, Register PhysReg, 62 CCValAssign &VA) override { 63 Register ExtReg = extendRegisterMin32(*this, ValVReg, VA); 64 65 // If this is a scalar return, insert a readfirstlane just in case the value 66 // ends up in a VGPR. 67 // FIXME: Assert this is a shader return. 68 const SIRegisterInfo *TRI 69 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 70 if (TRI->isSGPRReg(MRI, PhysReg)) { 71 auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, 72 {MRI.getType(ExtReg)}, false) 73 .addReg(ExtReg); 74 ExtReg = ToSGPR.getReg(0); 75 } 76 77 MIRBuilder.buildCopy(PhysReg, ExtReg); 78 MIB.addUse(PhysReg, RegState::Implicit); 79 } 80 81 bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, 82 CCValAssign::LocInfo LocInfo, 83 const CallLowering::ArgInfo &Info, 84 ISD::ArgFlagsTy Flags, 85 CCState &State) override { 86 return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State); 87 } 88 }; 89 90 struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler { 91 uint64_t StackUsed = 0; 92 93 AMDGPUIncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, 94 CCAssignFn *AssignFn) 95 : IncomingValueHandler(B, MRI, AssignFn) {} 96 97 Register getStackAddress(uint64_t Size, int64_t Offset, 98 MachinePointerInfo &MPO) override { 99 auto &MFI = MIRBuilder.getMF().getFrameInfo(); 100 int FI = MFI.CreateFixedObject(Size, Offset, true); 101 MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); 102 auto AddrReg = MIRBuilder.buildFrameIndex( 103 LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32), FI); 104 StackUsed = std::max(StackUsed, Size + Offset); 105 return AddrReg.getReg(0); 106 } 107 108 void assignValueToReg(Register ValVReg, Register PhysReg, 109 CCValAssign &VA) override { 110 markPhysRegUsed(PhysReg); 111 112 if (VA.getLocVT().getSizeInBits() < 32) { 113 // 16-bit types are reported as legal for 32-bit registers. We need to do 114 // a 32-bit copy, and truncate to avoid the verifier complaining about it. 115 auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg); 116 117 // If we have signext/zeroext, it applies to the whole 32-bit register 118 // before truncation. 119 auto Extended = 120 buildExtensionHint(VA, Copy.getReg(0), LLT(VA.getLocVT())); 121 MIRBuilder.buildTrunc(ValVReg, Extended); 122 return; 123 } 124 125 IncomingValueHandler::assignValueToReg(ValVReg, PhysReg, VA); 126 } 127 128 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t MemSize, 129 MachinePointerInfo &MPO, CCValAssign &VA) override { 130 MachineFunction &MF = MIRBuilder.getMF(); 131 132 // The reported memory location may be wider than the value. 133 const LLT RegTy = MRI.getType(ValVReg); 134 MemSize = std::min(static_cast<uint64_t>(RegTy.getSizeInBytes()), MemSize); 135 136 // FIXME: Get alignment 137 auto MMO = MF.getMachineMemOperand( 138 MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, MemSize, 139 inferAlignFromPtrInfo(MF, MPO)); 140 MIRBuilder.buildLoad(ValVReg, Addr, *MMO); 141 } 142 143 /// How the physical register gets marked varies between formal 144 /// parameters (it's a basic-block live-in), and a call instruction 145 /// (it's an implicit-def of the BL). 146 virtual void markPhysRegUsed(unsigned PhysReg) = 0; 147 }; 148 149 struct FormalArgHandler : public AMDGPUIncomingArgHandler { 150 FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, 151 CCAssignFn *AssignFn) 152 : AMDGPUIncomingArgHandler(B, MRI, AssignFn) {} 153 154 void markPhysRegUsed(unsigned PhysReg) override { 155 MIRBuilder.getMBB().addLiveIn(PhysReg); 156 } 157 }; 158 159 struct CallReturnHandler : public AMDGPUIncomingArgHandler { 160 CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, 161 MachineInstrBuilder MIB, CCAssignFn *AssignFn) 162 : AMDGPUIncomingArgHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} 163 164 void markPhysRegUsed(unsigned PhysReg) override { 165 MIB.addDef(PhysReg, RegState::Implicit); 166 } 167 168 MachineInstrBuilder MIB; 169 }; 170 171 struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler { 172 CCAssignFn *AssignFnVarArg; 173 174 /// For tail calls, the byte offset of the call's argument area from the 175 /// callee's. Unused elsewhere. 176 int FPDiff; 177 178 // Cache the SP register vreg if we need it more than once in this call site. 179 Register SPReg; 180 181 bool IsTailCall; 182 183 AMDGPUOutgoingArgHandler(MachineIRBuilder &MIRBuilder, 184 MachineRegisterInfo &MRI, MachineInstrBuilder MIB, 185 CCAssignFn *AssignFn, CCAssignFn *AssignFnVarArg, 186 bool IsTailCall = false, int FPDiff = 0) 187 : AMDGPUOutgoingValueHandler(MIRBuilder, MRI, MIB, AssignFn), 188 AssignFnVarArg(AssignFnVarArg), FPDiff(FPDiff), IsTailCall(IsTailCall) { 189 } 190 191 Register getStackAddress(uint64_t Size, int64_t Offset, 192 MachinePointerInfo &MPO) override { 193 MachineFunction &MF = MIRBuilder.getMF(); 194 const LLT PtrTy = LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32); 195 const LLT S32 = LLT::scalar(32); 196 197 if (IsTailCall) { 198 llvm_unreachable("implement me"); 199 } 200 201 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 202 203 if (!SPReg) 204 SPReg = MIRBuilder.buildCopy(PtrTy, MFI->getStackPtrOffsetReg()).getReg(0); 205 206 auto OffsetReg = MIRBuilder.buildConstant(S32, Offset); 207 208 auto AddrReg = MIRBuilder.buildPtrAdd(PtrTy, SPReg, OffsetReg); 209 MPO = MachinePointerInfo::getStack(MF, Offset); 210 return AddrReg.getReg(0); 211 } 212 213 void assignValueToReg(Register ValVReg, Register PhysReg, 214 CCValAssign &VA) override { 215 MIB.addUse(PhysReg, RegState::Implicit); 216 Register ExtReg = extendRegisterMin32(*this, ValVReg, VA); 217 MIRBuilder.buildCopy(PhysReg, ExtReg); 218 } 219 220 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, 221 MachinePointerInfo &MPO, CCValAssign &VA) override { 222 MachineFunction &MF = MIRBuilder.getMF(); 223 uint64_t LocMemOffset = VA.getLocMemOffset(); 224 const auto &ST = MF.getSubtarget<GCNSubtarget>(); 225 226 auto MMO = MF.getMachineMemOperand( 227 MPO, MachineMemOperand::MOStore, Size, 228 commonAlignment(ST.getStackAlignment(), LocMemOffset)); 229 MIRBuilder.buildStore(ValVReg, Addr, *MMO); 230 } 231 232 void assignValueToAddress(const CallLowering::ArgInfo &Arg, 233 unsigned ValRegIndex, Register Addr, 234 uint64_t MemSize, MachinePointerInfo &MPO, 235 CCValAssign &VA) override { 236 Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt 237 ? extendRegister(Arg.Regs[ValRegIndex], VA) 238 : Arg.Regs[ValRegIndex]; 239 240 // If we extended the value type we might need to adjust the MMO's 241 // Size. This happens if ComputeValueVTs widened a small type value to a 242 // legal register type (e.g. s8->s16) 243 const LLT RegTy = MRI.getType(ValVReg); 244 MemSize = std::min(MemSize, (uint64_t)RegTy.getSizeInBytes()); 245 assignValueToAddress(ValVReg, Addr, MemSize, MPO, VA); 246 } 247 }; 248 } 249 250 AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI) 251 : CallLowering(&TLI) { 252 } 253 254 // FIXME: Compatability shim 255 static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) { 256 switch (MIOpc) { 257 case TargetOpcode::G_SEXT: 258 return ISD::SIGN_EXTEND; 259 case TargetOpcode::G_ZEXT: 260 return ISD::ZERO_EXTEND; 261 case TargetOpcode::G_ANYEXT: 262 return ISD::ANY_EXTEND; 263 default: 264 llvm_unreachable("not an extend opcode"); 265 } 266 } 267 268 bool AMDGPUCallLowering::canLowerReturn(MachineFunction &MF, 269 CallingConv::ID CallConv, 270 SmallVectorImpl<BaseArgInfo> &Outs, 271 bool IsVarArg) const { 272 // For shaders. Vector types should be explicitly handled by CC. 273 if (AMDGPU::isEntryFunctionCC(CallConv)) 274 return true; 275 276 SmallVector<CCValAssign, 16> ArgLocs; 277 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 278 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, 279 MF.getFunction().getContext()); 280 281 return checkReturn(CCInfo, Outs, TLI.CCAssignFnForReturn(CallConv, IsVarArg)); 282 } 283 284 /// Lower the return value for the already existing \p Ret. This assumes that 285 /// \p B's insertion point is correct. 286 bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B, 287 const Value *Val, ArrayRef<Register> VRegs, 288 MachineInstrBuilder &Ret) const { 289 if (!Val) 290 return true; 291 292 auto &MF = B.getMF(); 293 const auto &F = MF.getFunction(); 294 const DataLayout &DL = MF.getDataLayout(); 295 MachineRegisterInfo *MRI = B.getMRI(); 296 LLVMContext &Ctx = F.getContext(); 297 298 CallingConv::ID CC = F.getCallingConv(); 299 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 300 301 SmallVector<EVT, 8> SplitEVTs; 302 ComputeValueVTs(TLI, DL, Val->getType(), SplitEVTs); 303 assert(VRegs.size() == SplitEVTs.size() && 304 "For each split Type there should be exactly one VReg."); 305 306 SmallVector<ArgInfo, 8> SplitRetInfos; 307 308 for (unsigned i = 0; i < SplitEVTs.size(); ++i) { 309 EVT VT = SplitEVTs[i]; 310 Register Reg = VRegs[i]; 311 ArgInfo RetInfo(Reg, VT.getTypeForEVT(Ctx)); 312 setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F); 313 314 if (VT.isScalarInteger()) { 315 unsigned ExtendOp = TargetOpcode::G_ANYEXT; 316 if (RetInfo.Flags[0].isSExt()) { 317 assert(RetInfo.Regs.size() == 1 && "expect only simple return values"); 318 ExtendOp = TargetOpcode::G_SEXT; 319 } else if (RetInfo.Flags[0].isZExt()) { 320 assert(RetInfo.Regs.size() == 1 && "expect only simple return values"); 321 ExtendOp = TargetOpcode::G_ZEXT; 322 } 323 324 EVT ExtVT = TLI.getTypeForExtReturn(Ctx, VT, 325 extOpcodeToISDExtOpcode(ExtendOp)); 326 if (ExtVT != VT) { 327 RetInfo.Ty = ExtVT.getTypeForEVT(Ctx); 328 LLT ExtTy = getLLTForType(*RetInfo.Ty, DL); 329 Reg = B.buildInstr(ExtendOp, {ExtTy}, {Reg}).getReg(0); 330 } 331 } 332 333 if (Reg != RetInfo.Regs[0]) { 334 RetInfo.Regs[0] = Reg; 335 // Reset the arg flags after modifying Reg. 336 setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F); 337 } 338 339 splitToValueTypes(RetInfo, SplitRetInfos, DL, CC); 340 } 341 342 CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg()); 343 AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret, AssignFn); 344 return handleAssignments(B, SplitRetInfos, RetHandler, CC, F.isVarArg()); 345 } 346 347 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val, 348 ArrayRef<Register> VRegs, 349 FunctionLoweringInfo &FLI) const { 350 351 MachineFunction &MF = B.getMF(); 352 MachineRegisterInfo &MRI = MF.getRegInfo(); 353 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 354 MFI->setIfReturnsVoid(!Val); 355 356 assert(!Val == VRegs.empty() && "Return value without a vreg"); 357 358 CallingConv::ID CC = B.getMF().getFunction().getCallingConv(); 359 const bool IsShader = AMDGPU::isShader(CC); 360 const bool IsWaveEnd = 361 (IsShader && MFI->returnsVoid()) || AMDGPU::isKernel(CC); 362 if (IsWaveEnd) { 363 B.buildInstr(AMDGPU::S_ENDPGM) 364 .addImm(0); 365 return true; 366 } 367 368 auto const &ST = MF.getSubtarget<GCNSubtarget>(); 369 370 unsigned ReturnOpc = 371 IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return; 372 373 auto Ret = B.buildInstrNoInsert(ReturnOpc); 374 Register ReturnAddrVReg; 375 if (ReturnOpc == AMDGPU::S_SETPC_B64_return) { 376 ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass); 377 Ret.addUse(ReturnAddrVReg); 378 } 379 380 if (!FLI.CanLowerReturn) 381 insertSRetStores(B, Val->getType(), VRegs, FLI.DemoteRegister); 382 else if (!lowerReturnVal(B, Val, VRegs, Ret)) 383 return false; 384 385 if (ReturnOpc == AMDGPU::S_SETPC_B64_return) { 386 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 387 Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF), 388 &AMDGPU::SGPR_64RegClass); 389 B.buildCopy(ReturnAddrVReg, LiveInReturn); 390 } 391 392 // TODO: Handle CalleeSavedRegsViaCopy. 393 394 B.insertInstr(Ret); 395 return true; 396 } 397 398 void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B, 399 Type *ParamTy, 400 uint64_t Offset) const { 401 MachineFunction &MF = B.getMF(); 402 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 403 MachineRegisterInfo &MRI = MF.getRegInfo(); 404 Register KernArgSegmentPtr = 405 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 406 Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr); 407 408 auto OffsetReg = B.buildConstant(LLT::scalar(64), Offset); 409 410 B.buildPtrAdd(DstReg, KernArgSegmentVReg, OffsetReg); 411 } 412 413 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy, 414 uint64_t Offset, Align Alignment, 415 Register DstReg) const { 416 MachineFunction &MF = B.getMF(); 417 const Function &F = MF.getFunction(); 418 const DataLayout &DL = F.getParent()->getDataLayout(); 419 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 420 unsigned TypeSize = DL.getTypeStoreSize(ParamTy); 421 422 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 423 Register PtrReg = B.getMRI()->createGenericVirtualRegister(PtrTy); 424 lowerParameterPtr(PtrReg, B, ParamTy, Offset); 425 426 MachineMemOperand *MMO = MF.getMachineMemOperand( 427 PtrInfo, 428 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 429 MachineMemOperand::MOInvariant, 430 TypeSize, Alignment); 431 432 B.buildLoad(DstReg, PtrReg, *MMO); 433 } 434 435 // Allocate special inputs passed in user SGPRs. 436 static void allocateHSAUserSGPRs(CCState &CCInfo, 437 MachineIRBuilder &B, 438 MachineFunction &MF, 439 const SIRegisterInfo &TRI, 440 SIMachineFunctionInfo &Info) { 441 // FIXME: How should these inputs interact with inreg / custom SGPR inputs? 442 if (Info.hasPrivateSegmentBuffer()) { 443 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); 444 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); 445 CCInfo.AllocateReg(PrivateSegmentBufferReg); 446 } 447 448 if (Info.hasDispatchPtr()) { 449 Register DispatchPtrReg = Info.addDispatchPtr(TRI); 450 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); 451 CCInfo.AllocateReg(DispatchPtrReg); 452 } 453 454 if (Info.hasQueuePtr()) { 455 Register QueuePtrReg = Info.addQueuePtr(TRI); 456 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); 457 CCInfo.AllocateReg(QueuePtrReg); 458 } 459 460 if (Info.hasKernargSegmentPtr()) { 461 MachineRegisterInfo &MRI = MF.getRegInfo(); 462 Register InputPtrReg = Info.addKernargSegmentPtr(TRI); 463 const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 464 Register VReg = MRI.createGenericVirtualRegister(P4); 465 MRI.addLiveIn(InputPtrReg, VReg); 466 B.getMBB().addLiveIn(InputPtrReg); 467 B.buildCopy(VReg, InputPtrReg); 468 CCInfo.AllocateReg(InputPtrReg); 469 } 470 471 if (Info.hasDispatchID()) { 472 Register DispatchIDReg = Info.addDispatchID(TRI); 473 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); 474 CCInfo.AllocateReg(DispatchIDReg); 475 } 476 477 if (Info.hasFlatScratchInit()) { 478 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI); 479 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); 480 CCInfo.AllocateReg(FlatScratchInitReg); 481 } 482 483 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read 484 // these from the dispatch pointer. 485 } 486 487 bool AMDGPUCallLowering::lowerFormalArgumentsKernel( 488 MachineIRBuilder &B, const Function &F, 489 ArrayRef<ArrayRef<Register>> VRegs) const { 490 MachineFunction &MF = B.getMF(); 491 const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>(); 492 MachineRegisterInfo &MRI = MF.getRegInfo(); 493 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 494 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); 495 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 496 497 const DataLayout &DL = F.getParent()->getDataLayout(); 498 499 SmallVector<CCValAssign, 16> ArgLocs; 500 CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); 501 502 allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info); 503 504 unsigned i = 0; 505 const Align KernArgBaseAlign(16); 506 const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F); 507 uint64_t ExplicitArgOffset = 0; 508 509 // TODO: Align down to dword alignment and extract bits for extending loads. 510 for (auto &Arg : F.args()) { 511 const bool IsByRef = Arg.hasByRefAttr(); 512 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 513 unsigned AllocSize = DL.getTypeAllocSize(ArgTy); 514 if (AllocSize == 0) 515 continue; 516 517 MaybeAlign ABIAlign = IsByRef ? Arg.getParamAlign() : None; 518 if (!ABIAlign) 519 ABIAlign = DL.getABITypeAlign(ArgTy); 520 521 uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset; 522 ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize; 523 524 if (Arg.use_empty()) { 525 ++i; 526 continue; 527 } 528 529 Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset); 530 531 if (IsByRef) { 532 unsigned ByRefAS = cast<PointerType>(Arg.getType())->getAddressSpace(); 533 534 assert(VRegs[i].size() == 1 && 535 "expected only one register for byval pointers"); 536 if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) { 537 lowerParameterPtr(VRegs[i][0], B, ArgTy, ArgOffset); 538 } else { 539 const LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 540 Register PtrReg = MRI.createGenericVirtualRegister(ConstPtrTy); 541 lowerParameterPtr(PtrReg, B, ArgTy, ArgOffset); 542 543 B.buildAddrSpaceCast(VRegs[i][0], PtrReg); 544 } 545 } else { 546 ArrayRef<Register> OrigArgRegs = VRegs[i]; 547 Register ArgReg = 548 OrigArgRegs.size() == 1 549 ? OrigArgRegs[0] 550 : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL)); 551 552 lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg); 553 if (OrigArgRegs.size() > 1) 554 unpackRegs(OrigArgRegs, ArgReg, ArgTy, B); 555 } 556 557 ++i; 558 } 559 560 TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); 561 TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false); 562 return true; 563 } 564 565 bool AMDGPUCallLowering::lowerFormalArguments( 566 MachineIRBuilder &B, const Function &F, ArrayRef<ArrayRef<Register>> VRegs, 567 FunctionLoweringInfo &FLI) const { 568 CallingConv::ID CC = F.getCallingConv(); 569 570 // The infrastructure for normal calling convention lowering is essentially 571 // useless for kernels. We want to avoid any kind of legalization or argument 572 // splitting. 573 if (CC == CallingConv::AMDGPU_KERNEL) 574 return lowerFormalArgumentsKernel(B, F, VRegs); 575 576 const bool IsGraphics = AMDGPU::isGraphics(CC); 577 const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC); 578 579 MachineFunction &MF = B.getMF(); 580 MachineBasicBlock &MBB = B.getMBB(); 581 MachineRegisterInfo &MRI = MF.getRegInfo(); 582 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 583 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 584 const SIRegisterInfo *TRI = Subtarget.getRegisterInfo(); 585 const DataLayout &DL = F.getParent()->getDataLayout(); 586 587 588 SmallVector<CCValAssign, 16> ArgLocs; 589 CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext()); 590 591 if (!IsEntryFunc) { 592 Register ReturnAddrReg = TRI->getReturnAddressReg(MF); 593 Register LiveInReturn = MF.addLiveIn(ReturnAddrReg, 594 &AMDGPU::SGPR_64RegClass); 595 MBB.addLiveIn(ReturnAddrReg); 596 B.buildCopy(LiveInReturn, ReturnAddrReg); 597 } 598 599 if (Info->hasImplicitBufferPtr()) { 600 Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI); 601 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); 602 CCInfo.AllocateReg(ImplicitBufferPtrReg); 603 } 604 605 SmallVector<ArgInfo, 32> SplitArgs; 606 unsigned Idx = 0; 607 unsigned PSInputNum = 0; 608 609 // Insert the hidden sret parameter if the return value won't fit in the 610 // return registers. 611 if (!FLI.CanLowerReturn) 612 insertSRetIncomingArgument(F, SplitArgs, FLI.DemoteRegister, MRI, DL); 613 614 for (auto &Arg : F.args()) { 615 if (DL.getTypeStoreSize(Arg.getType()) == 0) 616 continue; 617 618 const bool InReg = Arg.hasAttribute(Attribute::InReg); 619 620 // SGPR arguments to functions not implemented. 621 if (!IsGraphics && InReg) 622 return false; 623 624 if (Arg.hasAttribute(Attribute::SwiftSelf) || 625 Arg.hasAttribute(Attribute::SwiftError) || 626 Arg.hasAttribute(Attribute::Nest)) 627 return false; 628 629 if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) { 630 const bool ArgUsed = !Arg.use_empty(); 631 bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum); 632 633 if (!SkipArg) { 634 Info->markPSInputAllocated(PSInputNum); 635 if (ArgUsed) 636 Info->markPSInputEnabled(PSInputNum); 637 } 638 639 ++PSInputNum; 640 641 if (SkipArg) { 642 for (int I = 0, E = VRegs[Idx].size(); I != E; ++I) 643 B.buildUndef(VRegs[Idx][I]); 644 645 ++Idx; 646 continue; 647 } 648 } 649 650 ArgInfo OrigArg(VRegs[Idx], Arg.getType()); 651 const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex; 652 setArgFlags(OrigArg, OrigArgIdx, DL, F); 653 654 splitToValueTypes(OrigArg, SplitArgs, DL, CC); 655 ++Idx; 656 } 657 658 // At least one interpolation mode must be enabled or else the GPU will 659 // hang. 660 // 661 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user 662 // set PSInputAddr, the user wants to enable some bits after the compilation 663 // based on run-time states. Since we can't know what the final PSInputEna 664 // will look like, so we shouldn't do anything here and the user should take 665 // responsibility for the correct programming. 666 // 667 // Otherwise, the following restrictions apply: 668 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. 669 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be 670 // enabled too. 671 if (CC == CallingConv::AMDGPU_PS) { 672 if ((Info->getPSInputAddr() & 0x7F) == 0 || 673 ((Info->getPSInputAddr() & 0xF) == 0 && 674 Info->isPSInputAllocated(11))) { 675 CCInfo.AllocateReg(AMDGPU::VGPR0); 676 CCInfo.AllocateReg(AMDGPU::VGPR1); 677 Info->markPSInputAllocated(0); 678 Info->markPSInputEnabled(0); 679 } 680 681 if (Subtarget.isAmdPalOS()) { 682 // For isAmdPalOS, the user does not enable some bits after compilation 683 // based on run-time states; the register values being generated here are 684 // the final ones set in hardware. Therefore we need to apply the 685 // workaround to PSInputAddr and PSInputEnable together. (The case where 686 // a bit is set in PSInputAddr but not PSInputEnable is where the frontend 687 // set up an input arg for a particular interpolation mode, but nothing 688 // uses that input arg. Really we should have an earlier pass that removes 689 // such an arg.) 690 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable(); 691 if ((PsInputBits & 0x7F) == 0 || 692 ((PsInputBits & 0xF) == 0 && 693 (PsInputBits >> 11 & 1))) 694 Info->markPSInputEnabled( 695 countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined)); 696 } 697 } 698 699 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 700 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg()); 701 702 if (!MBB.empty()) 703 B.setInstr(*MBB.begin()); 704 705 if (!IsEntryFunc) { 706 // For the fixed ABI, pass workitem IDs in the last argument register. 707 if (AMDGPUTargetMachine::EnableFixedFunctionABI) 708 TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); 709 } 710 711 FormalArgHandler Handler(B, MRI, AssignFn); 712 if (!handleAssignments(CCInfo, ArgLocs, B, SplitArgs, Handler)) 713 return false; 714 715 if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) { 716 // Special inputs come after user arguments. 717 TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); 718 } 719 720 // Start adding system SGPRs. 721 if (IsEntryFunc) { 722 TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics); 723 } else { 724 if (!Subtarget.enableFlatScratch()) 725 CCInfo.AllocateReg(Info->getScratchRSrcReg()); 726 TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); 727 } 728 729 // Move back to the end of the basic block. 730 B.setMBB(MBB); 731 732 return true; 733 } 734 735 bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder, 736 CCState &CCInfo, 737 SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs, 738 CallLoweringInfo &Info) const { 739 MachineFunction &MF = MIRBuilder.getMF(); 740 741 const AMDGPUFunctionArgInfo *CalleeArgInfo 742 = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo; 743 744 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 745 const AMDGPUFunctionArgInfo &CallerArgInfo = MFI->getArgInfo(); 746 747 748 // TODO: Unify with private memory register handling. This is complicated by 749 // the fact that at least in kernels, the input argument is not necessarily 750 // in the same location as the input. 751 AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = { 752 AMDGPUFunctionArgInfo::DISPATCH_PTR, 753 AMDGPUFunctionArgInfo::QUEUE_PTR, 754 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, 755 AMDGPUFunctionArgInfo::DISPATCH_ID, 756 AMDGPUFunctionArgInfo::WORKGROUP_ID_X, 757 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, 758 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z 759 }; 760 761 MachineRegisterInfo &MRI = MF.getRegInfo(); 762 763 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 764 const AMDGPULegalizerInfo *LI 765 = static_cast<const AMDGPULegalizerInfo*>(ST.getLegalizerInfo()); 766 767 for (auto InputID : InputRegs) { 768 const ArgDescriptor *OutgoingArg; 769 const TargetRegisterClass *ArgRC; 770 LLT ArgTy; 771 772 std::tie(OutgoingArg, ArgRC, ArgTy) = 773 CalleeArgInfo->getPreloadedValue(InputID); 774 if (!OutgoingArg) 775 continue; 776 777 const ArgDescriptor *IncomingArg; 778 const TargetRegisterClass *IncomingArgRC; 779 std::tie(IncomingArg, IncomingArgRC, ArgTy) = 780 CallerArgInfo.getPreloadedValue(InputID); 781 assert(IncomingArgRC == ArgRC); 782 783 Register InputReg = MRI.createGenericVirtualRegister(ArgTy); 784 785 if (IncomingArg) { 786 LI->loadInputValue(InputReg, MIRBuilder, IncomingArg, ArgRC, ArgTy); 787 } else { 788 assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 789 LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder); 790 } 791 792 if (OutgoingArg->isRegister()) { 793 ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg); 794 if (!CCInfo.AllocateReg(OutgoingArg->getRegister())) 795 report_fatal_error("failed to allocate implicit input argument"); 796 } else { 797 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n"); 798 return false; 799 } 800 } 801 802 // Pack workitem IDs into a single register or pass it as is if already 803 // packed. 804 const ArgDescriptor *OutgoingArg; 805 const TargetRegisterClass *ArgRC; 806 LLT ArgTy; 807 808 std::tie(OutgoingArg, ArgRC, ArgTy) = 809 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X); 810 if (!OutgoingArg) 811 std::tie(OutgoingArg, ArgRC, ArgTy) = 812 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 813 if (!OutgoingArg) 814 std::tie(OutgoingArg, ArgRC, ArgTy) = 815 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 816 if (!OutgoingArg) 817 return false; 818 819 auto WorkitemIDX = 820 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X); 821 auto WorkitemIDY = 822 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 823 auto WorkitemIDZ = 824 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 825 826 const ArgDescriptor *IncomingArgX = std::get<0>(WorkitemIDX); 827 const ArgDescriptor *IncomingArgY = std::get<0>(WorkitemIDY); 828 const ArgDescriptor *IncomingArgZ = std::get<0>(WorkitemIDZ); 829 const LLT S32 = LLT::scalar(32); 830 831 // If incoming ids are not packed we need to pack them. 832 // FIXME: Should consider known workgroup size to eliminate known 0 cases. 833 Register InputReg; 834 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX) { 835 InputReg = MRI.createGenericVirtualRegister(S32); 836 LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX, 837 std::get<1>(WorkitemIDX), std::get<2>(WorkitemIDX)); 838 } 839 840 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY) { 841 Register Y = MRI.createGenericVirtualRegister(S32); 842 LI->loadInputValue(Y, MIRBuilder, IncomingArgY, std::get<1>(WorkitemIDY), 843 std::get<2>(WorkitemIDY)); 844 845 Y = MIRBuilder.buildShl(S32, Y, MIRBuilder.buildConstant(S32, 10)).getReg(0); 846 InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Y).getReg(0) : Y; 847 } 848 849 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ) { 850 Register Z = MRI.createGenericVirtualRegister(S32); 851 LI->loadInputValue(Z, MIRBuilder, IncomingArgZ, std::get<1>(WorkitemIDZ), 852 std::get<2>(WorkitemIDZ)); 853 854 Z = MIRBuilder.buildShl(S32, Z, MIRBuilder.buildConstant(S32, 20)).getReg(0); 855 InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Z).getReg(0) : Z; 856 } 857 858 if (!InputReg) { 859 InputReg = MRI.createGenericVirtualRegister(S32); 860 861 // Workitem ids are already packed, any of present incoming arguments will 862 // carry all required fields. 863 ArgDescriptor IncomingArg = ArgDescriptor::createArg( 864 IncomingArgX ? *IncomingArgX : 865 IncomingArgY ? *IncomingArgY : *IncomingArgZ, ~0u); 866 LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg, 867 &AMDGPU::VGPR_32RegClass, S32); 868 } 869 870 if (OutgoingArg->isRegister()) { 871 ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg); 872 if (!CCInfo.AllocateReg(OutgoingArg->getRegister())) 873 report_fatal_error("failed to allocate implicit input argument"); 874 } else { 875 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n"); 876 return false; 877 } 878 879 return true; 880 } 881 882 /// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for 883 /// CC. 884 static std::pair<CCAssignFn *, CCAssignFn *> 885 getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) { 886 return {TLI.CCAssignFnForCall(CC, false), TLI.CCAssignFnForCall(CC, true)}; 887 } 888 889 static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect, 890 bool IsTailCall) { 891 return AMDGPU::SI_CALL; 892 } 893 894 // Add operands to call instruction to track the callee. 895 static bool addCallTargetOperands(MachineInstrBuilder &CallInst, 896 MachineIRBuilder &MIRBuilder, 897 AMDGPUCallLowering::CallLoweringInfo &Info) { 898 if (Info.Callee.isReg()) { 899 CallInst.addReg(Info.Callee.getReg()); 900 CallInst.addImm(0); 901 } else if (Info.Callee.isGlobal() && Info.Callee.getOffset() == 0) { 902 // The call lowering lightly assumed we can directly encode a call target in 903 // the instruction, which is not the case. Materialize the address here. 904 const GlobalValue *GV = Info.Callee.getGlobal(); 905 auto Ptr = MIRBuilder.buildGlobalValue( 906 LLT::pointer(GV->getAddressSpace(), 64), GV); 907 CallInst.addReg(Ptr.getReg(0)); 908 CallInst.add(Info.Callee); 909 } else 910 return false; 911 912 return true; 913 } 914 915 bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, 916 CallLoweringInfo &Info) const { 917 if (Info.IsVarArg) { 918 LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n"); 919 return false; 920 } 921 922 MachineFunction &MF = MIRBuilder.getMF(); 923 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 924 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 925 926 const Function &F = MF.getFunction(); 927 MachineRegisterInfo &MRI = MF.getRegInfo(); 928 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 929 const DataLayout &DL = F.getParent()->getDataLayout(); 930 CallingConv::ID CallConv = F.getCallingConv(); 931 932 if (!AMDGPUTargetMachine::EnableFixedFunctionABI && 933 CallConv != CallingConv::AMDGPU_Gfx) { 934 LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n"); 935 return false; 936 } 937 938 if (AMDGPU::isShader(CallConv)) { 939 LLVM_DEBUG(dbgs() << "Unhandled call from graphics shader\n"); 940 return false; 941 } 942 943 SmallVector<ArgInfo, 8> OutArgs; 944 for (auto &OrigArg : Info.OrigArgs) 945 splitToValueTypes(OrigArg, OutArgs, DL, Info.CallConv); 946 947 // If we can lower as a tail call, do that instead. 948 bool CanTailCallOpt = false; 949 950 // We must emit a tail call if we have musttail. 951 if (Info.IsMustTailCall && !CanTailCallOpt) { 952 LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n"); 953 return false; 954 } 955 956 // Find out which ABI gets to decide where things go. 957 CCAssignFn *AssignFnFixed; 958 CCAssignFn *AssignFnVarArg; 959 std::tie(AssignFnFixed, AssignFnVarArg) = 960 getAssignFnsForCC(Info.CallConv, TLI); 961 962 MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP) 963 .addImm(0) 964 .addImm(0); 965 966 // Create a temporarily-floating call instruction so we can add the implicit 967 // uses of arg registers. 968 unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false); 969 970 auto MIB = MIRBuilder.buildInstrNoInsert(Opc); 971 MIB.addDef(TRI->getReturnAddressReg(MF)); 972 973 if (!addCallTargetOperands(MIB, MIRBuilder, Info)) 974 return false; 975 976 // Tell the call which registers are clobbered. 977 const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv); 978 MIB.addRegMask(Mask); 979 980 SmallVector<CCValAssign, 16> ArgLocs; 981 CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext()); 982 983 // We could pass MIB and directly add the implicit uses to the call 984 // now. However, as an aesthetic choice, place implicit argument operands 985 // after the ordinary user argument registers. 986 SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs; 987 988 if (AMDGPUTargetMachine::EnableFixedFunctionABI && 989 Info.CallConv != CallingConv::AMDGPU_Gfx) { 990 // With a fixed ABI, allocate fixed registers before user arguments. 991 if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info)) 992 return false; 993 } 994 995 // Do the actual argument marshalling. 996 SmallVector<Register, 8> PhysRegs; 997 AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed, 998 AssignFnVarArg, false); 999 if (!handleAssignments(CCInfo, ArgLocs, MIRBuilder, OutArgs, Handler)) 1000 return false; 1001 1002 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1003 1004 if (!ST.enableFlatScratch()) { 1005 // Insert copies for the SRD. In the HSA case, this should be an identity 1006 // copy. 1007 auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::vector(4, 32), 1008 MFI->getScratchRSrcReg()); 1009 MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); 1010 MIB.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit); 1011 } 1012 1013 for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) { 1014 MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second); 1015 MIB.addReg(ArgReg.first, RegState::Implicit); 1016 } 1017 1018 // Get a count of how many bytes are to be pushed on the stack. 1019 unsigned NumBytes = CCInfo.getNextStackOffset(); 1020 1021 // If Callee is a reg, since it is used by a target specific 1022 // instruction, it must have a register class matching the 1023 // constraint of that instruction. 1024 1025 // FIXME: We should define regbankselectable call instructions to handle 1026 // divergent call targets. 1027 if (MIB->getOperand(1).isReg()) { 1028 MIB->getOperand(1).setReg(constrainOperandRegClass( 1029 MF, *TRI, MRI, *ST.getInstrInfo(), 1030 *ST.getRegBankInfo(), *MIB, MIB->getDesc(), MIB->getOperand(1), 1031 1)); 1032 } 1033 1034 auto OrigInsertPt = MIRBuilder.getInsertPt(); 1035 1036 // Now we can add the actual call instruction to the correct position. 1037 MIRBuilder.insertInstr(MIB); 1038 1039 // Insert this now to give us an anchor point for managing the insert point. 1040 MachineInstrBuilder CallSeqEnd = 1041 MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN); 1042 1043 SmallVector<ArgInfo, 8> InArgs; 1044 if (!Info.CanLowerReturn) { 1045 insertSRetLoads(MIRBuilder, Info.OrigRet.Ty, Info.OrigRet.Regs, 1046 Info.DemoteRegister, Info.DemoteStackIndex); 1047 } else if (!Info.OrigRet.Ty->isVoidTy()) { 1048 splitToValueTypes(Info.OrigRet, InArgs, DL, Info.CallConv); 1049 } 1050 1051 // Make sure the raw argument copies are inserted before the marshalling to 1052 // the original types. 1053 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), CallSeqEnd); 1054 1055 // Finally we can copy the returned value back into its virtual-register. In 1056 // symmetry with the arguments, the physical register must be an 1057 // implicit-define of the call instruction. 1058 if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy()) { 1059 CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv, 1060 Info.IsVarArg); 1061 CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn); 1062 if (!handleAssignments(MIRBuilder, InArgs, Handler, Info.CallConv, 1063 Info.IsVarArg)) 1064 return false; 1065 } 1066 1067 uint64_t CalleePopBytes = NumBytes; 1068 CallSeqEnd.addImm(0) 1069 .addImm(CalleePopBytes); 1070 1071 // Restore the insert point to after the call sequence. 1072 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), OrigInsertPt); 1073 return true; 1074 } 1075