1 //===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file implements the lowering of LLVM calls to machine code calls for 11 /// GlobalISel. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUCallLowering.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUISelLowering.h" 18 #include "AMDGPULegalizerInfo.h" 19 #include "AMDGPUSubtarget.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 22 #include "SIISelLowering.h" 23 #include "SIMachineFunctionInfo.h" 24 #include "SIRegisterInfo.h" 25 #include "llvm/CodeGen/Analysis.h" 26 #include "llvm/CodeGen/CallingConvLower.h" 27 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 28 #include "llvm/CodeGen/MachineInstrBuilder.h" 29 #include "llvm/Support/LowLevelTypeImpl.h" 30 31 #define DEBUG_TYPE "amdgpu-call-lowering" 32 33 using namespace llvm; 34 35 namespace { 36 37 struct AMDGPUValueHandler : public CallLowering::ValueHandler { 38 AMDGPUValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, 39 CCAssignFn *AssignFn) 40 : ValueHandler(B, MRI, AssignFn) {} 41 42 /// Wrapper around extendRegister to ensure we extend to a full 32-bit 43 /// register. 44 Register extendRegisterMin32(Register ValVReg, CCValAssign &VA) { 45 if (VA.getLocVT().getSizeInBits() < 32) { 46 // 16-bit types are reported as legal for 32-bit registers. We need to 47 // extend and do a 32-bit copy to avoid the verifier complaining about it. 48 return MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0); 49 } 50 51 return extendRegister(ValVReg, VA); 52 } 53 }; 54 55 struct OutgoingValueHandler : public AMDGPUValueHandler { 56 OutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, 57 MachineInstrBuilder MIB, CCAssignFn *AssignFn) 58 : AMDGPUValueHandler(B, MRI, AssignFn), MIB(MIB) {} 59 60 MachineInstrBuilder MIB; 61 62 bool isIncomingArgumentHandler() const override { return false; } 63 64 Register getStackAddress(uint64_t Size, int64_t Offset, 65 MachinePointerInfo &MPO) override { 66 llvm_unreachable("not implemented"); 67 } 68 69 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, 70 MachinePointerInfo &MPO, CCValAssign &VA) override { 71 llvm_unreachable("not implemented"); 72 } 73 74 void assignValueToReg(Register ValVReg, Register PhysReg, 75 CCValAssign &VA) override { 76 Register ExtReg = extendRegisterMin32(ValVReg, VA); 77 78 // If this is a scalar return, insert a readfirstlane just in case the value 79 // ends up in a VGPR. 80 // FIXME: Assert this is a shader return. 81 const SIRegisterInfo *TRI 82 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 83 if (TRI->isSGPRReg(MRI, PhysReg)) { 84 auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, 85 {MRI.getType(ExtReg)}, false) 86 .addReg(ExtReg); 87 ExtReg = ToSGPR.getReg(0); 88 } 89 90 MIRBuilder.buildCopy(PhysReg, ExtReg); 91 MIB.addUse(PhysReg, RegState::Implicit); 92 } 93 94 bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, 95 CCValAssign::LocInfo LocInfo, 96 const CallLowering::ArgInfo &Info, 97 ISD::ArgFlagsTy Flags, 98 CCState &State) override { 99 return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State); 100 } 101 }; 102 103 struct IncomingArgHandler : public AMDGPUValueHandler { 104 uint64_t StackUsed = 0; 105 106 IncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, 107 CCAssignFn *AssignFn) 108 : AMDGPUValueHandler(B, MRI, AssignFn) {} 109 110 Register getStackAddress(uint64_t Size, int64_t Offset, 111 MachinePointerInfo &MPO) override { 112 auto &MFI = MIRBuilder.getMF().getFrameInfo(); 113 int FI = MFI.CreateFixedObject(Size, Offset, true); 114 MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); 115 auto AddrReg = MIRBuilder.buildFrameIndex( 116 LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32), FI); 117 StackUsed = std::max(StackUsed, Size + Offset); 118 return AddrReg.getReg(0); 119 } 120 121 void assignValueToReg(Register ValVReg, Register PhysReg, 122 CCValAssign &VA) override { 123 markPhysRegUsed(PhysReg); 124 125 if (VA.getLocVT().getSizeInBits() < 32) { 126 // 16-bit types are reported as legal for 32-bit registers. We need to do 127 // a 32-bit copy, and truncate to avoid the verifier complaining about it. 128 auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg); 129 MIRBuilder.buildTrunc(ValVReg, Copy); 130 return; 131 } 132 133 switch (VA.getLocInfo()) { 134 case CCValAssign::LocInfo::SExt: 135 case CCValAssign::LocInfo::ZExt: 136 case CCValAssign::LocInfo::AExt: { 137 auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg); 138 MIRBuilder.buildTrunc(ValVReg, Copy); 139 break; 140 } 141 default: 142 MIRBuilder.buildCopy(ValVReg, PhysReg); 143 break; 144 } 145 } 146 147 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, 148 MachinePointerInfo &MPO, CCValAssign &VA) override { 149 MachineFunction &MF = MIRBuilder.getMF(); 150 151 // FIXME: Get alignment 152 auto MMO = MF.getMachineMemOperand( 153 MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size, 154 inferAlignFromPtrInfo(MF, MPO)); 155 MIRBuilder.buildLoad(ValVReg, Addr, *MMO); 156 } 157 158 /// How the physical register gets marked varies between formal 159 /// parameters (it's a basic-block live-in), and a call instruction 160 /// (it's an implicit-def of the BL). 161 virtual void markPhysRegUsed(unsigned PhysReg) = 0; 162 163 // FIXME: What is the point of this being a callback? 164 bool isIncomingArgumentHandler() const override { return true; } 165 }; 166 167 struct FormalArgHandler : public IncomingArgHandler { 168 FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, 169 CCAssignFn *AssignFn) 170 : IncomingArgHandler(B, MRI, AssignFn) {} 171 172 void markPhysRegUsed(unsigned PhysReg) override { 173 MIRBuilder.getMBB().addLiveIn(PhysReg); 174 } 175 }; 176 177 struct CallReturnHandler : public IncomingArgHandler { 178 CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, 179 MachineInstrBuilder MIB, CCAssignFn *AssignFn) 180 : IncomingArgHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} 181 182 void markPhysRegUsed(unsigned PhysReg) override { 183 MIB.addDef(PhysReg, RegState::Implicit); 184 } 185 186 MachineInstrBuilder MIB; 187 }; 188 189 struct OutgoingArgHandler : public AMDGPUValueHandler { 190 MachineInstrBuilder MIB; 191 CCAssignFn *AssignFnVarArg; 192 193 /// For tail calls, the byte offset of the call's argument area from the 194 /// callee's. Unused elsewhere. 195 int FPDiff; 196 197 // Cache the SP register vreg if we need it more than once in this call site. 198 Register SPReg; 199 200 bool IsTailCall; 201 202 OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, 203 MachineInstrBuilder MIB, CCAssignFn *AssignFn, 204 CCAssignFn *AssignFnVarArg, bool IsTailCall = false, 205 int FPDiff = 0) 206 : AMDGPUValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB), 207 AssignFnVarArg(AssignFnVarArg), 208 FPDiff(FPDiff), IsTailCall(IsTailCall) {} 209 210 bool isIncomingArgumentHandler() const override { return false; } 211 212 Register getStackAddress(uint64_t Size, int64_t Offset, 213 MachinePointerInfo &MPO) override { 214 MachineFunction &MF = MIRBuilder.getMF(); 215 const LLT PtrTy = LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32); 216 const LLT S32 = LLT::scalar(32); 217 218 if (IsTailCall) { 219 llvm_unreachable("implement me"); 220 } 221 222 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 223 224 if (!SPReg) 225 SPReg = MIRBuilder.buildCopy(PtrTy, MFI->getStackPtrOffsetReg()).getReg(0); 226 227 auto OffsetReg = MIRBuilder.buildConstant(S32, Offset); 228 229 auto AddrReg = MIRBuilder.buildPtrAdd(PtrTy, SPReg, OffsetReg); 230 MPO = MachinePointerInfo::getStack(MF, Offset); 231 return AddrReg.getReg(0); 232 } 233 234 void assignValueToReg(Register ValVReg, Register PhysReg, 235 CCValAssign &VA) override { 236 MIB.addUse(PhysReg, RegState::Implicit); 237 Register ExtReg = extendRegisterMin32(ValVReg, VA); 238 MIRBuilder.buildCopy(PhysReg, ExtReg); 239 } 240 241 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, 242 MachinePointerInfo &MPO, CCValAssign &VA) override { 243 MachineFunction &MF = MIRBuilder.getMF(); 244 uint64_t LocMemOffset = VA.getLocMemOffset(); 245 const auto &ST = MF.getSubtarget<GCNSubtarget>(); 246 247 auto MMO = MF.getMachineMemOperand( 248 MPO, MachineMemOperand::MOStore, Size, 249 commonAlignment(ST.getStackAlignment(), LocMemOffset)); 250 MIRBuilder.buildStore(ValVReg, Addr, *MMO); 251 } 252 253 void assignValueToAddress(const CallLowering::ArgInfo &Arg, Register Addr, 254 uint64_t Size, MachinePointerInfo &MPO, 255 CCValAssign &VA) override { 256 Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt 257 ? extendRegister(Arg.Regs[0], VA) 258 : Arg.Regs[0]; 259 260 // If we extended we might need to adjust the MMO's Size. 261 const LLT RegTy = MRI.getType(ValVReg); 262 if (RegTy.getSizeInBytes() > Size) 263 Size = RegTy.getSizeInBytes(); 264 265 assignValueToAddress(ValVReg, Addr, Size, MPO, VA); 266 } 267 }; 268 } 269 270 AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI) 271 : CallLowering(&TLI) { 272 } 273 274 // FIXME: Compatability shim 275 static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) { 276 switch (MIOpc) { 277 case TargetOpcode::G_SEXT: 278 return ISD::SIGN_EXTEND; 279 case TargetOpcode::G_ZEXT: 280 return ISD::ZERO_EXTEND; 281 case TargetOpcode::G_ANYEXT: 282 return ISD::ANY_EXTEND; 283 default: 284 llvm_unreachable("not an extend opcode"); 285 } 286 } 287 288 void AMDGPUCallLowering::splitToValueTypes( 289 MachineIRBuilder &B, 290 const ArgInfo &OrigArg, 291 SmallVectorImpl<ArgInfo> &SplitArgs, 292 const DataLayout &DL, CallingConv::ID CallConv, 293 bool IsOutgoing, 294 SplitArgTy PerformArgSplit) const { 295 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 296 LLVMContext &Ctx = OrigArg.Ty->getContext(); 297 298 if (OrigArg.Ty->isVoidTy()) 299 return; 300 301 SmallVector<EVT, 4> SplitVTs; 302 ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs); 303 304 assert(OrigArg.Regs.size() == SplitVTs.size()); 305 306 int SplitIdx = 0; 307 for (EVT VT : SplitVTs) { 308 Register Reg = OrigArg.Regs[SplitIdx]; 309 Type *Ty = VT.getTypeForEVT(Ctx); 310 LLT LLTy = getLLTForType(*Ty, DL); 311 312 if (IsOutgoing && VT.isScalarInteger()) { 313 unsigned ExtendOp = TargetOpcode::G_ANYEXT; 314 if (OrigArg.Flags[0].isSExt()) { 315 assert(OrigArg.Regs.size() == 1 && "expect only simple return values"); 316 ExtendOp = TargetOpcode::G_SEXT; 317 } else if (OrigArg.Flags[0].isZExt()) { 318 assert(OrigArg.Regs.size() == 1 && "expect only simple return values"); 319 ExtendOp = TargetOpcode::G_ZEXT; 320 } 321 322 EVT ExtVT = TLI.getTypeForExtReturn(Ctx, VT, 323 extOpcodeToISDExtOpcode(ExtendOp)); 324 if (ExtVT.getSizeInBits() != VT.getSizeInBits()) { 325 VT = ExtVT; 326 Ty = ExtVT.getTypeForEVT(Ctx); 327 LLTy = getLLTForType(*Ty, DL); 328 Reg = B.buildInstr(ExtendOp, {LLTy}, {Reg}).getReg(0); 329 } 330 } 331 332 unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT); 333 MVT RegVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT); 334 335 if (NumParts == 1) { 336 // No splitting to do, but we want to replace the original type (e.g. [1 x 337 // double] -> double). 338 SplitArgs.emplace_back(Reg, Ty, OrigArg.Flags, OrigArg.IsFixed); 339 340 ++SplitIdx; 341 continue; 342 } 343 344 SmallVector<Register, 8> SplitRegs; 345 Type *PartTy = EVT(RegVT).getTypeForEVT(Ctx); 346 LLT PartLLT = getLLTForType(*PartTy, DL); 347 MachineRegisterInfo &MRI = *B.getMRI(); 348 349 // FIXME: Should we be reporting all of the part registers for a single 350 // argument, and let handleAssignments take care of the repacking? 351 for (unsigned i = 0; i < NumParts; ++i) { 352 Register PartReg = MRI.createGenericVirtualRegister(PartLLT); 353 SplitRegs.push_back(PartReg); 354 SplitArgs.emplace_back(ArrayRef<Register>(PartReg), PartTy, OrigArg.Flags); 355 } 356 357 PerformArgSplit(SplitRegs, Reg, LLTy, PartLLT, SplitIdx); 358 359 ++SplitIdx; 360 } 361 } 362 363 // Get the appropriate type to make \p OrigTy \p Factor times bigger. 364 static LLT getMultipleType(LLT OrigTy, int Factor) { 365 if (OrigTy.isVector()) { 366 return LLT::vector(OrigTy.getNumElements() * Factor, 367 OrigTy.getElementType()); 368 } 369 370 return LLT::scalar(OrigTy.getSizeInBits() * Factor); 371 } 372 373 // TODO: Move to generic code 374 static void unpackRegsToOrigType(MachineIRBuilder &B, 375 ArrayRef<Register> DstRegs, 376 Register SrcReg, 377 const CallLowering::ArgInfo &Info, 378 LLT SrcTy, 379 LLT PartTy) { 380 assert(DstRegs.size() > 1 && "Nothing to unpack"); 381 382 const unsigned SrcSize = SrcTy.getSizeInBits(); 383 const unsigned PartSize = PartTy.getSizeInBits(); 384 385 if (SrcTy.isVector() && !PartTy.isVector() && 386 PartSize > SrcTy.getElementType().getSizeInBits()) { 387 // Vector was scalarized, and the elements extended. 388 auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(), 389 SrcReg); 390 for (int i = 0, e = DstRegs.size(); i != e; ++i) 391 B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i)); 392 return; 393 } 394 395 if (SrcSize % PartSize == 0) { 396 B.buildUnmerge(DstRegs, SrcReg); 397 return; 398 } 399 400 const int NumRoundedParts = (SrcSize + PartSize - 1) / PartSize; 401 402 LLT BigTy = getMultipleType(PartTy, NumRoundedParts); 403 auto ImpDef = B.buildUndef(BigTy); 404 405 auto Big = B.buildInsert(BigTy, ImpDef.getReg(0), SrcReg, 0).getReg(0); 406 407 int64_t Offset = 0; 408 for (unsigned i = 0, e = DstRegs.size(); i != e; ++i, Offset += PartSize) 409 B.buildExtract(DstRegs[i], Big, Offset); 410 } 411 412 /// Lower the return value for the already existing \p Ret. This assumes that 413 /// \p B's insertion point is correct. 414 bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B, 415 const Value *Val, ArrayRef<Register> VRegs, 416 MachineInstrBuilder &Ret) const { 417 if (!Val) 418 return true; 419 420 auto &MF = B.getMF(); 421 const auto &F = MF.getFunction(); 422 const DataLayout &DL = MF.getDataLayout(); 423 MachineRegisterInfo *MRI = B.getMRI(); 424 425 CallingConv::ID CC = F.getCallingConv(); 426 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 427 428 ArgInfo OrigRetInfo(VRegs, Val->getType()); 429 setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F); 430 SmallVector<ArgInfo, 4> SplitRetInfos; 431 432 splitToValueTypes( 433 B, OrigRetInfo, SplitRetInfos, DL, CC, true, 434 [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT, 435 int VTSplitIdx) { 436 unpackRegsToOrigType(B, Regs, SrcReg, 437 SplitRetInfos[VTSplitIdx], 438 LLTy, PartLLT); 439 }); 440 441 CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg()); 442 OutgoingValueHandler RetHandler(B, *MRI, Ret, AssignFn); 443 return handleAssignments(B, SplitRetInfos, RetHandler); 444 } 445 446 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, 447 const Value *Val, 448 ArrayRef<Register> VRegs) const { 449 450 MachineFunction &MF = B.getMF(); 451 MachineRegisterInfo &MRI = MF.getRegInfo(); 452 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 453 MFI->setIfReturnsVoid(!Val); 454 455 assert(!Val == VRegs.empty() && "Return value without a vreg"); 456 457 CallingConv::ID CC = B.getMF().getFunction().getCallingConv(); 458 const bool IsShader = AMDGPU::isShader(CC); 459 const bool IsWaveEnd = (IsShader && MFI->returnsVoid()) || 460 AMDGPU::isKernel(CC); 461 if (IsWaveEnd) { 462 B.buildInstr(AMDGPU::S_ENDPGM) 463 .addImm(0); 464 return true; 465 } 466 467 auto const &ST = MF.getSubtarget<GCNSubtarget>(); 468 469 unsigned ReturnOpc = 470 IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return; 471 472 auto Ret = B.buildInstrNoInsert(ReturnOpc); 473 Register ReturnAddrVReg; 474 if (ReturnOpc == AMDGPU::S_SETPC_B64_return) { 475 ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass); 476 Ret.addUse(ReturnAddrVReg); 477 } 478 479 if (!lowerReturnVal(B, Val, VRegs, Ret)) 480 return false; 481 482 if (ReturnOpc == AMDGPU::S_SETPC_B64_return) { 483 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 484 Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF), 485 &AMDGPU::SGPR_64RegClass); 486 B.buildCopy(ReturnAddrVReg, LiveInReturn); 487 } 488 489 // TODO: Handle CalleeSavedRegsViaCopy. 490 491 B.insertInstr(Ret); 492 return true; 493 } 494 495 Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &B, 496 Type *ParamTy, 497 uint64_t Offset) const { 498 499 MachineFunction &MF = B.getMF(); 500 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 501 MachineRegisterInfo &MRI = MF.getRegInfo(); 502 const Function &F = MF.getFunction(); 503 const DataLayout &DL = F.getParent()->getDataLayout(); 504 PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS); 505 LLT PtrType = getLLTForType(*PtrTy, DL); 506 Register KernArgSegmentPtr = 507 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 508 Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr); 509 510 auto OffsetReg = B.buildConstant(LLT::scalar(64), Offset); 511 512 return B.buildPtrAdd(PtrType, KernArgSegmentVReg, OffsetReg).getReg(0); 513 } 514 515 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy, 516 uint64_t Offset, Align Alignment, 517 Register DstReg) const { 518 MachineFunction &MF = B.getMF(); 519 const Function &F = MF.getFunction(); 520 const DataLayout &DL = F.getParent()->getDataLayout(); 521 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 522 unsigned TypeSize = DL.getTypeStoreSize(ParamTy); 523 Register PtrReg = lowerParameterPtr(B, ParamTy, Offset); 524 525 MachineMemOperand *MMO = MF.getMachineMemOperand( 526 PtrInfo, 527 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 528 MachineMemOperand::MOInvariant, 529 TypeSize, Alignment); 530 531 B.buildLoad(DstReg, PtrReg, *MMO); 532 } 533 534 // Allocate special inputs passed in user SGPRs. 535 static void allocateHSAUserSGPRs(CCState &CCInfo, 536 MachineIRBuilder &B, 537 MachineFunction &MF, 538 const SIRegisterInfo &TRI, 539 SIMachineFunctionInfo &Info) { 540 // FIXME: How should these inputs interact with inreg / custom SGPR inputs? 541 if (Info.hasPrivateSegmentBuffer()) { 542 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); 543 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); 544 CCInfo.AllocateReg(PrivateSegmentBufferReg); 545 } 546 547 if (Info.hasDispatchPtr()) { 548 Register DispatchPtrReg = Info.addDispatchPtr(TRI); 549 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); 550 CCInfo.AllocateReg(DispatchPtrReg); 551 } 552 553 if (Info.hasQueuePtr()) { 554 Register QueuePtrReg = Info.addQueuePtr(TRI); 555 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); 556 CCInfo.AllocateReg(QueuePtrReg); 557 } 558 559 if (Info.hasKernargSegmentPtr()) { 560 MachineRegisterInfo &MRI = MF.getRegInfo(); 561 Register InputPtrReg = Info.addKernargSegmentPtr(TRI); 562 const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 563 Register VReg = MRI.createGenericVirtualRegister(P4); 564 MRI.addLiveIn(InputPtrReg, VReg); 565 B.getMBB().addLiveIn(InputPtrReg); 566 B.buildCopy(VReg, InputPtrReg); 567 CCInfo.AllocateReg(InputPtrReg); 568 } 569 570 if (Info.hasDispatchID()) { 571 Register DispatchIDReg = Info.addDispatchID(TRI); 572 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); 573 CCInfo.AllocateReg(DispatchIDReg); 574 } 575 576 if (Info.hasFlatScratchInit()) { 577 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI); 578 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); 579 CCInfo.AllocateReg(FlatScratchInitReg); 580 } 581 582 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read 583 // these from the dispatch pointer. 584 } 585 586 bool AMDGPUCallLowering::lowerFormalArgumentsKernel( 587 MachineIRBuilder &B, const Function &F, 588 ArrayRef<ArrayRef<Register>> VRegs) const { 589 MachineFunction &MF = B.getMF(); 590 const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>(); 591 MachineRegisterInfo &MRI = MF.getRegInfo(); 592 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 593 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); 594 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 595 596 const DataLayout &DL = F.getParent()->getDataLayout(); 597 598 SmallVector<CCValAssign, 16> ArgLocs; 599 CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); 600 601 allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info); 602 603 unsigned i = 0; 604 const Align KernArgBaseAlign(16); 605 const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F); 606 uint64_t ExplicitArgOffset = 0; 607 608 // TODO: Align down to dword alignment and extract bits for extending loads. 609 for (auto &Arg : F.args()) { 610 Type *ArgTy = Arg.getType(); 611 unsigned AllocSize = DL.getTypeAllocSize(ArgTy); 612 if (AllocSize == 0) 613 continue; 614 615 Align ABIAlign = DL.getABITypeAlign(ArgTy); 616 617 uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset; 618 ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize; 619 620 if (Arg.use_empty()) { 621 ++i; 622 continue; 623 } 624 625 ArrayRef<Register> OrigArgRegs = VRegs[i]; 626 Register ArgReg = 627 OrigArgRegs.size() == 1 628 ? OrigArgRegs[0] 629 : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL)); 630 631 Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset); 632 lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg); 633 if (OrigArgRegs.size() > 1) 634 unpackRegs(OrigArgRegs, ArgReg, ArgTy, B); 635 ++i; 636 } 637 638 TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); 639 TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false); 640 return true; 641 } 642 643 /// Pack values \p SrcRegs to cover the vector type result \p DstRegs. 644 static MachineInstrBuilder mergeVectorRegsToResultRegs( 645 MachineIRBuilder &B, ArrayRef<Register> DstRegs, ArrayRef<Register> SrcRegs) { 646 MachineRegisterInfo &MRI = *B.getMRI(); 647 LLT LLTy = MRI.getType(DstRegs[0]); 648 LLT PartLLT = MRI.getType(SrcRegs[0]); 649 650 // Deal with v3s16 split into v2s16 651 LLT LCMTy = getLCMType(LLTy, PartLLT); 652 if (LCMTy == LLTy) { 653 // Common case where no padding is needed. 654 assert(DstRegs.size() == 1); 655 return B.buildConcatVectors(DstRegs[0], SrcRegs); 656 } 657 658 const int NumWide = LCMTy.getSizeInBits() / PartLLT.getSizeInBits(); 659 Register Undef = B.buildUndef(PartLLT).getReg(0); 660 661 // Build vector of undefs. 662 SmallVector<Register, 8> WidenedSrcs(NumWide, Undef); 663 664 // Replace the first sources with the real registers. 665 std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin()); 666 667 auto Widened = B.buildConcatVectors(LCMTy, WidenedSrcs); 668 int NumDst = LCMTy.getSizeInBits() / LLTy.getSizeInBits(); 669 670 SmallVector<Register, 8> PadDstRegs(NumDst); 671 std::copy(DstRegs.begin(), DstRegs.end(), PadDstRegs.begin()); 672 673 // Create the excess dead defs for the unmerge. 674 for (int I = DstRegs.size(); I != NumDst; ++I) 675 PadDstRegs[I] = MRI.createGenericVirtualRegister(LLTy); 676 677 return B.buildUnmerge(PadDstRegs, Widened); 678 } 679 680 // TODO: Move this to generic code 681 static void packSplitRegsToOrigType(MachineIRBuilder &B, 682 ArrayRef<Register> OrigRegs, 683 ArrayRef<Register> Regs, 684 LLT LLTy, 685 LLT PartLLT) { 686 MachineRegisterInfo &MRI = *B.getMRI(); 687 688 if (!LLTy.isVector() && !PartLLT.isVector()) { 689 assert(OrigRegs.size() == 1); 690 LLT OrigTy = MRI.getType(OrigRegs[0]); 691 692 unsigned SrcSize = PartLLT.getSizeInBits() * Regs.size(); 693 if (SrcSize == OrigTy.getSizeInBits()) 694 B.buildMerge(OrigRegs[0], Regs); 695 else { 696 auto Widened = B.buildMerge(LLT::scalar(SrcSize), Regs); 697 B.buildTrunc(OrigRegs[0], Widened); 698 } 699 700 return; 701 } 702 703 if (LLTy.isVector() && PartLLT.isVector()) { 704 assert(OrigRegs.size() == 1); 705 assert(LLTy.getElementType() == PartLLT.getElementType()); 706 mergeVectorRegsToResultRegs(B, OrigRegs, Regs); 707 return; 708 } 709 710 assert(LLTy.isVector() && !PartLLT.isVector()); 711 712 LLT DstEltTy = LLTy.getElementType(); 713 714 // Pointer information was discarded. We'll need to coerce some register types 715 // to avoid violating type constraints. 716 LLT RealDstEltTy = MRI.getType(OrigRegs[0]).getElementType(); 717 718 assert(DstEltTy.getSizeInBits() == RealDstEltTy.getSizeInBits()); 719 720 if (DstEltTy == PartLLT) { 721 // Vector was trivially scalarized. 722 723 if (RealDstEltTy.isPointer()) { 724 for (Register Reg : Regs) 725 MRI.setType(Reg, RealDstEltTy); 726 } 727 728 B.buildBuildVector(OrigRegs[0], Regs); 729 } else if (DstEltTy.getSizeInBits() > PartLLT.getSizeInBits()) { 730 // Deal with vector with 64-bit elements decomposed to 32-bit 731 // registers. Need to create intermediate 64-bit elements. 732 SmallVector<Register, 8> EltMerges; 733 int PartsPerElt = DstEltTy.getSizeInBits() / PartLLT.getSizeInBits(); 734 735 assert(DstEltTy.getSizeInBits() % PartLLT.getSizeInBits() == 0); 736 737 for (int I = 0, NumElts = LLTy.getNumElements(); I != NumElts; ++I) { 738 auto Merge = B.buildMerge(RealDstEltTy, Regs.take_front(PartsPerElt)); 739 // Fix the type in case this is really a vector of pointers. 740 MRI.setType(Merge.getReg(0), RealDstEltTy); 741 EltMerges.push_back(Merge.getReg(0)); 742 Regs = Regs.drop_front(PartsPerElt); 743 } 744 745 B.buildBuildVector(OrigRegs[0], EltMerges); 746 } else { 747 // Vector was split, and elements promoted to a wider type. 748 LLT BVType = LLT::vector(LLTy.getNumElements(), PartLLT); 749 auto BV = B.buildBuildVector(BVType, Regs); 750 B.buildTrunc(OrigRegs[0], BV); 751 } 752 } 753 754 bool AMDGPUCallLowering::lowerFormalArguments( 755 MachineIRBuilder &B, const Function &F, 756 ArrayRef<ArrayRef<Register>> VRegs) const { 757 CallingConv::ID CC = F.getCallingConv(); 758 759 // The infrastructure for normal calling convention lowering is essentially 760 // useless for kernels. We want to avoid any kind of legalization or argument 761 // splitting. 762 if (CC == CallingConv::AMDGPU_KERNEL) 763 return lowerFormalArgumentsKernel(B, F, VRegs); 764 765 const bool IsShader = AMDGPU::isShader(CC); 766 const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC); 767 768 MachineFunction &MF = B.getMF(); 769 MachineBasicBlock &MBB = B.getMBB(); 770 MachineRegisterInfo &MRI = MF.getRegInfo(); 771 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 772 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 773 const SIRegisterInfo *TRI = Subtarget.getRegisterInfo(); 774 const DataLayout &DL = F.getParent()->getDataLayout(); 775 776 777 SmallVector<CCValAssign, 16> ArgLocs; 778 CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext()); 779 780 if (!IsEntryFunc) { 781 Register ReturnAddrReg = TRI->getReturnAddressReg(MF); 782 Register LiveInReturn = MF.addLiveIn(ReturnAddrReg, 783 &AMDGPU::SGPR_64RegClass); 784 MBB.addLiveIn(ReturnAddrReg); 785 B.buildCopy(LiveInReturn, ReturnAddrReg); 786 } 787 788 if (Info->hasImplicitBufferPtr()) { 789 Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI); 790 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); 791 CCInfo.AllocateReg(ImplicitBufferPtrReg); 792 } 793 794 795 SmallVector<ArgInfo, 32> SplitArgs; 796 unsigned Idx = 0; 797 unsigned PSInputNum = 0; 798 799 for (auto &Arg : F.args()) { 800 if (DL.getTypeStoreSize(Arg.getType()) == 0) 801 continue; 802 803 const bool InReg = Arg.hasAttribute(Attribute::InReg); 804 805 // SGPR arguments to functions not implemented. 806 if (!IsShader && InReg) 807 return false; 808 809 if (Arg.hasAttribute(Attribute::SwiftSelf) || 810 Arg.hasAttribute(Attribute::SwiftError) || 811 Arg.hasAttribute(Attribute::Nest)) 812 return false; 813 814 if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) { 815 const bool ArgUsed = !Arg.use_empty(); 816 bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum); 817 818 if (!SkipArg) { 819 Info->markPSInputAllocated(PSInputNum); 820 if (ArgUsed) 821 Info->markPSInputEnabled(PSInputNum); 822 } 823 824 ++PSInputNum; 825 826 if (SkipArg) { 827 for (int I = 0, E = VRegs[Idx].size(); I != E; ++I) 828 B.buildUndef(VRegs[Idx][I]); 829 830 ++Idx; 831 continue; 832 } 833 } 834 835 ArgInfo OrigArg(VRegs[Idx], Arg.getType()); 836 const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex; 837 setArgFlags(OrigArg, OrigArgIdx, DL, F); 838 839 splitToValueTypes( 840 B, OrigArg, SplitArgs, DL, CC, false, 841 // FIXME: We should probably be passing multiple registers to 842 // handleAssignments to do this 843 [&](ArrayRef<Register> Regs, Register DstReg, 844 LLT LLTy, LLT PartLLT, int VTSplitIdx) { 845 assert(DstReg == VRegs[Idx][VTSplitIdx]); 846 packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs, 847 LLTy, PartLLT); 848 }); 849 850 ++Idx; 851 } 852 853 // At least one interpolation mode must be enabled or else the GPU will 854 // hang. 855 // 856 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user 857 // set PSInputAddr, the user wants to enable some bits after the compilation 858 // based on run-time states. Since we can't know what the final PSInputEna 859 // will look like, so we shouldn't do anything here and the user should take 860 // responsibility for the correct programming. 861 // 862 // Otherwise, the following restrictions apply: 863 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. 864 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be 865 // enabled too. 866 if (CC == CallingConv::AMDGPU_PS) { 867 if ((Info->getPSInputAddr() & 0x7F) == 0 || 868 ((Info->getPSInputAddr() & 0xF) == 0 && 869 Info->isPSInputAllocated(11))) { 870 CCInfo.AllocateReg(AMDGPU::VGPR0); 871 CCInfo.AllocateReg(AMDGPU::VGPR1); 872 Info->markPSInputAllocated(0); 873 Info->markPSInputEnabled(0); 874 } 875 876 if (Subtarget.isAmdPalOS()) { 877 // For isAmdPalOS, the user does not enable some bits after compilation 878 // based on run-time states; the register values being generated here are 879 // the final ones set in hardware. Therefore we need to apply the 880 // workaround to PSInputAddr and PSInputEnable together. (The case where 881 // a bit is set in PSInputAddr but not PSInputEnable is where the frontend 882 // set up an input arg for a particular interpolation mode, but nothing 883 // uses that input arg. Really we should have an earlier pass that removes 884 // such an arg.) 885 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable(); 886 if ((PsInputBits & 0x7F) == 0 || 887 ((PsInputBits & 0xF) == 0 && 888 (PsInputBits >> 11 & 1))) 889 Info->markPSInputEnabled( 890 countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined)); 891 } 892 } 893 894 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 895 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg()); 896 897 if (!MBB.empty()) 898 B.setInstr(*MBB.begin()); 899 900 if (!IsEntryFunc) { 901 // For the fixed ABI, pass workitem IDs in the last argument register. 902 if (AMDGPUTargetMachine::EnableFixedFunctionABI) 903 TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); 904 } 905 906 FormalArgHandler Handler(B, MRI, AssignFn); 907 if (!handleAssignments(CCInfo, ArgLocs, B, SplitArgs, Handler)) 908 return false; 909 910 if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) { 911 // Special inputs come after user arguments. 912 TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); 913 } 914 915 // Start adding system SGPRs. 916 if (IsEntryFunc) { 917 TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader); 918 } else { 919 CCInfo.AllocateReg(Info->getScratchRSrcReg()); 920 TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); 921 } 922 923 // Move back to the end of the basic block. 924 B.setMBB(MBB); 925 926 return true; 927 } 928 929 bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder, 930 CCState &CCInfo, 931 SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs, 932 CallLoweringInfo &Info) const { 933 MachineFunction &MF = MIRBuilder.getMF(); 934 935 const AMDGPUFunctionArgInfo *CalleeArgInfo 936 = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo; 937 938 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 939 const AMDGPUFunctionArgInfo &CallerArgInfo = MFI->getArgInfo(); 940 941 942 // TODO: Unify with private memory register handling. This is complicated by 943 // the fact that at least in kernels, the input argument is not necessarily 944 // in the same location as the input. 945 AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = { 946 AMDGPUFunctionArgInfo::DISPATCH_PTR, 947 AMDGPUFunctionArgInfo::QUEUE_PTR, 948 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, 949 AMDGPUFunctionArgInfo::DISPATCH_ID, 950 AMDGPUFunctionArgInfo::WORKGROUP_ID_X, 951 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, 952 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z 953 }; 954 955 MachineRegisterInfo &MRI = MF.getRegInfo(); 956 957 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 958 const AMDGPULegalizerInfo *LI 959 = static_cast<const AMDGPULegalizerInfo*>(ST.getLegalizerInfo()); 960 961 for (auto InputID : InputRegs) { 962 const ArgDescriptor *OutgoingArg; 963 const TargetRegisterClass *ArgRC; 964 LLT ArgTy; 965 966 std::tie(OutgoingArg, ArgRC, ArgTy) = 967 CalleeArgInfo->getPreloadedValue(InputID); 968 if (!OutgoingArg) 969 continue; 970 971 const ArgDescriptor *IncomingArg; 972 const TargetRegisterClass *IncomingArgRC; 973 std::tie(IncomingArg, IncomingArgRC, ArgTy) = 974 CallerArgInfo.getPreloadedValue(InputID); 975 assert(IncomingArgRC == ArgRC); 976 977 Register InputReg = MRI.createGenericVirtualRegister(ArgTy); 978 979 if (IncomingArg) { 980 LI->loadInputValue(InputReg, MIRBuilder, IncomingArg); 981 } else { 982 assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 983 LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder); 984 } 985 986 if (OutgoingArg->isRegister()) { 987 ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg); 988 if (!CCInfo.AllocateReg(OutgoingArg->getRegister())) 989 report_fatal_error("failed to allocate implicit input argument"); 990 } else { 991 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n"); 992 return false; 993 } 994 } 995 996 // Pack workitem IDs into a single register or pass it as is if already 997 // packed. 998 const ArgDescriptor *OutgoingArg; 999 const TargetRegisterClass *ArgRC; 1000 LLT ArgTy; 1001 1002 std::tie(OutgoingArg, ArgRC, ArgTy) = 1003 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X); 1004 if (!OutgoingArg) 1005 std::tie(OutgoingArg, ArgRC, ArgTy) = 1006 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 1007 if (!OutgoingArg) 1008 std::tie(OutgoingArg, ArgRC, ArgTy) = 1009 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 1010 if (!OutgoingArg) 1011 return false; 1012 1013 const ArgDescriptor *IncomingArgX = std::get<0>( 1014 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X)); 1015 const ArgDescriptor *IncomingArgY = std::get<0>( 1016 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y)); 1017 const ArgDescriptor *IncomingArgZ = std::get<0>( 1018 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z)); 1019 1020 const LLT S32 = LLT::scalar(32); 1021 1022 // If incoming ids are not packed we need to pack them. 1023 // FIXME: Should consider known workgroup size to eliminate known 0 cases. 1024 Register InputReg; 1025 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX) { 1026 InputReg = MRI.createGenericVirtualRegister(S32); 1027 LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX); 1028 } 1029 1030 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY) { 1031 Register Y = MRI.createGenericVirtualRegister(S32); 1032 LI->loadInputValue(Y, MIRBuilder, IncomingArgY); 1033 1034 Y = MIRBuilder.buildShl(S32, Y, MIRBuilder.buildConstant(S32, 10)).getReg(0); 1035 InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Y).getReg(0) : Y; 1036 } 1037 1038 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ) { 1039 Register Z = MRI.createGenericVirtualRegister(S32); 1040 LI->loadInputValue(Z, MIRBuilder, IncomingArgZ); 1041 1042 Z = MIRBuilder.buildShl(S32, Z, MIRBuilder.buildConstant(S32, 20)).getReg(0); 1043 InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Z).getReg(0) : Z; 1044 } 1045 1046 if (!InputReg) { 1047 InputReg = MRI.createGenericVirtualRegister(S32); 1048 1049 // Workitem ids are already packed, any of present incoming arguments will 1050 // carry all required fields. 1051 ArgDescriptor IncomingArg = ArgDescriptor::createArg( 1052 IncomingArgX ? *IncomingArgX : 1053 IncomingArgY ? *IncomingArgY : *IncomingArgZ, ~0u); 1054 LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg); 1055 } 1056 1057 if (OutgoingArg->isRegister()) { 1058 ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg); 1059 if (!CCInfo.AllocateReg(OutgoingArg->getRegister())) 1060 report_fatal_error("failed to allocate implicit input argument"); 1061 } else { 1062 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n"); 1063 return false; 1064 } 1065 1066 return true; 1067 } 1068 1069 /// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for 1070 /// CC. 1071 static std::pair<CCAssignFn *, CCAssignFn *> 1072 getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) { 1073 return {TLI.CCAssignFnForCall(CC, false), TLI.CCAssignFnForCall(CC, true)}; 1074 } 1075 1076 static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect, 1077 bool IsTailCall) { 1078 return AMDGPU::SI_CALL; 1079 } 1080 1081 // Add operands to call instruction to track the callee. 1082 static bool addCallTargetOperands(MachineInstrBuilder &CallInst, 1083 MachineIRBuilder &MIRBuilder, 1084 AMDGPUCallLowering::CallLoweringInfo &Info) { 1085 if (Info.Callee.isReg()) { 1086 CallInst.addImm(0); 1087 CallInst.add(Info.Callee); 1088 } else if (Info.Callee.isGlobal() && Info.Callee.getOffset() == 0) { 1089 // The call lowering lightly assumed we can directly encode a call target in 1090 // the instruction, which is not the case. Materialize the address here. 1091 const GlobalValue *GV = Info.Callee.getGlobal(); 1092 auto Ptr = MIRBuilder.buildGlobalValue( 1093 LLT::pointer(GV->getAddressSpace(), 64), GV); 1094 CallInst.addReg(Ptr.getReg(0)); 1095 CallInst.add(Info.Callee); 1096 } else 1097 return false; 1098 1099 return true; 1100 } 1101 1102 bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, 1103 CallLoweringInfo &Info) const { 1104 if (!AMDGPUTargetMachine::EnableFixedFunctionABI) { 1105 LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n"); 1106 return false; 1107 } 1108 1109 if (Info.IsVarArg) { 1110 LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n"); 1111 return false; 1112 } 1113 1114 MachineFunction &MF = MIRBuilder.getMF(); 1115 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1116 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1117 1118 const Function &F = MF.getFunction(); 1119 MachineRegisterInfo &MRI = MF.getRegInfo(); 1120 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 1121 const DataLayout &DL = F.getParent()->getDataLayout(); 1122 1123 if (AMDGPU::isShader(F.getCallingConv())) { 1124 LLVM_DEBUG(dbgs() << "Unhandled call from graphics shader\n"); 1125 return false; 1126 } 1127 1128 SmallVector<ArgInfo, 8> OutArgs; 1129 SmallVector<ArgInfo, 4> SplitRetInfos; 1130 1131 for (auto &OrigArg : Info.OrigArgs) { 1132 splitToValueTypes( 1133 MIRBuilder, OrigArg, OutArgs, DL, Info.CallConv, true, 1134 // FIXME: We should probably be passing multiple registers to 1135 // handleAssignments to do this 1136 [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT, 1137 int VTSplitIdx) { 1138 unpackRegsToOrigType(MIRBuilder, Regs, SrcReg, OrigArg, LLTy, PartLLT); 1139 }); 1140 } 1141 1142 SmallVector<ArgInfo, 8> InArgs; 1143 if (!Info.OrigRet.Ty->isVoidTy()) { 1144 LLVM_DEBUG(dbgs() << "Call return values not yet handled\n"); 1145 return false; 1146 } 1147 1148 // If we can lower as a tail call, do that instead. 1149 bool CanTailCallOpt = false; 1150 1151 // We must emit a tail call if we have musttail. 1152 if (Info.IsMustTailCall && !CanTailCallOpt) { 1153 LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n"); 1154 return false; 1155 } 1156 1157 // Find out which ABI gets to decide where things go. 1158 CCAssignFn *AssignFnFixed; 1159 CCAssignFn *AssignFnVarArg; 1160 std::tie(AssignFnFixed, AssignFnVarArg) = 1161 getAssignFnsForCC(Info.CallConv, TLI); 1162 1163 MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP) 1164 .addImm(0) 1165 .addImm(0); 1166 1167 // Create a temporarily-floating call instruction so we can add the implicit 1168 // uses of arg registers. 1169 unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false); 1170 1171 auto MIB = MIRBuilder.buildInstrNoInsert(Opc); 1172 MIB.addDef(TRI->getReturnAddressReg(MF)); 1173 1174 if (!addCallTargetOperands(MIB, MIRBuilder, Info)) 1175 return false; 1176 1177 // Tell the call which registers are clobbered. 1178 const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv); 1179 MIB.addRegMask(Mask); 1180 1181 SmallVector<CCValAssign, 16> ArgLocs; 1182 CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext()); 1183 1184 // We could pass MIB and directly add the implicit uses to the call 1185 // now. However, as an aesthetic choice, place implicit argument operands 1186 // after the ordinary user argument registers. 1187 SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs; 1188 1189 if (AMDGPUTargetMachine::EnableFixedFunctionABI) { 1190 // With a fixed ABI, allocate fixed registers before user arguments. 1191 if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info)) 1192 return false; 1193 } 1194 1195 // Do the actual argument marshalling. 1196 SmallVector<Register, 8> PhysRegs; 1197 OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed, 1198 AssignFnVarArg, false); 1199 if (!handleAssignments(CCInfo, ArgLocs, MIRBuilder, OutArgs, Handler)) 1200 return false; 1201 1202 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1203 1204 // Insert copies for the SRD. In the HSA case, this should be an identity 1205 // copy. 1206 auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::vector(4, 32), 1207 MFI->getScratchRSrcReg()); 1208 MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); 1209 MIB.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit); 1210 1211 for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) { 1212 MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second); 1213 MIB.addReg(ArgReg.first, RegState::Implicit); 1214 } 1215 1216 // Get a count of how many bytes are to be pushed on the stack. 1217 unsigned NumBytes = CCInfo.getNextStackOffset(); 1218 1219 // Now we can add the actual call instruction to the correct position. 1220 MIRBuilder.insertInstr(MIB); 1221 1222 // If Callee is a reg, since it is used by a target specific 1223 // instruction, it must have a register class matching the 1224 // constraint of that instruction. 1225 1226 // FIXME: We should define regbankselectable call instructions to handle 1227 // divergent call targets. 1228 if (MIB->getOperand(1).isReg()) { 1229 MIB->getOperand(1).setReg(constrainOperandRegClass( 1230 MF, *TRI, MRI, *ST.getInstrInfo(), 1231 *ST.getRegBankInfo(), *MIB, MIB->getDesc(), MIB->getOperand(1), 1232 1)); 1233 } 1234 1235 // Finally we can copy the returned value back into its virtual-register. In 1236 // symmetry with the arguments, the physical register must be an 1237 // implicit-define of the call instruction. 1238 if (!Info.OrigRet.Ty->isVoidTy()) { 1239 CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv, 1240 Info.IsVarArg); 1241 CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn); 1242 if (!handleAssignments(MIRBuilder, InArgs, Handler)) 1243 return false; 1244 } 1245 1246 uint64_t CalleePopBytes = NumBytes; 1247 MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN) 1248 .addImm(0) 1249 .addImm(CalleePopBytes); 1250 1251 return true; 1252 } 1253