1 //===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file implements the lowering of LLVM calls to machine code calls for 11 /// GlobalISel. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUCallLowering.h" 16 #include "AMDGPU.h" 17 #include "AMDGPULegalizerInfo.h" 18 #include "AMDGPUTargetMachine.h" 19 #include "SIMachineFunctionInfo.h" 20 #include "SIRegisterInfo.h" 21 #include "llvm/CodeGen/Analysis.h" 22 #include "llvm/CodeGen/FunctionLoweringInfo.h" 23 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 24 #include "llvm/IR/IntrinsicsAMDGPU.h" 25 26 #define DEBUG_TYPE "amdgpu-call-lowering" 27 28 using namespace llvm; 29 30 namespace { 31 32 struct AMDGPUValueHandler : public CallLowering::ValueHandler { 33 AMDGPUValueHandler(bool IsIncoming, MachineIRBuilder &B, 34 MachineRegisterInfo &MRI, CCAssignFn *AssignFn) 35 : ValueHandler(IsIncoming, B, MRI, AssignFn) {} 36 37 /// Wrapper around extendRegister to ensure we extend to a full 32-bit 38 /// register. 39 Register extendRegisterMin32(Register ValVReg, CCValAssign &VA) { 40 if (VA.getLocVT().getSizeInBits() < 32) { 41 // 16-bit types are reported as legal for 32-bit registers. We need to 42 // extend and do a 32-bit copy to avoid the verifier complaining about it. 43 return MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0); 44 } 45 46 return extendRegister(ValVReg, VA); 47 } 48 }; 49 50 struct AMDGPUOutgoingValueHandler : public AMDGPUValueHandler { 51 AMDGPUOutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, 52 MachineInstrBuilder MIB, CCAssignFn *AssignFn) 53 : AMDGPUValueHandler(false, B, MRI, AssignFn), MIB(MIB) {} 54 55 MachineInstrBuilder MIB; 56 57 Register getStackAddress(uint64_t Size, int64_t Offset, 58 MachinePointerInfo &MPO) override { 59 llvm_unreachable("not implemented"); 60 } 61 62 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, 63 MachinePointerInfo &MPO, CCValAssign &VA) override { 64 llvm_unreachable("not implemented"); 65 } 66 67 void assignValueToReg(Register ValVReg, Register PhysReg, 68 CCValAssign &VA) override { 69 Register ExtReg = extendRegisterMin32(ValVReg, VA); 70 71 // If this is a scalar return, insert a readfirstlane just in case the value 72 // ends up in a VGPR. 73 // FIXME: Assert this is a shader return. 74 const SIRegisterInfo *TRI 75 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 76 if (TRI->isSGPRReg(MRI, PhysReg)) { 77 auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, 78 {MRI.getType(ExtReg)}, false) 79 .addReg(ExtReg); 80 ExtReg = ToSGPR.getReg(0); 81 } 82 83 MIRBuilder.buildCopy(PhysReg, ExtReg); 84 MIB.addUse(PhysReg, RegState::Implicit); 85 } 86 87 bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, 88 CCValAssign::LocInfo LocInfo, 89 const CallLowering::ArgInfo &Info, 90 ISD::ArgFlagsTy Flags, 91 CCState &State) override { 92 return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State); 93 } 94 }; 95 96 struct AMDGPUIncomingArgHandler : public AMDGPUValueHandler { 97 uint64_t StackUsed = 0; 98 99 AMDGPUIncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, 100 CCAssignFn *AssignFn) 101 : AMDGPUValueHandler(true, B, MRI, AssignFn) {} 102 103 Register getStackAddress(uint64_t Size, int64_t Offset, 104 MachinePointerInfo &MPO) override { 105 auto &MFI = MIRBuilder.getMF().getFrameInfo(); 106 int FI = MFI.CreateFixedObject(Size, Offset, true); 107 MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); 108 auto AddrReg = MIRBuilder.buildFrameIndex( 109 LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32), FI); 110 StackUsed = std::max(StackUsed, Size + Offset); 111 return AddrReg.getReg(0); 112 } 113 114 void assignValueToReg(Register ValVReg, Register PhysReg, 115 CCValAssign &VA) override { 116 markPhysRegUsed(PhysReg); 117 118 if (VA.getLocVT().getSizeInBits() < 32) { 119 // 16-bit types are reported as legal for 32-bit registers. We need to do 120 // a 32-bit copy, and truncate to avoid the verifier complaining about it. 121 auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg); 122 MIRBuilder.buildTrunc(ValVReg, Copy); 123 return; 124 } 125 126 switch (VA.getLocInfo()) { 127 case CCValAssign::LocInfo::SExt: 128 case CCValAssign::LocInfo::ZExt: 129 case CCValAssign::LocInfo::AExt: { 130 auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg); 131 MIRBuilder.buildTrunc(ValVReg, Copy); 132 break; 133 } 134 default: 135 MIRBuilder.buildCopy(ValVReg, PhysReg); 136 break; 137 } 138 } 139 140 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t MemSize, 141 MachinePointerInfo &MPO, CCValAssign &VA) override { 142 MachineFunction &MF = MIRBuilder.getMF(); 143 144 // The reported memory location may be wider than the value. 145 const LLT RegTy = MRI.getType(ValVReg); 146 MemSize = std::min(static_cast<uint64_t>(RegTy.getSizeInBytes()), MemSize); 147 148 // FIXME: Get alignment 149 auto MMO = MF.getMachineMemOperand( 150 MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, MemSize, 151 inferAlignFromPtrInfo(MF, MPO)); 152 MIRBuilder.buildLoad(ValVReg, Addr, *MMO); 153 } 154 155 /// How the physical register gets marked varies between formal 156 /// parameters (it's a basic-block live-in), and a call instruction 157 /// (it's an implicit-def of the BL). 158 virtual void markPhysRegUsed(unsigned PhysReg) = 0; 159 }; 160 161 struct FormalArgHandler : public AMDGPUIncomingArgHandler { 162 FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, 163 CCAssignFn *AssignFn) 164 : AMDGPUIncomingArgHandler(B, MRI, AssignFn) {} 165 166 void markPhysRegUsed(unsigned PhysReg) override { 167 MIRBuilder.getMBB().addLiveIn(PhysReg); 168 } 169 }; 170 171 struct CallReturnHandler : public AMDGPUIncomingArgHandler { 172 CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, 173 MachineInstrBuilder MIB, CCAssignFn *AssignFn) 174 : AMDGPUIncomingArgHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} 175 176 void markPhysRegUsed(unsigned PhysReg) override { 177 MIB.addDef(PhysReg, RegState::Implicit); 178 } 179 180 MachineInstrBuilder MIB; 181 }; 182 183 struct AMDGPUOutgoingArgHandler : public AMDGPUValueHandler { 184 MachineInstrBuilder MIB; 185 CCAssignFn *AssignFnVarArg; 186 187 /// For tail calls, the byte offset of the call's argument area from the 188 /// callee's. Unused elsewhere. 189 int FPDiff; 190 191 // Cache the SP register vreg if we need it more than once in this call site. 192 Register SPReg; 193 194 bool IsTailCall; 195 196 AMDGPUOutgoingArgHandler(MachineIRBuilder &MIRBuilder, 197 MachineRegisterInfo &MRI, MachineInstrBuilder MIB, 198 CCAssignFn *AssignFn, CCAssignFn *AssignFnVarArg, 199 bool IsTailCall = false, int FPDiff = 0) 200 : AMDGPUValueHandler(false, MIRBuilder, MRI, AssignFn), MIB(MIB), 201 AssignFnVarArg(AssignFnVarArg), FPDiff(FPDiff), IsTailCall(IsTailCall) { 202 } 203 204 Register getStackAddress(uint64_t Size, int64_t Offset, 205 MachinePointerInfo &MPO) override { 206 MachineFunction &MF = MIRBuilder.getMF(); 207 const LLT PtrTy = LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32); 208 const LLT S32 = LLT::scalar(32); 209 210 if (IsTailCall) { 211 llvm_unreachable("implement me"); 212 } 213 214 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 215 216 if (!SPReg) 217 SPReg = MIRBuilder.buildCopy(PtrTy, MFI->getStackPtrOffsetReg()).getReg(0); 218 219 auto OffsetReg = MIRBuilder.buildConstant(S32, Offset); 220 221 auto AddrReg = MIRBuilder.buildPtrAdd(PtrTy, SPReg, OffsetReg); 222 MPO = MachinePointerInfo::getStack(MF, Offset); 223 return AddrReg.getReg(0); 224 } 225 226 void assignValueToReg(Register ValVReg, Register PhysReg, 227 CCValAssign &VA) override { 228 MIB.addUse(PhysReg, RegState::Implicit); 229 Register ExtReg = extendRegisterMin32(ValVReg, VA); 230 MIRBuilder.buildCopy(PhysReg, ExtReg); 231 } 232 233 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, 234 MachinePointerInfo &MPO, CCValAssign &VA) override { 235 MachineFunction &MF = MIRBuilder.getMF(); 236 uint64_t LocMemOffset = VA.getLocMemOffset(); 237 const auto &ST = MF.getSubtarget<GCNSubtarget>(); 238 239 auto MMO = MF.getMachineMemOperand( 240 MPO, MachineMemOperand::MOStore, Size, 241 commonAlignment(ST.getStackAlignment(), LocMemOffset)); 242 MIRBuilder.buildStore(ValVReg, Addr, *MMO); 243 } 244 245 void assignValueToAddress(const CallLowering::ArgInfo &Arg, 246 unsigned ValRegIndex, Register Addr, 247 uint64_t MemSize, MachinePointerInfo &MPO, 248 CCValAssign &VA) override { 249 Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt 250 ? extendRegister(Arg.Regs[ValRegIndex], VA) 251 : Arg.Regs[ValRegIndex]; 252 253 // If we extended the value type we might need to adjust the MMO's 254 // Size. This happens if ComputeValueVTs widened a small type value to a 255 // legal register type (e.g. s8->s16) 256 const LLT RegTy = MRI.getType(ValVReg); 257 MemSize = std::min(MemSize, (uint64_t)RegTy.getSizeInBytes()); 258 assignValueToAddress(ValVReg, Addr, MemSize, MPO, VA); 259 } 260 }; 261 } 262 263 AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI) 264 : CallLowering(&TLI) { 265 } 266 267 // FIXME: Compatability shim 268 static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) { 269 switch (MIOpc) { 270 case TargetOpcode::G_SEXT: 271 return ISD::SIGN_EXTEND; 272 case TargetOpcode::G_ZEXT: 273 return ISD::ZERO_EXTEND; 274 case TargetOpcode::G_ANYEXT: 275 return ISD::ANY_EXTEND; 276 default: 277 llvm_unreachable("not an extend opcode"); 278 } 279 } 280 281 void AMDGPUCallLowering::processSplitArgs( 282 MachineIRBuilder &B, const ArgInfo &OrigArg, 283 const SmallVectorImpl<ArgInfo> &SplitArg, 284 SmallVectorImpl<ArgInfo> &SplitArgs, const DataLayout &DL, 285 CallingConv::ID CallConv, bool IsOutgoing, 286 SplitArgTy PerformArgSplit) const { 287 LLVMContext &Ctx = OrigArg.Ty->getContext(); 288 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 289 290 // FIXME: This is mostly nasty pre-processing before handleAssignments. Most 291 // of this should be performed by handleAssignments. 292 293 for (int SplitIdx = 0, e = SplitArg.size(); SplitIdx != e; ++SplitIdx) { 294 const ArgInfo &CurSplitArg = SplitArg[SplitIdx]; 295 Register Reg = OrigArg.Regs[SplitIdx]; 296 EVT VT = EVT::getEVT(CurSplitArg.Ty); 297 LLT LLTy = getLLTForType(*CurSplitArg.Ty, DL); 298 299 unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT); 300 MVT RegVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT); 301 302 if (NumParts == 1) { 303 // No splitting to do, but we want to replace the original type (e.g. [1 x 304 // double] -> double). 305 SplitArgs.emplace_back(Reg, CurSplitArg.Ty, OrigArg.Flags, 306 OrigArg.IsFixed); 307 continue; 308 } 309 310 SmallVector<Register, 8> SplitRegs; 311 Type *PartTy = EVT(RegVT).getTypeForEVT(Ctx); 312 LLT PartLLT = getLLTForType(*PartTy, DL); 313 MachineRegisterInfo &MRI = *B.getMRI(); 314 315 // FIXME: Should we be reporting all of the part registers for a single 316 // argument, and let handleAssignments take care of the repacking? 317 for (unsigned i = 0; i < NumParts; ++i) { 318 Register PartReg = MRI.createGenericVirtualRegister(PartLLT); 319 SplitRegs.push_back(PartReg); 320 SplitArgs.emplace_back(ArrayRef<Register>(PartReg), PartTy, OrigArg.Flags); 321 } 322 323 PerformArgSplit(SplitRegs, Reg, LLTy, PartLLT, SplitIdx); 324 } 325 } 326 327 // TODO: Move to generic code 328 static void unpackRegsToOrigType(MachineIRBuilder &B, 329 ArrayRef<Register> DstRegs, 330 Register SrcReg, 331 const CallLowering::ArgInfo &Info, 332 LLT SrcTy, 333 LLT PartTy) { 334 assert(DstRegs.size() > 1 && "Nothing to unpack"); 335 336 const unsigned PartSize = PartTy.getSizeInBits(); 337 338 if (SrcTy.isVector() && !PartTy.isVector() && 339 PartSize > SrcTy.getElementType().getSizeInBits()) { 340 // Vector was scalarized, and the elements extended. 341 auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(), SrcReg); 342 for (int i = 0, e = DstRegs.size(); i != e; ++i) 343 B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i)); 344 return; 345 } 346 347 LLT GCDTy = getGCDType(SrcTy, PartTy); 348 if (GCDTy == PartTy) { 349 // If this already evenly divisible, we can create a simple unmerge. 350 B.buildUnmerge(DstRegs, SrcReg); 351 return; 352 } 353 354 MachineRegisterInfo &MRI = *B.getMRI(); 355 LLT DstTy = MRI.getType(DstRegs[0]); 356 LLT LCMTy = getLCMType(SrcTy, PartTy); 357 358 const unsigned LCMSize = LCMTy.getSizeInBits(); 359 const unsigned DstSize = DstTy.getSizeInBits(); 360 const unsigned SrcSize = SrcTy.getSizeInBits(); 361 362 Register UnmergeSrc = SrcReg; 363 if (LCMSize != SrcSize) { 364 // Widen to the common type. 365 Register Undef = B.buildUndef(SrcTy).getReg(0); 366 SmallVector<Register, 8> MergeParts(1, SrcReg); 367 for (unsigned Size = SrcSize; Size != LCMSize; Size += SrcSize) 368 MergeParts.push_back(Undef); 369 370 UnmergeSrc = B.buildMerge(LCMTy, MergeParts).getReg(0); 371 } 372 373 // Unmerge to the original registers and pad with dead defs. 374 SmallVector<Register, 8> UnmergeResults(DstRegs.begin(), DstRegs.end()); 375 for (unsigned Size = DstSize * DstRegs.size(); Size != LCMSize; 376 Size += DstSize) { 377 UnmergeResults.push_back(MRI.createGenericVirtualRegister(DstTy)); 378 } 379 380 B.buildUnmerge(UnmergeResults, UnmergeSrc); 381 } 382 383 bool AMDGPUCallLowering::canLowerReturn(MachineFunction &MF, 384 CallingConv::ID CallConv, 385 SmallVectorImpl<BaseArgInfo> &Outs, 386 bool IsVarArg) const { 387 // For shaders. Vector types should be explicitly handled by CC. 388 if (AMDGPU::isEntryFunctionCC(CallConv)) 389 return true; 390 391 SmallVector<CCValAssign, 16> ArgLocs; 392 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 393 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, 394 MF.getFunction().getContext()); 395 396 return checkReturn(CCInfo, Outs, TLI.CCAssignFnForReturn(CallConv, IsVarArg)); 397 } 398 399 /// Lower the return value for the already existing \p Ret. This assumes that 400 /// \p B's insertion point is correct. 401 bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B, 402 const Value *Val, ArrayRef<Register> VRegs, 403 MachineInstrBuilder &Ret) const { 404 if (!Val) 405 return true; 406 407 auto &MF = B.getMF(); 408 const auto &F = MF.getFunction(); 409 const DataLayout &DL = MF.getDataLayout(); 410 MachineRegisterInfo *MRI = B.getMRI(); 411 LLVMContext &Ctx = F.getContext(); 412 413 CallingConv::ID CC = F.getCallingConv(); 414 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 415 416 SmallVector<EVT, 8> SplitEVTs; 417 ComputeValueVTs(TLI, DL, Val->getType(), SplitEVTs); 418 assert(VRegs.size() == SplitEVTs.size() && 419 "For each split Type there should be exactly one VReg."); 420 421 // We pre-process the return value decomposed into EVTs. 422 SmallVector<ArgInfo, 8> PreSplitRetInfos; 423 424 // Further processing is applied to split the arguments from PreSplitRetInfos 425 // into 32-bit pieces in SplitRetInfos before passing off to 426 // handleAssignments. 427 SmallVector<ArgInfo, 8> SplitRetInfos; 428 429 for (unsigned i = 0; i < SplitEVTs.size(); ++i) { 430 EVT VT = SplitEVTs[i]; 431 Register Reg = VRegs[i]; 432 ArgInfo RetInfo(Reg, VT.getTypeForEVT(Ctx)); 433 setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F); 434 435 if (VT.isScalarInteger()) { 436 unsigned ExtendOp = TargetOpcode::G_ANYEXT; 437 if (RetInfo.Flags[0].isSExt()) { 438 assert(RetInfo.Regs.size() == 1 && "expect only simple return values"); 439 ExtendOp = TargetOpcode::G_SEXT; 440 } else if (RetInfo.Flags[0].isZExt()) { 441 assert(RetInfo.Regs.size() == 1 && "expect only simple return values"); 442 ExtendOp = TargetOpcode::G_ZEXT; 443 } 444 445 EVT ExtVT = TLI.getTypeForExtReturn(Ctx, VT, 446 extOpcodeToISDExtOpcode(ExtendOp)); 447 if (ExtVT != VT) { 448 RetInfo.Ty = ExtVT.getTypeForEVT(Ctx); 449 LLT ExtTy = getLLTForType(*RetInfo.Ty, DL); 450 Reg = B.buildInstr(ExtendOp, {ExtTy}, {Reg}).getReg(0); 451 } 452 } 453 454 if (Reg != RetInfo.Regs[0]) { 455 RetInfo.Regs[0] = Reg; 456 // Reset the arg flags after modifying Reg. 457 setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F); 458 } 459 460 splitToValueTypes(RetInfo, PreSplitRetInfos, DL, CC); 461 462 // FIXME: This splitting should mostly be done by handleAssignments 463 processSplitArgs(B, RetInfo, 464 PreSplitRetInfos, SplitRetInfos, DL, CC, true, 465 [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, 466 LLT PartLLT, int VTSplitIdx) { 467 unpackRegsToOrigType(B, Regs, SrcReg, 468 PreSplitRetInfos[VTSplitIdx], LLTy, 469 PartLLT); 470 }); 471 PreSplitRetInfos.clear(); 472 } 473 474 CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg()); 475 AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret, AssignFn); 476 return handleAssignments(B, SplitRetInfos, RetHandler, CC, F.isVarArg()); 477 } 478 479 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val, 480 ArrayRef<Register> VRegs, 481 FunctionLoweringInfo &FLI) const { 482 483 MachineFunction &MF = B.getMF(); 484 MachineRegisterInfo &MRI = MF.getRegInfo(); 485 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 486 MFI->setIfReturnsVoid(!Val); 487 488 assert(!Val == VRegs.empty() && "Return value without a vreg"); 489 490 CallingConv::ID CC = B.getMF().getFunction().getCallingConv(); 491 const bool IsShader = AMDGPU::isShader(CC); 492 const bool IsWaveEnd = 493 (IsShader && MFI->returnsVoid()) || AMDGPU::isKernel(CC); 494 if (IsWaveEnd) { 495 B.buildInstr(AMDGPU::S_ENDPGM) 496 .addImm(0); 497 return true; 498 } 499 500 auto const &ST = MF.getSubtarget<GCNSubtarget>(); 501 502 unsigned ReturnOpc = 503 IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return; 504 505 auto Ret = B.buildInstrNoInsert(ReturnOpc); 506 Register ReturnAddrVReg; 507 if (ReturnOpc == AMDGPU::S_SETPC_B64_return) { 508 ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass); 509 Ret.addUse(ReturnAddrVReg); 510 } 511 512 if (!FLI.CanLowerReturn) 513 insertSRetStores(B, Val->getType(), VRegs, FLI.DemoteRegister); 514 else if (!lowerReturnVal(B, Val, VRegs, Ret)) 515 return false; 516 517 if (ReturnOpc == AMDGPU::S_SETPC_B64_return) { 518 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 519 Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF), 520 &AMDGPU::SGPR_64RegClass); 521 B.buildCopy(ReturnAddrVReg, LiveInReturn); 522 } 523 524 // TODO: Handle CalleeSavedRegsViaCopy. 525 526 B.insertInstr(Ret); 527 return true; 528 } 529 530 void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B, 531 Type *ParamTy, 532 uint64_t Offset) const { 533 MachineFunction &MF = B.getMF(); 534 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 535 MachineRegisterInfo &MRI = MF.getRegInfo(); 536 Register KernArgSegmentPtr = 537 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 538 Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr); 539 540 auto OffsetReg = B.buildConstant(LLT::scalar(64), Offset); 541 542 B.buildPtrAdd(DstReg, KernArgSegmentVReg, OffsetReg); 543 } 544 545 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy, 546 uint64_t Offset, Align Alignment, 547 Register DstReg) const { 548 MachineFunction &MF = B.getMF(); 549 const Function &F = MF.getFunction(); 550 const DataLayout &DL = F.getParent()->getDataLayout(); 551 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 552 unsigned TypeSize = DL.getTypeStoreSize(ParamTy); 553 554 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 555 Register PtrReg = B.getMRI()->createGenericVirtualRegister(PtrTy); 556 lowerParameterPtr(PtrReg, B, ParamTy, Offset); 557 558 MachineMemOperand *MMO = MF.getMachineMemOperand( 559 PtrInfo, 560 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 561 MachineMemOperand::MOInvariant, 562 TypeSize, Alignment); 563 564 B.buildLoad(DstReg, PtrReg, *MMO); 565 } 566 567 // Allocate special inputs passed in user SGPRs. 568 static void allocateHSAUserSGPRs(CCState &CCInfo, 569 MachineIRBuilder &B, 570 MachineFunction &MF, 571 const SIRegisterInfo &TRI, 572 SIMachineFunctionInfo &Info) { 573 // FIXME: How should these inputs interact with inreg / custom SGPR inputs? 574 if (Info.hasPrivateSegmentBuffer()) { 575 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); 576 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); 577 CCInfo.AllocateReg(PrivateSegmentBufferReg); 578 } 579 580 if (Info.hasDispatchPtr()) { 581 Register DispatchPtrReg = Info.addDispatchPtr(TRI); 582 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); 583 CCInfo.AllocateReg(DispatchPtrReg); 584 } 585 586 if (Info.hasQueuePtr()) { 587 Register QueuePtrReg = Info.addQueuePtr(TRI); 588 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); 589 CCInfo.AllocateReg(QueuePtrReg); 590 } 591 592 if (Info.hasKernargSegmentPtr()) { 593 MachineRegisterInfo &MRI = MF.getRegInfo(); 594 Register InputPtrReg = Info.addKernargSegmentPtr(TRI); 595 const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 596 Register VReg = MRI.createGenericVirtualRegister(P4); 597 MRI.addLiveIn(InputPtrReg, VReg); 598 B.getMBB().addLiveIn(InputPtrReg); 599 B.buildCopy(VReg, InputPtrReg); 600 CCInfo.AllocateReg(InputPtrReg); 601 } 602 603 if (Info.hasDispatchID()) { 604 Register DispatchIDReg = Info.addDispatchID(TRI); 605 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); 606 CCInfo.AllocateReg(DispatchIDReg); 607 } 608 609 if (Info.hasFlatScratchInit()) { 610 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI); 611 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); 612 CCInfo.AllocateReg(FlatScratchInitReg); 613 } 614 615 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read 616 // these from the dispatch pointer. 617 } 618 619 bool AMDGPUCallLowering::lowerFormalArgumentsKernel( 620 MachineIRBuilder &B, const Function &F, 621 ArrayRef<ArrayRef<Register>> VRegs) const { 622 MachineFunction &MF = B.getMF(); 623 const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>(); 624 MachineRegisterInfo &MRI = MF.getRegInfo(); 625 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 626 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); 627 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 628 629 const DataLayout &DL = F.getParent()->getDataLayout(); 630 631 SmallVector<CCValAssign, 16> ArgLocs; 632 CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); 633 634 allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info); 635 636 unsigned i = 0; 637 const Align KernArgBaseAlign(16); 638 const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F); 639 uint64_t ExplicitArgOffset = 0; 640 641 // TODO: Align down to dword alignment and extract bits for extending loads. 642 for (auto &Arg : F.args()) { 643 const bool IsByRef = Arg.hasByRefAttr(); 644 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 645 unsigned AllocSize = DL.getTypeAllocSize(ArgTy); 646 if (AllocSize == 0) 647 continue; 648 649 MaybeAlign ABIAlign = IsByRef ? Arg.getParamAlign() : None; 650 if (!ABIAlign) 651 ABIAlign = DL.getABITypeAlign(ArgTy); 652 653 uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset; 654 ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize; 655 656 if (Arg.use_empty()) { 657 ++i; 658 continue; 659 } 660 661 Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset); 662 663 if (IsByRef) { 664 unsigned ByRefAS = cast<PointerType>(Arg.getType())->getAddressSpace(); 665 666 assert(VRegs[i].size() == 1 && 667 "expected only one register for byval pointers"); 668 if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) { 669 lowerParameterPtr(VRegs[i][0], B, ArgTy, ArgOffset); 670 } else { 671 const LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 672 Register PtrReg = MRI.createGenericVirtualRegister(ConstPtrTy); 673 lowerParameterPtr(PtrReg, B, ArgTy, ArgOffset); 674 675 B.buildAddrSpaceCast(VRegs[i][0], PtrReg); 676 } 677 } else { 678 ArrayRef<Register> OrigArgRegs = VRegs[i]; 679 Register ArgReg = 680 OrigArgRegs.size() == 1 681 ? OrigArgRegs[0] 682 : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL)); 683 684 lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg); 685 if (OrigArgRegs.size() > 1) 686 unpackRegs(OrigArgRegs, ArgReg, ArgTy, B); 687 } 688 689 ++i; 690 } 691 692 TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); 693 TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false); 694 return true; 695 } 696 697 bool AMDGPUCallLowering::lowerFormalArguments( 698 MachineIRBuilder &B, const Function &F, ArrayRef<ArrayRef<Register>> VRegs, 699 FunctionLoweringInfo &FLI) const { 700 CallingConv::ID CC = F.getCallingConv(); 701 702 // The infrastructure for normal calling convention lowering is essentially 703 // useless for kernels. We want to avoid any kind of legalization or argument 704 // splitting. 705 if (CC == CallingConv::AMDGPU_KERNEL) 706 return lowerFormalArgumentsKernel(B, F, VRegs); 707 708 const bool IsGraphics = AMDGPU::isGraphics(CC); 709 const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC); 710 711 MachineFunction &MF = B.getMF(); 712 MachineBasicBlock &MBB = B.getMBB(); 713 MachineRegisterInfo &MRI = MF.getRegInfo(); 714 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 715 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 716 const SIRegisterInfo *TRI = Subtarget.getRegisterInfo(); 717 const DataLayout &DL = F.getParent()->getDataLayout(); 718 719 720 SmallVector<CCValAssign, 16> ArgLocs; 721 CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext()); 722 723 if (!IsEntryFunc) { 724 Register ReturnAddrReg = TRI->getReturnAddressReg(MF); 725 Register LiveInReturn = MF.addLiveIn(ReturnAddrReg, 726 &AMDGPU::SGPR_64RegClass); 727 MBB.addLiveIn(ReturnAddrReg); 728 B.buildCopy(LiveInReturn, ReturnAddrReg); 729 } 730 731 if (Info->hasImplicitBufferPtr()) { 732 Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI); 733 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); 734 CCInfo.AllocateReg(ImplicitBufferPtrReg); 735 } 736 737 SmallVector<ArgInfo, 32> SplitArgs; 738 unsigned Idx = 0; 739 unsigned PSInputNum = 0; 740 741 // Insert the hidden sret parameter if the return value won't fit in the 742 // return registers. 743 if (!FLI.CanLowerReturn) 744 insertSRetIncomingArgument(F, SplitArgs, FLI.DemoteRegister, MRI, DL); 745 746 for (auto &Arg : F.args()) { 747 if (DL.getTypeStoreSize(Arg.getType()) == 0) 748 continue; 749 750 const bool InReg = Arg.hasAttribute(Attribute::InReg); 751 752 // SGPR arguments to functions not implemented. 753 if (!IsGraphics && InReg) 754 return false; 755 756 if (Arg.hasAttribute(Attribute::SwiftSelf) || 757 Arg.hasAttribute(Attribute::SwiftError) || 758 Arg.hasAttribute(Attribute::Nest)) 759 return false; 760 761 if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) { 762 const bool ArgUsed = !Arg.use_empty(); 763 bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum); 764 765 if (!SkipArg) { 766 Info->markPSInputAllocated(PSInputNum); 767 if (ArgUsed) 768 Info->markPSInputEnabled(PSInputNum); 769 } 770 771 ++PSInputNum; 772 773 if (SkipArg) { 774 for (int I = 0, E = VRegs[Idx].size(); I != E; ++I) 775 B.buildUndef(VRegs[Idx][I]); 776 777 ++Idx; 778 continue; 779 } 780 } 781 782 ArgInfo OrigArg(VRegs[Idx], Arg.getType()); 783 const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex; 784 setArgFlags(OrigArg, OrigArgIdx, DL, F); 785 786 splitToValueTypes(OrigArg, SplitArgs, DL, CC); 787 ++Idx; 788 } 789 790 // At least one interpolation mode must be enabled or else the GPU will 791 // hang. 792 // 793 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user 794 // set PSInputAddr, the user wants to enable some bits after the compilation 795 // based on run-time states. Since we can't know what the final PSInputEna 796 // will look like, so we shouldn't do anything here and the user should take 797 // responsibility for the correct programming. 798 // 799 // Otherwise, the following restrictions apply: 800 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. 801 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be 802 // enabled too. 803 if (CC == CallingConv::AMDGPU_PS) { 804 if ((Info->getPSInputAddr() & 0x7F) == 0 || 805 ((Info->getPSInputAddr() & 0xF) == 0 && 806 Info->isPSInputAllocated(11))) { 807 CCInfo.AllocateReg(AMDGPU::VGPR0); 808 CCInfo.AllocateReg(AMDGPU::VGPR1); 809 Info->markPSInputAllocated(0); 810 Info->markPSInputEnabled(0); 811 } 812 813 if (Subtarget.isAmdPalOS()) { 814 // For isAmdPalOS, the user does not enable some bits after compilation 815 // based on run-time states; the register values being generated here are 816 // the final ones set in hardware. Therefore we need to apply the 817 // workaround to PSInputAddr and PSInputEnable together. (The case where 818 // a bit is set in PSInputAddr but not PSInputEnable is where the frontend 819 // set up an input arg for a particular interpolation mode, but nothing 820 // uses that input arg. Really we should have an earlier pass that removes 821 // such an arg.) 822 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable(); 823 if ((PsInputBits & 0x7F) == 0 || 824 ((PsInputBits & 0xF) == 0 && 825 (PsInputBits >> 11 & 1))) 826 Info->markPSInputEnabled( 827 countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined)); 828 } 829 } 830 831 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 832 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg()); 833 834 if (!MBB.empty()) 835 B.setInstr(*MBB.begin()); 836 837 if (!IsEntryFunc) { 838 // For the fixed ABI, pass workitem IDs in the last argument register. 839 if (AMDGPUTargetMachine::EnableFixedFunctionABI) 840 TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); 841 } 842 843 FormalArgHandler Handler(B, MRI, AssignFn); 844 if (!handleAssignments(CCInfo, ArgLocs, B, SplitArgs, Handler)) 845 return false; 846 847 if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) { 848 // Special inputs come after user arguments. 849 TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); 850 } 851 852 // Start adding system SGPRs. 853 if (IsEntryFunc) { 854 TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics); 855 } else { 856 if (!Subtarget.enableFlatScratch()) 857 CCInfo.AllocateReg(Info->getScratchRSrcReg()); 858 TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); 859 } 860 861 // Move back to the end of the basic block. 862 B.setMBB(MBB); 863 864 return true; 865 } 866 867 bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder, 868 CCState &CCInfo, 869 SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs, 870 CallLoweringInfo &Info) const { 871 MachineFunction &MF = MIRBuilder.getMF(); 872 873 const AMDGPUFunctionArgInfo *CalleeArgInfo 874 = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo; 875 876 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 877 const AMDGPUFunctionArgInfo &CallerArgInfo = MFI->getArgInfo(); 878 879 880 // TODO: Unify with private memory register handling. This is complicated by 881 // the fact that at least in kernels, the input argument is not necessarily 882 // in the same location as the input. 883 AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = { 884 AMDGPUFunctionArgInfo::DISPATCH_PTR, 885 AMDGPUFunctionArgInfo::QUEUE_PTR, 886 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, 887 AMDGPUFunctionArgInfo::DISPATCH_ID, 888 AMDGPUFunctionArgInfo::WORKGROUP_ID_X, 889 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, 890 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z 891 }; 892 893 MachineRegisterInfo &MRI = MF.getRegInfo(); 894 895 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 896 const AMDGPULegalizerInfo *LI 897 = static_cast<const AMDGPULegalizerInfo*>(ST.getLegalizerInfo()); 898 899 for (auto InputID : InputRegs) { 900 const ArgDescriptor *OutgoingArg; 901 const TargetRegisterClass *ArgRC; 902 LLT ArgTy; 903 904 std::tie(OutgoingArg, ArgRC, ArgTy) = 905 CalleeArgInfo->getPreloadedValue(InputID); 906 if (!OutgoingArg) 907 continue; 908 909 const ArgDescriptor *IncomingArg; 910 const TargetRegisterClass *IncomingArgRC; 911 std::tie(IncomingArg, IncomingArgRC, ArgTy) = 912 CallerArgInfo.getPreloadedValue(InputID); 913 assert(IncomingArgRC == ArgRC); 914 915 Register InputReg = MRI.createGenericVirtualRegister(ArgTy); 916 917 if (IncomingArg) { 918 LI->loadInputValue(InputReg, MIRBuilder, IncomingArg, ArgRC, ArgTy); 919 } else { 920 assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 921 LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder); 922 } 923 924 if (OutgoingArg->isRegister()) { 925 ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg); 926 if (!CCInfo.AllocateReg(OutgoingArg->getRegister())) 927 report_fatal_error("failed to allocate implicit input argument"); 928 } else { 929 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n"); 930 return false; 931 } 932 } 933 934 // Pack workitem IDs into a single register or pass it as is if already 935 // packed. 936 const ArgDescriptor *OutgoingArg; 937 const TargetRegisterClass *ArgRC; 938 LLT ArgTy; 939 940 std::tie(OutgoingArg, ArgRC, ArgTy) = 941 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X); 942 if (!OutgoingArg) 943 std::tie(OutgoingArg, ArgRC, ArgTy) = 944 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 945 if (!OutgoingArg) 946 std::tie(OutgoingArg, ArgRC, ArgTy) = 947 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 948 if (!OutgoingArg) 949 return false; 950 951 auto WorkitemIDX = 952 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X); 953 auto WorkitemIDY = 954 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 955 auto WorkitemIDZ = 956 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 957 958 const ArgDescriptor *IncomingArgX = std::get<0>(WorkitemIDX); 959 const ArgDescriptor *IncomingArgY = std::get<0>(WorkitemIDY); 960 const ArgDescriptor *IncomingArgZ = std::get<0>(WorkitemIDZ); 961 const LLT S32 = LLT::scalar(32); 962 963 // If incoming ids are not packed we need to pack them. 964 // FIXME: Should consider known workgroup size to eliminate known 0 cases. 965 Register InputReg; 966 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX) { 967 InputReg = MRI.createGenericVirtualRegister(S32); 968 LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX, 969 std::get<1>(WorkitemIDX), std::get<2>(WorkitemIDX)); 970 } 971 972 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY) { 973 Register Y = MRI.createGenericVirtualRegister(S32); 974 LI->loadInputValue(Y, MIRBuilder, IncomingArgY, std::get<1>(WorkitemIDY), 975 std::get<2>(WorkitemIDY)); 976 977 Y = MIRBuilder.buildShl(S32, Y, MIRBuilder.buildConstant(S32, 10)).getReg(0); 978 InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Y).getReg(0) : Y; 979 } 980 981 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ) { 982 Register Z = MRI.createGenericVirtualRegister(S32); 983 LI->loadInputValue(Z, MIRBuilder, IncomingArgZ, std::get<1>(WorkitemIDZ), 984 std::get<2>(WorkitemIDZ)); 985 986 Z = MIRBuilder.buildShl(S32, Z, MIRBuilder.buildConstant(S32, 20)).getReg(0); 987 InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Z).getReg(0) : Z; 988 } 989 990 if (!InputReg) { 991 InputReg = MRI.createGenericVirtualRegister(S32); 992 993 // Workitem ids are already packed, any of present incoming arguments will 994 // carry all required fields. 995 ArgDescriptor IncomingArg = ArgDescriptor::createArg( 996 IncomingArgX ? *IncomingArgX : 997 IncomingArgY ? *IncomingArgY : *IncomingArgZ, ~0u); 998 LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg, 999 &AMDGPU::VGPR_32RegClass, S32); 1000 } 1001 1002 if (OutgoingArg->isRegister()) { 1003 ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg); 1004 if (!CCInfo.AllocateReg(OutgoingArg->getRegister())) 1005 report_fatal_error("failed to allocate implicit input argument"); 1006 } else { 1007 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n"); 1008 return false; 1009 } 1010 1011 return true; 1012 } 1013 1014 /// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for 1015 /// CC. 1016 static std::pair<CCAssignFn *, CCAssignFn *> 1017 getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) { 1018 return {TLI.CCAssignFnForCall(CC, false), TLI.CCAssignFnForCall(CC, true)}; 1019 } 1020 1021 static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect, 1022 bool IsTailCall) { 1023 return AMDGPU::SI_CALL; 1024 } 1025 1026 // Add operands to call instruction to track the callee. 1027 static bool addCallTargetOperands(MachineInstrBuilder &CallInst, 1028 MachineIRBuilder &MIRBuilder, 1029 AMDGPUCallLowering::CallLoweringInfo &Info) { 1030 if (Info.Callee.isReg()) { 1031 CallInst.addReg(Info.Callee.getReg()); 1032 CallInst.addImm(0); 1033 } else if (Info.Callee.isGlobal() && Info.Callee.getOffset() == 0) { 1034 // The call lowering lightly assumed we can directly encode a call target in 1035 // the instruction, which is not the case. Materialize the address here. 1036 const GlobalValue *GV = Info.Callee.getGlobal(); 1037 auto Ptr = MIRBuilder.buildGlobalValue( 1038 LLT::pointer(GV->getAddressSpace(), 64), GV); 1039 CallInst.addReg(Ptr.getReg(0)); 1040 CallInst.add(Info.Callee); 1041 } else 1042 return false; 1043 1044 return true; 1045 } 1046 1047 bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, 1048 CallLoweringInfo &Info) const { 1049 if (Info.IsVarArg) { 1050 LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n"); 1051 return false; 1052 } 1053 1054 MachineFunction &MF = MIRBuilder.getMF(); 1055 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1056 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1057 1058 const Function &F = MF.getFunction(); 1059 MachineRegisterInfo &MRI = MF.getRegInfo(); 1060 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 1061 const DataLayout &DL = F.getParent()->getDataLayout(); 1062 CallingConv::ID CallConv = F.getCallingConv(); 1063 1064 if (!AMDGPUTargetMachine::EnableFixedFunctionABI && 1065 CallConv != CallingConv::AMDGPU_Gfx) { 1066 LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n"); 1067 return false; 1068 } 1069 1070 if (AMDGPU::isShader(CallConv)) { 1071 LLVM_DEBUG(dbgs() << "Unhandled call from graphics shader\n"); 1072 return false; 1073 } 1074 1075 SmallVector<ArgInfo, 8> OutArgs; 1076 1077 SmallVector<ArgInfo, 8> SplitArg; 1078 for (auto &OrigArg : Info.OrigArgs) { 1079 splitToValueTypes(OrigArg, SplitArg, DL, Info.CallConv); 1080 1081 processSplitArgs( 1082 MIRBuilder, OrigArg, SplitArg, OutArgs, DL, Info.CallConv, true, 1083 // FIXME: We should probably be passing multiple registers to 1084 // handleAssignments to do this 1085 [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT, 1086 int VTSplitIdx) { 1087 unpackRegsToOrigType(MIRBuilder, Regs, SrcReg, OrigArg, LLTy, PartLLT); 1088 }); 1089 1090 SplitArg.clear(); 1091 } 1092 1093 // If we can lower as a tail call, do that instead. 1094 bool CanTailCallOpt = false; 1095 1096 // We must emit a tail call if we have musttail. 1097 if (Info.IsMustTailCall && !CanTailCallOpt) { 1098 LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n"); 1099 return false; 1100 } 1101 1102 // Find out which ABI gets to decide where things go. 1103 CCAssignFn *AssignFnFixed; 1104 CCAssignFn *AssignFnVarArg; 1105 std::tie(AssignFnFixed, AssignFnVarArg) = 1106 getAssignFnsForCC(Info.CallConv, TLI); 1107 1108 MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP) 1109 .addImm(0) 1110 .addImm(0); 1111 1112 // Create a temporarily-floating call instruction so we can add the implicit 1113 // uses of arg registers. 1114 unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false); 1115 1116 auto MIB = MIRBuilder.buildInstrNoInsert(Opc); 1117 MIB.addDef(TRI->getReturnAddressReg(MF)); 1118 1119 if (!addCallTargetOperands(MIB, MIRBuilder, Info)) 1120 return false; 1121 1122 // Tell the call which registers are clobbered. 1123 const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv); 1124 MIB.addRegMask(Mask); 1125 1126 SmallVector<CCValAssign, 16> ArgLocs; 1127 CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext()); 1128 1129 // We could pass MIB and directly add the implicit uses to the call 1130 // now. However, as an aesthetic choice, place implicit argument operands 1131 // after the ordinary user argument registers. 1132 SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs; 1133 1134 if (AMDGPUTargetMachine::EnableFixedFunctionABI && 1135 Info.CallConv != CallingConv::AMDGPU_Gfx) { 1136 // With a fixed ABI, allocate fixed registers before user arguments. 1137 if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info)) 1138 return false; 1139 } 1140 1141 // Do the actual argument marshalling. 1142 SmallVector<Register, 8> PhysRegs; 1143 AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed, 1144 AssignFnVarArg, false); 1145 if (!handleAssignments(CCInfo, ArgLocs, MIRBuilder, OutArgs, Handler)) 1146 return false; 1147 1148 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1149 1150 if (!ST.enableFlatScratch()) { 1151 // Insert copies for the SRD. In the HSA case, this should be an identity 1152 // copy. 1153 auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::vector(4, 32), 1154 MFI->getScratchRSrcReg()); 1155 MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); 1156 MIB.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit); 1157 } 1158 1159 for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) { 1160 MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second); 1161 MIB.addReg(ArgReg.first, RegState::Implicit); 1162 } 1163 1164 // Get a count of how many bytes are to be pushed on the stack. 1165 unsigned NumBytes = CCInfo.getNextStackOffset(); 1166 1167 // If Callee is a reg, since it is used by a target specific 1168 // instruction, it must have a register class matching the 1169 // constraint of that instruction. 1170 1171 // FIXME: We should define regbankselectable call instructions to handle 1172 // divergent call targets. 1173 if (MIB->getOperand(1).isReg()) { 1174 MIB->getOperand(1).setReg(constrainOperandRegClass( 1175 MF, *TRI, MRI, *ST.getInstrInfo(), 1176 *ST.getRegBankInfo(), *MIB, MIB->getDesc(), MIB->getOperand(1), 1177 1)); 1178 } 1179 1180 auto OrigInsertPt = MIRBuilder.getInsertPt(); 1181 1182 // Now we can add the actual call instruction to the correct position. 1183 MIRBuilder.insertInstr(MIB); 1184 1185 // Insert this now to give us an anchor point for managing the insert point. 1186 MachineInstrBuilder CallSeqEnd = 1187 MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN); 1188 1189 SmallVector<ArgInfo, 8> InArgs; 1190 if (!Info.CanLowerReturn) { 1191 insertSRetLoads(MIRBuilder, Info.OrigRet.Ty, Info.OrigRet.Regs, 1192 Info.DemoteRegister, Info.DemoteStackIndex); 1193 } else if (!Info.OrigRet.Ty->isVoidTy()) { 1194 splitToValueTypes(Info.OrigRet, InArgs, DL, Info.CallConv); 1195 } 1196 1197 // Make sure the raw argument copies are inserted before the marshalling to 1198 // the original types. 1199 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), CallSeqEnd); 1200 1201 // Finally we can copy the returned value back into its virtual-register. In 1202 // symmetry with the arguments, the physical register must be an 1203 // implicit-define of the call instruction. 1204 if (Info.CanLowerReturn && !Info.OrigRet.Ty->isVoidTy()) { 1205 CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv, 1206 Info.IsVarArg); 1207 CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn); 1208 if (!handleAssignments(MIRBuilder, InArgs, Handler, Info.CallConv, 1209 Info.IsVarArg)) 1210 return false; 1211 } 1212 1213 uint64_t CalleePopBytes = NumBytes; 1214 CallSeqEnd.addImm(0) 1215 .addImm(CalleePopBytes); 1216 1217 // Restore the insert point to after the call sequence. 1218 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), OrigInsertPt); 1219 return true; 1220 } 1221