1 //===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file implements the lowering of LLVM calls to machine code calls for 11 /// GlobalISel. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUCallLowering.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUISelLowering.h" 18 #include "AMDGPULegalizerInfo.h" 19 #include "AMDGPUSubtarget.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 22 #include "SIISelLowering.h" 23 #include "SIMachineFunctionInfo.h" 24 #include "SIRegisterInfo.h" 25 #include "llvm/CodeGen/Analysis.h" 26 #include "llvm/CodeGen/CallingConvLower.h" 27 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 28 #include "llvm/CodeGen/MachineInstrBuilder.h" 29 #include "llvm/Support/LowLevelTypeImpl.h" 30 31 #define DEBUG_TYPE "amdgpu-call-lowering" 32 33 using namespace llvm; 34 35 namespace { 36 37 struct AMDGPUValueHandler : public CallLowering::ValueHandler { 38 AMDGPUValueHandler(bool IsIncoming, MachineIRBuilder &B, 39 MachineRegisterInfo &MRI, CCAssignFn *AssignFn) 40 : ValueHandler(IsIncoming, B, MRI, AssignFn) {} 41 42 /// Wrapper around extendRegister to ensure we extend to a full 32-bit 43 /// register. 44 Register extendRegisterMin32(Register ValVReg, CCValAssign &VA) { 45 if (VA.getLocVT().getSizeInBits() < 32) { 46 // 16-bit types are reported as legal for 32-bit registers. We need to 47 // extend and do a 32-bit copy to avoid the verifier complaining about it. 48 return MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0); 49 } 50 51 return extendRegister(ValVReg, VA); 52 } 53 }; 54 55 struct AMDGPUOutgoingValueHandler : public AMDGPUValueHandler { 56 AMDGPUOutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, 57 MachineInstrBuilder MIB, CCAssignFn *AssignFn) 58 : AMDGPUValueHandler(false, B, MRI, AssignFn), MIB(MIB) {} 59 60 MachineInstrBuilder MIB; 61 62 Register getStackAddress(uint64_t Size, int64_t Offset, 63 MachinePointerInfo &MPO) override { 64 llvm_unreachable("not implemented"); 65 } 66 67 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, 68 MachinePointerInfo &MPO, CCValAssign &VA) override { 69 llvm_unreachable("not implemented"); 70 } 71 72 void assignValueToReg(Register ValVReg, Register PhysReg, 73 CCValAssign &VA) override { 74 Register ExtReg = extendRegisterMin32(ValVReg, VA); 75 76 // If this is a scalar return, insert a readfirstlane just in case the value 77 // ends up in a VGPR. 78 // FIXME: Assert this is a shader return. 79 const SIRegisterInfo *TRI 80 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 81 if (TRI->isSGPRReg(MRI, PhysReg)) { 82 auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, 83 {MRI.getType(ExtReg)}, false) 84 .addReg(ExtReg); 85 ExtReg = ToSGPR.getReg(0); 86 } 87 88 MIRBuilder.buildCopy(PhysReg, ExtReg); 89 MIB.addUse(PhysReg, RegState::Implicit); 90 } 91 92 bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, 93 CCValAssign::LocInfo LocInfo, 94 const CallLowering::ArgInfo &Info, 95 ISD::ArgFlagsTy Flags, 96 CCState &State) override { 97 return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State); 98 } 99 }; 100 101 struct AMDGPUIncomingArgHandler : public AMDGPUValueHandler { 102 uint64_t StackUsed = 0; 103 104 AMDGPUIncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, 105 CCAssignFn *AssignFn) 106 : AMDGPUValueHandler(true, B, MRI, AssignFn) {} 107 108 Register getStackAddress(uint64_t Size, int64_t Offset, 109 MachinePointerInfo &MPO) override { 110 auto &MFI = MIRBuilder.getMF().getFrameInfo(); 111 int FI = MFI.CreateFixedObject(Size, Offset, true); 112 MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); 113 auto AddrReg = MIRBuilder.buildFrameIndex( 114 LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32), FI); 115 StackUsed = std::max(StackUsed, Size + Offset); 116 return AddrReg.getReg(0); 117 } 118 119 void assignValueToReg(Register ValVReg, Register PhysReg, 120 CCValAssign &VA) override { 121 markPhysRegUsed(PhysReg); 122 123 if (VA.getLocVT().getSizeInBits() < 32) { 124 // 16-bit types are reported as legal for 32-bit registers. We need to do 125 // a 32-bit copy, and truncate to avoid the verifier complaining about it. 126 auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg); 127 MIRBuilder.buildTrunc(ValVReg, Copy); 128 return; 129 } 130 131 switch (VA.getLocInfo()) { 132 case CCValAssign::LocInfo::SExt: 133 case CCValAssign::LocInfo::ZExt: 134 case CCValAssign::LocInfo::AExt: { 135 auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg); 136 MIRBuilder.buildTrunc(ValVReg, Copy); 137 break; 138 } 139 default: 140 MIRBuilder.buildCopy(ValVReg, PhysReg); 141 break; 142 } 143 } 144 145 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t MemSize, 146 MachinePointerInfo &MPO, CCValAssign &VA) override { 147 MachineFunction &MF = MIRBuilder.getMF(); 148 149 // The reported memory location may be wider than the value. 150 const LLT RegTy = MRI.getType(ValVReg); 151 MemSize = std::min(static_cast<uint64_t>(RegTy.getSizeInBytes()), MemSize); 152 153 // FIXME: Get alignment 154 auto MMO = MF.getMachineMemOperand( 155 MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, MemSize, 156 inferAlignFromPtrInfo(MF, MPO)); 157 MIRBuilder.buildLoad(ValVReg, Addr, *MMO); 158 } 159 160 /// How the physical register gets marked varies between formal 161 /// parameters (it's a basic-block live-in), and a call instruction 162 /// (it's an implicit-def of the BL). 163 virtual void markPhysRegUsed(unsigned PhysReg) = 0; 164 }; 165 166 struct FormalArgHandler : public AMDGPUIncomingArgHandler { 167 FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, 168 CCAssignFn *AssignFn) 169 : AMDGPUIncomingArgHandler(B, MRI, AssignFn) {} 170 171 void markPhysRegUsed(unsigned PhysReg) override { 172 MIRBuilder.getMBB().addLiveIn(PhysReg); 173 } 174 }; 175 176 struct CallReturnHandler : public AMDGPUIncomingArgHandler { 177 CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, 178 MachineInstrBuilder MIB, CCAssignFn *AssignFn) 179 : AMDGPUIncomingArgHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} 180 181 void markPhysRegUsed(unsigned PhysReg) override { 182 MIB.addDef(PhysReg, RegState::Implicit); 183 } 184 185 MachineInstrBuilder MIB; 186 }; 187 188 struct AMDGPUOutgoingArgHandler : public AMDGPUValueHandler { 189 MachineInstrBuilder MIB; 190 CCAssignFn *AssignFnVarArg; 191 192 /// For tail calls, the byte offset of the call's argument area from the 193 /// callee's. Unused elsewhere. 194 int FPDiff; 195 196 // Cache the SP register vreg if we need it more than once in this call site. 197 Register SPReg; 198 199 bool IsTailCall; 200 201 AMDGPUOutgoingArgHandler(MachineIRBuilder &MIRBuilder, 202 MachineRegisterInfo &MRI, MachineInstrBuilder MIB, 203 CCAssignFn *AssignFn, CCAssignFn *AssignFnVarArg, 204 bool IsTailCall = false, int FPDiff = 0) 205 : AMDGPUValueHandler(false, MIRBuilder, MRI, AssignFn), MIB(MIB), 206 AssignFnVarArg(AssignFnVarArg), FPDiff(FPDiff), IsTailCall(IsTailCall) { 207 } 208 209 Register getStackAddress(uint64_t Size, int64_t Offset, 210 MachinePointerInfo &MPO) override { 211 MachineFunction &MF = MIRBuilder.getMF(); 212 const LLT PtrTy = LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32); 213 const LLT S32 = LLT::scalar(32); 214 215 if (IsTailCall) { 216 llvm_unreachable("implement me"); 217 } 218 219 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 220 221 if (!SPReg) 222 SPReg = MIRBuilder.buildCopy(PtrTy, MFI->getStackPtrOffsetReg()).getReg(0); 223 224 auto OffsetReg = MIRBuilder.buildConstant(S32, Offset); 225 226 auto AddrReg = MIRBuilder.buildPtrAdd(PtrTy, SPReg, OffsetReg); 227 MPO = MachinePointerInfo::getStack(MF, Offset); 228 return AddrReg.getReg(0); 229 } 230 231 void assignValueToReg(Register ValVReg, Register PhysReg, 232 CCValAssign &VA) override { 233 MIB.addUse(PhysReg, RegState::Implicit); 234 Register ExtReg = extendRegisterMin32(ValVReg, VA); 235 MIRBuilder.buildCopy(PhysReg, ExtReg); 236 } 237 238 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, 239 MachinePointerInfo &MPO, CCValAssign &VA) override { 240 MachineFunction &MF = MIRBuilder.getMF(); 241 uint64_t LocMemOffset = VA.getLocMemOffset(); 242 const auto &ST = MF.getSubtarget<GCNSubtarget>(); 243 244 auto MMO = MF.getMachineMemOperand( 245 MPO, MachineMemOperand::MOStore, Size, 246 commonAlignment(ST.getStackAlignment(), LocMemOffset)); 247 MIRBuilder.buildStore(ValVReg, Addr, *MMO); 248 } 249 250 void assignValueToAddress(const CallLowering::ArgInfo &Arg, Register Addr, 251 uint64_t Size, MachinePointerInfo &MPO, 252 CCValAssign &VA) override { 253 Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt 254 ? extendRegister(Arg.Regs[0], VA) 255 : Arg.Regs[0]; 256 257 // If we extended we might need to adjust the MMO's Size. 258 const LLT RegTy = MRI.getType(ValVReg); 259 if (RegTy.getSizeInBytes() > Size) 260 Size = RegTy.getSizeInBytes(); 261 262 assignValueToAddress(ValVReg, Addr, Size, MPO, VA); 263 } 264 }; 265 } 266 267 AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI) 268 : CallLowering(&TLI) { 269 } 270 271 // FIXME: Compatability shim 272 static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) { 273 switch (MIOpc) { 274 case TargetOpcode::G_SEXT: 275 return ISD::SIGN_EXTEND; 276 case TargetOpcode::G_ZEXT: 277 return ISD::ZERO_EXTEND; 278 case TargetOpcode::G_ANYEXT: 279 return ISD::ANY_EXTEND; 280 default: 281 llvm_unreachable("not an extend opcode"); 282 } 283 } 284 285 void AMDGPUCallLowering::splitToValueTypes( 286 MachineIRBuilder &B, 287 const ArgInfo &OrigArg, 288 SmallVectorImpl<ArgInfo> &SplitArgs, 289 const DataLayout &DL, CallingConv::ID CallConv, 290 bool IsOutgoing, 291 SplitArgTy PerformArgSplit) const { 292 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 293 LLVMContext &Ctx = OrigArg.Ty->getContext(); 294 295 if (OrigArg.Ty->isVoidTy()) 296 return; 297 298 SmallVector<EVT, 4> SplitVTs; 299 ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs); 300 301 assert(OrigArg.Regs.size() == SplitVTs.size()); 302 303 int SplitIdx = 0; 304 for (EVT VT : SplitVTs) { 305 Register Reg = OrigArg.Regs[SplitIdx]; 306 Type *Ty = VT.getTypeForEVT(Ctx); 307 LLT LLTy = getLLTForType(*Ty, DL); 308 309 if (IsOutgoing && VT.isScalarInteger()) { 310 unsigned ExtendOp = TargetOpcode::G_ANYEXT; 311 if (OrigArg.Flags[0].isSExt()) { 312 assert(OrigArg.Regs.size() == 1 && "expect only simple return values"); 313 ExtendOp = TargetOpcode::G_SEXT; 314 } else if (OrigArg.Flags[0].isZExt()) { 315 assert(OrigArg.Regs.size() == 1 && "expect only simple return values"); 316 ExtendOp = TargetOpcode::G_ZEXT; 317 } 318 319 EVT ExtVT = TLI.getTypeForExtReturn(Ctx, VT, 320 extOpcodeToISDExtOpcode(ExtendOp)); 321 if (ExtVT.getSizeInBits() != VT.getSizeInBits()) { 322 VT = ExtVT; 323 Ty = ExtVT.getTypeForEVT(Ctx); 324 LLTy = getLLTForType(*Ty, DL); 325 Reg = B.buildInstr(ExtendOp, {LLTy}, {Reg}).getReg(0); 326 } 327 } 328 329 unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT); 330 MVT RegVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT); 331 332 if (NumParts == 1) { 333 // No splitting to do, but we want to replace the original type (e.g. [1 x 334 // double] -> double). 335 SplitArgs.emplace_back(Reg, Ty, OrigArg.Flags, OrigArg.IsFixed); 336 337 ++SplitIdx; 338 continue; 339 } 340 341 SmallVector<Register, 8> SplitRegs; 342 Type *PartTy = EVT(RegVT).getTypeForEVT(Ctx); 343 LLT PartLLT = getLLTForType(*PartTy, DL); 344 MachineRegisterInfo &MRI = *B.getMRI(); 345 346 // FIXME: Should we be reporting all of the part registers for a single 347 // argument, and let handleAssignments take care of the repacking? 348 for (unsigned i = 0; i < NumParts; ++i) { 349 Register PartReg = MRI.createGenericVirtualRegister(PartLLT); 350 SplitRegs.push_back(PartReg); 351 SplitArgs.emplace_back(ArrayRef<Register>(PartReg), PartTy, OrigArg.Flags); 352 } 353 354 PerformArgSplit(SplitRegs, Reg, LLTy, PartLLT, SplitIdx); 355 356 ++SplitIdx; 357 } 358 } 359 360 // Get the appropriate type to make \p OrigTy \p Factor times bigger. 361 static LLT getMultipleType(LLT OrigTy, int Factor) { 362 if (OrigTy.isVector()) { 363 return LLT::vector(OrigTy.getNumElements() * Factor, 364 OrigTy.getElementType()); 365 } 366 367 return LLT::scalar(OrigTy.getSizeInBits() * Factor); 368 } 369 370 // TODO: Move to generic code 371 static void unpackRegsToOrigType(MachineIRBuilder &B, 372 ArrayRef<Register> DstRegs, 373 Register SrcReg, 374 const CallLowering::ArgInfo &Info, 375 LLT SrcTy, 376 LLT PartTy) { 377 assert(DstRegs.size() > 1 && "Nothing to unpack"); 378 379 const unsigned SrcSize = SrcTy.getSizeInBits(); 380 const unsigned PartSize = PartTy.getSizeInBits(); 381 382 if (SrcTy.isVector() && !PartTy.isVector() && 383 PartSize > SrcTy.getElementType().getSizeInBits()) { 384 // Vector was scalarized, and the elements extended. 385 auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(), 386 SrcReg); 387 for (int i = 0, e = DstRegs.size(); i != e; ++i) 388 B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i)); 389 return; 390 } 391 392 if (SrcSize % PartSize == 0) { 393 B.buildUnmerge(DstRegs, SrcReg); 394 return; 395 } 396 397 const int NumRoundedParts = (SrcSize + PartSize - 1) / PartSize; 398 399 LLT BigTy = getMultipleType(PartTy, NumRoundedParts); 400 auto ImpDef = B.buildUndef(BigTy); 401 402 auto Big = B.buildInsert(BigTy, ImpDef.getReg(0), SrcReg, 0).getReg(0); 403 404 int64_t Offset = 0; 405 for (unsigned i = 0, e = DstRegs.size(); i != e; ++i, Offset += PartSize) 406 B.buildExtract(DstRegs[i], Big, Offset); 407 } 408 409 /// Lower the return value for the already existing \p Ret. This assumes that 410 /// \p B's insertion point is correct. 411 bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B, 412 const Value *Val, ArrayRef<Register> VRegs, 413 MachineInstrBuilder &Ret) const { 414 if (!Val) 415 return true; 416 417 auto &MF = B.getMF(); 418 const auto &F = MF.getFunction(); 419 const DataLayout &DL = MF.getDataLayout(); 420 MachineRegisterInfo *MRI = B.getMRI(); 421 422 CallingConv::ID CC = F.getCallingConv(); 423 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 424 425 ArgInfo OrigRetInfo(VRegs, Val->getType()); 426 setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F); 427 SmallVector<ArgInfo, 4> SplitRetInfos; 428 429 splitToValueTypes( 430 B, OrigRetInfo, SplitRetInfos, DL, CC, true, 431 [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT, 432 int VTSplitIdx) { 433 unpackRegsToOrigType(B, Regs, SrcReg, 434 SplitRetInfos[VTSplitIdx], 435 LLTy, PartLLT); 436 }); 437 438 CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg()); 439 AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret, AssignFn); 440 return handleAssignments(B, SplitRetInfos, RetHandler); 441 } 442 443 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, 444 const Value *Val, 445 ArrayRef<Register> VRegs) const { 446 447 MachineFunction &MF = B.getMF(); 448 MachineRegisterInfo &MRI = MF.getRegInfo(); 449 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 450 MFI->setIfReturnsVoid(!Val); 451 452 assert(!Val == VRegs.empty() && "Return value without a vreg"); 453 454 CallingConv::ID CC = B.getMF().getFunction().getCallingConv(); 455 const bool IsShader = AMDGPU::isShader(CC); 456 const bool IsWaveEnd = (IsShader && MFI->returnsVoid()) || 457 AMDGPU::isKernel(CC); 458 if (IsWaveEnd) { 459 B.buildInstr(AMDGPU::S_ENDPGM) 460 .addImm(0); 461 return true; 462 } 463 464 auto const &ST = MF.getSubtarget<GCNSubtarget>(); 465 466 unsigned ReturnOpc = 467 IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return; 468 469 auto Ret = B.buildInstrNoInsert(ReturnOpc); 470 Register ReturnAddrVReg; 471 if (ReturnOpc == AMDGPU::S_SETPC_B64_return) { 472 ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass); 473 Ret.addUse(ReturnAddrVReg); 474 } 475 476 if (!lowerReturnVal(B, Val, VRegs, Ret)) 477 return false; 478 479 if (ReturnOpc == AMDGPU::S_SETPC_B64_return) { 480 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 481 Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF), 482 &AMDGPU::SGPR_64RegClass); 483 B.buildCopy(ReturnAddrVReg, LiveInReturn); 484 } 485 486 // TODO: Handle CalleeSavedRegsViaCopy. 487 488 B.insertInstr(Ret); 489 return true; 490 } 491 492 void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B, 493 Type *ParamTy, 494 uint64_t Offset) const { 495 MachineFunction &MF = B.getMF(); 496 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 497 MachineRegisterInfo &MRI = MF.getRegInfo(); 498 Register KernArgSegmentPtr = 499 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 500 Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr); 501 502 auto OffsetReg = B.buildConstant(LLT::scalar(64), Offset); 503 504 B.buildPtrAdd(DstReg, KernArgSegmentVReg, OffsetReg); 505 } 506 507 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy, 508 uint64_t Offset, Align Alignment, 509 Register DstReg) const { 510 MachineFunction &MF = B.getMF(); 511 const Function &F = MF.getFunction(); 512 const DataLayout &DL = F.getParent()->getDataLayout(); 513 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 514 unsigned TypeSize = DL.getTypeStoreSize(ParamTy); 515 516 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 517 Register PtrReg = B.getMRI()->createGenericVirtualRegister(PtrTy); 518 lowerParameterPtr(PtrReg, B, ParamTy, Offset); 519 520 MachineMemOperand *MMO = MF.getMachineMemOperand( 521 PtrInfo, 522 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 523 MachineMemOperand::MOInvariant, 524 TypeSize, Alignment); 525 526 B.buildLoad(DstReg, PtrReg, *MMO); 527 } 528 529 // Allocate special inputs passed in user SGPRs. 530 static void allocateHSAUserSGPRs(CCState &CCInfo, 531 MachineIRBuilder &B, 532 MachineFunction &MF, 533 const SIRegisterInfo &TRI, 534 SIMachineFunctionInfo &Info) { 535 // FIXME: How should these inputs interact with inreg / custom SGPR inputs? 536 if (Info.hasPrivateSegmentBuffer()) { 537 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); 538 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); 539 CCInfo.AllocateReg(PrivateSegmentBufferReg); 540 } 541 542 if (Info.hasDispatchPtr()) { 543 Register DispatchPtrReg = Info.addDispatchPtr(TRI); 544 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); 545 CCInfo.AllocateReg(DispatchPtrReg); 546 } 547 548 if (Info.hasQueuePtr()) { 549 Register QueuePtrReg = Info.addQueuePtr(TRI); 550 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); 551 CCInfo.AllocateReg(QueuePtrReg); 552 } 553 554 if (Info.hasKernargSegmentPtr()) { 555 MachineRegisterInfo &MRI = MF.getRegInfo(); 556 Register InputPtrReg = Info.addKernargSegmentPtr(TRI); 557 const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 558 Register VReg = MRI.createGenericVirtualRegister(P4); 559 MRI.addLiveIn(InputPtrReg, VReg); 560 B.getMBB().addLiveIn(InputPtrReg); 561 B.buildCopy(VReg, InputPtrReg); 562 CCInfo.AllocateReg(InputPtrReg); 563 } 564 565 if (Info.hasDispatchID()) { 566 Register DispatchIDReg = Info.addDispatchID(TRI); 567 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); 568 CCInfo.AllocateReg(DispatchIDReg); 569 } 570 571 if (Info.hasFlatScratchInit()) { 572 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI); 573 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); 574 CCInfo.AllocateReg(FlatScratchInitReg); 575 } 576 577 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read 578 // these from the dispatch pointer. 579 } 580 581 bool AMDGPUCallLowering::lowerFormalArgumentsKernel( 582 MachineIRBuilder &B, const Function &F, 583 ArrayRef<ArrayRef<Register>> VRegs) const { 584 MachineFunction &MF = B.getMF(); 585 const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>(); 586 MachineRegisterInfo &MRI = MF.getRegInfo(); 587 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 588 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); 589 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 590 591 const DataLayout &DL = F.getParent()->getDataLayout(); 592 593 SmallVector<CCValAssign, 16> ArgLocs; 594 CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); 595 596 allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info); 597 598 unsigned i = 0; 599 const Align KernArgBaseAlign(16); 600 const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F); 601 uint64_t ExplicitArgOffset = 0; 602 603 // TODO: Align down to dword alignment and extract bits for extending loads. 604 for (auto &Arg : F.args()) { 605 const bool IsByRef = Arg.hasByRefAttr(); 606 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 607 unsigned AllocSize = DL.getTypeAllocSize(ArgTy); 608 if (AllocSize == 0) 609 continue; 610 611 MaybeAlign ABIAlign = IsByRef ? Arg.getParamAlign() : None; 612 if (!ABIAlign) 613 ABIAlign = DL.getABITypeAlign(ArgTy); 614 615 uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset; 616 ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize; 617 618 if (Arg.use_empty()) { 619 ++i; 620 continue; 621 } 622 623 Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset); 624 625 if (IsByRef) { 626 unsigned ByRefAS = cast<PointerType>(Arg.getType())->getAddressSpace(); 627 628 assert(VRegs[i].size() == 1 && 629 "expected only one register for byval pointers"); 630 if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) { 631 lowerParameterPtr(VRegs[i][0], B, ArgTy, ArgOffset); 632 } else { 633 const LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 634 Register PtrReg = MRI.createGenericVirtualRegister(ConstPtrTy); 635 lowerParameterPtr(PtrReg, B, ArgTy, ArgOffset); 636 637 B.buildAddrSpaceCast(VRegs[i][0], PtrReg); 638 } 639 } else { 640 ArrayRef<Register> OrigArgRegs = VRegs[i]; 641 Register ArgReg = 642 OrigArgRegs.size() == 1 643 ? OrigArgRegs[0] 644 : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL)); 645 646 lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg); 647 if (OrigArgRegs.size() > 1) 648 unpackRegs(OrigArgRegs, ArgReg, ArgTy, B); 649 } 650 651 ++i; 652 } 653 654 TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); 655 TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false); 656 return true; 657 } 658 659 /// Pack values \p SrcRegs to cover the vector type result \p DstRegs. 660 static MachineInstrBuilder mergeVectorRegsToResultRegs( 661 MachineIRBuilder &B, ArrayRef<Register> DstRegs, ArrayRef<Register> SrcRegs) { 662 MachineRegisterInfo &MRI = *B.getMRI(); 663 LLT LLTy = MRI.getType(DstRegs[0]); 664 LLT PartLLT = MRI.getType(SrcRegs[0]); 665 666 // Deal with v3s16 split into v2s16 667 LLT LCMTy = getLCMType(LLTy, PartLLT); 668 if (LCMTy == LLTy) { 669 // Common case where no padding is needed. 670 assert(DstRegs.size() == 1); 671 return B.buildConcatVectors(DstRegs[0], SrcRegs); 672 } 673 674 const int NumWide = LCMTy.getSizeInBits() / PartLLT.getSizeInBits(); 675 Register Undef = B.buildUndef(PartLLT).getReg(0); 676 677 // Build vector of undefs. 678 SmallVector<Register, 8> WidenedSrcs(NumWide, Undef); 679 680 // Replace the first sources with the real registers. 681 std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin()); 682 683 auto Widened = B.buildConcatVectors(LCMTy, WidenedSrcs); 684 int NumDst = LCMTy.getSizeInBits() / LLTy.getSizeInBits(); 685 686 SmallVector<Register, 8> PadDstRegs(NumDst); 687 std::copy(DstRegs.begin(), DstRegs.end(), PadDstRegs.begin()); 688 689 // Create the excess dead defs for the unmerge. 690 for (int I = DstRegs.size(); I != NumDst; ++I) 691 PadDstRegs[I] = MRI.createGenericVirtualRegister(LLTy); 692 693 return B.buildUnmerge(PadDstRegs, Widened); 694 } 695 696 // TODO: Move this to generic code 697 static void packSplitRegsToOrigType(MachineIRBuilder &B, 698 ArrayRef<Register> OrigRegs, 699 ArrayRef<Register> Regs, 700 LLT LLTy, 701 LLT PartLLT) { 702 MachineRegisterInfo &MRI = *B.getMRI(); 703 704 if (!LLTy.isVector() && !PartLLT.isVector()) { 705 assert(OrigRegs.size() == 1); 706 LLT OrigTy = MRI.getType(OrigRegs[0]); 707 708 unsigned SrcSize = PartLLT.getSizeInBits() * Regs.size(); 709 if (SrcSize == OrigTy.getSizeInBits()) 710 B.buildMerge(OrigRegs[0], Regs); 711 else { 712 auto Widened = B.buildMerge(LLT::scalar(SrcSize), Regs); 713 B.buildTrunc(OrigRegs[0], Widened); 714 } 715 716 return; 717 } 718 719 if (LLTy.isVector() && PartLLT.isVector()) { 720 assert(OrigRegs.size() == 1); 721 assert(LLTy.getElementType() == PartLLT.getElementType()); 722 mergeVectorRegsToResultRegs(B, OrigRegs, Regs); 723 return; 724 } 725 726 assert(LLTy.isVector() && !PartLLT.isVector()); 727 728 LLT DstEltTy = LLTy.getElementType(); 729 730 // Pointer information was discarded. We'll need to coerce some register types 731 // to avoid violating type constraints. 732 LLT RealDstEltTy = MRI.getType(OrigRegs[0]).getElementType(); 733 734 assert(DstEltTy.getSizeInBits() == RealDstEltTy.getSizeInBits()); 735 736 if (DstEltTy == PartLLT) { 737 // Vector was trivially scalarized. 738 739 if (RealDstEltTy.isPointer()) { 740 for (Register Reg : Regs) 741 MRI.setType(Reg, RealDstEltTy); 742 } 743 744 B.buildBuildVector(OrigRegs[0], Regs); 745 } else if (DstEltTy.getSizeInBits() > PartLLT.getSizeInBits()) { 746 // Deal with vector with 64-bit elements decomposed to 32-bit 747 // registers. Need to create intermediate 64-bit elements. 748 SmallVector<Register, 8> EltMerges; 749 int PartsPerElt = DstEltTy.getSizeInBits() / PartLLT.getSizeInBits(); 750 751 assert(DstEltTy.getSizeInBits() % PartLLT.getSizeInBits() == 0); 752 753 for (int I = 0, NumElts = LLTy.getNumElements(); I != NumElts; ++I) { 754 auto Merge = B.buildMerge(RealDstEltTy, Regs.take_front(PartsPerElt)); 755 // Fix the type in case this is really a vector of pointers. 756 MRI.setType(Merge.getReg(0), RealDstEltTy); 757 EltMerges.push_back(Merge.getReg(0)); 758 Regs = Regs.drop_front(PartsPerElt); 759 } 760 761 B.buildBuildVector(OrigRegs[0], EltMerges); 762 } else { 763 // Vector was split, and elements promoted to a wider type. 764 LLT BVType = LLT::vector(LLTy.getNumElements(), PartLLT); 765 auto BV = B.buildBuildVector(BVType, Regs); 766 B.buildTrunc(OrigRegs[0], BV); 767 } 768 } 769 770 bool AMDGPUCallLowering::lowerFormalArguments( 771 MachineIRBuilder &B, const Function &F, 772 ArrayRef<ArrayRef<Register>> VRegs) const { 773 CallingConv::ID CC = F.getCallingConv(); 774 775 // The infrastructure for normal calling convention lowering is essentially 776 // useless for kernels. We want to avoid any kind of legalization or argument 777 // splitting. 778 if (CC == CallingConv::AMDGPU_KERNEL) 779 return lowerFormalArgumentsKernel(B, F, VRegs); 780 781 const bool IsShader = AMDGPU::isShader(CC); 782 const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC); 783 784 MachineFunction &MF = B.getMF(); 785 MachineBasicBlock &MBB = B.getMBB(); 786 MachineRegisterInfo &MRI = MF.getRegInfo(); 787 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 788 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 789 const SIRegisterInfo *TRI = Subtarget.getRegisterInfo(); 790 const DataLayout &DL = F.getParent()->getDataLayout(); 791 792 793 SmallVector<CCValAssign, 16> ArgLocs; 794 CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext()); 795 796 if (!IsEntryFunc) { 797 Register ReturnAddrReg = TRI->getReturnAddressReg(MF); 798 Register LiveInReturn = MF.addLiveIn(ReturnAddrReg, 799 &AMDGPU::SGPR_64RegClass); 800 MBB.addLiveIn(ReturnAddrReg); 801 B.buildCopy(LiveInReturn, ReturnAddrReg); 802 } 803 804 if (Info->hasImplicitBufferPtr()) { 805 Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI); 806 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); 807 CCInfo.AllocateReg(ImplicitBufferPtrReg); 808 } 809 810 811 SmallVector<ArgInfo, 32> SplitArgs; 812 unsigned Idx = 0; 813 unsigned PSInputNum = 0; 814 815 for (auto &Arg : F.args()) { 816 if (DL.getTypeStoreSize(Arg.getType()) == 0) 817 continue; 818 819 const bool InReg = Arg.hasAttribute(Attribute::InReg); 820 821 // SGPR arguments to functions not implemented. 822 if (!IsShader && InReg) 823 return false; 824 825 if (Arg.hasAttribute(Attribute::SwiftSelf) || 826 Arg.hasAttribute(Attribute::SwiftError) || 827 Arg.hasAttribute(Attribute::Nest)) 828 return false; 829 830 if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) { 831 const bool ArgUsed = !Arg.use_empty(); 832 bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum); 833 834 if (!SkipArg) { 835 Info->markPSInputAllocated(PSInputNum); 836 if (ArgUsed) 837 Info->markPSInputEnabled(PSInputNum); 838 } 839 840 ++PSInputNum; 841 842 if (SkipArg) { 843 for (int I = 0, E = VRegs[Idx].size(); I != E; ++I) 844 B.buildUndef(VRegs[Idx][I]); 845 846 ++Idx; 847 continue; 848 } 849 } 850 851 ArgInfo OrigArg(VRegs[Idx], Arg.getType()); 852 const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex; 853 setArgFlags(OrigArg, OrigArgIdx, DL, F); 854 855 splitToValueTypes( 856 B, OrigArg, SplitArgs, DL, CC, false, 857 // FIXME: We should probably be passing multiple registers to 858 // handleAssignments to do this 859 [&](ArrayRef<Register> Regs, Register DstReg, 860 LLT LLTy, LLT PartLLT, int VTSplitIdx) { 861 assert(DstReg == VRegs[Idx][VTSplitIdx]); 862 packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs, 863 LLTy, PartLLT); 864 }); 865 866 ++Idx; 867 } 868 869 // At least one interpolation mode must be enabled or else the GPU will 870 // hang. 871 // 872 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user 873 // set PSInputAddr, the user wants to enable some bits after the compilation 874 // based on run-time states. Since we can't know what the final PSInputEna 875 // will look like, so we shouldn't do anything here and the user should take 876 // responsibility for the correct programming. 877 // 878 // Otherwise, the following restrictions apply: 879 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. 880 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be 881 // enabled too. 882 if (CC == CallingConv::AMDGPU_PS) { 883 if ((Info->getPSInputAddr() & 0x7F) == 0 || 884 ((Info->getPSInputAddr() & 0xF) == 0 && 885 Info->isPSInputAllocated(11))) { 886 CCInfo.AllocateReg(AMDGPU::VGPR0); 887 CCInfo.AllocateReg(AMDGPU::VGPR1); 888 Info->markPSInputAllocated(0); 889 Info->markPSInputEnabled(0); 890 } 891 892 if (Subtarget.isAmdPalOS()) { 893 // For isAmdPalOS, the user does not enable some bits after compilation 894 // based on run-time states; the register values being generated here are 895 // the final ones set in hardware. Therefore we need to apply the 896 // workaround to PSInputAddr and PSInputEnable together. (The case where 897 // a bit is set in PSInputAddr but not PSInputEnable is where the frontend 898 // set up an input arg for a particular interpolation mode, but nothing 899 // uses that input arg. Really we should have an earlier pass that removes 900 // such an arg.) 901 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable(); 902 if ((PsInputBits & 0x7F) == 0 || 903 ((PsInputBits & 0xF) == 0 && 904 (PsInputBits >> 11 & 1))) 905 Info->markPSInputEnabled( 906 countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined)); 907 } 908 } 909 910 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 911 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg()); 912 913 if (!MBB.empty()) 914 B.setInstr(*MBB.begin()); 915 916 if (!IsEntryFunc) { 917 // For the fixed ABI, pass workitem IDs in the last argument register. 918 if (AMDGPUTargetMachine::EnableFixedFunctionABI) 919 TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); 920 } 921 922 FormalArgHandler Handler(B, MRI, AssignFn); 923 if (!handleAssignments(CCInfo, ArgLocs, B, SplitArgs, Handler)) 924 return false; 925 926 if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) { 927 // Special inputs come after user arguments. 928 TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); 929 } 930 931 // Start adding system SGPRs. 932 if (IsEntryFunc) { 933 TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader); 934 } else { 935 CCInfo.AllocateReg(Info->getScratchRSrcReg()); 936 TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); 937 } 938 939 // Move back to the end of the basic block. 940 B.setMBB(MBB); 941 942 return true; 943 } 944 945 bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder, 946 CCState &CCInfo, 947 SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs, 948 CallLoweringInfo &Info) const { 949 MachineFunction &MF = MIRBuilder.getMF(); 950 951 const AMDGPUFunctionArgInfo *CalleeArgInfo 952 = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo; 953 954 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 955 const AMDGPUFunctionArgInfo &CallerArgInfo = MFI->getArgInfo(); 956 957 958 // TODO: Unify with private memory register handling. This is complicated by 959 // the fact that at least in kernels, the input argument is not necessarily 960 // in the same location as the input. 961 AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = { 962 AMDGPUFunctionArgInfo::DISPATCH_PTR, 963 AMDGPUFunctionArgInfo::QUEUE_PTR, 964 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, 965 AMDGPUFunctionArgInfo::DISPATCH_ID, 966 AMDGPUFunctionArgInfo::WORKGROUP_ID_X, 967 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, 968 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z 969 }; 970 971 MachineRegisterInfo &MRI = MF.getRegInfo(); 972 973 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 974 const AMDGPULegalizerInfo *LI 975 = static_cast<const AMDGPULegalizerInfo*>(ST.getLegalizerInfo()); 976 977 for (auto InputID : InputRegs) { 978 const ArgDescriptor *OutgoingArg; 979 const TargetRegisterClass *ArgRC; 980 LLT ArgTy; 981 982 std::tie(OutgoingArg, ArgRC, ArgTy) = 983 CalleeArgInfo->getPreloadedValue(InputID); 984 if (!OutgoingArg) 985 continue; 986 987 const ArgDescriptor *IncomingArg; 988 const TargetRegisterClass *IncomingArgRC; 989 std::tie(IncomingArg, IncomingArgRC, ArgTy) = 990 CallerArgInfo.getPreloadedValue(InputID); 991 assert(IncomingArgRC == ArgRC); 992 993 Register InputReg = MRI.createGenericVirtualRegister(ArgTy); 994 995 if (IncomingArg) { 996 LI->loadInputValue(InputReg, MIRBuilder, IncomingArg, ArgRC, ArgTy); 997 } else { 998 assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 999 LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder); 1000 } 1001 1002 if (OutgoingArg->isRegister()) { 1003 ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg); 1004 if (!CCInfo.AllocateReg(OutgoingArg->getRegister())) 1005 report_fatal_error("failed to allocate implicit input argument"); 1006 } else { 1007 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n"); 1008 return false; 1009 } 1010 } 1011 1012 // Pack workitem IDs into a single register or pass it as is if already 1013 // packed. 1014 const ArgDescriptor *OutgoingArg; 1015 const TargetRegisterClass *ArgRC; 1016 LLT ArgTy; 1017 1018 std::tie(OutgoingArg, ArgRC, ArgTy) = 1019 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X); 1020 if (!OutgoingArg) 1021 std::tie(OutgoingArg, ArgRC, ArgTy) = 1022 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 1023 if (!OutgoingArg) 1024 std::tie(OutgoingArg, ArgRC, ArgTy) = 1025 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 1026 if (!OutgoingArg) 1027 return false; 1028 1029 auto WorkitemIDX = 1030 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X); 1031 auto WorkitemIDY = 1032 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 1033 auto WorkitemIDZ = 1034 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 1035 1036 const ArgDescriptor *IncomingArgX = std::get<0>(WorkitemIDX); 1037 const ArgDescriptor *IncomingArgY = std::get<0>(WorkitemIDY); 1038 const ArgDescriptor *IncomingArgZ = std::get<0>(WorkitemIDZ); 1039 const LLT S32 = LLT::scalar(32); 1040 1041 // If incoming ids are not packed we need to pack them. 1042 // FIXME: Should consider known workgroup size to eliminate known 0 cases. 1043 Register InputReg; 1044 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX) { 1045 InputReg = MRI.createGenericVirtualRegister(S32); 1046 LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX, 1047 std::get<1>(WorkitemIDX), std::get<2>(WorkitemIDX)); 1048 } 1049 1050 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY) { 1051 Register Y = MRI.createGenericVirtualRegister(S32); 1052 LI->loadInputValue(Y, MIRBuilder, IncomingArgY, std::get<1>(WorkitemIDY), 1053 std::get<2>(WorkitemIDY)); 1054 1055 Y = MIRBuilder.buildShl(S32, Y, MIRBuilder.buildConstant(S32, 10)).getReg(0); 1056 InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Y).getReg(0) : Y; 1057 } 1058 1059 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ) { 1060 Register Z = MRI.createGenericVirtualRegister(S32); 1061 LI->loadInputValue(Z, MIRBuilder, IncomingArgZ, std::get<1>(WorkitemIDZ), 1062 std::get<2>(WorkitemIDZ)); 1063 1064 Z = MIRBuilder.buildShl(S32, Z, MIRBuilder.buildConstant(S32, 20)).getReg(0); 1065 InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Z).getReg(0) : Z; 1066 } 1067 1068 if (!InputReg) { 1069 InputReg = MRI.createGenericVirtualRegister(S32); 1070 1071 // Workitem ids are already packed, any of present incoming arguments will 1072 // carry all required fields. 1073 ArgDescriptor IncomingArg = ArgDescriptor::createArg( 1074 IncomingArgX ? *IncomingArgX : 1075 IncomingArgY ? *IncomingArgY : *IncomingArgZ, ~0u); 1076 LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg, 1077 &AMDGPU::VGPR_32RegClass, S32); 1078 } 1079 1080 if (OutgoingArg->isRegister()) { 1081 ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg); 1082 if (!CCInfo.AllocateReg(OutgoingArg->getRegister())) 1083 report_fatal_error("failed to allocate implicit input argument"); 1084 } else { 1085 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n"); 1086 return false; 1087 } 1088 1089 return true; 1090 } 1091 1092 /// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for 1093 /// CC. 1094 static std::pair<CCAssignFn *, CCAssignFn *> 1095 getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) { 1096 return {TLI.CCAssignFnForCall(CC, false), TLI.CCAssignFnForCall(CC, true)}; 1097 } 1098 1099 static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect, 1100 bool IsTailCall) { 1101 return AMDGPU::SI_CALL; 1102 } 1103 1104 // Add operands to call instruction to track the callee. 1105 static bool addCallTargetOperands(MachineInstrBuilder &CallInst, 1106 MachineIRBuilder &MIRBuilder, 1107 AMDGPUCallLowering::CallLoweringInfo &Info) { 1108 if (Info.Callee.isReg()) { 1109 CallInst.addReg(Info.Callee.getReg()); 1110 CallInst.addImm(0); 1111 } else if (Info.Callee.isGlobal() && Info.Callee.getOffset() == 0) { 1112 // The call lowering lightly assumed we can directly encode a call target in 1113 // the instruction, which is not the case. Materialize the address here. 1114 const GlobalValue *GV = Info.Callee.getGlobal(); 1115 auto Ptr = MIRBuilder.buildGlobalValue( 1116 LLT::pointer(GV->getAddressSpace(), 64), GV); 1117 CallInst.addReg(Ptr.getReg(0)); 1118 CallInst.add(Info.Callee); 1119 } else 1120 return false; 1121 1122 return true; 1123 } 1124 1125 bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, 1126 CallLoweringInfo &Info) const { 1127 if (!AMDGPUTargetMachine::EnableFixedFunctionABI) { 1128 LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n"); 1129 return false; 1130 } 1131 1132 if (Info.IsVarArg) { 1133 LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n"); 1134 return false; 1135 } 1136 1137 MachineFunction &MF = MIRBuilder.getMF(); 1138 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1139 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1140 1141 const Function &F = MF.getFunction(); 1142 MachineRegisterInfo &MRI = MF.getRegInfo(); 1143 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 1144 const DataLayout &DL = F.getParent()->getDataLayout(); 1145 1146 if (AMDGPU::isShader(F.getCallingConv())) { 1147 LLVM_DEBUG(dbgs() << "Unhandled call from graphics shader\n"); 1148 return false; 1149 } 1150 1151 SmallVector<ArgInfo, 8> OutArgs; 1152 SmallVector<ArgInfo, 4> SplitRetInfos; 1153 1154 for (auto &OrigArg : Info.OrigArgs) { 1155 splitToValueTypes( 1156 MIRBuilder, OrigArg, OutArgs, DL, Info.CallConv, true, 1157 // FIXME: We should probably be passing multiple registers to 1158 // handleAssignments to do this 1159 [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT, 1160 int VTSplitIdx) { 1161 unpackRegsToOrigType(MIRBuilder, Regs, SrcReg, OrigArg, LLTy, PartLLT); 1162 }); 1163 } 1164 1165 // If we can lower as a tail call, do that instead. 1166 bool CanTailCallOpt = false; 1167 1168 // We must emit a tail call if we have musttail. 1169 if (Info.IsMustTailCall && !CanTailCallOpt) { 1170 LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n"); 1171 return false; 1172 } 1173 1174 // Find out which ABI gets to decide where things go. 1175 CCAssignFn *AssignFnFixed; 1176 CCAssignFn *AssignFnVarArg; 1177 std::tie(AssignFnFixed, AssignFnVarArg) = 1178 getAssignFnsForCC(Info.CallConv, TLI); 1179 1180 MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP) 1181 .addImm(0) 1182 .addImm(0); 1183 1184 // Create a temporarily-floating call instruction so we can add the implicit 1185 // uses of arg registers. 1186 unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false); 1187 1188 auto MIB = MIRBuilder.buildInstrNoInsert(Opc); 1189 MIB.addDef(TRI->getReturnAddressReg(MF)); 1190 1191 if (!addCallTargetOperands(MIB, MIRBuilder, Info)) 1192 return false; 1193 1194 // Tell the call which registers are clobbered. 1195 const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv); 1196 MIB.addRegMask(Mask); 1197 1198 SmallVector<CCValAssign, 16> ArgLocs; 1199 CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext()); 1200 1201 // We could pass MIB and directly add the implicit uses to the call 1202 // now. However, as an aesthetic choice, place implicit argument operands 1203 // after the ordinary user argument registers. 1204 SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs; 1205 1206 if (AMDGPUTargetMachine::EnableFixedFunctionABI) { 1207 // With a fixed ABI, allocate fixed registers before user arguments. 1208 if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info)) 1209 return false; 1210 } 1211 1212 // Do the actual argument marshalling. 1213 SmallVector<Register, 8> PhysRegs; 1214 AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed, 1215 AssignFnVarArg, false); 1216 if (!handleAssignments(CCInfo, ArgLocs, MIRBuilder, OutArgs, Handler)) 1217 return false; 1218 1219 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1220 1221 // Insert copies for the SRD. In the HSA case, this should be an identity 1222 // copy. 1223 auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::vector(4, 32), 1224 MFI->getScratchRSrcReg()); 1225 MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); 1226 MIB.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit); 1227 1228 for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) { 1229 MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second); 1230 MIB.addReg(ArgReg.first, RegState::Implicit); 1231 } 1232 1233 // Get a count of how many bytes are to be pushed on the stack. 1234 unsigned NumBytes = CCInfo.getNextStackOffset(); 1235 1236 // If Callee is a reg, since it is used by a target specific 1237 // instruction, it must have a register class matching the 1238 // constraint of that instruction. 1239 1240 // FIXME: We should define regbankselectable call instructions to handle 1241 // divergent call targets. 1242 if (MIB->getOperand(1).isReg()) { 1243 MIB->getOperand(1).setReg(constrainOperandRegClass( 1244 MF, *TRI, MRI, *ST.getInstrInfo(), 1245 *ST.getRegBankInfo(), *MIB, MIB->getDesc(), MIB->getOperand(1), 1246 1)); 1247 } 1248 1249 auto OrigInsertPt = MIRBuilder.getInsertPt(); 1250 1251 // Now we can add the actual call instruction to the correct position. 1252 MIRBuilder.insertInstr(MIB); 1253 1254 // Insert this now to give us an anchor point for managing the insert point. 1255 MachineInstrBuilder CallSeqEnd = 1256 MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN); 1257 1258 SmallVector<ArgInfo, 8> InArgs; 1259 if (!Info.OrigRet.Ty->isVoidTy()) { 1260 splitToValueTypes( 1261 MIRBuilder, Info.OrigRet, InArgs, DL, Info.CallConv, false, 1262 [&](ArrayRef<Register> Regs, Register DstReg, 1263 LLT LLTy, LLT PartLLT, int VTSplitIdx) { 1264 assert(DstReg == Info.OrigRet.Regs[VTSplitIdx]); 1265 packSplitRegsToOrigType(MIRBuilder, Info.OrigRet.Regs[VTSplitIdx], 1266 Regs, LLTy, PartLLT); 1267 }); 1268 } 1269 1270 // Make sure the raw argument copies are inserted before the marshalling to 1271 // the original types. 1272 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), CallSeqEnd); 1273 1274 // Finally we can copy the returned value back into its virtual-register. In 1275 // symmetry with the arguments, the physical register must be an 1276 // implicit-define of the call instruction. 1277 if (!Info.OrigRet.Ty->isVoidTy()) { 1278 CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv, 1279 Info.IsVarArg); 1280 CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn); 1281 if (!handleAssignments(MIRBuilder, InArgs, Handler)) 1282 return false; 1283 } 1284 1285 uint64_t CalleePopBytes = NumBytes; 1286 CallSeqEnd.addImm(0) 1287 .addImm(CalleePopBytes); 1288 1289 // Restore the insert point to after the call sequence. 1290 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), OrigInsertPt); 1291 return true; 1292 } 1293