1 //===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file implements the lowering of LLVM calls to machine code calls for 11 /// GlobalISel. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUCallLowering.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUISelLowering.h" 18 #include "AMDGPULegalizerInfo.h" 19 #include "AMDGPUSubtarget.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 22 #include "SIISelLowering.h" 23 #include "SIMachineFunctionInfo.h" 24 #include "SIRegisterInfo.h" 25 #include "llvm/CodeGen/Analysis.h" 26 #include "llvm/CodeGen/CallingConvLower.h" 27 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 28 #include "llvm/CodeGen/MachineInstrBuilder.h" 29 #include "llvm/Support/LowLevelTypeImpl.h" 30 31 #define DEBUG_TYPE "amdgpu-call-lowering" 32 33 using namespace llvm; 34 35 namespace { 36 37 struct AMDGPUValueHandler : public CallLowering::ValueHandler { 38 AMDGPUValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, 39 CCAssignFn *AssignFn) 40 : ValueHandler(B, MRI, AssignFn) {} 41 42 /// Wrapper around extendRegister to ensure we extend to a full 32-bit 43 /// register. 44 Register extendRegisterMin32(Register ValVReg, CCValAssign &VA) { 45 if (VA.getLocVT().getSizeInBits() < 32) { 46 // 16-bit types are reported as legal for 32-bit registers. We need to 47 // extend and do a 32-bit copy to avoid the verifier complaining about it. 48 return MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0); 49 } 50 51 return extendRegister(ValVReg, VA); 52 } 53 }; 54 55 struct OutgoingValueHandler : public AMDGPUValueHandler { 56 OutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, 57 MachineInstrBuilder MIB, CCAssignFn *AssignFn) 58 : AMDGPUValueHandler(B, MRI, AssignFn), MIB(MIB) {} 59 60 MachineInstrBuilder MIB; 61 62 bool isIncomingArgumentHandler() const override { return false; } 63 64 Register getStackAddress(uint64_t Size, int64_t Offset, 65 MachinePointerInfo &MPO) override { 66 llvm_unreachable("not implemented"); 67 } 68 69 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, 70 MachinePointerInfo &MPO, CCValAssign &VA) override { 71 llvm_unreachable("not implemented"); 72 } 73 74 void assignValueToReg(Register ValVReg, Register PhysReg, 75 CCValAssign &VA) override { 76 Register ExtReg = extendRegisterMin32(ValVReg, VA); 77 78 // If this is a scalar return, insert a readfirstlane just in case the value 79 // ends up in a VGPR. 80 // FIXME: Assert this is a shader return. 81 const SIRegisterInfo *TRI 82 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 83 if (TRI->isSGPRReg(MRI, PhysReg)) { 84 auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, 85 {MRI.getType(ExtReg)}, false) 86 .addReg(ExtReg); 87 ExtReg = ToSGPR.getReg(0); 88 } 89 90 MIRBuilder.buildCopy(PhysReg, ExtReg); 91 MIB.addUse(PhysReg, RegState::Implicit); 92 } 93 94 bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, 95 CCValAssign::LocInfo LocInfo, 96 const CallLowering::ArgInfo &Info, 97 ISD::ArgFlagsTy Flags, 98 CCState &State) override { 99 return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State); 100 } 101 }; 102 103 struct IncomingArgHandler : public AMDGPUValueHandler { 104 uint64_t StackUsed = 0; 105 106 IncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, 107 CCAssignFn *AssignFn) 108 : AMDGPUValueHandler(B, MRI, AssignFn) {} 109 110 Register getStackAddress(uint64_t Size, int64_t Offset, 111 MachinePointerInfo &MPO) override { 112 auto &MFI = MIRBuilder.getMF().getFrameInfo(); 113 int FI = MFI.CreateFixedObject(Size, Offset, true); 114 MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); 115 auto AddrReg = MIRBuilder.buildFrameIndex( 116 LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32), FI); 117 StackUsed = std::max(StackUsed, Size + Offset); 118 return AddrReg.getReg(0); 119 } 120 121 void assignValueToReg(Register ValVReg, Register PhysReg, 122 CCValAssign &VA) override { 123 markPhysRegUsed(PhysReg); 124 125 if (VA.getLocVT().getSizeInBits() < 32) { 126 // 16-bit types are reported as legal for 32-bit registers. We need to do 127 // a 32-bit copy, and truncate to avoid the verifier complaining about it. 128 auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg); 129 MIRBuilder.buildTrunc(ValVReg, Copy); 130 return; 131 } 132 133 switch (VA.getLocInfo()) { 134 case CCValAssign::LocInfo::SExt: 135 case CCValAssign::LocInfo::ZExt: 136 case CCValAssign::LocInfo::AExt: { 137 auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg); 138 MIRBuilder.buildTrunc(ValVReg, Copy); 139 break; 140 } 141 default: 142 MIRBuilder.buildCopy(ValVReg, PhysReg); 143 break; 144 } 145 } 146 147 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, 148 MachinePointerInfo &MPO, CCValAssign &VA) override { 149 MachineFunction &MF = MIRBuilder.getMF(); 150 151 // FIXME: Get alignment 152 auto MMO = MF.getMachineMemOperand( 153 MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size, 154 inferAlignFromPtrInfo(MF, MPO)); 155 MIRBuilder.buildLoad(ValVReg, Addr, *MMO); 156 } 157 158 /// How the physical register gets marked varies between formal 159 /// parameters (it's a basic-block live-in), and a call instruction 160 /// (it's an implicit-def of the BL). 161 virtual void markPhysRegUsed(unsigned PhysReg) = 0; 162 163 // FIXME: What is the point of this being a callback? 164 bool isIncomingArgumentHandler() const override { return true; } 165 }; 166 167 struct FormalArgHandler : public IncomingArgHandler { 168 FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, 169 CCAssignFn *AssignFn) 170 : IncomingArgHandler(B, MRI, AssignFn) {} 171 172 void markPhysRegUsed(unsigned PhysReg) override { 173 MIRBuilder.getMBB().addLiveIn(PhysReg); 174 } 175 }; 176 177 struct CallReturnHandler : public IncomingArgHandler { 178 CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, 179 MachineInstrBuilder MIB, CCAssignFn *AssignFn) 180 : IncomingArgHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} 181 182 void markPhysRegUsed(unsigned PhysReg) override { 183 MIB.addDef(PhysReg, RegState::Implicit); 184 } 185 186 MachineInstrBuilder MIB; 187 }; 188 189 struct OutgoingArgHandler : public AMDGPUValueHandler { 190 MachineInstrBuilder MIB; 191 CCAssignFn *AssignFnVarArg; 192 193 /// For tail calls, the byte offset of the call's argument area from the 194 /// callee's. Unused elsewhere. 195 int FPDiff; 196 197 // Cache the SP register vreg if we need it more than once in this call site. 198 Register SPReg; 199 200 bool IsTailCall; 201 202 OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, 203 MachineInstrBuilder MIB, CCAssignFn *AssignFn, 204 CCAssignFn *AssignFnVarArg, bool IsTailCall = false, 205 int FPDiff = 0) 206 : AMDGPUValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB), 207 AssignFnVarArg(AssignFnVarArg), 208 FPDiff(FPDiff), IsTailCall(IsTailCall) {} 209 210 bool isIncomingArgumentHandler() const override { return false; } 211 212 Register getStackAddress(uint64_t Size, int64_t Offset, 213 MachinePointerInfo &MPO) override { 214 MachineFunction &MF = MIRBuilder.getMF(); 215 const LLT PtrTy = LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32); 216 const LLT S32 = LLT::scalar(32); 217 218 if (IsTailCall) { 219 llvm_unreachable("implement me"); 220 } 221 222 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 223 224 if (!SPReg) 225 SPReg = MIRBuilder.buildCopy(PtrTy, MFI->getStackPtrOffsetReg()).getReg(0); 226 227 auto OffsetReg = MIRBuilder.buildConstant(S32, Offset); 228 229 auto AddrReg = MIRBuilder.buildPtrAdd(PtrTy, SPReg, OffsetReg); 230 MPO = MachinePointerInfo::getStack(MF, Offset); 231 return AddrReg.getReg(0); 232 } 233 234 void assignValueToReg(Register ValVReg, Register PhysReg, 235 CCValAssign &VA) override { 236 MIB.addUse(PhysReg, RegState::Implicit); 237 Register ExtReg = extendRegisterMin32(ValVReg, VA); 238 MIRBuilder.buildCopy(PhysReg, ExtReg); 239 } 240 241 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, 242 MachinePointerInfo &MPO, CCValAssign &VA) override { 243 MachineFunction &MF = MIRBuilder.getMF(); 244 uint64_t LocMemOffset = VA.getLocMemOffset(); 245 const auto &ST = MF.getSubtarget<GCNSubtarget>(); 246 247 auto MMO = MF.getMachineMemOperand( 248 MPO, MachineMemOperand::MOStore, Size, 249 commonAlignment(ST.getStackAlignment(), LocMemOffset)); 250 MIRBuilder.buildStore(ValVReg, Addr, *MMO); 251 } 252 253 void assignValueToAddress(const CallLowering::ArgInfo &Arg, Register Addr, 254 uint64_t Size, MachinePointerInfo &MPO, 255 CCValAssign &VA) override { 256 Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt 257 ? extendRegister(Arg.Regs[0], VA) 258 : Arg.Regs[0]; 259 260 // If we extended we might need to adjust the MMO's Size. 261 const LLT RegTy = MRI.getType(ValVReg); 262 if (RegTy.getSizeInBytes() > Size) 263 Size = RegTy.getSizeInBytes(); 264 265 assignValueToAddress(ValVReg, Addr, Size, MPO, VA); 266 } 267 }; 268 } 269 270 AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI) 271 : CallLowering(&TLI) { 272 } 273 274 // FIXME: Compatability shim 275 static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) { 276 switch (MIOpc) { 277 case TargetOpcode::G_SEXT: 278 return ISD::SIGN_EXTEND; 279 case TargetOpcode::G_ZEXT: 280 return ISD::ZERO_EXTEND; 281 case TargetOpcode::G_ANYEXT: 282 return ISD::ANY_EXTEND; 283 default: 284 llvm_unreachable("not an extend opcode"); 285 } 286 } 287 288 void AMDGPUCallLowering::splitToValueTypes( 289 MachineIRBuilder &B, 290 const ArgInfo &OrigArg, 291 SmallVectorImpl<ArgInfo> &SplitArgs, 292 const DataLayout &DL, CallingConv::ID CallConv, 293 bool IsOutgoing, 294 SplitArgTy PerformArgSplit) const { 295 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 296 LLVMContext &Ctx = OrigArg.Ty->getContext(); 297 298 if (OrigArg.Ty->isVoidTy()) 299 return; 300 301 SmallVector<EVT, 4> SplitVTs; 302 ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs); 303 304 assert(OrigArg.Regs.size() == SplitVTs.size()); 305 306 int SplitIdx = 0; 307 for (EVT VT : SplitVTs) { 308 Register Reg = OrigArg.Regs[SplitIdx]; 309 Type *Ty = VT.getTypeForEVT(Ctx); 310 LLT LLTy = getLLTForType(*Ty, DL); 311 312 if (IsOutgoing && VT.isScalarInteger()) { 313 unsigned ExtendOp = TargetOpcode::G_ANYEXT; 314 if (OrigArg.Flags[0].isSExt()) { 315 assert(OrigArg.Regs.size() == 1 && "expect only simple return values"); 316 ExtendOp = TargetOpcode::G_SEXT; 317 } else if (OrigArg.Flags[0].isZExt()) { 318 assert(OrigArg.Regs.size() == 1 && "expect only simple return values"); 319 ExtendOp = TargetOpcode::G_ZEXT; 320 } 321 322 EVT ExtVT = TLI.getTypeForExtReturn(Ctx, VT, 323 extOpcodeToISDExtOpcode(ExtendOp)); 324 if (ExtVT.getSizeInBits() != VT.getSizeInBits()) { 325 VT = ExtVT; 326 Ty = ExtVT.getTypeForEVT(Ctx); 327 LLTy = getLLTForType(*Ty, DL); 328 Reg = B.buildInstr(ExtendOp, {LLTy}, {Reg}).getReg(0); 329 } 330 } 331 332 unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT); 333 MVT RegVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT); 334 335 if (NumParts == 1) { 336 // No splitting to do, but we want to replace the original type (e.g. [1 x 337 // double] -> double). 338 SplitArgs.emplace_back(Reg, Ty, OrigArg.Flags, OrigArg.IsFixed); 339 340 ++SplitIdx; 341 continue; 342 } 343 344 SmallVector<Register, 8> SplitRegs; 345 Type *PartTy = EVT(RegVT).getTypeForEVT(Ctx); 346 LLT PartLLT = getLLTForType(*PartTy, DL); 347 MachineRegisterInfo &MRI = *B.getMRI(); 348 349 // FIXME: Should we be reporting all of the part registers for a single 350 // argument, and let handleAssignments take care of the repacking? 351 for (unsigned i = 0; i < NumParts; ++i) { 352 Register PartReg = MRI.createGenericVirtualRegister(PartLLT); 353 SplitRegs.push_back(PartReg); 354 SplitArgs.emplace_back(ArrayRef<Register>(PartReg), PartTy, OrigArg.Flags); 355 } 356 357 PerformArgSplit(SplitRegs, Reg, LLTy, PartLLT, SplitIdx); 358 359 ++SplitIdx; 360 } 361 } 362 363 // Get the appropriate type to make \p OrigTy \p Factor times bigger. 364 static LLT getMultipleType(LLT OrigTy, int Factor) { 365 if (OrigTy.isVector()) { 366 return LLT::vector(OrigTy.getNumElements() * Factor, 367 OrigTy.getElementType()); 368 } 369 370 return LLT::scalar(OrigTy.getSizeInBits() * Factor); 371 } 372 373 // TODO: Move to generic code 374 static void unpackRegsToOrigType(MachineIRBuilder &B, 375 ArrayRef<Register> DstRegs, 376 Register SrcReg, 377 const CallLowering::ArgInfo &Info, 378 LLT SrcTy, 379 LLT PartTy) { 380 assert(DstRegs.size() > 1 && "Nothing to unpack"); 381 382 const unsigned SrcSize = SrcTy.getSizeInBits(); 383 const unsigned PartSize = PartTy.getSizeInBits(); 384 385 if (SrcTy.isVector() && !PartTy.isVector() && 386 PartSize > SrcTy.getElementType().getSizeInBits()) { 387 // Vector was scalarized, and the elements extended. 388 auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(), 389 SrcReg); 390 for (int i = 0, e = DstRegs.size(); i != e; ++i) 391 B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i)); 392 return; 393 } 394 395 if (SrcSize % PartSize == 0) { 396 B.buildUnmerge(DstRegs, SrcReg); 397 return; 398 } 399 400 const int NumRoundedParts = (SrcSize + PartSize - 1) / PartSize; 401 402 LLT BigTy = getMultipleType(PartTy, NumRoundedParts); 403 auto ImpDef = B.buildUndef(BigTy); 404 405 auto Big = B.buildInsert(BigTy, ImpDef.getReg(0), SrcReg, 0).getReg(0); 406 407 int64_t Offset = 0; 408 for (unsigned i = 0, e = DstRegs.size(); i != e; ++i, Offset += PartSize) 409 B.buildExtract(DstRegs[i], Big, Offset); 410 } 411 412 /// Lower the return value for the already existing \p Ret. This assumes that 413 /// \p B's insertion point is correct. 414 bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B, 415 const Value *Val, ArrayRef<Register> VRegs, 416 MachineInstrBuilder &Ret) const { 417 if (!Val) 418 return true; 419 420 auto &MF = B.getMF(); 421 const auto &F = MF.getFunction(); 422 const DataLayout &DL = MF.getDataLayout(); 423 MachineRegisterInfo *MRI = B.getMRI(); 424 425 CallingConv::ID CC = F.getCallingConv(); 426 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 427 428 ArgInfo OrigRetInfo(VRegs, Val->getType()); 429 setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F); 430 SmallVector<ArgInfo, 4> SplitRetInfos; 431 432 splitToValueTypes( 433 B, OrigRetInfo, SplitRetInfos, DL, CC, true, 434 [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT, 435 int VTSplitIdx) { 436 unpackRegsToOrigType(B, Regs, SrcReg, 437 SplitRetInfos[VTSplitIdx], 438 LLTy, PartLLT); 439 }); 440 441 CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg()); 442 OutgoingValueHandler RetHandler(B, *MRI, Ret, AssignFn); 443 return handleAssignments(B, SplitRetInfos, RetHandler); 444 } 445 446 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, 447 const Value *Val, 448 ArrayRef<Register> VRegs) const { 449 450 MachineFunction &MF = B.getMF(); 451 MachineRegisterInfo &MRI = MF.getRegInfo(); 452 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 453 MFI->setIfReturnsVoid(!Val); 454 455 assert(!Val == VRegs.empty() && "Return value without a vreg"); 456 457 CallingConv::ID CC = B.getMF().getFunction().getCallingConv(); 458 const bool IsShader = AMDGPU::isShader(CC); 459 const bool IsWaveEnd = (IsShader && MFI->returnsVoid()) || 460 AMDGPU::isKernel(CC); 461 if (IsWaveEnd) { 462 B.buildInstr(AMDGPU::S_ENDPGM) 463 .addImm(0); 464 return true; 465 } 466 467 auto const &ST = MF.getSubtarget<GCNSubtarget>(); 468 469 unsigned ReturnOpc = 470 IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return; 471 472 auto Ret = B.buildInstrNoInsert(ReturnOpc); 473 Register ReturnAddrVReg; 474 if (ReturnOpc == AMDGPU::S_SETPC_B64_return) { 475 ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass); 476 Ret.addUse(ReturnAddrVReg); 477 } 478 479 if (!lowerReturnVal(B, Val, VRegs, Ret)) 480 return false; 481 482 if (ReturnOpc == AMDGPU::S_SETPC_B64_return) { 483 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 484 Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF), 485 &AMDGPU::SGPR_64RegClass); 486 B.buildCopy(ReturnAddrVReg, LiveInReturn); 487 } 488 489 // TODO: Handle CalleeSavedRegsViaCopy. 490 491 B.insertInstr(Ret); 492 return true; 493 } 494 495 void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B, 496 Type *ParamTy, 497 uint64_t Offset) const { 498 MachineFunction &MF = B.getMF(); 499 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 500 MachineRegisterInfo &MRI = MF.getRegInfo(); 501 Register KernArgSegmentPtr = 502 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 503 Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr); 504 505 auto OffsetReg = B.buildConstant(LLT::scalar(64), Offset); 506 507 B.buildPtrAdd(DstReg, KernArgSegmentVReg, OffsetReg); 508 } 509 510 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy, 511 uint64_t Offset, Align Alignment, 512 Register DstReg) const { 513 MachineFunction &MF = B.getMF(); 514 const Function &F = MF.getFunction(); 515 const DataLayout &DL = F.getParent()->getDataLayout(); 516 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 517 unsigned TypeSize = DL.getTypeStoreSize(ParamTy); 518 519 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 520 Register PtrReg = B.getMRI()->createGenericVirtualRegister(PtrTy); 521 lowerParameterPtr(PtrReg, B, ParamTy, Offset); 522 523 MachineMemOperand *MMO = MF.getMachineMemOperand( 524 PtrInfo, 525 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 526 MachineMemOperand::MOInvariant, 527 TypeSize, Alignment); 528 529 B.buildLoad(DstReg, PtrReg, *MMO); 530 } 531 532 // Allocate special inputs passed in user SGPRs. 533 static void allocateHSAUserSGPRs(CCState &CCInfo, 534 MachineIRBuilder &B, 535 MachineFunction &MF, 536 const SIRegisterInfo &TRI, 537 SIMachineFunctionInfo &Info) { 538 // FIXME: How should these inputs interact with inreg / custom SGPR inputs? 539 if (Info.hasPrivateSegmentBuffer()) { 540 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); 541 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); 542 CCInfo.AllocateReg(PrivateSegmentBufferReg); 543 } 544 545 if (Info.hasDispatchPtr()) { 546 Register DispatchPtrReg = Info.addDispatchPtr(TRI); 547 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); 548 CCInfo.AllocateReg(DispatchPtrReg); 549 } 550 551 if (Info.hasQueuePtr()) { 552 Register QueuePtrReg = Info.addQueuePtr(TRI); 553 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); 554 CCInfo.AllocateReg(QueuePtrReg); 555 } 556 557 if (Info.hasKernargSegmentPtr()) { 558 MachineRegisterInfo &MRI = MF.getRegInfo(); 559 Register InputPtrReg = Info.addKernargSegmentPtr(TRI); 560 const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 561 Register VReg = MRI.createGenericVirtualRegister(P4); 562 MRI.addLiveIn(InputPtrReg, VReg); 563 B.getMBB().addLiveIn(InputPtrReg); 564 B.buildCopy(VReg, InputPtrReg); 565 CCInfo.AllocateReg(InputPtrReg); 566 } 567 568 if (Info.hasDispatchID()) { 569 Register DispatchIDReg = Info.addDispatchID(TRI); 570 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); 571 CCInfo.AllocateReg(DispatchIDReg); 572 } 573 574 if (Info.hasFlatScratchInit()) { 575 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI); 576 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); 577 CCInfo.AllocateReg(FlatScratchInitReg); 578 } 579 580 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read 581 // these from the dispatch pointer. 582 } 583 584 bool AMDGPUCallLowering::lowerFormalArgumentsKernel( 585 MachineIRBuilder &B, const Function &F, 586 ArrayRef<ArrayRef<Register>> VRegs) const { 587 MachineFunction &MF = B.getMF(); 588 const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>(); 589 MachineRegisterInfo &MRI = MF.getRegInfo(); 590 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 591 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); 592 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 593 594 const DataLayout &DL = F.getParent()->getDataLayout(); 595 596 SmallVector<CCValAssign, 16> ArgLocs; 597 CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); 598 599 allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info); 600 601 unsigned i = 0; 602 const Align KernArgBaseAlign(16); 603 const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F); 604 uint64_t ExplicitArgOffset = 0; 605 606 // TODO: Align down to dword alignment and extract bits for extending loads. 607 for (auto &Arg : F.args()) { 608 const bool IsByRef = Arg.hasByRefAttr(); 609 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 610 unsigned AllocSize = DL.getTypeAllocSize(ArgTy); 611 if (AllocSize == 0) 612 continue; 613 614 MaybeAlign ABIAlign = IsByRef ? Arg.getParamAlign() : None; 615 if (!ABIAlign) 616 ABIAlign = DL.getABITypeAlign(ArgTy); 617 618 uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset; 619 ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize; 620 621 if (Arg.use_empty()) { 622 ++i; 623 continue; 624 } 625 626 Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset); 627 628 if (IsByRef) { 629 unsigned ByRefAS = cast<PointerType>(Arg.getType())->getAddressSpace(); 630 631 assert(VRegs[i].size() == 1 && 632 "expected only one register for byval pointers"); 633 if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) { 634 lowerParameterPtr(VRegs[i][0], B, ArgTy, ArgOffset); 635 } else { 636 const LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 637 Register PtrReg = MRI.createGenericVirtualRegister(ConstPtrTy); 638 lowerParameterPtr(PtrReg, B, ArgTy, ArgOffset); 639 640 B.buildAddrSpaceCast(VRegs[i][0], PtrReg); 641 } 642 } else { 643 ArrayRef<Register> OrigArgRegs = VRegs[i]; 644 Register ArgReg = 645 OrigArgRegs.size() == 1 646 ? OrigArgRegs[0] 647 : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL)); 648 649 lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg); 650 if (OrigArgRegs.size() > 1) 651 unpackRegs(OrigArgRegs, ArgReg, ArgTy, B); 652 } 653 654 ++i; 655 } 656 657 TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); 658 TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false); 659 return true; 660 } 661 662 /// Pack values \p SrcRegs to cover the vector type result \p DstRegs. 663 static MachineInstrBuilder mergeVectorRegsToResultRegs( 664 MachineIRBuilder &B, ArrayRef<Register> DstRegs, ArrayRef<Register> SrcRegs) { 665 MachineRegisterInfo &MRI = *B.getMRI(); 666 LLT LLTy = MRI.getType(DstRegs[0]); 667 LLT PartLLT = MRI.getType(SrcRegs[0]); 668 669 // Deal with v3s16 split into v2s16 670 LLT LCMTy = getLCMType(LLTy, PartLLT); 671 if (LCMTy == LLTy) { 672 // Common case where no padding is needed. 673 assert(DstRegs.size() == 1); 674 return B.buildConcatVectors(DstRegs[0], SrcRegs); 675 } 676 677 const int NumWide = LCMTy.getSizeInBits() / PartLLT.getSizeInBits(); 678 Register Undef = B.buildUndef(PartLLT).getReg(0); 679 680 // Build vector of undefs. 681 SmallVector<Register, 8> WidenedSrcs(NumWide, Undef); 682 683 // Replace the first sources with the real registers. 684 std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin()); 685 686 auto Widened = B.buildConcatVectors(LCMTy, WidenedSrcs); 687 int NumDst = LCMTy.getSizeInBits() / LLTy.getSizeInBits(); 688 689 SmallVector<Register, 8> PadDstRegs(NumDst); 690 std::copy(DstRegs.begin(), DstRegs.end(), PadDstRegs.begin()); 691 692 // Create the excess dead defs for the unmerge. 693 for (int I = DstRegs.size(); I != NumDst; ++I) 694 PadDstRegs[I] = MRI.createGenericVirtualRegister(LLTy); 695 696 return B.buildUnmerge(PadDstRegs, Widened); 697 } 698 699 // TODO: Move this to generic code 700 static void packSplitRegsToOrigType(MachineIRBuilder &B, 701 ArrayRef<Register> OrigRegs, 702 ArrayRef<Register> Regs, 703 LLT LLTy, 704 LLT PartLLT) { 705 MachineRegisterInfo &MRI = *B.getMRI(); 706 707 if (!LLTy.isVector() && !PartLLT.isVector()) { 708 assert(OrigRegs.size() == 1); 709 LLT OrigTy = MRI.getType(OrigRegs[0]); 710 711 unsigned SrcSize = PartLLT.getSizeInBits() * Regs.size(); 712 if (SrcSize == OrigTy.getSizeInBits()) 713 B.buildMerge(OrigRegs[0], Regs); 714 else { 715 auto Widened = B.buildMerge(LLT::scalar(SrcSize), Regs); 716 B.buildTrunc(OrigRegs[0], Widened); 717 } 718 719 return; 720 } 721 722 if (LLTy.isVector() && PartLLT.isVector()) { 723 assert(OrigRegs.size() == 1); 724 assert(LLTy.getElementType() == PartLLT.getElementType()); 725 mergeVectorRegsToResultRegs(B, OrigRegs, Regs); 726 return; 727 } 728 729 assert(LLTy.isVector() && !PartLLT.isVector()); 730 731 LLT DstEltTy = LLTy.getElementType(); 732 733 // Pointer information was discarded. We'll need to coerce some register types 734 // to avoid violating type constraints. 735 LLT RealDstEltTy = MRI.getType(OrigRegs[0]).getElementType(); 736 737 assert(DstEltTy.getSizeInBits() == RealDstEltTy.getSizeInBits()); 738 739 if (DstEltTy == PartLLT) { 740 // Vector was trivially scalarized. 741 742 if (RealDstEltTy.isPointer()) { 743 for (Register Reg : Regs) 744 MRI.setType(Reg, RealDstEltTy); 745 } 746 747 B.buildBuildVector(OrigRegs[0], Regs); 748 } else if (DstEltTy.getSizeInBits() > PartLLT.getSizeInBits()) { 749 // Deal with vector with 64-bit elements decomposed to 32-bit 750 // registers. Need to create intermediate 64-bit elements. 751 SmallVector<Register, 8> EltMerges; 752 int PartsPerElt = DstEltTy.getSizeInBits() / PartLLT.getSizeInBits(); 753 754 assert(DstEltTy.getSizeInBits() % PartLLT.getSizeInBits() == 0); 755 756 for (int I = 0, NumElts = LLTy.getNumElements(); I != NumElts; ++I) { 757 auto Merge = B.buildMerge(RealDstEltTy, Regs.take_front(PartsPerElt)); 758 // Fix the type in case this is really a vector of pointers. 759 MRI.setType(Merge.getReg(0), RealDstEltTy); 760 EltMerges.push_back(Merge.getReg(0)); 761 Regs = Regs.drop_front(PartsPerElt); 762 } 763 764 B.buildBuildVector(OrigRegs[0], EltMerges); 765 } else { 766 // Vector was split, and elements promoted to a wider type. 767 LLT BVType = LLT::vector(LLTy.getNumElements(), PartLLT); 768 auto BV = B.buildBuildVector(BVType, Regs); 769 B.buildTrunc(OrigRegs[0], BV); 770 } 771 } 772 773 bool AMDGPUCallLowering::lowerFormalArguments( 774 MachineIRBuilder &B, const Function &F, 775 ArrayRef<ArrayRef<Register>> VRegs) const { 776 CallingConv::ID CC = F.getCallingConv(); 777 778 // The infrastructure for normal calling convention lowering is essentially 779 // useless for kernels. We want to avoid any kind of legalization or argument 780 // splitting. 781 if (CC == CallingConv::AMDGPU_KERNEL) 782 return lowerFormalArgumentsKernel(B, F, VRegs); 783 784 const bool IsShader = AMDGPU::isShader(CC); 785 const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC); 786 787 MachineFunction &MF = B.getMF(); 788 MachineBasicBlock &MBB = B.getMBB(); 789 MachineRegisterInfo &MRI = MF.getRegInfo(); 790 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 791 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 792 const SIRegisterInfo *TRI = Subtarget.getRegisterInfo(); 793 const DataLayout &DL = F.getParent()->getDataLayout(); 794 795 796 SmallVector<CCValAssign, 16> ArgLocs; 797 CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext()); 798 799 if (!IsEntryFunc) { 800 Register ReturnAddrReg = TRI->getReturnAddressReg(MF); 801 Register LiveInReturn = MF.addLiveIn(ReturnAddrReg, 802 &AMDGPU::SGPR_64RegClass); 803 MBB.addLiveIn(ReturnAddrReg); 804 B.buildCopy(LiveInReturn, ReturnAddrReg); 805 } 806 807 if (Info->hasImplicitBufferPtr()) { 808 Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI); 809 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); 810 CCInfo.AllocateReg(ImplicitBufferPtrReg); 811 } 812 813 814 SmallVector<ArgInfo, 32> SplitArgs; 815 unsigned Idx = 0; 816 unsigned PSInputNum = 0; 817 818 for (auto &Arg : F.args()) { 819 if (DL.getTypeStoreSize(Arg.getType()) == 0) 820 continue; 821 822 const bool InReg = Arg.hasAttribute(Attribute::InReg); 823 824 // SGPR arguments to functions not implemented. 825 if (!IsShader && InReg) 826 return false; 827 828 if (Arg.hasAttribute(Attribute::SwiftSelf) || 829 Arg.hasAttribute(Attribute::SwiftError) || 830 Arg.hasAttribute(Attribute::Nest)) 831 return false; 832 833 if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) { 834 const bool ArgUsed = !Arg.use_empty(); 835 bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum); 836 837 if (!SkipArg) { 838 Info->markPSInputAllocated(PSInputNum); 839 if (ArgUsed) 840 Info->markPSInputEnabled(PSInputNum); 841 } 842 843 ++PSInputNum; 844 845 if (SkipArg) { 846 for (int I = 0, E = VRegs[Idx].size(); I != E; ++I) 847 B.buildUndef(VRegs[Idx][I]); 848 849 ++Idx; 850 continue; 851 } 852 } 853 854 ArgInfo OrigArg(VRegs[Idx], Arg.getType()); 855 const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex; 856 setArgFlags(OrigArg, OrigArgIdx, DL, F); 857 858 splitToValueTypes( 859 B, OrigArg, SplitArgs, DL, CC, false, 860 // FIXME: We should probably be passing multiple registers to 861 // handleAssignments to do this 862 [&](ArrayRef<Register> Regs, Register DstReg, 863 LLT LLTy, LLT PartLLT, int VTSplitIdx) { 864 assert(DstReg == VRegs[Idx][VTSplitIdx]); 865 packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs, 866 LLTy, PartLLT); 867 }); 868 869 ++Idx; 870 } 871 872 // At least one interpolation mode must be enabled or else the GPU will 873 // hang. 874 // 875 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user 876 // set PSInputAddr, the user wants to enable some bits after the compilation 877 // based on run-time states. Since we can't know what the final PSInputEna 878 // will look like, so we shouldn't do anything here and the user should take 879 // responsibility for the correct programming. 880 // 881 // Otherwise, the following restrictions apply: 882 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. 883 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be 884 // enabled too. 885 if (CC == CallingConv::AMDGPU_PS) { 886 if ((Info->getPSInputAddr() & 0x7F) == 0 || 887 ((Info->getPSInputAddr() & 0xF) == 0 && 888 Info->isPSInputAllocated(11))) { 889 CCInfo.AllocateReg(AMDGPU::VGPR0); 890 CCInfo.AllocateReg(AMDGPU::VGPR1); 891 Info->markPSInputAllocated(0); 892 Info->markPSInputEnabled(0); 893 } 894 895 if (Subtarget.isAmdPalOS()) { 896 // For isAmdPalOS, the user does not enable some bits after compilation 897 // based on run-time states; the register values being generated here are 898 // the final ones set in hardware. Therefore we need to apply the 899 // workaround to PSInputAddr and PSInputEnable together. (The case where 900 // a bit is set in PSInputAddr but not PSInputEnable is where the frontend 901 // set up an input arg for a particular interpolation mode, but nothing 902 // uses that input arg. Really we should have an earlier pass that removes 903 // such an arg.) 904 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable(); 905 if ((PsInputBits & 0x7F) == 0 || 906 ((PsInputBits & 0xF) == 0 && 907 (PsInputBits >> 11 & 1))) 908 Info->markPSInputEnabled( 909 countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined)); 910 } 911 } 912 913 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 914 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg()); 915 916 if (!MBB.empty()) 917 B.setInstr(*MBB.begin()); 918 919 if (!IsEntryFunc) { 920 // For the fixed ABI, pass workitem IDs in the last argument register. 921 if (AMDGPUTargetMachine::EnableFixedFunctionABI) 922 TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); 923 } 924 925 FormalArgHandler Handler(B, MRI, AssignFn); 926 if (!handleAssignments(CCInfo, ArgLocs, B, SplitArgs, Handler)) 927 return false; 928 929 if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) { 930 // Special inputs come after user arguments. 931 TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); 932 } 933 934 // Start adding system SGPRs. 935 if (IsEntryFunc) { 936 TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader); 937 } else { 938 CCInfo.AllocateReg(Info->getScratchRSrcReg()); 939 TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); 940 } 941 942 // Move back to the end of the basic block. 943 B.setMBB(MBB); 944 945 return true; 946 } 947 948 bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder, 949 CCState &CCInfo, 950 SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs, 951 CallLoweringInfo &Info) const { 952 MachineFunction &MF = MIRBuilder.getMF(); 953 954 const AMDGPUFunctionArgInfo *CalleeArgInfo 955 = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo; 956 957 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 958 const AMDGPUFunctionArgInfo &CallerArgInfo = MFI->getArgInfo(); 959 960 961 // TODO: Unify with private memory register handling. This is complicated by 962 // the fact that at least in kernels, the input argument is not necessarily 963 // in the same location as the input. 964 AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = { 965 AMDGPUFunctionArgInfo::DISPATCH_PTR, 966 AMDGPUFunctionArgInfo::QUEUE_PTR, 967 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, 968 AMDGPUFunctionArgInfo::DISPATCH_ID, 969 AMDGPUFunctionArgInfo::WORKGROUP_ID_X, 970 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, 971 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z 972 }; 973 974 MachineRegisterInfo &MRI = MF.getRegInfo(); 975 976 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 977 const AMDGPULegalizerInfo *LI 978 = static_cast<const AMDGPULegalizerInfo*>(ST.getLegalizerInfo()); 979 980 for (auto InputID : InputRegs) { 981 const ArgDescriptor *OutgoingArg; 982 const TargetRegisterClass *ArgRC; 983 LLT ArgTy; 984 985 std::tie(OutgoingArg, ArgRC, ArgTy) = 986 CalleeArgInfo->getPreloadedValue(InputID); 987 if (!OutgoingArg) 988 continue; 989 990 const ArgDescriptor *IncomingArg; 991 const TargetRegisterClass *IncomingArgRC; 992 std::tie(IncomingArg, IncomingArgRC, ArgTy) = 993 CallerArgInfo.getPreloadedValue(InputID); 994 assert(IncomingArgRC == ArgRC); 995 996 Register InputReg = MRI.createGenericVirtualRegister(ArgTy); 997 998 if (IncomingArg) { 999 LI->loadInputValue(InputReg, MIRBuilder, IncomingArg); 1000 } else { 1001 assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 1002 LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder); 1003 } 1004 1005 if (OutgoingArg->isRegister()) { 1006 ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg); 1007 if (!CCInfo.AllocateReg(OutgoingArg->getRegister())) 1008 report_fatal_error("failed to allocate implicit input argument"); 1009 } else { 1010 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n"); 1011 return false; 1012 } 1013 } 1014 1015 // Pack workitem IDs into a single register or pass it as is if already 1016 // packed. 1017 const ArgDescriptor *OutgoingArg; 1018 const TargetRegisterClass *ArgRC; 1019 LLT ArgTy; 1020 1021 std::tie(OutgoingArg, ArgRC, ArgTy) = 1022 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X); 1023 if (!OutgoingArg) 1024 std::tie(OutgoingArg, ArgRC, ArgTy) = 1025 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 1026 if (!OutgoingArg) 1027 std::tie(OutgoingArg, ArgRC, ArgTy) = 1028 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 1029 if (!OutgoingArg) 1030 return false; 1031 1032 const ArgDescriptor *IncomingArgX = std::get<0>( 1033 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X)); 1034 const ArgDescriptor *IncomingArgY = std::get<0>( 1035 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y)); 1036 const ArgDescriptor *IncomingArgZ = std::get<0>( 1037 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z)); 1038 1039 const LLT S32 = LLT::scalar(32); 1040 1041 // If incoming ids are not packed we need to pack them. 1042 // FIXME: Should consider known workgroup size to eliminate known 0 cases. 1043 Register InputReg; 1044 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX) { 1045 InputReg = MRI.createGenericVirtualRegister(S32); 1046 LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX); 1047 } 1048 1049 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY) { 1050 Register Y = MRI.createGenericVirtualRegister(S32); 1051 LI->loadInputValue(Y, MIRBuilder, IncomingArgY); 1052 1053 Y = MIRBuilder.buildShl(S32, Y, MIRBuilder.buildConstant(S32, 10)).getReg(0); 1054 InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Y).getReg(0) : Y; 1055 } 1056 1057 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ) { 1058 Register Z = MRI.createGenericVirtualRegister(S32); 1059 LI->loadInputValue(Z, MIRBuilder, IncomingArgZ); 1060 1061 Z = MIRBuilder.buildShl(S32, Z, MIRBuilder.buildConstant(S32, 20)).getReg(0); 1062 InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Z).getReg(0) : Z; 1063 } 1064 1065 if (!InputReg) { 1066 InputReg = MRI.createGenericVirtualRegister(S32); 1067 1068 // Workitem ids are already packed, any of present incoming arguments will 1069 // carry all required fields. 1070 ArgDescriptor IncomingArg = ArgDescriptor::createArg( 1071 IncomingArgX ? *IncomingArgX : 1072 IncomingArgY ? *IncomingArgY : *IncomingArgZ, ~0u); 1073 LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg); 1074 } 1075 1076 if (OutgoingArg->isRegister()) { 1077 ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg); 1078 if (!CCInfo.AllocateReg(OutgoingArg->getRegister())) 1079 report_fatal_error("failed to allocate implicit input argument"); 1080 } else { 1081 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n"); 1082 return false; 1083 } 1084 1085 return true; 1086 } 1087 1088 /// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for 1089 /// CC. 1090 static std::pair<CCAssignFn *, CCAssignFn *> 1091 getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) { 1092 return {TLI.CCAssignFnForCall(CC, false), TLI.CCAssignFnForCall(CC, true)}; 1093 } 1094 1095 static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect, 1096 bool IsTailCall) { 1097 return AMDGPU::SI_CALL; 1098 } 1099 1100 // Add operands to call instruction to track the callee. 1101 static bool addCallTargetOperands(MachineInstrBuilder &CallInst, 1102 MachineIRBuilder &MIRBuilder, 1103 AMDGPUCallLowering::CallLoweringInfo &Info) { 1104 if (Info.Callee.isReg()) { 1105 CallInst.addImm(0); 1106 CallInst.add(Info.Callee); 1107 } else if (Info.Callee.isGlobal() && Info.Callee.getOffset() == 0) { 1108 // The call lowering lightly assumed we can directly encode a call target in 1109 // the instruction, which is not the case. Materialize the address here. 1110 const GlobalValue *GV = Info.Callee.getGlobal(); 1111 auto Ptr = MIRBuilder.buildGlobalValue( 1112 LLT::pointer(GV->getAddressSpace(), 64), GV); 1113 CallInst.addReg(Ptr.getReg(0)); 1114 CallInst.add(Info.Callee); 1115 } else 1116 return false; 1117 1118 return true; 1119 } 1120 1121 bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, 1122 CallLoweringInfo &Info) const { 1123 if (!AMDGPUTargetMachine::EnableFixedFunctionABI) { 1124 LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n"); 1125 return false; 1126 } 1127 1128 if (Info.IsVarArg) { 1129 LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n"); 1130 return false; 1131 } 1132 1133 MachineFunction &MF = MIRBuilder.getMF(); 1134 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1135 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1136 1137 const Function &F = MF.getFunction(); 1138 MachineRegisterInfo &MRI = MF.getRegInfo(); 1139 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 1140 const DataLayout &DL = F.getParent()->getDataLayout(); 1141 1142 if (AMDGPU::isShader(F.getCallingConv())) { 1143 LLVM_DEBUG(dbgs() << "Unhandled call from graphics shader\n"); 1144 return false; 1145 } 1146 1147 SmallVector<ArgInfo, 8> OutArgs; 1148 SmallVector<ArgInfo, 4> SplitRetInfos; 1149 1150 for (auto &OrigArg : Info.OrigArgs) { 1151 splitToValueTypes( 1152 MIRBuilder, OrigArg, OutArgs, DL, Info.CallConv, true, 1153 // FIXME: We should probably be passing multiple registers to 1154 // handleAssignments to do this 1155 [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT, 1156 int VTSplitIdx) { 1157 unpackRegsToOrigType(MIRBuilder, Regs, SrcReg, OrigArg, LLTy, PartLLT); 1158 }); 1159 } 1160 1161 SmallVector<ArgInfo, 8> InArgs; 1162 if (!Info.OrigRet.Ty->isVoidTy()) { 1163 LLVM_DEBUG(dbgs() << "Call return values not yet handled\n"); 1164 return false; 1165 } 1166 1167 // If we can lower as a tail call, do that instead. 1168 bool CanTailCallOpt = false; 1169 1170 // We must emit a tail call if we have musttail. 1171 if (Info.IsMustTailCall && !CanTailCallOpt) { 1172 LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n"); 1173 return false; 1174 } 1175 1176 // Find out which ABI gets to decide where things go. 1177 CCAssignFn *AssignFnFixed; 1178 CCAssignFn *AssignFnVarArg; 1179 std::tie(AssignFnFixed, AssignFnVarArg) = 1180 getAssignFnsForCC(Info.CallConv, TLI); 1181 1182 MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP) 1183 .addImm(0) 1184 .addImm(0); 1185 1186 // Create a temporarily-floating call instruction so we can add the implicit 1187 // uses of arg registers. 1188 unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false); 1189 1190 auto MIB = MIRBuilder.buildInstrNoInsert(Opc); 1191 MIB.addDef(TRI->getReturnAddressReg(MF)); 1192 1193 if (!addCallTargetOperands(MIB, MIRBuilder, Info)) 1194 return false; 1195 1196 // Tell the call which registers are clobbered. 1197 const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv); 1198 MIB.addRegMask(Mask); 1199 1200 SmallVector<CCValAssign, 16> ArgLocs; 1201 CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext()); 1202 1203 // We could pass MIB and directly add the implicit uses to the call 1204 // now. However, as an aesthetic choice, place implicit argument operands 1205 // after the ordinary user argument registers. 1206 SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs; 1207 1208 if (AMDGPUTargetMachine::EnableFixedFunctionABI) { 1209 // With a fixed ABI, allocate fixed registers before user arguments. 1210 if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info)) 1211 return false; 1212 } 1213 1214 // Do the actual argument marshalling. 1215 SmallVector<Register, 8> PhysRegs; 1216 OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed, 1217 AssignFnVarArg, false); 1218 if (!handleAssignments(CCInfo, ArgLocs, MIRBuilder, OutArgs, Handler)) 1219 return false; 1220 1221 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1222 1223 // Insert copies for the SRD. In the HSA case, this should be an identity 1224 // copy. 1225 auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::vector(4, 32), 1226 MFI->getScratchRSrcReg()); 1227 MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); 1228 MIB.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit); 1229 1230 for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) { 1231 MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second); 1232 MIB.addReg(ArgReg.first, RegState::Implicit); 1233 } 1234 1235 // Get a count of how many bytes are to be pushed on the stack. 1236 unsigned NumBytes = CCInfo.getNextStackOffset(); 1237 1238 // Now we can add the actual call instruction to the correct position. 1239 MIRBuilder.insertInstr(MIB); 1240 1241 // If Callee is a reg, since it is used by a target specific 1242 // instruction, it must have a register class matching the 1243 // constraint of that instruction. 1244 1245 // FIXME: We should define regbankselectable call instructions to handle 1246 // divergent call targets. 1247 if (MIB->getOperand(1).isReg()) { 1248 MIB->getOperand(1).setReg(constrainOperandRegClass( 1249 MF, *TRI, MRI, *ST.getInstrInfo(), 1250 *ST.getRegBankInfo(), *MIB, MIB->getDesc(), MIB->getOperand(1), 1251 1)); 1252 } 1253 1254 // Finally we can copy the returned value back into its virtual-register. In 1255 // symmetry with the arguments, the physical register must be an 1256 // implicit-define of the call instruction. 1257 if (!Info.OrigRet.Ty->isVoidTy()) { 1258 CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv, 1259 Info.IsVarArg); 1260 CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn); 1261 if (!handleAssignments(MIRBuilder, InArgs, Handler)) 1262 return false; 1263 } 1264 1265 uint64_t CalleePopBytes = NumBytes; 1266 MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN) 1267 .addImm(0) 1268 .addImm(CalleePopBytes); 1269 1270 return true; 1271 } 1272