1 //===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file implements the lowering of LLVM calls to machine code calls for 11 /// GlobalISel. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUCallLowering.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUISelLowering.h" 18 #include "AMDGPUSubtarget.h" 19 #include "SIISelLowering.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "SIRegisterInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/CodeGen/Analysis.h" 24 #include "llvm/CodeGen/CallingConvLower.h" 25 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 26 #include "llvm/CodeGen/MachineInstrBuilder.h" 27 #include "llvm/Support/LowLevelTypeImpl.h" 28 29 using namespace llvm; 30 31 namespace { 32 33 struct OutgoingValueHandler : public CallLowering::ValueHandler { 34 OutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, 35 MachineInstrBuilder MIB, CCAssignFn *AssignFn) 36 : ValueHandler(B, MRI, AssignFn), MIB(MIB) {} 37 38 MachineInstrBuilder MIB; 39 40 bool isIncomingArgumentHandler() const override { return false; } 41 42 Register getStackAddress(uint64_t Size, int64_t Offset, 43 MachinePointerInfo &MPO) override { 44 llvm_unreachable("not implemented"); 45 } 46 47 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, 48 MachinePointerInfo &MPO, CCValAssign &VA) override { 49 llvm_unreachable("not implemented"); 50 } 51 52 void assignValueToReg(Register ValVReg, Register PhysReg, 53 CCValAssign &VA) override { 54 Register ExtReg; 55 if (VA.getLocVT().getSizeInBits() < 32) { 56 // 16-bit types are reported as legal for 32-bit registers. We need to 57 // extend and do a 32-bit copy to avoid the verifier complaining about it. 58 ExtReg = MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0); 59 } else 60 ExtReg = extendRegister(ValVReg, VA); 61 62 // If this is a scalar return, insert a readfirstlane just in case the value 63 // ends up in a VGPR. 64 // FIXME: Assert this is a shader return. 65 const SIRegisterInfo *TRI 66 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 67 if (TRI->isSGPRReg(MRI, PhysReg)) { 68 auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, 69 {MRI.getType(ExtReg)}, false) 70 .addReg(ExtReg); 71 ExtReg = ToSGPR.getReg(0); 72 } 73 74 MIRBuilder.buildCopy(PhysReg, ExtReg); 75 MIB.addUse(PhysReg, RegState::Implicit); 76 } 77 78 bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, 79 CCValAssign::LocInfo LocInfo, 80 const CallLowering::ArgInfo &Info, 81 ISD::ArgFlagsTy Flags, 82 CCState &State) override { 83 return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State); 84 } 85 }; 86 87 struct IncomingArgHandler : public CallLowering::ValueHandler { 88 uint64_t StackUsed = 0; 89 90 IncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, 91 CCAssignFn *AssignFn) 92 : ValueHandler(B, MRI, AssignFn) {} 93 94 Register getStackAddress(uint64_t Size, int64_t Offset, 95 MachinePointerInfo &MPO) override { 96 auto &MFI = MIRBuilder.getMF().getFrameInfo(); 97 int FI = MFI.CreateFixedObject(Size, Offset, true); 98 MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); 99 auto AddrReg = MIRBuilder.buildFrameIndex( 100 LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32), FI); 101 StackUsed = std::max(StackUsed, Size + Offset); 102 return AddrReg.getReg(0); 103 } 104 105 void assignValueToReg(Register ValVReg, Register PhysReg, 106 CCValAssign &VA) override { 107 markPhysRegUsed(PhysReg); 108 109 if (VA.getLocVT().getSizeInBits() < 32) { 110 // 16-bit types are reported as legal for 32-bit registers. We need to do 111 // a 32-bit copy, and truncate to avoid the verifier complaining about it. 112 auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg); 113 MIRBuilder.buildTrunc(ValVReg, Copy); 114 return; 115 } 116 117 switch (VA.getLocInfo()) { 118 case CCValAssign::LocInfo::SExt: 119 case CCValAssign::LocInfo::ZExt: 120 case CCValAssign::LocInfo::AExt: { 121 auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg); 122 MIRBuilder.buildTrunc(ValVReg, Copy); 123 break; 124 } 125 default: 126 MIRBuilder.buildCopy(ValVReg, PhysReg); 127 break; 128 } 129 } 130 131 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, 132 MachinePointerInfo &MPO, CCValAssign &VA) override { 133 MachineFunction &MF = MIRBuilder.getMF(); 134 135 // FIXME: Get alignment 136 auto MMO = MF.getMachineMemOperand( 137 MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size, 138 inferAlignFromPtrInfo(MF, MPO)); 139 MIRBuilder.buildLoad(ValVReg, Addr, *MMO); 140 } 141 142 /// How the physical register gets marked varies between formal 143 /// parameters (it's a basic-block live-in), and a call instruction 144 /// (it's an implicit-def of the BL). 145 virtual void markPhysRegUsed(unsigned PhysReg) = 0; 146 147 // FIXME: What is the point of this being a callback? 148 bool isIncomingArgumentHandler() const override { return true; } 149 }; 150 151 struct FormalArgHandler : public IncomingArgHandler { 152 FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, 153 CCAssignFn *AssignFn) 154 : IncomingArgHandler(B, MRI, AssignFn) {} 155 156 void markPhysRegUsed(unsigned PhysReg) override { 157 MIRBuilder.getMBB().addLiveIn(PhysReg); 158 } 159 }; 160 161 } 162 163 AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI) 164 : CallLowering(&TLI) { 165 } 166 167 // FIXME: Compatability shim 168 static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) { 169 switch (MIOpc) { 170 case TargetOpcode::G_SEXT: 171 return ISD::SIGN_EXTEND; 172 case TargetOpcode::G_ZEXT: 173 return ISD::ZERO_EXTEND; 174 case TargetOpcode::G_ANYEXT: 175 return ISD::ANY_EXTEND; 176 default: 177 llvm_unreachable("not an extend opcode"); 178 } 179 } 180 181 void AMDGPUCallLowering::splitToValueTypes( 182 MachineIRBuilder &B, 183 const ArgInfo &OrigArg, unsigned OrigArgIdx, 184 SmallVectorImpl<ArgInfo> &SplitArgs, 185 const DataLayout &DL, CallingConv::ID CallConv, 186 SplitArgTy PerformArgSplit) const { 187 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 188 LLVMContext &Ctx = OrigArg.Ty->getContext(); 189 190 if (OrigArg.Ty->isVoidTy()) 191 return; 192 193 SmallVector<EVT, 4> SplitVTs; 194 ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs); 195 196 assert(OrigArg.Regs.size() == SplitVTs.size()); 197 198 int SplitIdx = 0; 199 for (EVT VT : SplitVTs) { 200 Register Reg = OrigArg.Regs[SplitIdx]; 201 Type *Ty = VT.getTypeForEVT(Ctx); 202 LLT LLTy = getLLTForType(*Ty, DL); 203 204 if (OrigArgIdx == AttributeList::ReturnIndex && VT.isScalarInteger()) { 205 unsigned ExtendOp = TargetOpcode::G_ANYEXT; 206 if (OrigArg.Flags[0].isSExt()) { 207 assert(OrigArg.Regs.size() == 1 && "expect only simple return values"); 208 ExtendOp = TargetOpcode::G_SEXT; 209 } else if (OrigArg.Flags[0].isZExt()) { 210 assert(OrigArg.Regs.size() == 1 && "expect only simple return values"); 211 ExtendOp = TargetOpcode::G_ZEXT; 212 } 213 214 EVT ExtVT = TLI.getTypeForExtReturn(Ctx, VT, 215 extOpcodeToISDExtOpcode(ExtendOp)); 216 if (ExtVT != VT) { 217 VT = ExtVT; 218 Ty = ExtVT.getTypeForEVT(Ctx); 219 LLTy = getLLTForType(*Ty, DL); 220 Reg = B.buildInstr(ExtendOp, {LLTy}, {Reg}).getReg(0); 221 } 222 } 223 224 unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT); 225 MVT RegVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT); 226 227 if (NumParts == 1) { 228 // Fixup EVTs to an MVT. 229 // 230 // FIXME: This is pretty hacky. Why do we have to split the type 231 // legalization logic between here and handleAssignments? 232 if (OrigArgIdx != AttributeList::ReturnIndex && VT != RegVT) { 233 assert(VT.getSizeInBits() < 32 && 234 "unexpected illegal type"); 235 Ty = Type::getInt32Ty(Ctx); 236 Register OrigReg = Reg; 237 Reg = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32)); 238 B.buildTrunc(OrigReg, Reg); 239 } 240 241 // No splitting to do, but we want to replace the original type (e.g. [1 x 242 // double] -> double). 243 SplitArgs.emplace_back(Reg, Ty, OrigArg.Flags, OrigArg.IsFixed); 244 245 ++SplitIdx; 246 continue; 247 } 248 249 SmallVector<Register, 8> SplitRegs; 250 Type *PartTy = EVT(RegVT).getTypeForEVT(Ctx); 251 LLT PartLLT = getLLTForType(*PartTy, DL); 252 MachineRegisterInfo &MRI = *B.getMRI(); 253 254 // FIXME: Should we be reporting all of the part registers for a single 255 // argument, and let handleAssignments take care of the repacking? 256 for (unsigned i = 0; i < NumParts; ++i) { 257 Register PartReg = MRI.createGenericVirtualRegister(PartLLT); 258 SplitRegs.push_back(PartReg); 259 SplitArgs.emplace_back(ArrayRef<Register>(PartReg), PartTy, OrigArg.Flags); 260 } 261 262 PerformArgSplit(SplitRegs, Reg, LLTy, PartLLT, SplitIdx); 263 264 ++SplitIdx; 265 } 266 } 267 268 // Get the appropriate type to make \p OrigTy \p Factor times bigger. 269 static LLT getMultipleType(LLT OrigTy, int Factor) { 270 if (OrigTy.isVector()) { 271 return LLT::vector(OrigTy.getNumElements() * Factor, 272 OrigTy.getElementType()); 273 } 274 275 return LLT::scalar(OrigTy.getSizeInBits() * Factor); 276 } 277 278 // TODO: Move to generic code 279 static void unpackRegsToOrigType(MachineIRBuilder &B, 280 ArrayRef<Register> DstRegs, 281 Register SrcReg, 282 const CallLowering::ArgInfo &Info, 283 LLT SrcTy, 284 LLT PartTy) { 285 assert(DstRegs.size() > 1 && "Nothing to unpack"); 286 287 const unsigned SrcSize = SrcTy.getSizeInBits(); 288 const unsigned PartSize = PartTy.getSizeInBits(); 289 290 if (SrcTy.isVector() && !PartTy.isVector() && 291 PartSize > SrcTy.getElementType().getSizeInBits()) { 292 // Vector was scalarized, and the elements extended. 293 auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(), 294 SrcReg); 295 for (int i = 0, e = DstRegs.size(); i != e; ++i) 296 B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i)); 297 return; 298 } 299 300 if (SrcSize % PartSize == 0) { 301 B.buildUnmerge(DstRegs, SrcReg); 302 return; 303 } 304 305 const int NumRoundedParts = (SrcSize + PartSize - 1) / PartSize; 306 307 LLT BigTy = getMultipleType(PartTy, NumRoundedParts); 308 auto ImpDef = B.buildUndef(BigTy); 309 310 auto Big = B.buildInsert(BigTy, ImpDef.getReg(0), SrcReg, 0).getReg(0); 311 312 int64_t Offset = 0; 313 for (unsigned i = 0, e = DstRegs.size(); i != e; ++i, Offset += PartSize) 314 B.buildExtract(DstRegs[i], Big, Offset); 315 } 316 317 /// Lower the return value for the already existing \p Ret. This assumes that 318 /// \p B's insertion point is correct. 319 bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B, 320 const Value *Val, ArrayRef<Register> VRegs, 321 MachineInstrBuilder &Ret) const { 322 if (!Val) 323 return true; 324 325 auto &MF = B.getMF(); 326 const auto &F = MF.getFunction(); 327 const DataLayout &DL = MF.getDataLayout(); 328 MachineRegisterInfo *MRI = B.getMRI(); 329 330 CallingConv::ID CC = F.getCallingConv(); 331 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 332 333 ArgInfo OrigRetInfo(VRegs, Val->getType()); 334 setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F); 335 SmallVector<ArgInfo, 4> SplitRetInfos; 336 337 splitToValueTypes( 338 B, OrigRetInfo, AttributeList::ReturnIndex, SplitRetInfos, DL, CC, 339 [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT, 340 int VTSplitIdx) { 341 unpackRegsToOrigType(B, Regs, SrcReg, 342 SplitRetInfos[VTSplitIdx], 343 LLTy, PartLLT); 344 }); 345 346 CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg()); 347 OutgoingValueHandler RetHandler(B, *MRI, Ret, AssignFn); 348 return handleAssignments(B, SplitRetInfos, RetHandler); 349 } 350 351 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, 352 const Value *Val, 353 ArrayRef<Register> VRegs) const { 354 355 MachineFunction &MF = B.getMF(); 356 MachineRegisterInfo &MRI = MF.getRegInfo(); 357 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 358 MFI->setIfReturnsVoid(!Val); 359 360 assert(!Val == VRegs.empty() && "Return value without a vreg"); 361 362 CallingConv::ID CC = B.getMF().getFunction().getCallingConv(); 363 const bool IsShader = AMDGPU::isShader(CC); 364 const bool IsWaveEnd = (IsShader && MFI->returnsVoid()) || 365 AMDGPU::isKernel(CC); 366 if (IsWaveEnd) { 367 B.buildInstr(AMDGPU::S_ENDPGM) 368 .addImm(0); 369 return true; 370 } 371 372 auto const &ST = MF.getSubtarget<GCNSubtarget>(); 373 374 unsigned ReturnOpc = 375 IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return; 376 377 auto Ret = B.buildInstrNoInsert(ReturnOpc); 378 Register ReturnAddrVReg; 379 if (ReturnOpc == AMDGPU::S_SETPC_B64_return) { 380 ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass); 381 Ret.addUse(ReturnAddrVReg); 382 } 383 384 if (!lowerReturnVal(B, Val, VRegs, Ret)) 385 return false; 386 387 if (ReturnOpc == AMDGPU::S_SETPC_B64_return) { 388 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 389 Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF), 390 &AMDGPU::SGPR_64RegClass); 391 B.buildCopy(ReturnAddrVReg, LiveInReturn); 392 } 393 394 // TODO: Handle CalleeSavedRegsViaCopy. 395 396 B.insertInstr(Ret); 397 return true; 398 } 399 400 Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &B, 401 Type *ParamTy, 402 uint64_t Offset) const { 403 404 MachineFunction &MF = B.getMF(); 405 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 406 MachineRegisterInfo &MRI = MF.getRegInfo(); 407 const Function &F = MF.getFunction(); 408 const DataLayout &DL = F.getParent()->getDataLayout(); 409 PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS); 410 LLT PtrType = getLLTForType(*PtrTy, DL); 411 Register KernArgSegmentPtr = 412 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 413 Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr); 414 415 auto OffsetReg = B.buildConstant(LLT::scalar(64), Offset); 416 417 return B.buildPtrAdd(PtrType, KernArgSegmentVReg, OffsetReg).getReg(0); 418 } 419 420 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy, 421 uint64_t Offset, Align Alignment, 422 Register DstReg) const { 423 MachineFunction &MF = B.getMF(); 424 const Function &F = MF.getFunction(); 425 const DataLayout &DL = F.getParent()->getDataLayout(); 426 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 427 unsigned TypeSize = DL.getTypeStoreSize(ParamTy); 428 Register PtrReg = lowerParameterPtr(B, ParamTy, Offset); 429 430 MachineMemOperand *MMO = MF.getMachineMemOperand( 431 PtrInfo, 432 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 433 MachineMemOperand::MOInvariant, 434 TypeSize, Alignment); 435 436 B.buildLoad(DstReg, PtrReg, *MMO); 437 } 438 439 // Allocate special inputs passed in user SGPRs. 440 static void allocateHSAUserSGPRs(CCState &CCInfo, 441 MachineIRBuilder &B, 442 MachineFunction &MF, 443 const SIRegisterInfo &TRI, 444 SIMachineFunctionInfo &Info) { 445 // FIXME: How should these inputs interact with inreg / custom SGPR inputs? 446 if (Info.hasPrivateSegmentBuffer()) { 447 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); 448 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); 449 CCInfo.AllocateReg(PrivateSegmentBufferReg); 450 } 451 452 if (Info.hasDispatchPtr()) { 453 Register DispatchPtrReg = Info.addDispatchPtr(TRI); 454 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); 455 CCInfo.AllocateReg(DispatchPtrReg); 456 } 457 458 if (Info.hasQueuePtr()) { 459 Register QueuePtrReg = Info.addQueuePtr(TRI); 460 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); 461 CCInfo.AllocateReg(QueuePtrReg); 462 } 463 464 if (Info.hasKernargSegmentPtr()) { 465 MachineRegisterInfo &MRI = MF.getRegInfo(); 466 Register InputPtrReg = Info.addKernargSegmentPtr(TRI); 467 const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 468 Register VReg = MRI.createGenericVirtualRegister(P4); 469 MRI.addLiveIn(InputPtrReg, VReg); 470 B.getMBB().addLiveIn(InputPtrReg); 471 B.buildCopy(VReg, InputPtrReg); 472 CCInfo.AllocateReg(InputPtrReg); 473 } 474 475 if (Info.hasDispatchID()) { 476 Register DispatchIDReg = Info.addDispatchID(TRI); 477 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); 478 CCInfo.AllocateReg(DispatchIDReg); 479 } 480 481 if (Info.hasFlatScratchInit()) { 482 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI); 483 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); 484 CCInfo.AllocateReg(FlatScratchInitReg); 485 } 486 487 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read 488 // these from the dispatch pointer. 489 } 490 491 bool AMDGPUCallLowering::lowerFormalArgumentsKernel( 492 MachineIRBuilder &B, const Function &F, 493 ArrayRef<ArrayRef<Register>> VRegs) const { 494 MachineFunction &MF = B.getMF(); 495 const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>(); 496 MachineRegisterInfo &MRI = MF.getRegInfo(); 497 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 498 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); 499 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 500 501 const DataLayout &DL = F.getParent()->getDataLayout(); 502 503 SmallVector<CCValAssign, 16> ArgLocs; 504 CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); 505 506 allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info); 507 508 unsigned i = 0; 509 const Align KernArgBaseAlign(16); 510 const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F); 511 uint64_t ExplicitArgOffset = 0; 512 513 // TODO: Align down to dword alignment and extract bits for extending loads. 514 for (auto &Arg : F.args()) { 515 Type *ArgTy = Arg.getType(); 516 unsigned AllocSize = DL.getTypeAllocSize(ArgTy); 517 if (AllocSize == 0) 518 continue; 519 520 unsigned ABIAlign = DL.getABITypeAlignment(ArgTy); 521 522 uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset; 523 ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize; 524 525 ArrayRef<Register> OrigArgRegs = VRegs[i]; 526 Register ArgReg = 527 OrigArgRegs.size() == 1 528 ? OrigArgRegs[0] 529 : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL)); 530 Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset); 531 ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy)); 532 lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg); 533 if (OrigArgRegs.size() > 1) 534 unpackRegs(OrigArgRegs, ArgReg, ArgTy, B); 535 ++i; 536 } 537 538 TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); 539 TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false); 540 return true; 541 } 542 543 /// Pack values \p SrcRegs to cover the vector type result \p DstRegs. 544 static MachineInstrBuilder mergeVectorRegsToResultRegs( 545 MachineIRBuilder &B, ArrayRef<Register> DstRegs, ArrayRef<Register> SrcRegs) { 546 MachineRegisterInfo &MRI = *B.getMRI(); 547 LLT LLTy = MRI.getType(DstRegs[0]); 548 LLT PartLLT = MRI.getType(SrcRegs[0]); 549 550 // Deal with v3s16 split into v2s16 551 LLT LCMTy = getLCMType(LLTy, PartLLT); 552 if (LCMTy == LLTy) { 553 // Common case where no padding is needed. 554 assert(DstRegs.size() == 1); 555 return B.buildConcatVectors(DstRegs[0], SrcRegs); 556 } 557 558 const int NumWide = LCMTy.getSizeInBits() / PartLLT.getSizeInBits(); 559 Register Undef = B.buildUndef(PartLLT).getReg(0); 560 561 // Build vector of undefs. 562 SmallVector<Register, 8> WidenedSrcs(NumWide, Undef); 563 564 // Replace the first sources with the real registers. 565 std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin()); 566 567 auto Widened = B.buildConcatVectors(LCMTy, WidenedSrcs); 568 int NumDst = LCMTy.getSizeInBits() / LLTy.getSizeInBits(); 569 570 SmallVector<Register, 8> PadDstRegs(NumDst); 571 std::copy(DstRegs.begin(), DstRegs.end(), PadDstRegs.begin()); 572 573 // Create the excess dead defs for the unmerge. 574 for (int I = DstRegs.size(); I != NumDst; ++I) 575 PadDstRegs[I] = MRI.createGenericVirtualRegister(LLTy); 576 577 return B.buildUnmerge(PadDstRegs, Widened); 578 } 579 580 // TODO: Move this to generic code 581 static void packSplitRegsToOrigType(MachineIRBuilder &B, 582 ArrayRef<Register> OrigRegs, 583 ArrayRef<Register> Regs, 584 LLT LLTy, 585 LLT PartLLT) { 586 MachineRegisterInfo &MRI = *B.getMRI(); 587 588 if (!LLTy.isVector() && !PartLLT.isVector()) { 589 assert(OrigRegs.size() == 1); 590 LLT OrigTy = MRI.getType(OrigRegs[0]); 591 592 unsigned SrcSize = PartLLT.getSizeInBits() * Regs.size(); 593 if (SrcSize == OrigTy.getSizeInBits()) 594 B.buildMerge(OrigRegs[0], Regs); 595 else { 596 auto Widened = B.buildMerge(LLT::scalar(SrcSize), Regs); 597 B.buildTrunc(OrigRegs[0], Widened); 598 } 599 600 return; 601 } 602 603 if (LLTy.isVector() && PartLLT.isVector()) { 604 assert(OrigRegs.size() == 1); 605 assert(LLTy.getElementType() == PartLLT.getElementType()); 606 mergeVectorRegsToResultRegs(B, OrigRegs, Regs); 607 return; 608 } 609 610 assert(LLTy.isVector() && !PartLLT.isVector()); 611 612 LLT DstEltTy = LLTy.getElementType(); 613 614 // Pointer information was discarded. We'll need to coerce some register types 615 // to avoid violating type constraints. 616 LLT RealDstEltTy = MRI.getType(OrigRegs[0]).getElementType(); 617 618 assert(DstEltTy.getSizeInBits() == RealDstEltTy.getSizeInBits()); 619 620 if (DstEltTy == PartLLT) { 621 // Vector was trivially scalarized. 622 623 if (RealDstEltTy.isPointer()) { 624 for (Register Reg : Regs) 625 MRI.setType(Reg, RealDstEltTy); 626 } 627 628 B.buildBuildVector(OrigRegs[0], Regs); 629 } else if (DstEltTy.getSizeInBits() > PartLLT.getSizeInBits()) { 630 // Deal with vector with 64-bit elements decomposed to 32-bit 631 // registers. Need to create intermediate 64-bit elements. 632 SmallVector<Register, 8> EltMerges; 633 int PartsPerElt = DstEltTy.getSizeInBits() / PartLLT.getSizeInBits(); 634 635 assert(DstEltTy.getSizeInBits() % PartLLT.getSizeInBits() == 0); 636 637 for (int I = 0, NumElts = LLTy.getNumElements(); I != NumElts; ++I) { 638 auto Merge = B.buildMerge(RealDstEltTy, Regs.take_front(PartsPerElt)); 639 // Fix the type in case this is really a vector of pointers. 640 MRI.setType(Merge.getReg(0), RealDstEltTy); 641 EltMerges.push_back(Merge.getReg(0)); 642 Regs = Regs.drop_front(PartsPerElt); 643 } 644 645 B.buildBuildVector(OrigRegs[0], EltMerges); 646 } else { 647 // Vector was split, and elements promoted to a wider type. 648 LLT BVType = LLT::vector(LLTy.getNumElements(), PartLLT); 649 auto BV = B.buildBuildVector(BVType, Regs); 650 B.buildTrunc(OrigRegs[0], BV); 651 } 652 } 653 654 bool AMDGPUCallLowering::lowerFormalArguments( 655 MachineIRBuilder &B, const Function &F, 656 ArrayRef<ArrayRef<Register>> VRegs) const { 657 CallingConv::ID CC = F.getCallingConv(); 658 659 // The infrastructure for normal calling convention lowering is essentially 660 // useless for kernels. We want to avoid any kind of legalization or argument 661 // splitting. 662 if (CC == CallingConv::AMDGPU_KERNEL) 663 return lowerFormalArgumentsKernel(B, F, VRegs); 664 665 const bool IsShader = AMDGPU::isShader(CC); 666 const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC); 667 668 MachineFunction &MF = B.getMF(); 669 MachineBasicBlock &MBB = B.getMBB(); 670 MachineRegisterInfo &MRI = MF.getRegInfo(); 671 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 672 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 673 const SIRegisterInfo *TRI = Subtarget.getRegisterInfo(); 674 const DataLayout &DL = F.getParent()->getDataLayout(); 675 676 677 SmallVector<CCValAssign, 16> ArgLocs; 678 CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext()); 679 680 if (!IsEntryFunc) { 681 Register ReturnAddrReg = TRI->getReturnAddressReg(MF); 682 Register LiveInReturn = MF.addLiveIn(ReturnAddrReg, 683 &AMDGPU::SGPR_64RegClass); 684 MBB.addLiveIn(ReturnAddrReg); 685 B.buildCopy(LiveInReturn, ReturnAddrReg); 686 } 687 688 if (Info->hasImplicitBufferPtr()) { 689 Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI); 690 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); 691 CCInfo.AllocateReg(ImplicitBufferPtrReg); 692 } 693 694 695 SmallVector<ArgInfo, 32> SplitArgs; 696 unsigned Idx = 0; 697 unsigned PSInputNum = 0; 698 699 for (auto &Arg : F.args()) { 700 if (DL.getTypeStoreSize(Arg.getType()) == 0) 701 continue; 702 703 const bool InReg = Arg.hasAttribute(Attribute::InReg); 704 705 // SGPR arguments to functions not implemented. 706 if (!IsShader && InReg) 707 return false; 708 709 if (Arg.hasAttribute(Attribute::SwiftSelf) || 710 Arg.hasAttribute(Attribute::SwiftError) || 711 Arg.hasAttribute(Attribute::Nest)) 712 return false; 713 714 if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) { 715 const bool ArgUsed = !Arg.use_empty(); 716 bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum); 717 718 if (!SkipArg) { 719 Info->markPSInputAllocated(PSInputNum); 720 if (ArgUsed) 721 Info->markPSInputEnabled(PSInputNum); 722 } 723 724 ++PSInputNum; 725 726 if (SkipArg) { 727 for (int I = 0, E = VRegs[Idx].size(); I != E; ++I) 728 B.buildUndef(VRegs[Idx][I]); 729 730 ++Idx; 731 continue; 732 } 733 } 734 735 ArgInfo OrigArg(VRegs[Idx], Arg.getType()); 736 const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex; 737 setArgFlags(OrigArg, OrigArgIdx, DL, F); 738 739 splitToValueTypes( 740 B, OrigArg, OrigArgIdx, SplitArgs, DL, CC, 741 // FIXME: We should probably be passing multiple registers to 742 // handleAssignments to do this 743 [&](ArrayRef<Register> Regs, Register DstReg, 744 LLT LLTy, LLT PartLLT, int VTSplitIdx) { 745 assert(DstReg == VRegs[Idx][VTSplitIdx]); 746 packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs, 747 LLTy, PartLLT); 748 }); 749 750 ++Idx; 751 } 752 753 // At least one interpolation mode must be enabled or else the GPU will 754 // hang. 755 // 756 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user 757 // set PSInputAddr, the user wants to enable some bits after the compilation 758 // based on run-time states. Since we can't know what the final PSInputEna 759 // will look like, so we shouldn't do anything here and the user should take 760 // responsibility for the correct programming. 761 // 762 // Otherwise, the following restrictions apply: 763 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. 764 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be 765 // enabled too. 766 if (CC == CallingConv::AMDGPU_PS) { 767 if ((Info->getPSInputAddr() & 0x7F) == 0 || 768 ((Info->getPSInputAddr() & 0xF) == 0 && 769 Info->isPSInputAllocated(11))) { 770 CCInfo.AllocateReg(AMDGPU::VGPR0); 771 CCInfo.AllocateReg(AMDGPU::VGPR1); 772 Info->markPSInputAllocated(0); 773 Info->markPSInputEnabled(0); 774 } 775 776 if (Subtarget.isAmdPalOS()) { 777 // For isAmdPalOS, the user does not enable some bits after compilation 778 // based on run-time states; the register values being generated here are 779 // the final ones set in hardware. Therefore we need to apply the 780 // workaround to PSInputAddr and PSInputEnable together. (The case where 781 // a bit is set in PSInputAddr but not PSInputEnable is where the frontend 782 // set up an input arg for a particular interpolation mode, but nothing 783 // uses that input arg. Really we should have an earlier pass that removes 784 // such an arg.) 785 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable(); 786 if ((PsInputBits & 0x7F) == 0 || 787 ((PsInputBits & 0xF) == 0 && 788 (PsInputBits >> 11 & 1))) 789 Info->markPSInputEnabled( 790 countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined)); 791 } 792 } 793 794 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 795 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg()); 796 797 if (!MBB.empty()) 798 B.setInstr(*MBB.begin()); 799 800 FormalArgHandler Handler(B, MRI, AssignFn); 801 if (!handleAssignments(CCInfo, ArgLocs, B, SplitArgs, Handler)) 802 return false; 803 804 if (!IsEntryFunc) { 805 // Special inputs come after user arguments. 806 TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); 807 } 808 809 // Start adding system SGPRs. 810 if (IsEntryFunc) { 811 TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader); 812 } else { 813 CCInfo.AllocateReg(Info->getScratchRSrcReg()); 814 TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); 815 } 816 817 // Move back to the end of the basic block. 818 B.setMBB(MBB); 819 820 return true; 821 } 822