1 //===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file implements the lowering of LLVM calls to machine code calls for 11 /// GlobalISel. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUCallLowering.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUISelLowering.h" 18 #include "AMDGPUSubtarget.h" 19 #include "SIISelLowering.h" 20 #include "SIMachineFunctionInfo.h" 21 #include "SIRegisterInfo.h" 22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 23 #include "llvm/CodeGen/Analysis.h" 24 #include "llvm/CodeGen/CallingConvLower.h" 25 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 26 #include "llvm/CodeGen/MachineInstrBuilder.h" 27 #include "llvm/Support/LowLevelTypeImpl.h" 28 29 using namespace llvm; 30 31 namespace { 32 33 struct OutgoingValueHandler : public CallLowering::ValueHandler { 34 OutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, 35 MachineInstrBuilder MIB, CCAssignFn *AssignFn) 36 : ValueHandler(B, MRI, AssignFn), MIB(MIB) {} 37 38 MachineInstrBuilder MIB; 39 40 Register getStackAddress(uint64_t Size, int64_t Offset, 41 MachinePointerInfo &MPO) override { 42 llvm_unreachable("not implemented"); 43 } 44 45 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, 46 MachinePointerInfo &MPO, CCValAssign &VA) override { 47 llvm_unreachable("not implemented"); 48 } 49 50 void assignValueToReg(Register ValVReg, Register PhysReg, 51 CCValAssign &VA) override { 52 Register ExtReg; 53 if (VA.getLocVT().getSizeInBits() < 32) { 54 // 16-bit types are reported as legal for 32-bit registers. We need to 55 // extend and do a 32-bit copy to avoid the verifier complaining about it. 56 ExtReg = MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0); 57 } else 58 ExtReg = extendRegister(ValVReg, VA); 59 60 MIRBuilder.buildCopy(PhysReg, ExtReg); 61 MIB.addUse(PhysReg, RegState::Implicit); 62 } 63 64 bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, 65 CCValAssign::LocInfo LocInfo, 66 const CallLowering::ArgInfo &Info, 67 ISD::ArgFlagsTy Flags, 68 CCState &State) override { 69 return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State); 70 } 71 }; 72 73 struct IncomingArgHandler : public CallLowering::ValueHandler { 74 uint64_t StackUsed = 0; 75 76 IncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, 77 CCAssignFn *AssignFn) 78 : ValueHandler(B, MRI, AssignFn) {} 79 80 Register getStackAddress(uint64_t Size, int64_t Offset, 81 MachinePointerInfo &MPO) override { 82 auto &MFI = MIRBuilder.getMF().getFrameInfo(); 83 int FI = MFI.CreateFixedObject(Size, Offset, true); 84 MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); 85 Register AddrReg = MRI.createGenericVirtualRegister( 86 LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32)); 87 MIRBuilder.buildFrameIndex(AddrReg, FI); 88 StackUsed = std::max(StackUsed, Size + Offset); 89 return AddrReg; 90 } 91 92 void assignValueToReg(Register ValVReg, Register PhysReg, 93 CCValAssign &VA) override { 94 markPhysRegUsed(PhysReg); 95 96 if (VA.getLocVT().getSizeInBits() < 32) { 97 // 16-bit types are reported as legal for 32-bit registers. We need to do 98 // a 32-bit copy, and truncate to avoid the verifier complaining about it. 99 auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg); 100 MIRBuilder.buildTrunc(ValVReg, Copy); 101 return; 102 } 103 104 switch (VA.getLocInfo()) { 105 case CCValAssign::LocInfo::SExt: 106 case CCValAssign::LocInfo::ZExt: 107 case CCValAssign::LocInfo::AExt: { 108 auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg); 109 MIRBuilder.buildTrunc(ValVReg, Copy); 110 break; 111 } 112 default: 113 MIRBuilder.buildCopy(ValVReg, PhysReg); 114 break; 115 } 116 } 117 118 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, 119 MachinePointerInfo &MPO, CCValAssign &VA) override { 120 // FIXME: Get alignment 121 auto MMO = MIRBuilder.getMF().getMachineMemOperand( 122 MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size, 1); 123 MIRBuilder.buildLoad(ValVReg, Addr, *MMO); 124 } 125 126 /// How the physical register gets marked varies between formal 127 /// parameters (it's a basic-block live-in), and a call instruction 128 /// (it's an implicit-def of the BL). 129 virtual void markPhysRegUsed(unsigned PhysReg) = 0; 130 131 // FIXME: What is the point of this being a callback? 132 bool isIncomingArgumentHandler() const override { return true; } 133 }; 134 135 struct FormalArgHandler : public IncomingArgHandler { 136 FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, 137 CCAssignFn *AssignFn) 138 : IncomingArgHandler(B, MRI, AssignFn) {} 139 140 void markPhysRegUsed(unsigned PhysReg) override { 141 MIRBuilder.getMBB().addLiveIn(PhysReg); 142 } 143 }; 144 145 } 146 147 AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI) 148 : CallLowering(&TLI) { 149 } 150 151 void AMDGPUCallLowering::splitToValueTypes( 152 const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs, 153 const DataLayout &DL, MachineRegisterInfo &MRI, CallingConv::ID CallConv, 154 SplitArgTy PerformArgSplit) const { 155 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 156 LLVMContext &Ctx = OrigArg.Ty->getContext(); 157 158 if (OrigArg.Ty->isVoidTy()) 159 return; 160 161 SmallVector<EVT, 4> SplitVTs; 162 ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs); 163 164 assert(OrigArg.Regs.size() == SplitVTs.size()); 165 166 int SplitIdx = 0; 167 for (EVT VT : SplitVTs) { 168 unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT); 169 Type *Ty = VT.getTypeForEVT(Ctx); 170 171 172 173 if (NumParts == 1) { 174 // No splitting to do, but we want to replace the original type (e.g. [1 x 175 // double] -> double). 176 SplitArgs.emplace_back(OrigArg.Regs[SplitIdx], Ty, 177 OrigArg.Flags, OrigArg.IsFixed); 178 179 ++SplitIdx; 180 continue; 181 } 182 183 LLT LLTy = getLLTForType(*Ty, DL); 184 185 SmallVector<Register, 8> SplitRegs; 186 187 EVT PartVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT); 188 Type *PartTy = PartVT.getTypeForEVT(Ctx); 189 LLT PartLLT = getLLTForType(*PartTy, DL); 190 191 // FIXME: Should we be reporting all of the part registers for a single 192 // argument, and let handleAssignments take care of the repacking? 193 for (unsigned i = 0; i < NumParts; ++i) { 194 Register PartReg = MRI.createGenericVirtualRegister(PartLLT); 195 SplitRegs.push_back(PartReg); 196 SplitArgs.emplace_back(ArrayRef<Register>(PartReg), PartTy, OrigArg.Flags); 197 } 198 199 PerformArgSplit(SplitRegs, LLTy, PartLLT, SplitIdx); 200 201 ++SplitIdx; 202 } 203 } 204 205 // Get the appropriate type to make \p OrigTy \p Factor times bigger. 206 static LLT getMultipleType(LLT OrigTy, int Factor) { 207 if (OrigTy.isVector()) { 208 return LLT::vector(OrigTy.getNumElements() * Factor, 209 OrigTy.getElementType()); 210 } 211 212 return LLT::scalar(OrigTy.getSizeInBits() * Factor); 213 } 214 215 // TODO: Move to generic code 216 static void unpackRegsToOrigType(MachineIRBuilder &B, 217 ArrayRef<Register> DstRegs, 218 Register SrcReg, 219 LLT SrcTy, 220 LLT PartTy) { 221 assert(DstRegs.size() > 1 && "Nothing to unpack"); 222 223 MachineFunction &MF = B.getMF(); 224 MachineRegisterInfo &MRI = MF.getRegInfo(); 225 226 const unsigned SrcSize = SrcTy.getSizeInBits(); 227 const unsigned PartSize = PartTy.getSizeInBits(); 228 229 if (SrcTy.isVector() && !PartTy.isVector() && 230 PartSize > SrcTy.getElementType().getSizeInBits()) { 231 // Vector was scalarized, and the elements extended. 232 auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(), 233 SrcReg); 234 for (int i = 0, e = DstRegs.size(); i != e; ++i) 235 B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i)); 236 return; 237 } 238 239 if (SrcSize % PartSize == 0) { 240 B.buildUnmerge(DstRegs, SrcReg); 241 return; 242 } 243 244 const int NumRoundedParts = (SrcSize + PartSize - 1) / PartSize; 245 246 LLT BigTy = getMultipleType(PartTy, NumRoundedParts); 247 auto ImpDef = B.buildUndef(BigTy); 248 249 Register BigReg = MRI.createGenericVirtualRegister(BigTy); 250 B.buildInsert(BigReg, ImpDef.getReg(0), SrcReg, 0).getReg(0); 251 252 int64_t Offset = 0; 253 for (unsigned i = 0, e = DstRegs.size(); i != e; ++i, Offset += PartSize) 254 B.buildExtract(DstRegs[i], BigReg, Offset); 255 } 256 257 /// Lower the return value for the already existing \p Ret. This assumes that 258 /// \p B's insertion point is correct. 259 bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B, 260 const Value *Val, ArrayRef<Register> VRegs, 261 MachineInstrBuilder &Ret) const { 262 if (!Val) 263 return true; 264 265 auto &MF = B.getMF(); 266 const auto &F = MF.getFunction(); 267 const DataLayout &DL = MF.getDataLayout(); 268 269 CallingConv::ID CC = F.getCallingConv(); 270 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 271 MachineRegisterInfo &MRI = MF.getRegInfo(); 272 273 ArgInfo OrigRetInfo(VRegs, Val->getType()); 274 setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F); 275 SmallVector<ArgInfo, 4> SplitRetInfos; 276 277 splitToValueTypes( 278 OrigRetInfo, SplitRetInfos, DL, MRI, CC, 279 [&](ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT, int VTSplitIdx) { 280 unpackRegsToOrigType(B, Regs, VRegs[VTSplitIdx], LLTy, PartLLT); 281 }); 282 283 CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg()); 284 285 OutgoingValueHandler RetHandler(B, MF.getRegInfo(), Ret, AssignFn); 286 return handleAssignments(B, SplitRetInfos, RetHandler); 287 } 288 289 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, 290 const Value *Val, 291 ArrayRef<Register> VRegs) const { 292 293 MachineFunction &MF = B.getMF(); 294 MachineRegisterInfo &MRI = MF.getRegInfo(); 295 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 296 MFI->setIfReturnsVoid(!Val); 297 298 assert(!Val == VRegs.empty() && "Return value without a vreg"); 299 300 CallingConv::ID CC = B.getMF().getFunction().getCallingConv(); 301 const bool IsShader = AMDGPU::isShader(CC); 302 const bool IsWaveEnd = (IsShader && MFI->returnsVoid()) || 303 AMDGPU::isKernel(CC); 304 if (IsWaveEnd) { 305 B.buildInstr(AMDGPU::S_ENDPGM) 306 .addImm(0); 307 return true; 308 } 309 310 auto const &ST = B.getMF().getSubtarget<GCNSubtarget>(); 311 312 unsigned ReturnOpc = 313 IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return; 314 315 auto Ret = B.buildInstrNoInsert(ReturnOpc); 316 Register ReturnAddrVReg; 317 if (ReturnOpc == AMDGPU::S_SETPC_B64_return) { 318 ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass); 319 Ret.addUse(ReturnAddrVReg); 320 } 321 322 if (!lowerReturnVal(B, Val, VRegs, Ret)) 323 return false; 324 325 if (ReturnOpc == AMDGPU::S_SETPC_B64_return) { 326 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 327 Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF), 328 &AMDGPU::SGPR_64RegClass); 329 B.buildCopy(ReturnAddrVReg, LiveInReturn); 330 } 331 332 // TODO: Handle CalleeSavedRegsViaCopy. 333 334 B.insertInstr(Ret); 335 return true; 336 } 337 338 Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &B, 339 Type *ParamTy, 340 uint64_t Offset) const { 341 342 MachineFunction &MF = B.getMF(); 343 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 344 MachineRegisterInfo &MRI = MF.getRegInfo(); 345 const Function &F = MF.getFunction(); 346 const DataLayout &DL = F.getParent()->getDataLayout(); 347 PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS); 348 LLT PtrType = getLLTForType(*PtrTy, DL); 349 Register DstReg = MRI.createGenericVirtualRegister(PtrType); 350 Register KernArgSegmentPtr = 351 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 352 Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr); 353 354 Register OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64)); 355 B.buildConstant(OffsetReg, Offset); 356 357 B.buildGEP(DstReg, KernArgSegmentVReg, OffsetReg); 358 359 return DstReg; 360 } 361 362 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, 363 Type *ParamTy, uint64_t Offset, 364 unsigned Align, 365 Register DstReg) const { 366 MachineFunction &MF = B.getMF(); 367 const Function &F = MF.getFunction(); 368 const DataLayout &DL = F.getParent()->getDataLayout(); 369 PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS); 370 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); 371 unsigned TypeSize = DL.getTypeStoreSize(ParamTy); 372 Register PtrReg = lowerParameterPtr(B, ParamTy, Offset); 373 374 MachineMemOperand *MMO = 375 MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad | 376 MachineMemOperand::MODereferenceable | 377 MachineMemOperand::MOInvariant, 378 TypeSize, Align); 379 380 B.buildLoad(DstReg, PtrReg, *MMO); 381 } 382 383 // Allocate special inputs passed in user SGPRs. 384 static void allocateHSAUserSGPRs(CCState &CCInfo, 385 MachineIRBuilder &B, 386 MachineFunction &MF, 387 const SIRegisterInfo &TRI, 388 SIMachineFunctionInfo &Info) { 389 // FIXME: How should these inputs interact with inreg / custom SGPR inputs? 390 if (Info.hasPrivateSegmentBuffer()) { 391 unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); 392 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); 393 CCInfo.AllocateReg(PrivateSegmentBufferReg); 394 } 395 396 if (Info.hasDispatchPtr()) { 397 unsigned DispatchPtrReg = Info.addDispatchPtr(TRI); 398 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); 399 CCInfo.AllocateReg(DispatchPtrReg); 400 } 401 402 if (Info.hasQueuePtr()) { 403 unsigned QueuePtrReg = Info.addQueuePtr(TRI); 404 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); 405 CCInfo.AllocateReg(QueuePtrReg); 406 } 407 408 if (Info.hasKernargSegmentPtr()) { 409 MachineRegisterInfo &MRI = MF.getRegInfo(); 410 Register InputPtrReg = Info.addKernargSegmentPtr(TRI); 411 const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 412 Register VReg = MRI.createGenericVirtualRegister(P4); 413 MRI.addLiveIn(InputPtrReg, VReg); 414 B.getMBB().addLiveIn(InputPtrReg); 415 B.buildCopy(VReg, InputPtrReg); 416 CCInfo.AllocateReg(InputPtrReg); 417 } 418 419 if (Info.hasDispatchID()) { 420 unsigned DispatchIDReg = Info.addDispatchID(TRI); 421 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); 422 CCInfo.AllocateReg(DispatchIDReg); 423 } 424 425 if (Info.hasFlatScratchInit()) { 426 unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI); 427 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); 428 CCInfo.AllocateReg(FlatScratchInitReg); 429 } 430 431 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read 432 // these from the dispatch pointer. 433 } 434 435 bool AMDGPUCallLowering::lowerFormalArgumentsKernel( 436 MachineIRBuilder &B, const Function &F, 437 ArrayRef<ArrayRef<Register>> VRegs) const { 438 MachineFunction &MF = B.getMF(); 439 const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>(); 440 MachineRegisterInfo &MRI = MF.getRegInfo(); 441 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 442 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); 443 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 444 445 const DataLayout &DL = F.getParent()->getDataLayout(); 446 447 SmallVector<CCValAssign, 16> ArgLocs; 448 CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); 449 450 allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info); 451 452 unsigned i = 0; 453 const unsigned KernArgBaseAlign = 16; 454 const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F); 455 uint64_t ExplicitArgOffset = 0; 456 457 // TODO: Align down to dword alignment and extract bits for extending loads. 458 for (auto &Arg : F.args()) { 459 Type *ArgTy = Arg.getType(); 460 unsigned AllocSize = DL.getTypeAllocSize(ArgTy); 461 if (AllocSize == 0) 462 continue; 463 464 unsigned ABIAlign = DL.getABITypeAlignment(ArgTy); 465 466 uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset; 467 ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize; 468 469 ArrayRef<Register> OrigArgRegs = VRegs[i]; 470 Register ArgReg = 471 OrigArgRegs.size() == 1 472 ? OrigArgRegs[0] 473 : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL)); 474 unsigned Align = MinAlign(KernArgBaseAlign, ArgOffset); 475 ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy)); 476 lowerParameter(B, ArgTy, ArgOffset, Align, ArgReg); 477 if (OrigArgRegs.size() > 1) 478 unpackRegs(OrigArgRegs, ArgReg, ArgTy, B); 479 ++i; 480 } 481 482 TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); 483 TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false); 484 return true; 485 } 486 487 // TODO: Move this to generic code 488 static void packSplitRegsToOrigType(MachineIRBuilder &B, 489 ArrayRef<Register> OrigRegs, 490 ArrayRef<Register> Regs, 491 LLT LLTy, 492 LLT PartLLT) { 493 if (!LLTy.isVector() && !PartLLT.isVector()) { 494 B.buildMerge(OrigRegs[0], Regs); 495 return; 496 } 497 498 if (LLTy.isVector() && PartLLT.isVector()) { 499 assert(LLTy.getElementType() == PartLLT.getElementType()); 500 501 int DstElts = LLTy.getNumElements(); 502 int PartElts = PartLLT.getNumElements(); 503 if (DstElts % PartElts == 0) 504 B.buildConcatVectors(OrigRegs[0], Regs); 505 else { 506 // Deal with v3s16 split into v2s16 507 assert(PartElts == 2 && DstElts % 2 != 0); 508 int RoundedElts = PartElts * ((DstElts + PartElts - 1) / PartElts); 509 510 LLT RoundedDestTy = LLT::vector(RoundedElts, PartLLT.getElementType()); 511 auto RoundedConcat = B.buildConcatVectors(RoundedDestTy, Regs); 512 B.buildExtract(OrigRegs[0], RoundedConcat, 0); 513 } 514 515 return; 516 } 517 518 assert(LLTy.isVector() && !PartLLT.isVector()); 519 520 LLT DstEltTy = LLTy.getElementType(); 521 if (DstEltTy == PartLLT) { 522 // Vector was trivially scalarized. 523 B.buildBuildVector(OrigRegs[0], Regs); 524 } else if (DstEltTy.getSizeInBits() > PartLLT.getSizeInBits()) { 525 // Deal with vector with 64-bit elements decomposed to 32-bit 526 // registers. Need to create intermediate 64-bit elements. 527 SmallVector<Register, 8> EltMerges; 528 int PartsPerElt = DstEltTy.getSizeInBits() / PartLLT.getSizeInBits(); 529 530 assert(DstEltTy.getSizeInBits() % PartLLT.getSizeInBits() == 0); 531 532 for (int I = 0, NumElts = LLTy.getNumElements(); I != NumElts; ++I) { 533 auto Merge = B.buildMerge(DstEltTy, 534 Regs.take_front(PartsPerElt)); 535 EltMerges.push_back(Merge.getReg(0)); 536 Regs = Regs.drop_front(PartsPerElt); 537 } 538 539 B.buildBuildVector(OrigRegs[0], EltMerges); 540 } else { 541 // Vector was split, and elements promoted to a wider type. 542 LLT BVType = LLT::vector(LLTy.getNumElements(), PartLLT); 543 auto BV = B.buildBuildVector(BVType, Regs); 544 B.buildTrunc(OrigRegs[0], BV); 545 } 546 } 547 548 bool AMDGPUCallLowering::lowerFormalArguments( 549 MachineIRBuilder &B, const Function &F, 550 ArrayRef<ArrayRef<Register>> VRegs) const { 551 CallingConv::ID CC = F.getCallingConv(); 552 553 // The infrastructure for normal calling convention lowering is essentially 554 // useless for kernels. We want to avoid any kind of legalization or argument 555 // splitting. 556 if (CC == CallingConv::AMDGPU_KERNEL) 557 return lowerFormalArgumentsKernel(B, F, VRegs); 558 559 const bool IsShader = AMDGPU::isShader(CC); 560 const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC); 561 562 MachineFunction &MF = B.getMF(); 563 MachineBasicBlock &MBB = B.getMBB(); 564 MachineRegisterInfo &MRI = MF.getRegInfo(); 565 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 566 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 567 const SIRegisterInfo *TRI = Subtarget.getRegisterInfo(); 568 const DataLayout &DL = F.getParent()->getDataLayout(); 569 570 571 SmallVector<CCValAssign, 16> ArgLocs; 572 CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext()); 573 574 if (!IsEntryFunc) { 575 Register ReturnAddrReg = TRI->getReturnAddressReg(MF); 576 Register LiveInReturn = MF.addLiveIn(ReturnAddrReg, 577 &AMDGPU::SGPR_64RegClass); 578 MBB.addLiveIn(ReturnAddrReg); 579 B.buildCopy(LiveInReturn, ReturnAddrReg); 580 } 581 582 if (Info->hasImplicitBufferPtr()) { 583 Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI); 584 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); 585 CCInfo.AllocateReg(ImplicitBufferPtrReg); 586 } 587 588 589 SmallVector<ArgInfo, 32> SplitArgs; 590 unsigned Idx = 0; 591 unsigned PSInputNum = 0; 592 593 for (auto &Arg : F.args()) { 594 if (DL.getTypeStoreSize(Arg.getType()) == 0) 595 continue; 596 597 const bool InReg = Arg.hasAttribute(Attribute::InReg); 598 599 // SGPR arguments to functions not implemented. 600 if (!IsShader && InReg) 601 return false; 602 603 if (Arg.hasAttribute(Attribute::SwiftSelf) || 604 Arg.hasAttribute(Attribute::SwiftError) || 605 Arg.hasAttribute(Attribute::Nest)) 606 return false; 607 608 if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) { 609 const bool ArgUsed = !Arg.use_empty(); 610 bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum); 611 612 if (!SkipArg) { 613 Info->markPSInputAllocated(PSInputNum); 614 if (ArgUsed) 615 Info->markPSInputEnabled(PSInputNum); 616 } 617 618 ++PSInputNum; 619 620 if (SkipArg) { 621 for (int I = 0, E = VRegs[Idx].size(); I != E; ++I) 622 B.buildUndef(VRegs[Idx][I]); 623 624 ++Idx; 625 continue; 626 } 627 } 628 629 ArgInfo OrigArg(VRegs[Idx], Arg.getType()); 630 setArgFlags(OrigArg, Idx + AttributeList::FirstArgIndex, DL, F); 631 632 splitToValueTypes( 633 OrigArg, SplitArgs, DL, MRI, CC, 634 // FIXME: We should probably be passing multiple registers to 635 // handleAssignments to do this 636 [&](ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT, int VTSplitIdx) { 637 packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs, 638 LLTy, PartLLT); 639 }); 640 641 ++Idx; 642 } 643 644 // At least one interpolation mode must be enabled or else the GPU will 645 // hang. 646 // 647 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user 648 // set PSInputAddr, the user wants to enable some bits after the compilation 649 // based on run-time states. Since we can't know what the final PSInputEna 650 // will look like, so we shouldn't do anything here and the user should take 651 // responsibility for the correct programming. 652 // 653 // Otherwise, the following restrictions apply: 654 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. 655 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be 656 // enabled too. 657 if (CC == CallingConv::AMDGPU_PS) { 658 if ((Info->getPSInputAddr() & 0x7F) == 0 || 659 ((Info->getPSInputAddr() & 0xF) == 0 && 660 Info->isPSInputAllocated(11))) { 661 CCInfo.AllocateReg(AMDGPU::VGPR0); 662 CCInfo.AllocateReg(AMDGPU::VGPR1); 663 Info->markPSInputAllocated(0); 664 Info->markPSInputEnabled(0); 665 } 666 667 if (Subtarget.isAmdPalOS()) { 668 // For isAmdPalOS, the user does not enable some bits after compilation 669 // based on run-time states; the register values being generated here are 670 // the final ones set in hardware. Therefore we need to apply the 671 // workaround to PSInputAddr and PSInputEnable together. (The case where 672 // a bit is set in PSInputAddr but not PSInputEnable is where the frontend 673 // set up an input arg for a particular interpolation mode, but nothing 674 // uses that input arg. Really we should have an earlier pass that removes 675 // such an arg.) 676 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable(); 677 if ((PsInputBits & 0x7F) == 0 || 678 ((PsInputBits & 0xF) == 0 && 679 (PsInputBits >> 11 & 1))) 680 Info->markPSInputEnabled( 681 countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined)); 682 } 683 } 684 685 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 686 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg()); 687 688 if (!MBB.empty()) 689 B.setInstr(*MBB.begin()); 690 691 FormalArgHandler Handler(B, MRI, AssignFn); 692 if (!handleAssignments(CCInfo, ArgLocs, B, SplitArgs, Handler)) 693 return false; 694 695 if (!IsEntryFunc) { 696 // Special inputs come after user arguments. 697 TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); 698 } 699 700 // Start adding system SGPRs. 701 if (IsEntryFunc) { 702 TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader); 703 } else { 704 CCInfo.AllocateReg(Info->getScratchRSrcReg()); 705 CCInfo.AllocateReg(Info->getScratchWaveOffsetReg()); 706 CCInfo.AllocateReg(Info->getFrameOffsetReg()); 707 TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); 708 } 709 710 // Move back to the end of the basic block. 711 B.setMBB(MBB); 712 713 return true; 714 } 715