1 //===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file implements the lowering of LLVM calls to machine code calls for 11 /// GlobalISel. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUCallLowering.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUISelLowering.h" 18 #include "AMDGPULegalizerInfo.h" 19 #include "AMDGPUSubtarget.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 22 #include "SIISelLowering.h" 23 #include "SIMachineFunctionInfo.h" 24 #include "SIRegisterInfo.h" 25 #include "llvm/CodeGen/Analysis.h" 26 #include "llvm/CodeGen/CallingConvLower.h" 27 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 28 #include "llvm/CodeGen/MachineInstrBuilder.h" 29 #include "llvm/Support/LowLevelTypeImpl.h" 30 31 #define DEBUG_TYPE "amdgpu-call-lowering" 32 33 using namespace llvm; 34 35 namespace { 36 37 struct AMDGPUValueHandler : public CallLowering::ValueHandler { 38 AMDGPUValueHandler(bool IsIncoming, MachineIRBuilder &B, 39 MachineRegisterInfo &MRI, CCAssignFn *AssignFn) 40 : ValueHandler(IsIncoming, B, MRI, AssignFn) {} 41 42 /// Wrapper around extendRegister to ensure we extend to a full 32-bit 43 /// register. 44 Register extendRegisterMin32(Register ValVReg, CCValAssign &VA) { 45 if (VA.getLocVT().getSizeInBits() < 32) { 46 // 16-bit types are reported as legal for 32-bit registers. We need to 47 // extend and do a 32-bit copy to avoid the verifier complaining about it. 48 return MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0); 49 } 50 51 return extendRegister(ValVReg, VA); 52 } 53 }; 54 55 struct AMDGPUOutgoingValueHandler : public AMDGPUValueHandler { 56 AMDGPUOutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, 57 MachineInstrBuilder MIB, CCAssignFn *AssignFn) 58 : AMDGPUValueHandler(false, B, MRI, AssignFn), MIB(MIB) {} 59 60 MachineInstrBuilder MIB; 61 62 Register getStackAddress(uint64_t Size, int64_t Offset, 63 MachinePointerInfo &MPO) override { 64 llvm_unreachable("not implemented"); 65 } 66 67 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, 68 MachinePointerInfo &MPO, CCValAssign &VA) override { 69 llvm_unreachable("not implemented"); 70 } 71 72 void assignValueToReg(Register ValVReg, Register PhysReg, 73 CCValAssign &VA) override { 74 Register ExtReg = extendRegisterMin32(ValVReg, VA); 75 76 // If this is a scalar return, insert a readfirstlane just in case the value 77 // ends up in a VGPR. 78 // FIXME: Assert this is a shader return. 79 const SIRegisterInfo *TRI 80 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 81 if (TRI->isSGPRReg(MRI, PhysReg)) { 82 auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, 83 {MRI.getType(ExtReg)}, false) 84 .addReg(ExtReg); 85 ExtReg = ToSGPR.getReg(0); 86 } 87 88 MIRBuilder.buildCopy(PhysReg, ExtReg); 89 MIB.addUse(PhysReg, RegState::Implicit); 90 } 91 92 bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, 93 CCValAssign::LocInfo LocInfo, 94 const CallLowering::ArgInfo &Info, 95 ISD::ArgFlagsTy Flags, 96 CCState &State) override { 97 return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State); 98 } 99 }; 100 101 struct AMDGPUIncomingArgHandler : public AMDGPUValueHandler { 102 uint64_t StackUsed = 0; 103 104 AMDGPUIncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, 105 CCAssignFn *AssignFn) 106 : AMDGPUValueHandler(true, B, MRI, AssignFn) {} 107 108 Register getStackAddress(uint64_t Size, int64_t Offset, 109 MachinePointerInfo &MPO) override { 110 auto &MFI = MIRBuilder.getMF().getFrameInfo(); 111 int FI = MFI.CreateFixedObject(Size, Offset, true); 112 MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); 113 auto AddrReg = MIRBuilder.buildFrameIndex( 114 LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32), FI); 115 StackUsed = std::max(StackUsed, Size + Offset); 116 return AddrReg.getReg(0); 117 } 118 119 void assignValueToReg(Register ValVReg, Register PhysReg, 120 CCValAssign &VA) override { 121 markPhysRegUsed(PhysReg); 122 123 if (VA.getLocVT().getSizeInBits() < 32) { 124 // 16-bit types are reported as legal for 32-bit registers. We need to do 125 // a 32-bit copy, and truncate to avoid the verifier complaining about it. 126 auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg); 127 MIRBuilder.buildTrunc(ValVReg, Copy); 128 return; 129 } 130 131 switch (VA.getLocInfo()) { 132 case CCValAssign::LocInfo::SExt: 133 case CCValAssign::LocInfo::ZExt: 134 case CCValAssign::LocInfo::AExt: { 135 auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg); 136 MIRBuilder.buildTrunc(ValVReg, Copy); 137 break; 138 } 139 default: 140 MIRBuilder.buildCopy(ValVReg, PhysReg); 141 break; 142 } 143 } 144 145 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t MemSize, 146 MachinePointerInfo &MPO, CCValAssign &VA) override { 147 MachineFunction &MF = MIRBuilder.getMF(); 148 149 // The reported memory location may be wider than the value. 150 const LLT RegTy = MRI.getType(ValVReg); 151 MemSize = std::min(static_cast<uint64_t>(RegTy.getSizeInBytes()), MemSize); 152 153 // FIXME: Get alignment 154 auto MMO = MF.getMachineMemOperand( 155 MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, MemSize, 156 inferAlignFromPtrInfo(MF, MPO)); 157 MIRBuilder.buildLoad(ValVReg, Addr, *MMO); 158 } 159 160 /// How the physical register gets marked varies between formal 161 /// parameters (it's a basic-block live-in), and a call instruction 162 /// (it's an implicit-def of the BL). 163 virtual void markPhysRegUsed(unsigned PhysReg) = 0; 164 }; 165 166 struct FormalArgHandler : public AMDGPUIncomingArgHandler { 167 FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, 168 CCAssignFn *AssignFn) 169 : AMDGPUIncomingArgHandler(B, MRI, AssignFn) {} 170 171 void markPhysRegUsed(unsigned PhysReg) override { 172 MIRBuilder.getMBB().addLiveIn(PhysReg); 173 } 174 }; 175 176 struct CallReturnHandler : public AMDGPUIncomingArgHandler { 177 CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, 178 MachineInstrBuilder MIB, CCAssignFn *AssignFn) 179 : AMDGPUIncomingArgHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} 180 181 void markPhysRegUsed(unsigned PhysReg) override { 182 MIB.addDef(PhysReg, RegState::Implicit); 183 } 184 185 MachineInstrBuilder MIB; 186 }; 187 188 struct AMDGPUOutgoingArgHandler : public AMDGPUValueHandler { 189 MachineInstrBuilder MIB; 190 CCAssignFn *AssignFnVarArg; 191 192 /// For tail calls, the byte offset of the call's argument area from the 193 /// callee's. Unused elsewhere. 194 int FPDiff; 195 196 // Cache the SP register vreg if we need it more than once in this call site. 197 Register SPReg; 198 199 bool IsTailCall; 200 201 AMDGPUOutgoingArgHandler(MachineIRBuilder &MIRBuilder, 202 MachineRegisterInfo &MRI, MachineInstrBuilder MIB, 203 CCAssignFn *AssignFn, CCAssignFn *AssignFnVarArg, 204 bool IsTailCall = false, int FPDiff = 0) 205 : AMDGPUValueHandler(false, MIRBuilder, MRI, AssignFn), MIB(MIB), 206 AssignFnVarArg(AssignFnVarArg), FPDiff(FPDiff), IsTailCall(IsTailCall) { 207 } 208 209 Register getStackAddress(uint64_t Size, int64_t Offset, 210 MachinePointerInfo &MPO) override { 211 MachineFunction &MF = MIRBuilder.getMF(); 212 const LLT PtrTy = LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32); 213 const LLT S32 = LLT::scalar(32); 214 215 if (IsTailCall) { 216 llvm_unreachable("implement me"); 217 } 218 219 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 220 221 if (!SPReg) 222 SPReg = MIRBuilder.buildCopy(PtrTy, MFI->getStackPtrOffsetReg()).getReg(0); 223 224 auto OffsetReg = MIRBuilder.buildConstant(S32, Offset); 225 226 auto AddrReg = MIRBuilder.buildPtrAdd(PtrTy, SPReg, OffsetReg); 227 MPO = MachinePointerInfo::getStack(MF, Offset); 228 return AddrReg.getReg(0); 229 } 230 231 void assignValueToReg(Register ValVReg, Register PhysReg, 232 CCValAssign &VA) override { 233 MIB.addUse(PhysReg, RegState::Implicit); 234 Register ExtReg = extendRegisterMin32(ValVReg, VA); 235 MIRBuilder.buildCopy(PhysReg, ExtReg); 236 } 237 238 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, 239 MachinePointerInfo &MPO, CCValAssign &VA) override { 240 MachineFunction &MF = MIRBuilder.getMF(); 241 uint64_t LocMemOffset = VA.getLocMemOffset(); 242 const auto &ST = MF.getSubtarget<GCNSubtarget>(); 243 244 auto MMO = MF.getMachineMemOperand( 245 MPO, MachineMemOperand::MOStore, Size, 246 commonAlignment(ST.getStackAlignment(), LocMemOffset)); 247 MIRBuilder.buildStore(ValVReg, Addr, *MMO); 248 } 249 250 void assignValueToAddress(const CallLowering::ArgInfo &Arg, Register Addr, 251 uint64_t Size, MachinePointerInfo &MPO, 252 CCValAssign &VA) override { 253 Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt 254 ? extendRegister(Arg.Regs[0], VA) 255 : Arg.Regs[0]; 256 257 // If we extended we might need to adjust the MMO's Size. 258 const LLT RegTy = MRI.getType(ValVReg); 259 if (RegTy.getSizeInBytes() > Size) 260 Size = RegTy.getSizeInBytes(); 261 262 assignValueToAddress(ValVReg, Addr, Size, MPO, VA); 263 } 264 }; 265 } 266 267 AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI) 268 : CallLowering(&TLI) { 269 } 270 271 // FIXME: Compatability shim 272 static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) { 273 switch (MIOpc) { 274 case TargetOpcode::G_SEXT: 275 return ISD::SIGN_EXTEND; 276 case TargetOpcode::G_ZEXT: 277 return ISD::ZERO_EXTEND; 278 case TargetOpcode::G_ANYEXT: 279 return ISD::ANY_EXTEND; 280 default: 281 llvm_unreachable("not an extend opcode"); 282 } 283 } 284 285 void AMDGPUCallLowering::splitToValueTypes( 286 MachineIRBuilder &B, 287 const ArgInfo &OrigArg, 288 SmallVectorImpl<ArgInfo> &SplitArgs, 289 const DataLayout &DL, CallingConv::ID CallConv, 290 bool IsOutgoing, 291 SplitArgTy PerformArgSplit) const { 292 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 293 LLVMContext &Ctx = OrigArg.Ty->getContext(); 294 295 if (OrigArg.Ty->isVoidTy()) 296 return; 297 298 SmallVector<EVT, 4> SplitVTs; 299 ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs); 300 301 assert(OrigArg.Regs.size() == SplitVTs.size()); 302 303 int SplitIdx = 0; 304 for (EVT VT : SplitVTs) { 305 Register Reg = OrigArg.Regs[SplitIdx]; 306 Type *Ty = VT.getTypeForEVT(Ctx); 307 LLT LLTy = getLLTForType(*Ty, DL); 308 309 if (IsOutgoing && VT.isScalarInteger()) { 310 unsigned ExtendOp = TargetOpcode::G_ANYEXT; 311 if (OrigArg.Flags[0].isSExt()) { 312 assert(OrigArg.Regs.size() == 1 && "expect only simple return values"); 313 ExtendOp = TargetOpcode::G_SEXT; 314 } else if (OrigArg.Flags[0].isZExt()) { 315 assert(OrigArg.Regs.size() == 1 && "expect only simple return values"); 316 ExtendOp = TargetOpcode::G_ZEXT; 317 } 318 319 EVT ExtVT = TLI.getTypeForExtReturn(Ctx, VT, 320 extOpcodeToISDExtOpcode(ExtendOp)); 321 if (ExtVT.getSizeInBits() != VT.getSizeInBits()) { 322 VT = ExtVT; 323 Ty = ExtVT.getTypeForEVT(Ctx); 324 LLTy = getLLTForType(*Ty, DL); 325 Reg = B.buildInstr(ExtendOp, {LLTy}, {Reg}).getReg(0); 326 } 327 } 328 329 unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT); 330 MVT RegVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT); 331 332 if (NumParts == 1) { 333 // No splitting to do, but we want to replace the original type (e.g. [1 x 334 // double] -> double). 335 SplitArgs.emplace_back(Reg, Ty, OrigArg.Flags, OrigArg.IsFixed); 336 337 ++SplitIdx; 338 continue; 339 } 340 341 SmallVector<Register, 8> SplitRegs; 342 Type *PartTy = EVT(RegVT).getTypeForEVT(Ctx); 343 LLT PartLLT = getLLTForType(*PartTy, DL); 344 MachineRegisterInfo &MRI = *B.getMRI(); 345 346 // FIXME: Should we be reporting all of the part registers for a single 347 // argument, and let handleAssignments take care of the repacking? 348 for (unsigned i = 0; i < NumParts; ++i) { 349 Register PartReg = MRI.createGenericVirtualRegister(PartLLT); 350 SplitRegs.push_back(PartReg); 351 SplitArgs.emplace_back(ArrayRef<Register>(PartReg), PartTy, OrigArg.Flags); 352 } 353 354 PerformArgSplit(SplitRegs, Reg, LLTy, PartLLT, SplitIdx); 355 356 ++SplitIdx; 357 } 358 } 359 360 // TODO: Move to generic code 361 static void unpackRegsToOrigType(MachineIRBuilder &B, 362 ArrayRef<Register> DstRegs, 363 Register SrcReg, 364 const CallLowering::ArgInfo &Info, 365 LLT SrcTy, 366 LLT PartTy) { 367 assert(DstRegs.size() > 1 && "Nothing to unpack"); 368 369 const unsigned PartSize = PartTy.getSizeInBits(); 370 371 if (SrcTy.isVector() && !PartTy.isVector() && 372 PartSize > SrcTy.getElementType().getSizeInBits()) { 373 // Vector was scalarized, and the elements extended. 374 auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(), SrcReg); 375 for (int i = 0, e = DstRegs.size(); i != e; ++i) 376 B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i)); 377 return; 378 } 379 380 LLT GCDTy = getGCDType(SrcTy, PartTy); 381 if (GCDTy == PartTy) { 382 // If this already evenly divisible, we can create a simple unmerge. 383 B.buildUnmerge(DstRegs, SrcReg); 384 return; 385 } 386 387 MachineRegisterInfo &MRI = *B.getMRI(); 388 LLT DstTy = MRI.getType(DstRegs[0]); 389 LLT LCMTy = getLCMType(SrcTy, PartTy); 390 391 const unsigned LCMSize = LCMTy.getSizeInBits(); 392 const unsigned DstSize = DstTy.getSizeInBits(); 393 const unsigned SrcSize = SrcTy.getSizeInBits(); 394 395 Register UnmergeSrc = SrcReg; 396 if (LCMSize != SrcSize) { 397 // Widen to the common type. 398 Register Undef = B.buildUndef(SrcTy).getReg(0); 399 SmallVector<Register, 8> MergeParts(1, SrcReg); 400 for (unsigned Size = SrcSize; Size != LCMSize; Size += SrcSize) 401 MergeParts.push_back(Undef); 402 403 UnmergeSrc = B.buildMerge(LCMTy, MergeParts).getReg(0); 404 } 405 406 // Unmerge to the original registers and pad with dead defs. 407 SmallVector<Register, 8> UnmergeResults(DstRegs.begin(), DstRegs.end()); 408 for (unsigned Size = DstSize * DstRegs.size(); Size != LCMSize; 409 Size += DstSize) { 410 UnmergeResults.push_back(MRI.createGenericVirtualRegister(DstTy)); 411 } 412 413 B.buildUnmerge(UnmergeResults, UnmergeSrc); 414 } 415 416 /// Lower the return value for the already existing \p Ret. This assumes that 417 /// \p B's insertion point is correct. 418 bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B, 419 const Value *Val, ArrayRef<Register> VRegs, 420 MachineInstrBuilder &Ret) const { 421 if (!Val) 422 return true; 423 424 auto &MF = B.getMF(); 425 const auto &F = MF.getFunction(); 426 const DataLayout &DL = MF.getDataLayout(); 427 MachineRegisterInfo *MRI = B.getMRI(); 428 429 CallingConv::ID CC = F.getCallingConv(); 430 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 431 432 ArgInfo OrigRetInfo(VRegs, Val->getType()); 433 setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F); 434 SmallVector<ArgInfo, 4> SplitRetInfos; 435 436 splitToValueTypes( 437 B, OrigRetInfo, SplitRetInfos, DL, CC, true, 438 [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT, 439 int VTSplitIdx) { 440 unpackRegsToOrigType(B, Regs, SrcReg, 441 SplitRetInfos[VTSplitIdx], 442 LLTy, PartLLT); 443 }); 444 445 CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg()); 446 AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret, AssignFn); 447 return handleAssignments(B, SplitRetInfos, RetHandler); 448 } 449 450 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, 451 const Value *Val, 452 ArrayRef<Register> VRegs) const { 453 454 MachineFunction &MF = B.getMF(); 455 MachineRegisterInfo &MRI = MF.getRegInfo(); 456 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 457 MFI->setIfReturnsVoid(!Val); 458 459 assert(!Val == VRegs.empty() && "Return value without a vreg"); 460 461 CallingConv::ID CC = B.getMF().getFunction().getCallingConv(); 462 const bool IsShader = AMDGPU::isShader(CC); 463 const bool IsWaveEnd = (IsShader && MFI->returnsVoid()) || 464 AMDGPU::isKernel(CC); 465 if (IsWaveEnd) { 466 B.buildInstr(AMDGPU::S_ENDPGM) 467 .addImm(0); 468 return true; 469 } 470 471 auto const &ST = MF.getSubtarget<GCNSubtarget>(); 472 473 unsigned ReturnOpc = 474 IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return; 475 476 auto Ret = B.buildInstrNoInsert(ReturnOpc); 477 Register ReturnAddrVReg; 478 if (ReturnOpc == AMDGPU::S_SETPC_B64_return) { 479 ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass); 480 Ret.addUse(ReturnAddrVReg); 481 } 482 483 if (!lowerReturnVal(B, Val, VRegs, Ret)) 484 return false; 485 486 if (ReturnOpc == AMDGPU::S_SETPC_B64_return) { 487 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 488 Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF), 489 &AMDGPU::SGPR_64RegClass); 490 B.buildCopy(ReturnAddrVReg, LiveInReturn); 491 } 492 493 // TODO: Handle CalleeSavedRegsViaCopy. 494 495 B.insertInstr(Ret); 496 return true; 497 } 498 499 void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B, 500 Type *ParamTy, 501 uint64_t Offset) const { 502 MachineFunction &MF = B.getMF(); 503 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 504 MachineRegisterInfo &MRI = MF.getRegInfo(); 505 Register KernArgSegmentPtr = 506 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 507 Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr); 508 509 auto OffsetReg = B.buildConstant(LLT::scalar(64), Offset); 510 511 B.buildPtrAdd(DstReg, KernArgSegmentVReg, OffsetReg); 512 } 513 514 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy, 515 uint64_t Offset, Align Alignment, 516 Register DstReg) const { 517 MachineFunction &MF = B.getMF(); 518 const Function &F = MF.getFunction(); 519 const DataLayout &DL = F.getParent()->getDataLayout(); 520 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 521 unsigned TypeSize = DL.getTypeStoreSize(ParamTy); 522 523 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 524 Register PtrReg = B.getMRI()->createGenericVirtualRegister(PtrTy); 525 lowerParameterPtr(PtrReg, B, ParamTy, Offset); 526 527 MachineMemOperand *MMO = MF.getMachineMemOperand( 528 PtrInfo, 529 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 530 MachineMemOperand::MOInvariant, 531 TypeSize, Alignment); 532 533 B.buildLoad(DstReg, PtrReg, *MMO); 534 } 535 536 // Allocate special inputs passed in user SGPRs. 537 static void allocateHSAUserSGPRs(CCState &CCInfo, 538 MachineIRBuilder &B, 539 MachineFunction &MF, 540 const SIRegisterInfo &TRI, 541 SIMachineFunctionInfo &Info) { 542 // FIXME: How should these inputs interact with inreg / custom SGPR inputs? 543 if (Info.hasPrivateSegmentBuffer()) { 544 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); 545 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); 546 CCInfo.AllocateReg(PrivateSegmentBufferReg); 547 } 548 549 if (Info.hasDispatchPtr()) { 550 Register DispatchPtrReg = Info.addDispatchPtr(TRI); 551 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); 552 CCInfo.AllocateReg(DispatchPtrReg); 553 } 554 555 if (Info.hasQueuePtr()) { 556 Register QueuePtrReg = Info.addQueuePtr(TRI); 557 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); 558 CCInfo.AllocateReg(QueuePtrReg); 559 } 560 561 if (Info.hasKernargSegmentPtr()) { 562 MachineRegisterInfo &MRI = MF.getRegInfo(); 563 Register InputPtrReg = Info.addKernargSegmentPtr(TRI); 564 const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 565 Register VReg = MRI.createGenericVirtualRegister(P4); 566 MRI.addLiveIn(InputPtrReg, VReg); 567 B.getMBB().addLiveIn(InputPtrReg); 568 B.buildCopy(VReg, InputPtrReg); 569 CCInfo.AllocateReg(InputPtrReg); 570 } 571 572 if (Info.hasDispatchID()) { 573 Register DispatchIDReg = Info.addDispatchID(TRI); 574 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); 575 CCInfo.AllocateReg(DispatchIDReg); 576 } 577 578 if (Info.hasFlatScratchInit()) { 579 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI); 580 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); 581 CCInfo.AllocateReg(FlatScratchInitReg); 582 } 583 584 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read 585 // these from the dispatch pointer. 586 } 587 588 bool AMDGPUCallLowering::lowerFormalArgumentsKernel( 589 MachineIRBuilder &B, const Function &F, 590 ArrayRef<ArrayRef<Register>> VRegs) const { 591 MachineFunction &MF = B.getMF(); 592 const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>(); 593 MachineRegisterInfo &MRI = MF.getRegInfo(); 594 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 595 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); 596 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 597 598 const DataLayout &DL = F.getParent()->getDataLayout(); 599 600 SmallVector<CCValAssign, 16> ArgLocs; 601 CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); 602 603 allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info); 604 605 unsigned i = 0; 606 const Align KernArgBaseAlign(16); 607 const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F); 608 uint64_t ExplicitArgOffset = 0; 609 610 // TODO: Align down to dword alignment and extract bits for extending loads. 611 for (auto &Arg : F.args()) { 612 const bool IsByRef = Arg.hasByRefAttr(); 613 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 614 unsigned AllocSize = DL.getTypeAllocSize(ArgTy); 615 if (AllocSize == 0) 616 continue; 617 618 MaybeAlign ABIAlign = IsByRef ? Arg.getParamAlign() : None; 619 if (!ABIAlign) 620 ABIAlign = DL.getABITypeAlign(ArgTy); 621 622 uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset; 623 ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize; 624 625 if (Arg.use_empty()) { 626 ++i; 627 continue; 628 } 629 630 Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset); 631 632 if (IsByRef) { 633 unsigned ByRefAS = cast<PointerType>(Arg.getType())->getAddressSpace(); 634 635 assert(VRegs[i].size() == 1 && 636 "expected only one register for byval pointers"); 637 if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) { 638 lowerParameterPtr(VRegs[i][0], B, ArgTy, ArgOffset); 639 } else { 640 const LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 641 Register PtrReg = MRI.createGenericVirtualRegister(ConstPtrTy); 642 lowerParameterPtr(PtrReg, B, ArgTy, ArgOffset); 643 644 B.buildAddrSpaceCast(VRegs[i][0], PtrReg); 645 } 646 } else { 647 ArrayRef<Register> OrigArgRegs = VRegs[i]; 648 Register ArgReg = 649 OrigArgRegs.size() == 1 650 ? OrigArgRegs[0] 651 : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL)); 652 653 lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg); 654 if (OrigArgRegs.size() > 1) 655 unpackRegs(OrigArgRegs, ArgReg, ArgTy, B); 656 } 657 658 ++i; 659 } 660 661 TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); 662 TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false); 663 return true; 664 } 665 666 /// Pack values \p SrcRegs to cover the vector type result \p DstRegs. 667 static MachineInstrBuilder mergeVectorRegsToResultRegs( 668 MachineIRBuilder &B, ArrayRef<Register> DstRegs, ArrayRef<Register> SrcRegs) { 669 MachineRegisterInfo &MRI = *B.getMRI(); 670 LLT LLTy = MRI.getType(DstRegs[0]); 671 LLT PartLLT = MRI.getType(SrcRegs[0]); 672 673 // Deal with v3s16 split into v2s16 674 LLT LCMTy = getLCMType(LLTy, PartLLT); 675 if (LCMTy == LLTy) { 676 // Common case where no padding is needed. 677 assert(DstRegs.size() == 1); 678 return B.buildConcatVectors(DstRegs[0], SrcRegs); 679 } 680 681 const int NumWide = LCMTy.getSizeInBits() / PartLLT.getSizeInBits(); 682 Register Undef = B.buildUndef(PartLLT).getReg(0); 683 684 // Build vector of undefs. 685 SmallVector<Register, 8> WidenedSrcs(NumWide, Undef); 686 687 // Replace the first sources with the real registers. 688 std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin()); 689 690 auto Widened = B.buildConcatVectors(LCMTy, WidenedSrcs); 691 int NumDst = LCMTy.getSizeInBits() / LLTy.getSizeInBits(); 692 693 SmallVector<Register, 8> PadDstRegs(NumDst); 694 std::copy(DstRegs.begin(), DstRegs.end(), PadDstRegs.begin()); 695 696 // Create the excess dead defs for the unmerge. 697 for (int I = DstRegs.size(); I != NumDst; ++I) 698 PadDstRegs[I] = MRI.createGenericVirtualRegister(LLTy); 699 700 return B.buildUnmerge(PadDstRegs, Widened); 701 } 702 703 // TODO: Move this to generic code 704 static void packSplitRegsToOrigType(MachineIRBuilder &B, 705 ArrayRef<Register> OrigRegs, 706 ArrayRef<Register> Regs, 707 LLT LLTy, 708 LLT PartLLT) { 709 MachineRegisterInfo &MRI = *B.getMRI(); 710 711 if (!LLTy.isVector() && !PartLLT.isVector()) { 712 assert(OrigRegs.size() == 1); 713 LLT OrigTy = MRI.getType(OrigRegs[0]); 714 715 unsigned SrcSize = PartLLT.getSizeInBits() * Regs.size(); 716 if (SrcSize == OrigTy.getSizeInBits()) 717 B.buildMerge(OrigRegs[0], Regs); 718 else { 719 auto Widened = B.buildMerge(LLT::scalar(SrcSize), Regs); 720 B.buildTrunc(OrigRegs[0], Widened); 721 } 722 723 return; 724 } 725 726 if (LLTy.isVector() && PartLLT.isVector()) { 727 assert(OrigRegs.size() == 1); 728 assert(LLTy.getElementType() == PartLLT.getElementType()); 729 mergeVectorRegsToResultRegs(B, OrigRegs, Regs); 730 return; 731 } 732 733 assert(LLTy.isVector() && !PartLLT.isVector()); 734 735 LLT DstEltTy = LLTy.getElementType(); 736 737 // Pointer information was discarded. We'll need to coerce some register types 738 // to avoid violating type constraints. 739 LLT RealDstEltTy = MRI.getType(OrigRegs[0]).getElementType(); 740 741 assert(DstEltTy.getSizeInBits() == RealDstEltTy.getSizeInBits()); 742 743 if (DstEltTy == PartLLT) { 744 // Vector was trivially scalarized. 745 746 if (RealDstEltTy.isPointer()) { 747 for (Register Reg : Regs) 748 MRI.setType(Reg, RealDstEltTy); 749 } 750 751 B.buildBuildVector(OrigRegs[0], Regs); 752 } else if (DstEltTy.getSizeInBits() > PartLLT.getSizeInBits()) { 753 // Deal with vector with 64-bit elements decomposed to 32-bit 754 // registers. Need to create intermediate 64-bit elements. 755 SmallVector<Register, 8> EltMerges; 756 int PartsPerElt = DstEltTy.getSizeInBits() / PartLLT.getSizeInBits(); 757 758 assert(DstEltTy.getSizeInBits() % PartLLT.getSizeInBits() == 0); 759 760 for (int I = 0, NumElts = LLTy.getNumElements(); I != NumElts; ++I) { 761 auto Merge = B.buildMerge(RealDstEltTy, Regs.take_front(PartsPerElt)); 762 // Fix the type in case this is really a vector of pointers. 763 MRI.setType(Merge.getReg(0), RealDstEltTy); 764 EltMerges.push_back(Merge.getReg(0)); 765 Regs = Regs.drop_front(PartsPerElt); 766 } 767 768 B.buildBuildVector(OrigRegs[0], EltMerges); 769 } else { 770 // Vector was split, and elements promoted to a wider type. 771 LLT BVType = LLT::vector(LLTy.getNumElements(), PartLLT); 772 auto BV = B.buildBuildVector(BVType, Regs); 773 B.buildTrunc(OrigRegs[0], BV); 774 } 775 } 776 777 bool AMDGPUCallLowering::lowerFormalArguments( 778 MachineIRBuilder &B, const Function &F, 779 ArrayRef<ArrayRef<Register>> VRegs) const { 780 CallingConv::ID CC = F.getCallingConv(); 781 782 // The infrastructure for normal calling convention lowering is essentially 783 // useless for kernels. We want to avoid any kind of legalization or argument 784 // splitting. 785 if (CC == CallingConv::AMDGPU_KERNEL) 786 return lowerFormalArgumentsKernel(B, F, VRegs); 787 788 const bool IsShader = AMDGPU::isShader(CC); 789 const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC); 790 791 MachineFunction &MF = B.getMF(); 792 MachineBasicBlock &MBB = B.getMBB(); 793 MachineRegisterInfo &MRI = MF.getRegInfo(); 794 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 795 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 796 const SIRegisterInfo *TRI = Subtarget.getRegisterInfo(); 797 const DataLayout &DL = F.getParent()->getDataLayout(); 798 799 800 SmallVector<CCValAssign, 16> ArgLocs; 801 CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext()); 802 803 if (!IsEntryFunc) { 804 Register ReturnAddrReg = TRI->getReturnAddressReg(MF); 805 Register LiveInReturn = MF.addLiveIn(ReturnAddrReg, 806 &AMDGPU::SGPR_64RegClass); 807 MBB.addLiveIn(ReturnAddrReg); 808 B.buildCopy(LiveInReturn, ReturnAddrReg); 809 } 810 811 if (Info->hasImplicitBufferPtr()) { 812 Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI); 813 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); 814 CCInfo.AllocateReg(ImplicitBufferPtrReg); 815 } 816 817 818 SmallVector<ArgInfo, 32> SplitArgs; 819 unsigned Idx = 0; 820 unsigned PSInputNum = 0; 821 822 for (auto &Arg : F.args()) { 823 if (DL.getTypeStoreSize(Arg.getType()) == 0) 824 continue; 825 826 const bool InReg = Arg.hasAttribute(Attribute::InReg); 827 828 // SGPR arguments to functions not implemented. 829 if (!IsShader && InReg) 830 return false; 831 832 if (Arg.hasAttribute(Attribute::SwiftSelf) || 833 Arg.hasAttribute(Attribute::SwiftError) || 834 Arg.hasAttribute(Attribute::Nest)) 835 return false; 836 837 if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) { 838 const bool ArgUsed = !Arg.use_empty(); 839 bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum); 840 841 if (!SkipArg) { 842 Info->markPSInputAllocated(PSInputNum); 843 if (ArgUsed) 844 Info->markPSInputEnabled(PSInputNum); 845 } 846 847 ++PSInputNum; 848 849 if (SkipArg) { 850 for (int I = 0, E = VRegs[Idx].size(); I != E; ++I) 851 B.buildUndef(VRegs[Idx][I]); 852 853 ++Idx; 854 continue; 855 } 856 } 857 858 ArgInfo OrigArg(VRegs[Idx], Arg.getType()); 859 const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex; 860 setArgFlags(OrigArg, OrigArgIdx, DL, F); 861 862 splitToValueTypes( 863 B, OrigArg, SplitArgs, DL, CC, false, 864 // FIXME: We should probably be passing multiple registers to 865 // handleAssignments to do this 866 [&](ArrayRef<Register> Regs, Register DstReg, 867 LLT LLTy, LLT PartLLT, int VTSplitIdx) { 868 assert(DstReg == VRegs[Idx][VTSplitIdx]); 869 packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs, 870 LLTy, PartLLT); 871 }); 872 873 ++Idx; 874 } 875 876 // At least one interpolation mode must be enabled or else the GPU will 877 // hang. 878 // 879 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user 880 // set PSInputAddr, the user wants to enable some bits after the compilation 881 // based on run-time states. Since we can't know what the final PSInputEna 882 // will look like, so we shouldn't do anything here and the user should take 883 // responsibility for the correct programming. 884 // 885 // Otherwise, the following restrictions apply: 886 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. 887 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be 888 // enabled too. 889 if (CC == CallingConv::AMDGPU_PS) { 890 if ((Info->getPSInputAddr() & 0x7F) == 0 || 891 ((Info->getPSInputAddr() & 0xF) == 0 && 892 Info->isPSInputAllocated(11))) { 893 CCInfo.AllocateReg(AMDGPU::VGPR0); 894 CCInfo.AllocateReg(AMDGPU::VGPR1); 895 Info->markPSInputAllocated(0); 896 Info->markPSInputEnabled(0); 897 } 898 899 if (Subtarget.isAmdPalOS()) { 900 // For isAmdPalOS, the user does not enable some bits after compilation 901 // based on run-time states; the register values being generated here are 902 // the final ones set in hardware. Therefore we need to apply the 903 // workaround to PSInputAddr and PSInputEnable together. (The case where 904 // a bit is set in PSInputAddr but not PSInputEnable is where the frontend 905 // set up an input arg for a particular interpolation mode, but nothing 906 // uses that input arg. Really we should have an earlier pass that removes 907 // such an arg.) 908 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable(); 909 if ((PsInputBits & 0x7F) == 0 || 910 ((PsInputBits & 0xF) == 0 && 911 (PsInputBits >> 11 & 1))) 912 Info->markPSInputEnabled( 913 countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined)); 914 } 915 } 916 917 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 918 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg()); 919 920 if (!MBB.empty()) 921 B.setInstr(*MBB.begin()); 922 923 if (!IsEntryFunc) { 924 // For the fixed ABI, pass workitem IDs in the last argument register. 925 if (AMDGPUTargetMachine::EnableFixedFunctionABI) 926 TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); 927 } 928 929 FormalArgHandler Handler(B, MRI, AssignFn); 930 if (!handleAssignments(CCInfo, ArgLocs, B, SplitArgs, Handler)) 931 return false; 932 933 if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) { 934 // Special inputs come after user arguments. 935 TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); 936 } 937 938 // Start adding system SGPRs. 939 if (IsEntryFunc) { 940 TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader); 941 } else { 942 CCInfo.AllocateReg(Info->getScratchRSrcReg()); 943 TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); 944 } 945 946 // Move back to the end of the basic block. 947 B.setMBB(MBB); 948 949 return true; 950 } 951 952 bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder, 953 CCState &CCInfo, 954 SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs, 955 CallLoweringInfo &Info) const { 956 MachineFunction &MF = MIRBuilder.getMF(); 957 958 const AMDGPUFunctionArgInfo *CalleeArgInfo 959 = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo; 960 961 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 962 const AMDGPUFunctionArgInfo &CallerArgInfo = MFI->getArgInfo(); 963 964 965 // TODO: Unify with private memory register handling. This is complicated by 966 // the fact that at least in kernels, the input argument is not necessarily 967 // in the same location as the input. 968 AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = { 969 AMDGPUFunctionArgInfo::DISPATCH_PTR, 970 AMDGPUFunctionArgInfo::QUEUE_PTR, 971 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, 972 AMDGPUFunctionArgInfo::DISPATCH_ID, 973 AMDGPUFunctionArgInfo::WORKGROUP_ID_X, 974 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, 975 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z 976 }; 977 978 MachineRegisterInfo &MRI = MF.getRegInfo(); 979 980 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 981 const AMDGPULegalizerInfo *LI 982 = static_cast<const AMDGPULegalizerInfo*>(ST.getLegalizerInfo()); 983 984 for (auto InputID : InputRegs) { 985 const ArgDescriptor *OutgoingArg; 986 const TargetRegisterClass *ArgRC; 987 LLT ArgTy; 988 989 std::tie(OutgoingArg, ArgRC, ArgTy) = 990 CalleeArgInfo->getPreloadedValue(InputID); 991 if (!OutgoingArg) 992 continue; 993 994 const ArgDescriptor *IncomingArg; 995 const TargetRegisterClass *IncomingArgRC; 996 std::tie(IncomingArg, IncomingArgRC, ArgTy) = 997 CallerArgInfo.getPreloadedValue(InputID); 998 assert(IncomingArgRC == ArgRC); 999 1000 Register InputReg = MRI.createGenericVirtualRegister(ArgTy); 1001 1002 if (IncomingArg) { 1003 LI->loadInputValue(InputReg, MIRBuilder, IncomingArg, ArgRC, ArgTy); 1004 } else { 1005 assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 1006 LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder); 1007 } 1008 1009 if (OutgoingArg->isRegister()) { 1010 ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg); 1011 if (!CCInfo.AllocateReg(OutgoingArg->getRegister())) 1012 report_fatal_error("failed to allocate implicit input argument"); 1013 } else { 1014 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n"); 1015 return false; 1016 } 1017 } 1018 1019 // Pack workitem IDs into a single register or pass it as is if already 1020 // packed. 1021 const ArgDescriptor *OutgoingArg; 1022 const TargetRegisterClass *ArgRC; 1023 LLT ArgTy; 1024 1025 std::tie(OutgoingArg, ArgRC, ArgTy) = 1026 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X); 1027 if (!OutgoingArg) 1028 std::tie(OutgoingArg, ArgRC, ArgTy) = 1029 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 1030 if (!OutgoingArg) 1031 std::tie(OutgoingArg, ArgRC, ArgTy) = 1032 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 1033 if (!OutgoingArg) 1034 return false; 1035 1036 auto WorkitemIDX = 1037 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X); 1038 auto WorkitemIDY = 1039 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 1040 auto WorkitemIDZ = 1041 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 1042 1043 const ArgDescriptor *IncomingArgX = std::get<0>(WorkitemIDX); 1044 const ArgDescriptor *IncomingArgY = std::get<0>(WorkitemIDY); 1045 const ArgDescriptor *IncomingArgZ = std::get<0>(WorkitemIDZ); 1046 const LLT S32 = LLT::scalar(32); 1047 1048 // If incoming ids are not packed we need to pack them. 1049 // FIXME: Should consider known workgroup size to eliminate known 0 cases. 1050 Register InputReg; 1051 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX) { 1052 InputReg = MRI.createGenericVirtualRegister(S32); 1053 LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX, 1054 std::get<1>(WorkitemIDX), std::get<2>(WorkitemIDX)); 1055 } 1056 1057 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY) { 1058 Register Y = MRI.createGenericVirtualRegister(S32); 1059 LI->loadInputValue(Y, MIRBuilder, IncomingArgY, std::get<1>(WorkitemIDY), 1060 std::get<2>(WorkitemIDY)); 1061 1062 Y = MIRBuilder.buildShl(S32, Y, MIRBuilder.buildConstant(S32, 10)).getReg(0); 1063 InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Y).getReg(0) : Y; 1064 } 1065 1066 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ) { 1067 Register Z = MRI.createGenericVirtualRegister(S32); 1068 LI->loadInputValue(Z, MIRBuilder, IncomingArgZ, std::get<1>(WorkitemIDZ), 1069 std::get<2>(WorkitemIDZ)); 1070 1071 Z = MIRBuilder.buildShl(S32, Z, MIRBuilder.buildConstant(S32, 20)).getReg(0); 1072 InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Z).getReg(0) : Z; 1073 } 1074 1075 if (!InputReg) { 1076 InputReg = MRI.createGenericVirtualRegister(S32); 1077 1078 // Workitem ids are already packed, any of present incoming arguments will 1079 // carry all required fields. 1080 ArgDescriptor IncomingArg = ArgDescriptor::createArg( 1081 IncomingArgX ? *IncomingArgX : 1082 IncomingArgY ? *IncomingArgY : *IncomingArgZ, ~0u); 1083 LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg, 1084 &AMDGPU::VGPR_32RegClass, S32); 1085 } 1086 1087 if (OutgoingArg->isRegister()) { 1088 ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg); 1089 if (!CCInfo.AllocateReg(OutgoingArg->getRegister())) 1090 report_fatal_error("failed to allocate implicit input argument"); 1091 } else { 1092 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n"); 1093 return false; 1094 } 1095 1096 return true; 1097 } 1098 1099 /// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for 1100 /// CC. 1101 static std::pair<CCAssignFn *, CCAssignFn *> 1102 getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) { 1103 return {TLI.CCAssignFnForCall(CC, false), TLI.CCAssignFnForCall(CC, true)}; 1104 } 1105 1106 static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect, 1107 bool IsTailCall) { 1108 return AMDGPU::SI_CALL; 1109 } 1110 1111 // Add operands to call instruction to track the callee. 1112 static bool addCallTargetOperands(MachineInstrBuilder &CallInst, 1113 MachineIRBuilder &MIRBuilder, 1114 AMDGPUCallLowering::CallLoweringInfo &Info) { 1115 if (Info.Callee.isReg()) { 1116 CallInst.addReg(Info.Callee.getReg()); 1117 CallInst.addImm(0); 1118 } else if (Info.Callee.isGlobal() && Info.Callee.getOffset() == 0) { 1119 // The call lowering lightly assumed we can directly encode a call target in 1120 // the instruction, which is not the case. Materialize the address here. 1121 const GlobalValue *GV = Info.Callee.getGlobal(); 1122 auto Ptr = MIRBuilder.buildGlobalValue( 1123 LLT::pointer(GV->getAddressSpace(), 64), GV); 1124 CallInst.addReg(Ptr.getReg(0)); 1125 CallInst.add(Info.Callee); 1126 } else 1127 return false; 1128 1129 return true; 1130 } 1131 1132 bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, 1133 CallLoweringInfo &Info) const { 1134 if (!AMDGPUTargetMachine::EnableFixedFunctionABI) { 1135 LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n"); 1136 return false; 1137 } 1138 1139 if (Info.IsVarArg) { 1140 LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n"); 1141 return false; 1142 } 1143 1144 MachineFunction &MF = MIRBuilder.getMF(); 1145 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1146 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1147 1148 const Function &F = MF.getFunction(); 1149 MachineRegisterInfo &MRI = MF.getRegInfo(); 1150 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 1151 const DataLayout &DL = F.getParent()->getDataLayout(); 1152 1153 if (AMDGPU::isShader(F.getCallingConv())) { 1154 LLVM_DEBUG(dbgs() << "Unhandled call from graphics shader\n"); 1155 return false; 1156 } 1157 1158 SmallVector<ArgInfo, 8> OutArgs; 1159 SmallVector<ArgInfo, 4> SplitRetInfos; 1160 1161 for (auto &OrigArg : Info.OrigArgs) { 1162 splitToValueTypes( 1163 MIRBuilder, OrigArg, OutArgs, DL, Info.CallConv, true, 1164 // FIXME: We should probably be passing multiple registers to 1165 // handleAssignments to do this 1166 [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT, 1167 int VTSplitIdx) { 1168 unpackRegsToOrigType(MIRBuilder, Regs, SrcReg, OrigArg, LLTy, PartLLT); 1169 }); 1170 } 1171 1172 // If we can lower as a tail call, do that instead. 1173 bool CanTailCallOpt = false; 1174 1175 // We must emit a tail call if we have musttail. 1176 if (Info.IsMustTailCall && !CanTailCallOpt) { 1177 LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n"); 1178 return false; 1179 } 1180 1181 // Find out which ABI gets to decide where things go. 1182 CCAssignFn *AssignFnFixed; 1183 CCAssignFn *AssignFnVarArg; 1184 std::tie(AssignFnFixed, AssignFnVarArg) = 1185 getAssignFnsForCC(Info.CallConv, TLI); 1186 1187 MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP) 1188 .addImm(0) 1189 .addImm(0); 1190 1191 // Create a temporarily-floating call instruction so we can add the implicit 1192 // uses of arg registers. 1193 unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false); 1194 1195 auto MIB = MIRBuilder.buildInstrNoInsert(Opc); 1196 MIB.addDef(TRI->getReturnAddressReg(MF)); 1197 1198 if (!addCallTargetOperands(MIB, MIRBuilder, Info)) 1199 return false; 1200 1201 // Tell the call which registers are clobbered. 1202 const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv); 1203 MIB.addRegMask(Mask); 1204 1205 SmallVector<CCValAssign, 16> ArgLocs; 1206 CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext()); 1207 1208 // We could pass MIB and directly add the implicit uses to the call 1209 // now. However, as an aesthetic choice, place implicit argument operands 1210 // after the ordinary user argument registers. 1211 SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs; 1212 1213 if (AMDGPUTargetMachine::EnableFixedFunctionABI) { 1214 // With a fixed ABI, allocate fixed registers before user arguments. 1215 if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info)) 1216 return false; 1217 } 1218 1219 // Do the actual argument marshalling. 1220 SmallVector<Register, 8> PhysRegs; 1221 AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed, 1222 AssignFnVarArg, false); 1223 if (!handleAssignments(CCInfo, ArgLocs, MIRBuilder, OutArgs, Handler)) 1224 return false; 1225 1226 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1227 1228 // Insert copies for the SRD. In the HSA case, this should be an identity 1229 // copy. 1230 auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::vector(4, 32), 1231 MFI->getScratchRSrcReg()); 1232 MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); 1233 MIB.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit); 1234 1235 for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) { 1236 MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second); 1237 MIB.addReg(ArgReg.first, RegState::Implicit); 1238 } 1239 1240 // Get a count of how many bytes are to be pushed on the stack. 1241 unsigned NumBytes = CCInfo.getNextStackOffset(); 1242 1243 // If Callee is a reg, since it is used by a target specific 1244 // instruction, it must have a register class matching the 1245 // constraint of that instruction. 1246 1247 // FIXME: We should define regbankselectable call instructions to handle 1248 // divergent call targets. 1249 if (MIB->getOperand(1).isReg()) { 1250 MIB->getOperand(1).setReg(constrainOperandRegClass( 1251 MF, *TRI, MRI, *ST.getInstrInfo(), 1252 *ST.getRegBankInfo(), *MIB, MIB->getDesc(), MIB->getOperand(1), 1253 1)); 1254 } 1255 1256 auto OrigInsertPt = MIRBuilder.getInsertPt(); 1257 1258 // Now we can add the actual call instruction to the correct position. 1259 MIRBuilder.insertInstr(MIB); 1260 1261 // Insert this now to give us an anchor point for managing the insert point. 1262 MachineInstrBuilder CallSeqEnd = 1263 MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN); 1264 1265 SmallVector<ArgInfo, 8> InArgs; 1266 if (!Info.OrigRet.Ty->isVoidTy()) { 1267 splitToValueTypes( 1268 MIRBuilder, Info.OrigRet, InArgs, DL, Info.CallConv, false, 1269 [&](ArrayRef<Register> Regs, Register DstReg, 1270 LLT LLTy, LLT PartLLT, int VTSplitIdx) { 1271 assert(DstReg == Info.OrigRet.Regs[VTSplitIdx]); 1272 packSplitRegsToOrigType(MIRBuilder, Info.OrigRet.Regs[VTSplitIdx], 1273 Regs, LLTy, PartLLT); 1274 }); 1275 } 1276 1277 // Make sure the raw argument copies are inserted before the marshalling to 1278 // the original types. 1279 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), CallSeqEnd); 1280 1281 // Finally we can copy the returned value back into its virtual-register. In 1282 // symmetry with the arguments, the physical register must be an 1283 // implicit-define of the call instruction. 1284 if (!Info.OrigRet.Ty->isVoidTy()) { 1285 CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv, 1286 Info.IsVarArg); 1287 CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn); 1288 if (!handleAssignments(MIRBuilder, InArgs, Handler)) 1289 return false; 1290 } 1291 1292 uint64_t CalleePopBytes = NumBytes; 1293 CallSeqEnd.addImm(0) 1294 .addImm(CalleePopBytes); 1295 1296 // Restore the insert point to after the call sequence. 1297 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), OrigInsertPt); 1298 return true; 1299 } 1300