1 //===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This file implements the lowering of LLVM calls to machine code calls for 11 /// GlobalISel. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPUCallLowering.h" 16 #include "AMDGPU.h" 17 #include "AMDGPUISelLowering.h" 18 #include "AMDGPULegalizerInfo.h" 19 #include "AMDGPUSubtarget.h" 20 #include "AMDGPUTargetMachine.h" 21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 22 #include "SIISelLowering.h" 23 #include "SIMachineFunctionInfo.h" 24 #include "SIRegisterInfo.h" 25 #include "llvm/CodeGen/Analysis.h" 26 #include "llvm/CodeGen/CallingConvLower.h" 27 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 28 #include "llvm/CodeGen/MachineInstrBuilder.h" 29 #include "llvm/Support/LowLevelTypeImpl.h" 30 31 #define DEBUG_TYPE "amdgpu-call-lowering" 32 33 using namespace llvm; 34 35 namespace { 36 37 struct AMDGPUValueHandler : public CallLowering::ValueHandler { 38 AMDGPUValueHandler(bool IsIncoming, MachineIRBuilder &B, 39 MachineRegisterInfo &MRI, CCAssignFn *AssignFn) 40 : ValueHandler(IsIncoming, B, MRI, AssignFn) {} 41 42 /// Wrapper around extendRegister to ensure we extend to a full 32-bit 43 /// register. 44 Register extendRegisterMin32(Register ValVReg, CCValAssign &VA) { 45 if (VA.getLocVT().getSizeInBits() < 32) { 46 // 16-bit types are reported as legal for 32-bit registers. We need to 47 // extend and do a 32-bit copy to avoid the verifier complaining about it. 48 return MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0); 49 } 50 51 return extendRegister(ValVReg, VA); 52 } 53 }; 54 55 struct AMDGPUOutgoingValueHandler : public AMDGPUValueHandler { 56 AMDGPUOutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, 57 MachineInstrBuilder MIB, CCAssignFn *AssignFn) 58 : AMDGPUValueHandler(false, B, MRI, AssignFn), MIB(MIB) {} 59 60 MachineInstrBuilder MIB; 61 62 Register getStackAddress(uint64_t Size, int64_t Offset, 63 MachinePointerInfo &MPO) override { 64 llvm_unreachable("not implemented"); 65 } 66 67 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, 68 MachinePointerInfo &MPO, CCValAssign &VA) override { 69 llvm_unreachable("not implemented"); 70 } 71 72 void assignValueToReg(Register ValVReg, Register PhysReg, 73 CCValAssign &VA) override { 74 Register ExtReg = extendRegisterMin32(ValVReg, VA); 75 76 // If this is a scalar return, insert a readfirstlane just in case the value 77 // ends up in a VGPR. 78 // FIXME: Assert this is a shader return. 79 const SIRegisterInfo *TRI 80 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 81 if (TRI->isSGPRReg(MRI, PhysReg)) { 82 auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, 83 {MRI.getType(ExtReg)}, false) 84 .addReg(ExtReg); 85 ExtReg = ToSGPR.getReg(0); 86 } 87 88 MIRBuilder.buildCopy(PhysReg, ExtReg); 89 MIB.addUse(PhysReg, RegState::Implicit); 90 } 91 92 bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, 93 CCValAssign::LocInfo LocInfo, 94 const CallLowering::ArgInfo &Info, 95 ISD::ArgFlagsTy Flags, 96 CCState &State) override { 97 return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State); 98 } 99 }; 100 101 struct AMDGPUIncomingArgHandler : public AMDGPUValueHandler { 102 uint64_t StackUsed = 0; 103 104 AMDGPUIncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, 105 CCAssignFn *AssignFn) 106 : AMDGPUValueHandler(true, B, MRI, AssignFn) {} 107 108 Register getStackAddress(uint64_t Size, int64_t Offset, 109 MachinePointerInfo &MPO) override { 110 auto &MFI = MIRBuilder.getMF().getFrameInfo(); 111 int FI = MFI.CreateFixedObject(Size, Offset, true); 112 MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); 113 auto AddrReg = MIRBuilder.buildFrameIndex( 114 LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32), FI); 115 StackUsed = std::max(StackUsed, Size + Offset); 116 return AddrReg.getReg(0); 117 } 118 119 void assignValueToReg(Register ValVReg, Register PhysReg, 120 CCValAssign &VA) override { 121 markPhysRegUsed(PhysReg); 122 123 if (VA.getLocVT().getSizeInBits() < 32) { 124 // 16-bit types are reported as legal for 32-bit registers. We need to do 125 // a 32-bit copy, and truncate to avoid the verifier complaining about it. 126 auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg); 127 MIRBuilder.buildTrunc(ValVReg, Copy); 128 return; 129 } 130 131 switch (VA.getLocInfo()) { 132 case CCValAssign::LocInfo::SExt: 133 case CCValAssign::LocInfo::ZExt: 134 case CCValAssign::LocInfo::AExt: { 135 auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg); 136 MIRBuilder.buildTrunc(ValVReg, Copy); 137 break; 138 } 139 default: 140 MIRBuilder.buildCopy(ValVReg, PhysReg); 141 break; 142 } 143 } 144 145 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t MemSize, 146 MachinePointerInfo &MPO, CCValAssign &VA) override { 147 MachineFunction &MF = MIRBuilder.getMF(); 148 149 // The reported memory location may be wider than the value. 150 const LLT RegTy = MRI.getType(ValVReg); 151 MemSize = std::min(static_cast<uint64_t>(RegTy.getSizeInBytes()), MemSize); 152 153 // FIXME: Get alignment 154 auto MMO = MF.getMachineMemOperand( 155 MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, MemSize, 156 inferAlignFromPtrInfo(MF, MPO)); 157 MIRBuilder.buildLoad(ValVReg, Addr, *MMO); 158 } 159 160 /// How the physical register gets marked varies between formal 161 /// parameters (it's a basic-block live-in), and a call instruction 162 /// (it's an implicit-def of the BL). 163 virtual void markPhysRegUsed(unsigned PhysReg) = 0; 164 }; 165 166 struct FormalArgHandler : public AMDGPUIncomingArgHandler { 167 FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI, 168 CCAssignFn *AssignFn) 169 : AMDGPUIncomingArgHandler(B, MRI, AssignFn) {} 170 171 void markPhysRegUsed(unsigned PhysReg) override { 172 MIRBuilder.getMBB().addLiveIn(PhysReg); 173 } 174 }; 175 176 struct CallReturnHandler : public AMDGPUIncomingArgHandler { 177 CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, 178 MachineInstrBuilder MIB, CCAssignFn *AssignFn) 179 : AMDGPUIncomingArgHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} 180 181 void markPhysRegUsed(unsigned PhysReg) override { 182 MIB.addDef(PhysReg, RegState::Implicit); 183 } 184 185 MachineInstrBuilder MIB; 186 }; 187 188 struct AMDGPUOutgoingArgHandler : public AMDGPUValueHandler { 189 MachineInstrBuilder MIB; 190 CCAssignFn *AssignFnVarArg; 191 192 /// For tail calls, the byte offset of the call's argument area from the 193 /// callee's. Unused elsewhere. 194 int FPDiff; 195 196 // Cache the SP register vreg if we need it more than once in this call site. 197 Register SPReg; 198 199 bool IsTailCall; 200 201 AMDGPUOutgoingArgHandler(MachineIRBuilder &MIRBuilder, 202 MachineRegisterInfo &MRI, MachineInstrBuilder MIB, 203 CCAssignFn *AssignFn, CCAssignFn *AssignFnVarArg, 204 bool IsTailCall = false, int FPDiff = 0) 205 : AMDGPUValueHandler(false, MIRBuilder, MRI, AssignFn), MIB(MIB), 206 AssignFnVarArg(AssignFnVarArg), FPDiff(FPDiff), IsTailCall(IsTailCall) { 207 } 208 209 Register getStackAddress(uint64_t Size, int64_t Offset, 210 MachinePointerInfo &MPO) override { 211 MachineFunction &MF = MIRBuilder.getMF(); 212 const LLT PtrTy = LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32); 213 const LLT S32 = LLT::scalar(32); 214 215 if (IsTailCall) { 216 llvm_unreachable("implement me"); 217 } 218 219 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 220 221 if (!SPReg) 222 SPReg = MIRBuilder.buildCopy(PtrTy, MFI->getStackPtrOffsetReg()).getReg(0); 223 224 auto OffsetReg = MIRBuilder.buildConstant(S32, Offset); 225 226 auto AddrReg = MIRBuilder.buildPtrAdd(PtrTy, SPReg, OffsetReg); 227 MPO = MachinePointerInfo::getStack(MF, Offset); 228 return AddrReg.getReg(0); 229 } 230 231 void assignValueToReg(Register ValVReg, Register PhysReg, 232 CCValAssign &VA) override { 233 MIB.addUse(PhysReg, RegState::Implicit); 234 Register ExtReg = extendRegisterMin32(ValVReg, VA); 235 MIRBuilder.buildCopy(PhysReg, ExtReg); 236 } 237 238 void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size, 239 MachinePointerInfo &MPO, CCValAssign &VA) override { 240 MachineFunction &MF = MIRBuilder.getMF(); 241 uint64_t LocMemOffset = VA.getLocMemOffset(); 242 const auto &ST = MF.getSubtarget<GCNSubtarget>(); 243 244 auto MMO = MF.getMachineMemOperand( 245 MPO, MachineMemOperand::MOStore, Size, 246 commonAlignment(ST.getStackAlignment(), LocMemOffset)); 247 MIRBuilder.buildStore(ValVReg, Addr, *MMO); 248 } 249 250 void assignValueToAddress(const CallLowering::ArgInfo &Arg, Register Addr, 251 uint64_t Size, MachinePointerInfo &MPO, 252 CCValAssign &VA) override { 253 Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt 254 ? extendRegister(Arg.Regs[0], VA) 255 : Arg.Regs[0]; 256 257 // If we extended we might need to adjust the MMO's Size. 258 const LLT RegTy = MRI.getType(ValVReg); 259 if (RegTy.getSizeInBytes() > Size) 260 Size = RegTy.getSizeInBytes(); 261 262 assignValueToAddress(ValVReg, Addr, Size, MPO, VA); 263 } 264 }; 265 } 266 267 AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI) 268 : CallLowering(&TLI) { 269 } 270 271 // FIXME: Compatability shim 272 static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) { 273 switch (MIOpc) { 274 case TargetOpcode::G_SEXT: 275 return ISD::SIGN_EXTEND; 276 case TargetOpcode::G_ZEXT: 277 return ISD::ZERO_EXTEND; 278 case TargetOpcode::G_ANYEXT: 279 return ISD::ANY_EXTEND; 280 default: 281 llvm_unreachable("not an extend opcode"); 282 } 283 } 284 285 void AMDGPUCallLowering::splitToValueTypes( 286 MachineIRBuilder &B, 287 const ArgInfo &OrigArg, 288 SmallVectorImpl<ArgInfo> &SplitArgs, 289 const DataLayout &DL, CallingConv::ID CallConv, 290 bool IsOutgoing, 291 SplitArgTy PerformArgSplit) const { 292 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 293 LLVMContext &Ctx = OrigArg.Ty->getContext(); 294 295 if (OrigArg.Ty->isVoidTy()) 296 return; 297 298 SmallVector<EVT, 4> SplitVTs; 299 ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs); 300 301 assert(OrigArg.Regs.size() == SplitVTs.size()); 302 303 int SplitIdx = 0; 304 for (EVT VT : SplitVTs) { 305 Register Reg = OrigArg.Regs[SplitIdx]; 306 Type *Ty = VT.getTypeForEVT(Ctx); 307 LLT LLTy = getLLTForType(*Ty, DL); 308 309 if (IsOutgoing && VT.isScalarInteger()) { 310 unsigned ExtendOp = TargetOpcode::G_ANYEXT; 311 if (OrigArg.Flags[0].isSExt()) { 312 assert(OrigArg.Regs.size() == 1 && "expect only simple return values"); 313 ExtendOp = TargetOpcode::G_SEXT; 314 } else if (OrigArg.Flags[0].isZExt()) { 315 assert(OrigArg.Regs.size() == 1 && "expect only simple return values"); 316 ExtendOp = TargetOpcode::G_ZEXT; 317 } 318 319 EVT ExtVT = TLI.getTypeForExtReturn(Ctx, VT, 320 extOpcodeToISDExtOpcode(ExtendOp)); 321 if (ExtVT.getSizeInBits() != VT.getSizeInBits()) { 322 VT = ExtVT; 323 Ty = ExtVT.getTypeForEVT(Ctx); 324 LLTy = getLLTForType(*Ty, DL); 325 Reg = B.buildInstr(ExtendOp, {LLTy}, {Reg}).getReg(0); 326 } 327 } 328 329 unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT); 330 MVT RegVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT); 331 332 if (NumParts == 1) { 333 // No splitting to do, but we want to replace the original type (e.g. [1 x 334 // double] -> double). 335 SplitArgs.emplace_back(Reg, Ty, OrigArg.Flags, OrigArg.IsFixed); 336 337 ++SplitIdx; 338 continue; 339 } 340 341 SmallVector<Register, 8> SplitRegs; 342 Type *PartTy = EVT(RegVT).getTypeForEVT(Ctx); 343 LLT PartLLT = getLLTForType(*PartTy, DL); 344 MachineRegisterInfo &MRI = *B.getMRI(); 345 346 // FIXME: Should we be reporting all of the part registers for a single 347 // argument, and let handleAssignments take care of the repacking? 348 for (unsigned i = 0; i < NumParts; ++i) { 349 Register PartReg = MRI.createGenericVirtualRegister(PartLLT); 350 SplitRegs.push_back(PartReg); 351 SplitArgs.emplace_back(ArrayRef<Register>(PartReg), PartTy, OrigArg.Flags); 352 } 353 354 PerformArgSplit(SplitRegs, Reg, LLTy, PartLLT, SplitIdx); 355 356 ++SplitIdx; 357 } 358 } 359 360 // TODO: Move to generic code 361 static void unpackRegsToOrigType(MachineIRBuilder &B, 362 ArrayRef<Register> DstRegs, 363 Register SrcReg, 364 const CallLowering::ArgInfo &Info, 365 LLT SrcTy, 366 LLT PartTy) { 367 assert(DstRegs.size() > 1 && "Nothing to unpack"); 368 369 const unsigned PartSize = PartTy.getSizeInBits(); 370 371 if (SrcTy.isVector() && !PartTy.isVector() && 372 PartSize > SrcTy.getElementType().getSizeInBits()) { 373 // Vector was scalarized, and the elements extended. 374 auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(), SrcReg); 375 for (int i = 0, e = DstRegs.size(); i != e; ++i) 376 B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i)); 377 return; 378 } 379 380 LLT GCDTy = getGCDType(SrcTy, PartTy); 381 if (GCDTy == PartTy) { 382 // If this already evenly divisible, we can create a simple unmerge. 383 B.buildUnmerge(DstRegs, SrcReg); 384 return; 385 } 386 387 MachineRegisterInfo &MRI = *B.getMRI(); 388 LLT DstTy = MRI.getType(DstRegs[0]); 389 LLT LCMTy = getLCMType(SrcTy, PartTy); 390 391 const unsigned LCMSize = LCMTy.getSizeInBits(); 392 const unsigned DstSize = DstTy.getSizeInBits(); 393 const unsigned SrcSize = SrcTy.getSizeInBits(); 394 395 Register UnmergeSrc = SrcReg; 396 if (LCMSize != SrcSize) { 397 // Widen to the common type. 398 Register Undef = B.buildUndef(SrcTy).getReg(0); 399 SmallVector<Register, 8> MergeParts(1, SrcReg); 400 for (unsigned Size = SrcSize; Size != LCMSize; Size += SrcSize) 401 MergeParts.push_back(Undef); 402 403 UnmergeSrc = B.buildMerge(LCMTy, MergeParts).getReg(0); 404 } 405 406 // Unmerge to the original registers and pad with dead defs. 407 SmallVector<Register, 8> UnmergeResults(DstRegs.begin(), DstRegs.end()); 408 for (unsigned Size = DstSize * DstRegs.size(); Size != LCMSize; 409 Size += DstSize) { 410 UnmergeResults.push_back(MRI.createGenericVirtualRegister(DstTy)); 411 } 412 413 B.buildUnmerge(UnmergeResults, UnmergeSrc); 414 } 415 416 /// Lower the return value for the already existing \p Ret. This assumes that 417 /// \p B's insertion point is correct. 418 bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B, 419 const Value *Val, ArrayRef<Register> VRegs, 420 MachineInstrBuilder &Ret) const { 421 if (!Val) 422 return true; 423 424 auto &MF = B.getMF(); 425 const auto &F = MF.getFunction(); 426 const DataLayout &DL = MF.getDataLayout(); 427 MachineRegisterInfo *MRI = B.getMRI(); 428 429 CallingConv::ID CC = F.getCallingConv(); 430 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 431 432 ArgInfo OrigRetInfo(VRegs, Val->getType()); 433 setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F); 434 SmallVector<ArgInfo, 4> SplitRetInfos; 435 436 splitToValueTypes( 437 B, OrigRetInfo, SplitRetInfos, DL, CC, true, 438 [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT, 439 int VTSplitIdx) { 440 unpackRegsToOrigType(B, Regs, SrcReg, 441 SplitRetInfos[VTSplitIdx], 442 LLTy, PartLLT); 443 }); 444 445 CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg()); 446 AMDGPUOutgoingValueHandler RetHandler(B, *MRI, Ret, AssignFn); 447 return handleAssignments(B, SplitRetInfos, RetHandler); 448 } 449 450 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, 451 const Value *Val, 452 ArrayRef<Register> VRegs) const { 453 454 MachineFunction &MF = B.getMF(); 455 MachineRegisterInfo &MRI = MF.getRegInfo(); 456 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 457 MFI->setIfReturnsVoid(!Val); 458 459 assert(!Val == VRegs.empty() && "Return value without a vreg"); 460 461 CallingConv::ID CC = B.getMF().getFunction().getCallingConv(); 462 const bool IsShader = AMDGPU::isShader(CC); 463 const bool IsWaveEnd = 464 (IsShader && MFI->returnsVoid()) || AMDGPU::isKernel(CC); 465 if (IsWaveEnd) { 466 B.buildInstr(AMDGPU::S_ENDPGM) 467 .addImm(0); 468 return true; 469 } 470 471 auto const &ST = MF.getSubtarget<GCNSubtarget>(); 472 473 unsigned ReturnOpc = 474 IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return; 475 476 auto Ret = B.buildInstrNoInsert(ReturnOpc); 477 Register ReturnAddrVReg; 478 if (ReturnOpc == AMDGPU::S_SETPC_B64_return) { 479 ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass); 480 Ret.addUse(ReturnAddrVReg); 481 } 482 483 if (!lowerReturnVal(B, Val, VRegs, Ret)) 484 return false; 485 486 if (ReturnOpc == AMDGPU::S_SETPC_B64_return) { 487 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 488 Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF), 489 &AMDGPU::SGPR_64RegClass); 490 B.buildCopy(ReturnAddrVReg, LiveInReturn); 491 } 492 493 // TODO: Handle CalleeSavedRegsViaCopy. 494 495 B.insertInstr(Ret); 496 return true; 497 } 498 499 void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B, 500 Type *ParamTy, 501 uint64_t Offset) const { 502 MachineFunction &MF = B.getMF(); 503 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 504 MachineRegisterInfo &MRI = MF.getRegInfo(); 505 Register KernArgSegmentPtr = 506 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 507 Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr); 508 509 auto OffsetReg = B.buildConstant(LLT::scalar(64), Offset); 510 511 B.buildPtrAdd(DstReg, KernArgSegmentVReg, OffsetReg); 512 } 513 514 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy, 515 uint64_t Offset, Align Alignment, 516 Register DstReg) const { 517 MachineFunction &MF = B.getMF(); 518 const Function &F = MF.getFunction(); 519 const DataLayout &DL = F.getParent()->getDataLayout(); 520 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 521 unsigned TypeSize = DL.getTypeStoreSize(ParamTy); 522 523 LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 524 Register PtrReg = B.getMRI()->createGenericVirtualRegister(PtrTy); 525 lowerParameterPtr(PtrReg, B, ParamTy, Offset); 526 527 MachineMemOperand *MMO = MF.getMachineMemOperand( 528 PtrInfo, 529 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 530 MachineMemOperand::MOInvariant, 531 TypeSize, Alignment); 532 533 B.buildLoad(DstReg, PtrReg, *MMO); 534 } 535 536 // Allocate special inputs passed in user SGPRs. 537 static void allocateHSAUserSGPRs(CCState &CCInfo, 538 MachineIRBuilder &B, 539 MachineFunction &MF, 540 const SIRegisterInfo &TRI, 541 SIMachineFunctionInfo &Info) { 542 // FIXME: How should these inputs interact with inreg / custom SGPR inputs? 543 if (Info.hasPrivateSegmentBuffer()) { 544 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); 545 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); 546 CCInfo.AllocateReg(PrivateSegmentBufferReg); 547 } 548 549 if (Info.hasDispatchPtr()) { 550 Register DispatchPtrReg = Info.addDispatchPtr(TRI); 551 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); 552 CCInfo.AllocateReg(DispatchPtrReg); 553 } 554 555 if (Info.hasQueuePtr()) { 556 Register QueuePtrReg = Info.addQueuePtr(TRI); 557 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); 558 CCInfo.AllocateReg(QueuePtrReg); 559 } 560 561 if (Info.hasKernargSegmentPtr()) { 562 MachineRegisterInfo &MRI = MF.getRegInfo(); 563 Register InputPtrReg = Info.addKernargSegmentPtr(TRI); 564 const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 565 Register VReg = MRI.createGenericVirtualRegister(P4); 566 MRI.addLiveIn(InputPtrReg, VReg); 567 B.getMBB().addLiveIn(InputPtrReg); 568 B.buildCopy(VReg, InputPtrReg); 569 CCInfo.AllocateReg(InputPtrReg); 570 } 571 572 if (Info.hasDispatchID()) { 573 Register DispatchIDReg = Info.addDispatchID(TRI); 574 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); 575 CCInfo.AllocateReg(DispatchIDReg); 576 } 577 578 if (Info.hasFlatScratchInit()) { 579 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI); 580 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); 581 CCInfo.AllocateReg(FlatScratchInitReg); 582 } 583 584 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read 585 // these from the dispatch pointer. 586 } 587 588 bool AMDGPUCallLowering::lowerFormalArgumentsKernel( 589 MachineIRBuilder &B, const Function &F, 590 ArrayRef<ArrayRef<Register>> VRegs) const { 591 MachineFunction &MF = B.getMF(); 592 const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>(); 593 MachineRegisterInfo &MRI = MF.getRegInfo(); 594 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 595 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); 596 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 597 598 const DataLayout &DL = F.getParent()->getDataLayout(); 599 600 SmallVector<CCValAssign, 16> ArgLocs; 601 CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext()); 602 603 allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info); 604 605 unsigned i = 0; 606 const Align KernArgBaseAlign(16); 607 const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F); 608 uint64_t ExplicitArgOffset = 0; 609 610 // TODO: Align down to dword alignment and extract bits for extending loads. 611 for (auto &Arg : F.args()) { 612 const bool IsByRef = Arg.hasByRefAttr(); 613 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType(); 614 unsigned AllocSize = DL.getTypeAllocSize(ArgTy); 615 if (AllocSize == 0) 616 continue; 617 618 MaybeAlign ABIAlign = IsByRef ? Arg.getParamAlign() : None; 619 if (!ABIAlign) 620 ABIAlign = DL.getABITypeAlign(ArgTy); 621 622 uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset; 623 ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize; 624 625 if (Arg.use_empty()) { 626 ++i; 627 continue; 628 } 629 630 Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset); 631 632 if (IsByRef) { 633 unsigned ByRefAS = cast<PointerType>(Arg.getType())->getAddressSpace(); 634 635 assert(VRegs[i].size() == 1 && 636 "expected only one register for byval pointers"); 637 if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) { 638 lowerParameterPtr(VRegs[i][0], B, ArgTy, ArgOffset); 639 } else { 640 const LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 641 Register PtrReg = MRI.createGenericVirtualRegister(ConstPtrTy); 642 lowerParameterPtr(PtrReg, B, ArgTy, ArgOffset); 643 644 B.buildAddrSpaceCast(VRegs[i][0], PtrReg); 645 } 646 } else { 647 ArrayRef<Register> OrigArgRegs = VRegs[i]; 648 Register ArgReg = 649 OrigArgRegs.size() == 1 650 ? OrigArgRegs[0] 651 : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL)); 652 653 lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg); 654 if (OrigArgRegs.size() > 1) 655 unpackRegs(OrigArgRegs, ArgReg, ArgTy, B); 656 } 657 658 ++i; 659 } 660 661 TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); 662 TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false); 663 return true; 664 } 665 666 /// Pack values \p SrcRegs to cover the vector type result \p DstRegs. 667 static MachineInstrBuilder mergeVectorRegsToResultRegs( 668 MachineIRBuilder &B, ArrayRef<Register> DstRegs, ArrayRef<Register> SrcRegs) { 669 MachineRegisterInfo &MRI = *B.getMRI(); 670 LLT LLTy = MRI.getType(DstRegs[0]); 671 LLT PartLLT = MRI.getType(SrcRegs[0]); 672 673 // Deal with v3s16 split into v2s16 674 LLT LCMTy = getLCMType(LLTy, PartLLT); 675 if (LCMTy == LLTy) { 676 // Common case where no padding is needed. 677 assert(DstRegs.size() == 1); 678 return B.buildConcatVectors(DstRegs[0], SrcRegs); 679 } 680 681 const int NumWide = LCMTy.getSizeInBits() / PartLLT.getSizeInBits(); 682 Register Undef = B.buildUndef(PartLLT).getReg(0); 683 684 // Build vector of undefs. 685 SmallVector<Register, 8> WidenedSrcs(NumWide, Undef); 686 687 // Replace the first sources with the real registers. 688 std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin()); 689 690 auto Widened = B.buildConcatVectors(LCMTy, WidenedSrcs); 691 int NumDst = LCMTy.getSizeInBits() / LLTy.getSizeInBits(); 692 693 SmallVector<Register, 8> PadDstRegs(NumDst); 694 std::copy(DstRegs.begin(), DstRegs.end(), PadDstRegs.begin()); 695 696 // Create the excess dead defs for the unmerge. 697 for (int I = DstRegs.size(); I != NumDst; ++I) 698 PadDstRegs[I] = MRI.createGenericVirtualRegister(LLTy); 699 700 return B.buildUnmerge(PadDstRegs, Widened); 701 } 702 703 // TODO: Move this to generic code 704 static void packSplitRegsToOrigType(MachineIRBuilder &B, 705 ArrayRef<Register> OrigRegs, 706 ArrayRef<Register> Regs, 707 LLT LLTy, 708 LLT PartLLT) { 709 MachineRegisterInfo &MRI = *B.getMRI(); 710 711 if (!LLTy.isVector() && !PartLLT.isVector()) { 712 assert(OrigRegs.size() == 1); 713 LLT OrigTy = MRI.getType(OrigRegs[0]); 714 715 unsigned SrcSize = PartLLT.getSizeInBits() * Regs.size(); 716 if (SrcSize == OrigTy.getSizeInBits()) 717 B.buildMerge(OrigRegs[0], Regs); 718 else { 719 auto Widened = B.buildMerge(LLT::scalar(SrcSize), Regs); 720 B.buildTrunc(OrigRegs[0], Widened); 721 } 722 723 return; 724 } 725 726 if (LLTy.isVector() && PartLLT.isVector()) { 727 assert(OrigRegs.size() == 1); 728 assert(LLTy.getElementType() == PartLLT.getElementType()); 729 mergeVectorRegsToResultRegs(B, OrigRegs, Regs); 730 return; 731 } 732 733 assert(LLTy.isVector() && !PartLLT.isVector()); 734 735 LLT DstEltTy = LLTy.getElementType(); 736 737 // Pointer information was discarded. We'll need to coerce some register types 738 // to avoid violating type constraints. 739 LLT RealDstEltTy = MRI.getType(OrigRegs[0]).getElementType(); 740 741 assert(DstEltTy.getSizeInBits() == RealDstEltTy.getSizeInBits()); 742 743 if (DstEltTy == PartLLT) { 744 // Vector was trivially scalarized. 745 746 if (RealDstEltTy.isPointer()) { 747 for (Register Reg : Regs) 748 MRI.setType(Reg, RealDstEltTy); 749 } 750 751 B.buildBuildVector(OrigRegs[0], Regs); 752 } else if (DstEltTy.getSizeInBits() > PartLLT.getSizeInBits()) { 753 // Deal with vector with 64-bit elements decomposed to 32-bit 754 // registers. Need to create intermediate 64-bit elements. 755 SmallVector<Register, 8> EltMerges; 756 int PartsPerElt = DstEltTy.getSizeInBits() / PartLLT.getSizeInBits(); 757 758 assert(DstEltTy.getSizeInBits() % PartLLT.getSizeInBits() == 0); 759 760 for (int I = 0, NumElts = LLTy.getNumElements(); I != NumElts; ++I) { 761 auto Merge = B.buildMerge(RealDstEltTy, Regs.take_front(PartsPerElt)); 762 // Fix the type in case this is really a vector of pointers. 763 MRI.setType(Merge.getReg(0), RealDstEltTy); 764 EltMerges.push_back(Merge.getReg(0)); 765 Regs = Regs.drop_front(PartsPerElt); 766 } 767 768 B.buildBuildVector(OrigRegs[0], EltMerges); 769 } else { 770 // Vector was split, and elements promoted to a wider type. 771 LLT BVType = LLT::vector(LLTy.getNumElements(), PartLLT); 772 auto BV = B.buildBuildVector(BVType, Regs); 773 B.buildTrunc(OrigRegs[0], BV); 774 } 775 } 776 777 bool AMDGPUCallLowering::lowerFormalArguments( 778 MachineIRBuilder &B, const Function &F, 779 ArrayRef<ArrayRef<Register>> VRegs) const { 780 CallingConv::ID CC = F.getCallingConv(); 781 782 // The infrastructure for normal calling convention lowering is essentially 783 // useless for kernels. We want to avoid any kind of legalization or argument 784 // splitting. 785 if (CC == CallingConv::AMDGPU_KERNEL) 786 return lowerFormalArgumentsKernel(B, F, VRegs); 787 788 const bool IsGraphics = AMDGPU::isGraphics(CC); 789 const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC); 790 791 MachineFunction &MF = B.getMF(); 792 MachineBasicBlock &MBB = B.getMBB(); 793 MachineRegisterInfo &MRI = MF.getRegInfo(); 794 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 795 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>(); 796 const SIRegisterInfo *TRI = Subtarget.getRegisterInfo(); 797 const DataLayout &DL = F.getParent()->getDataLayout(); 798 799 800 SmallVector<CCValAssign, 16> ArgLocs; 801 CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext()); 802 803 if (!IsEntryFunc) { 804 Register ReturnAddrReg = TRI->getReturnAddressReg(MF); 805 Register LiveInReturn = MF.addLiveIn(ReturnAddrReg, 806 &AMDGPU::SGPR_64RegClass); 807 MBB.addLiveIn(ReturnAddrReg); 808 B.buildCopy(LiveInReturn, ReturnAddrReg); 809 } 810 811 if (Info->hasImplicitBufferPtr()) { 812 Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI); 813 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); 814 CCInfo.AllocateReg(ImplicitBufferPtrReg); 815 } 816 817 818 SmallVector<ArgInfo, 32> SplitArgs; 819 unsigned Idx = 0; 820 unsigned PSInputNum = 0; 821 822 for (auto &Arg : F.args()) { 823 if (DL.getTypeStoreSize(Arg.getType()) == 0) 824 continue; 825 826 const bool InReg = Arg.hasAttribute(Attribute::InReg); 827 828 // SGPR arguments to functions not implemented. 829 if (!IsGraphics && InReg) 830 return false; 831 832 if (Arg.hasAttribute(Attribute::SwiftSelf) || 833 Arg.hasAttribute(Attribute::SwiftError) || 834 Arg.hasAttribute(Attribute::Nest)) 835 return false; 836 837 if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) { 838 const bool ArgUsed = !Arg.use_empty(); 839 bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum); 840 841 if (!SkipArg) { 842 Info->markPSInputAllocated(PSInputNum); 843 if (ArgUsed) 844 Info->markPSInputEnabled(PSInputNum); 845 } 846 847 ++PSInputNum; 848 849 if (SkipArg) { 850 for (int I = 0, E = VRegs[Idx].size(); I != E; ++I) 851 B.buildUndef(VRegs[Idx][I]); 852 853 ++Idx; 854 continue; 855 } 856 } 857 858 ArgInfo OrigArg(VRegs[Idx], Arg.getType()); 859 const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex; 860 setArgFlags(OrigArg, OrigArgIdx, DL, F); 861 862 splitToValueTypes( 863 B, OrigArg, SplitArgs, DL, CC, false, 864 // FIXME: We should probably be passing multiple registers to 865 // handleAssignments to do this 866 [&](ArrayRef<Register> Regs, Register DstReg, 867 LLT LLTy, LLT PartLLT, int VTSplitIdx) { 868 assert(DstReg == VRegs[Idx][VTSplitIdx]); 869 packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs, 870 LLTy, PartLLT); 871 }); 872 873 ++Idx; 874 } 875 876 // At least one interpolation mode must be enabled or else the GPU will 877 // hang. 878 // 879 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user 880 // set PSInputAddr, the user wants to enable some bits after the compilation 881 // based on run-time states. Since we can't know what the final PSInputEna 882 // will look like, so we shouldn't do anything here and the user should take 883 // responsibility for the correct programming. 884 // 885 // Otherwise, the following restrictions apply: 886 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. 887 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be 888 // enabled too. 889 if (CC == CallingConv::AMDGPU_PS) { 890 if ((Info->getPSInputAddr() & 0x7F) == 0 || 891 ((Info->getPSInputAddr() & 0xF) == 0 && 892 Info->isPSInputAllocated(11))) { 893 CCInfo.AllocateReg(AMDGPU::VGPR0); 894 CCInfo.AllocateReg(AMDGPU::VGPR1); 895 Info->markPSInputAllocated(0); 896 Info->markPSInputEnabled(0); 897 } 898 899 if (Subtarget.isAmdPalOS()) { 900 // For isAmdPalOS, the user does not enable some bits after compilation 901 // based on run-time states; the register values being generated here are 902 // the final ones set in hardware. Therefore we need to apply the 903 // workaround to PSInputAddr and PSInputEnable together. (The case where 904 // a bit is set in PSInputAddr but not PSInputEnable is where the frontend 905 // set up an input arg for a particular interpolation mode, but nothing 906 // uses that input arg. Really we should have an earlier pass that removes 907 // such an arg.) 908 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable(); 909 if ((PsInputBits & 0x7F) == 0 || 910 ((PsInputBits & 0xF) == 0 && 911 (PsInputBits >> 11 & 1))) 912 Info->markPSInputEnabled( 913 countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined)); 914 } 915 } 916 917 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 918 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg()); 919 920 if (!MBB.empty()) 921 B.setInstr(*MBB.begin()); 922 923 if (!IsEntryFunc) { 924 // For the fixed ABI, pass workitem IDs in the last argument register. 925 if (AMDGPUTargetMachine::EnableFixedFunctionABI) 926 TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); 927 } 928 929 FormalArgHandler Handler(B, MRI, AssignFn); 930 if (!handleAssignments(CCInfo, ArgLocs, B, SplitArgs, Handler)) 931 return false; 932 933 if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) { 934 // Special inputs come after user arguments. 935 TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); 936 } 937 938 // Start adding system SGPRs. 939 if (IsEntryFunc) { 940 TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics); 941 } else { 942 if (!Subtarget.enableFlatScratch()) 943 CCInfo.AllocateReg(Info->getScratchRSrcReg()); 944 TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); 945 } 946 947 // Move back to the end of the basic block. 948 B.setMBB(MBB); 949 950 return true; 951 } 952 953 bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder, 954 CCState &CCInfo, 955 SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs, 956 CallLoweringInfo &Info) const { 957 MachineFunction &MF = MIRBuilder.getMF(); 958 959 const AMDGPUFunctionArgInfo *CalleeArgInfo 960 = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo; 961 962 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 963 const AMDGPUFunctionArgInfo &CallerArgInfo = MFI->getArgInfo(); 964 965 966 // TODO: Unify with private memory register handling. This is complicated by 967 // the fact that at least in kernels, the input argument is not necessarily 968 // in the same location as the input. 969 AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = { 970 AMDGPUFunctionArgInfo::DISPATCH_PTR, 971 AMDGPUFunctionArgInfo::QUEUE_PTR, 972 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, 973 AMDGPUFunctionArgInfo::DISPATCH_ID, 974 AMDGPUFunctionArgInfo::WORKGROUP_ID_X, 975 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, 976 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z 977 }; 978 979 MachineRegisterInfo &MRI = MF.getRegInfo(); 980 981 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 982 const AMDGPULegalizerInfo *LI 983 = static_cast<const AMDGPULegalizerInfo*>(ST.getLegalizerInfo()); 984 985 for (auto InputID : InputRegs) { 986 const ArgDescriptor *OutgoingArg; 987 const TargetRegisterClass *ArgRC; 988 LLT ArgTy; 989 990 std::tie(OutgoingArg, ArgRC, ArgTy) = 991 CalleeArgInfo->getPreloadedValue(InputID); 992 if (!OutgoingArg) 993 continue; 994 995 const ArgDescriptor *IncomingArg; 996 const TargetRegisterClass *IncomingArgRC; 997 std::tie(IncomingArg, IncomingArgRC, ArgTy) = 998 CallerArgInfo.getPreloadedValue(InputID); 999 assert(IncomingArgRC == ArgRC); 1000 1001 Register InputReg = MRI.createGenericVirtualRegister(ArgTy); 1002 1003 if (IncomingArg) { 1004 LI->loadInputValue(InputReg, MIRBuilder, IncomingArg, ArgRC, ArgTy); 1005 } else { 1006 assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 1007 LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder); 1008 } 1009 1010 if (OutgoingArg->isRegister()) { 1011 ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg); 1012 if (!CCInfo.AllocateReg(OutgoingArg->getRegister())) 1013 report_fatal_error("failed to allocate implicit input argument"); 1014 } else { 1015 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n"); 1016 return false; 1017 } 1018 } 1019 1020 // Pack workitem IDs into a single register or pass it as is if already 1021 // packed. 1022 const ArgDescriptor *OutgoingArg; 1023 const TargetRegisterClass *ArgRC; 1024 LLT ArgTy; 1025 1026 std::tie(OutgoingArg, ArgRC, ArgTy) = 1027 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X); 1028 if (!OutgoingArg) 1029 std::tie(OutgoingArg, ArgRC, ArgTy) = 1030 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 1031 if (!OutgoingArg) 1032 std::tie(OutgoingArg, ArgRC, ArgTy) = 1033 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 1034 if (!OutgoingArg) 1035 return false; 1036 1037 auto WorkitemIDX = 1038 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X); 1039 auto WorkitemIDY = 1040 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 1041 auto WorkitemIDZ = 1042 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 1043 1044 const ArgDescriptor *IncomingArgX = std::get<0>(WorkitemIDX); 1045 const ArgDescriptor *IncomingArgY = std::get<0>(WorkitemIDY); 1046 const ArgDescriptor *IncomingArgZ = std::get<0>(WorkitemIDZ); 1047 const LLT S32 = LLT::scalar(32); 1048 1049 // If incoming ids are not packed we need to pack them. 1050 // FIXME: Should consider known workgroup size to eliminate known 0 cases. 1051 Register InputReg; 1052 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX) { 1053 InputReg = MRI.createGenericVirtualRegister(S32); 1054 LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX, 1055 std::get<1>(WorkitemIDX), std::get<2>(WorkitemIDX)); 1056 } 1057 1058 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY) { 1059 Register Y = MRI.createGenericVirtualRegister(S32); 1060 LI->loadInputValue(Y, MIRBuilder, IncomingArgY, std::get<1>(WorkitemIDY), 1061 std::get<2>(WorkitemIDY)); 1062 1063 Y = MIRBuilder.buildShl(S32, Y, MIRBuilder.buildConstant(S32, 10)).getReg(0); 1064 InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Y).getReg(0) : Y; 1065 } 1066 1067 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ) { 1068 Register Z = MRI.createGenericVirtualRegister(S32); 1069 LI->loadInputValue(Z, MIRBuilder, IncomingArgZ, std::get<1>(WorkitemIDZ), 1070 std::get<2>(WorkitemIDZ)); 1071 1072 Z = MIRBuilder.buildShl(S32, Z, MIRBuilder.buildConstant(S32, 20)).getReg(0); 1073 InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Z).getReg(0) : Z; 1074 } 1075 1076 if (!InputReg) { 1077 InputReg = MRI.createGenericVirtualRegister(S32); 1078 1079 // Workitem ids are already packed, any of present incoming arguments will 1080 // carry all required fields. 1081 ArgDescriptor IncomingArg = ArgDescriptor::createArg( 1082 IncomingArgX ? *IncomingArgX : 1083 IncomingArgY ? *IncomingArgY : *IncomingArgZ, ~0u); 1084 LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg, 1085 &AMDGPU::VGPR_32RegClass, S32); 1086 } 1087 1088 if (OutgoingArg->isRegister()) { 1089 ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg); 1090 if (!CCInfo.AllocateReg(OutgoingArg->getRegister())) 1091 report_fatal_error("failed to allocate implicit input argument"); 1092 } else { 1093 LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n"); 1094 return false; 1095 } 1096 1097 return true; 1098 } 1099 1100 /// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for 1101 /// CC. 1102 static std::pair<CCAssignFn *, CCAssignFn *> 1103 getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) { 1104 return {TLI.CCAssignFnForCall(CC, false), TLI.CCAssignFnForCall(CC, true)}; 1105 } 1106 1107 static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect, 1108 bool IsTailCall) { 1109 return AMDGPU::SI_CALL; 1110 } 1111 1112 // Add operands to call instruction to track the callee. 1113 static bool addCallTargetOperands(MachineInstrBuilder &CallInst, 1114 MachineIRBuilder &MIRBuilder, 1115 AMDGPUCallLowering::CallLoweringInfo &Info) { 1116 if (Info.Callee.isReg()) { 1117 CallInst.addReg(Info.Callee.getReg()); 1118 CallInst.addImm(0); 1119 } else if (Info.Callee.isGlobal() && Info.Callee.getOffset() == 0) { 1120 // The call lowering lightly assumed we can directly encode a call target in 1121 // the instruction, which is not the case. Materialize the address here. 1122 const GlobalValue *GV = Info.Callee.getGlobal(); 1123 auto Ptr = MIRBuilder.buildGlobalValue( 1124 LLT::pointer(GV->getAddressSpace(), 64), GV); 1125 CallInst.addReg(Ptr.getReg(0)); 1126 CallInst.add(Info.Callee); 1127 } else 1128 return false; 1129 1130 return true; 1131 } 1132 1133 bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, 1134 CallLoweringInfo &Info) const { 1135 if (Info.IsVarArg) { 1136 LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n"); 1137 return false; 1138 } 1139 1140 MachineFunction &MF = MIRBuilder.getMF(); 1141 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 1142 const SIRegisterInfo *TRI = ST.getRegisterInfo(); 1143 1144 const Function &F = MF.getFunction(); 1145 MachineRegisterInfo &MRI = MF.getRegInfo(); 1146 const SITargetLowering &TLI = *getTLI<SITargetLowering>(); 1147 const DataLayout &DL = F.getParent()->getDataLayout(); 1148 CallingConv::ID CallConv = F.getCallingConv(); 1149 1150 if (!AMDGPUTargetMachine::EnableFixedFunctionABI && 1151 CallConv != CallingConv::AMDGPU_Gfx) { 1152 LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n"); 1153 return false; 1154 } 1155 1156 if (AMDGPU::isShader(CallConv)) { 1157 LLVM_DEBUG(dbgs() << "Unhandled call from graphics shader\n"); 1158 return false; 1159 } 1160 1161 SmallVector<ArgInfo, 8> OutArgs; 1162 SmallVector<ArgInfo, 4> SplitRetInfos; 1163 1164 for (auto &OrigArg : Info.OrigArgs) { 1165 splitToValueTypes( 1166 MIRBuilder, OrigArg, OutArgs, DL, Info.CallConv, true, 1167 // FIXME: We should probably be passing multiple registers to 1168 // handleAssignments to do this 1169 [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT, 1170 int VTSplitIdx) { 1171 unpackRegsToOrigType(MIRBuilder, Regs, SrcReg, OrigArg, LLTy, PartLLT); 1172 }); 1173 } 1174 1175 // If we can lower as a tail call, do that instead. 1176 bool CanTailCallOpt = false; 1177 1178 // We must emit a tail call if we have musttail. 1179 if (Info.IsMustTailCall && !CanTailCallOpt) { 1180 LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n"); 1181 return false; 1182 } 1183 1184 // Find out which ABI gets to decide where things go. 1185 CCAssignFn *AssignFnFixed; 1186 CCAssignFn *AssignFnVarArg; 1187 std::tie(AssignFnFixed, AssignFnVarArg) = 1188 getAssignFnsForCC(Info.CallConv, TLI); 1189 1190 MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP) 1191 .addImm(0) 1192 .addImm(0); 1193 1194 // Create a temporarily-floating call instruction so we can add the implicit 1195 // uses of arg registers. 1196 unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false); 1197 1198 auto MIB = MIRBuilder.buildInstrNoInsert(Opc); 1199 MIB.addDef(TRI->getReturnAddressReg(MF)); 1200 1201 if (!addCallTargetOperands(MIB, MIRBuilder, Info)) 1202 return false; 1203 1204 // Tell the call which registers are clobbered. 1205 const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv); 1206 MIB.addRegMask(Mask); 1207 1208 SmallVector<CCValAssign, 16> ArgLocs; 1209 CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext()); 1210 1211 // We could pass MIB and directly add the implicit uses to the call 1212 // now. However, as an aesthetic choice, place implicit argument operands 1213 // after the ordinary user argument registers. 1214 SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs; 1215 1216 if (AMDGPUTargetMachine::EnableFixedFunctionABI) { 1217 // With a fixed ABI, allocate fixed registers before user arguments. 1218 if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info)) 1219 return false; 1220 } 1221 1222 // Do the actual argument marshalling. 1223 SmallVector<Register, 8> PhysRegs; 1224 AMDGPUOutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed, 1225 AssignFnVarArg, false); 1226 if (!handleAssignments(CCInfo, ArgLocs, MIRBuilder, OutArgs, Handler)) 1227 return false; 1228 1229 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1230 1231 if (!ST.enableFlatScratch()) { 1232 // Insert copies for the SRD. In the HSA case, this should be an identity 1233 // copy. 1234 auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::vector(4, 32), 1235 MFI->getScratchRSrcReg()); 1236 MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); 1237 MIB.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit); 1238 } 1239 1240 for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) { 1241 MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second); 1242 MIB.addReg(ArgReg.first, RegState::Implicit); 1243 } 1244 1245 // Get a count of how many bytes are to be pushed on the stack. 1246 unsigned NumBytes = CCInfo.getNextStackOffset(); 1247 1248 // If Callee is a reg, since it is used by a target specific 1249 // instruction, it must have a register class matching the 1250 // constraint of that instruction. 1251 1252 // FIXME: We should define regbankselectable call instructions to handle 1253 // divergent call targets. 1254 if (MIB->getOperand(1).isReg()) { 1255 MIB->getOperand(1).setReg(constrainOperandRegClass( 1256 MF, *TRI, MRI, *ST.getInstrInfo(), 1257 *ST.getRegBankInfo(), *MIB, MIB->getDesc(), MIB->getOperand(1), 1258 1)); 1259 } 1260 1261 auto OrigInsertPt = MIRBuilder.getInsertPt(); 1262 1263 // Now we can add the actual call instruction to the correct position. 1264 MIRBuilder.insertInstr(MIB); 1265 1266 // Insert this now to give us an anchor point for managing the insert point. 1267 MachineInstrBuilder CallSeqEnd = 1268 MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN); 1269 1270 SmallVector<ArgInfo, 8> InArgs; 1271 if (!Info.OrigRet.Ty->isVoidTy()) { 1272 splitToValueTypes( 1273 MIRBuilder, Info.OrigRet, InArgs, DL, Info.CallConv, false, 1274 [&](ArrayRef<Register> Regs, Register DstReg, 1275 LLT LLTy, LLT PartLLT, int VTSplitIdx) { 1276 assert(DstReg == Info.OrigRet.Regs[VTSplitIdx]); 1277 packSplitRegsToOrigType(MIRBuilder, Info.OrigRet.Regs[VTSplitIdx], 1278 Regs, LLTy, PartLLT); 1279 }); 1280 } 1281 1282 // Make sure the raw argument copies are inserted before the marshalling to 1283 // the original types. 1284 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), CallSeqEnd); 1285 1286 // Finally we can copy the returned value back into its virtual-register. In 1287 // symmetry with the arguments, the physical register must be an 1288 // implicit-define of the call instruction. 1289 if (!Info.OrigRet.Ty->isVoidTy()) { 1290 CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv, 1291 Info.IsVarArg); 1292 CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn); 1293 if (!handleAssignments(MIRBuilder, InArgs, Handler)) 1294 return false; 1295 } 1296 1297 uint64_t CalleePopBytes = NumBytes; 1298 CallSeqEnd.addImm(0) 1299 .addImm(CalleePopBytes); 1300 1301 // Restore the insert point to after the call sequence. 1302 MIRBuilder.setInsertPt(MIRBuilder.getMBB(), OrigInsertPt); 1303 return true; 1304 } 1305