1d8ea85acSTom Stellard //===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===//
2000c5af3STom Stellard //
32946cd70SChandler Carruth // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
42946cd70SChandler Carruth // See https://llvm.org/LICENSE.txt for license information.
52946cd70SChandler Carruth // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6000c5af3STom Stellard //
7000c5af3STom Stellard //===----------------------------------------------------------------------===//
8000c5af3STom Stellard ///
9000c5af3STom Stellard /// \file
10000c5af3STom Stellard /// This file implements the lowering of LLVM calls to machine code calls for
11000c5af3STom Stellard /// GlobalISel.
12000c5af3STom Stellard ///
13000c5af3STom Stellard //===----------------------------------------------------------------------===//
14000c5af3STom Stellard 
15000c5af3STom Stellard #include "AMDGPUCallLowering.h"
16ca16621bSTom Stellard #include "AMDGPU.h"
17000c5af3STom Stellard #include "AMDGPUISelLowering.h"
18ca16621bSTom Stellard #include "AMDGPUSubtarget.h"
19a162048aSMatt Arsenault #include "AMDGPUTargetMachine.h"
20ca16621bSTom Stellard #include "SIISelLowering.h"
21ca16621bSTom Stellard #include "SIMachineFunctionInfo.h"
226bda14b3SChandler Carruth #include "SIRegisterInfo.h"
2344b30b45STom Stellard #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
24206b9927STom Stellard #include "llvm/CodeGen/Analysis.h"
25ca16621bSTom Stellard #include "llvm/CodeGen/CallingConvLower.h"
26000c5af3STom Stellard #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27000c5af3STom Stellard #include "llvm/CodeGen/MachineInstrBuilder.h"
28206b9927STom Stellard #include "llvm/Support/LowLevelTypeImpl.h"
29000c5af3STom Stellard 
30000c5af3STom Stellard using namespace llvm;
31000c5af3STom Stellard 
32206b9927STom Stellard namespace {
33206b9927STom Stellard 
34a9ea8a9aSMatt Arsenault struct OutgoingValueHandler : public CallLowering::ValueHandler {
3506c8cb03SAustin Kerbow   OutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
36206b9927STom Stellard                        MachineInstrBuilder MIB, CCAssignFn *AssignFn)
3706c8cb03SAustin Kerbow       : ValueHandler(B, MRI, AssignFn), MIB(MIB) {}
38206b9927STom Stellard 
39206b9927STom Stellard   MachineInstrBuilder MIB;
40206b9927STom Stellard 
419f9151d4SQuentin Colombet   bool isIncomingArgumentHandler() const override { return false; }
429f9151d4SQuentin Colombet 
43faeaedf8SMatt Arsenault   Register getStackAddress(uint64_t Size, int64_t Offset,
44206b9927STom Stellard                            MachinePointerInfo &MPO) override {
45206b9927STom Stellard     llvm_unreachable("not implemented");
46206b9927STom Stellard   }
47206b9927STom Stellard 
48faeaedf8SMatt Arsenault   void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
49206b9927STom Stellard                             MachinePointerInfo &MPO, CCValAssign &VA) override {
50206b9927STom Stellard     llvm_unreachable("not implemented");
51206b9927STom Stellard   }
52206b9927STom Stellard 
53faeaedf8SMatt Arsenault   void assignValueToReg(Register ValVReg, Register PhysReg,
54206b9927STom Stellard                         CCValAssign &VA) override {
55a9ea8a9aSMatt Arsenault     Register ExtReg;
56a9ea8a9aSMatt Arsenault     if (VA.getLocVT().getSizeInBits() < 32) {
57a9ea8a9aSMatt Arsenault       // 16-bit types are reported as legal for 32-bit registers. We need to
58a9ea8a9aSMatt Arsenault       // extend and do a 32-bit copy to avoid the verifier complaining about it.
59a9ea8a9aSMatt Arsenault       ExtReg = MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0);
60a9ea8a9aSMatt Arsenault     } else
61a9ea8a9aSMatt Arsenault       ExtReg = extendRegister(ValVReg, VA);
62a9ea8a9aSMatt Arsenault 
6367cfbec7SMatt Arsenault     // If this is a scalar return, insert a readfirstlane just in case the value
6467cfbec7SMatt Arsenault     // ends up in a VGPR.
6567cfbec7SMatt Arsenault     // FIXME: Assert this is a shader return.
6667cfbec7SMatt Arsenault     const SIRegisterInfo *TRI
6767cfbec7SMatt Arsenault       = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
6867cfbec7SMatt Arsenault     if (TRI->isSGPRReg(MRI, PhysReg)) {
6967cfbec7SMatt Arsenault       auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane,
7067cfbec7SMatt Arsenault                                               {MRI.getType(ExtReg)}, false)
7167cfbec7SMatt Arsenault         .addReg(ExtReg);
7267cfbec7SMatt Arsenault       ExtReg = ToSGPR.getReg(0);
7367cfbec7SMatt Arsenault     }
7467cfbec7SMatt Arsenault 
75a9ea8a9aSMatt Arsenault     MIRBuilder.buildCopy(PhysReg, ExtReg);
76a9ea8a9aSMatt Arsenault     MIB.addUse(PhysReg, RegState::Implicit);
77206b9927STom Stellard   }
78206b9927STom Stellard 
79206b9927STom Stellard   bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
80206b9927STom Stellard                  CCValAssign::LocInfo LocInfo,
81206b9927STom Stellard                  const CallLowering::ArgInfo &Info,
82fbaf425bSAmara Emerson                  ISD::ArgFlagsTy Flags,
83206b9927STom Stellard                  CCState &State) override {
84fbaf425bSAmara Emerson     return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
85206b9927STom Stellard   }
86206b9927STom Stellard };
87206b9927STom Stellard 
88fecf43ebSMatt Arsenault struct IncomingArgHandler : public CallLowering::ValueHandler {
89fecf43ebSMatt Arsenault   uint64_t StackUsed = 0;
90fecf43ebSMatt Arsenault 
9106c8cb03SAustin Kerbow   IncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
92fecf43ebSMatt Arsenault                      CCAssignFn *AssignFn)
9306c8cb03SAustin Kerbow     : ValueHandler(B, MRI, AssignFn) {}
94fecf43ebSMatt Arsenault 
95fecf43ebSMatt Arsenault   Register getStackAddress(uint64_t Size, int64_t Offset,
96fecf43ebSMatt Arsenault                            MachinePointerInfo &MPO) override {
97fecf43ebSMatt Arsenault     auto &MFI = MIRBuilder.getMF().getFrameInfo();
98fecf43ebSMatt Arsenault     int FI = MFI.CreateFixedObject(Size, Offset, true);
99fecf43ebSMatt Arsenault     MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
1002a1b5af2SJay Foad     auto AddrReg = MIRBuilder.buildFrameIndex(
1012a1b5af2SJay Foad         LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32), FI);
102fecf43ebSMatt Arsenault     StackUsed = std::max(StackUsed, Size + Offset);
1032a1b5af2SJay Foad     return AddrReg.getReg(0);
104fecf43ebSMatt Arsenault   }
105fecf43ebSMatt Arsenault 
106fecf43ebSMatt Arsenault   void assignValueToReg(Register ValVReg, Register PhysReg,
107fecf43ebSMatt Arsenault                         CCValAssign &VA) override {
108fecf43ebSMatt Arsenault     markPhysRegUsed(PhysReg);
109fecf43ebSMatt Arsenault 
110fecf43ebSMatt Arsenault     if (VA.getLocVT().getSizeInBits() < 32) {
111fecf43ebSMatt Arsenault       // 16-bit types are reported as legal for 32-bit registers. We need to do
112fecf43ebSMatt Arsenault       // a 32-bit copy, and truncate to avoid the verifier complaining about it.
113fecf43ebSMatt Arsenault       auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg);
114fecf43ebSMatt Arsenault       MIRBuilder.buildTrunc(ValVReg, Copy);
115fecf43ebSMatt Arsenault       return;
116fecf43ebSMatt Arsenault     }
117fecf43ebSMatt Arsenault 
118fecf43ebSMatt Arsenault     switch (VA.getLocInfo()) {
119fecf43ebSMatt Arsenault     case CCValAssign::LocInfo::SExt:
120fecf43ebSMatt Arsenault     case CCValAssign::LocInfo::ZExt:
121fecf43ebSMatt Arsenault     case CCValAssign::LocInfo::AExt: {
122fecf43ebSMatt Arsenault       auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg);
123fecf43ebSMatt Arsenault       MIRBuilder.buildTrunc(ValVReg, Copy);
124fecf43ebSMatt Arsenault       break;
125fecf43ebSMatt Arsenault     }
126fecf43ebSMatt Arsenault     default:
127fecf43ebSMatt Arsenault       MIRBuilder.buildCopy(ValVReg, PhysReg);
128fecf43ebSMatt Arsenault       break;
129fecf43ebSMatt Arsenault     }
130fecf43ebSMatt Arsenault   }
131fecf43ebSMatt Arsenault 
132fecf43ebSMatt Arsenault   void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
133fecf43ebSMatt Arsenault                             MachinePointerInfo &MPO, CCValAssign &VA) override {
134fb0c35faSMatt Arsenault     MachineFunction &MF = MIRBuilder.getMF();
135fb0c35faSMatt Arsenault 
136fecf43ebSMatt Arsenault     // FIXME: Get alignment
137fb0c35faSMatt Arsenault     auto MMO = MF.getMachineMemOperand(
138fb0c35faSMatt Arsenault         MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size,
1390de874adSGuillaume Chatelet         inferAlignFromPtrInfo(MF, MPO));
140fecf43ebSMatt Arsenault     MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
141fecf43ebSMatt Arsenault   }
142fecf43ebSMatt Arsenault 
143fecf43ebSMatt Arsenault   /// How the physical register gets marked varies between formal
144fecf43ebSMatt Arsenault   /// parameters (it's a basic-block live-in), and a call instruction
145fecf43ebSMatt Arsenault   /// (it's an implicit-def of the BL).
146fecf43ebSMatt Arsenault   virtual void markPhysRegUsed(unsigned PhysReg) = 0;
147fecf43ebSMatt Arsenault 
148fecf43ebSMatt Arsenault   // FIXME: What is the point of this being a callback?
149bc1172dfSAmara Emerson   bool isIncomingArgumentHandler() const override { return true; }
150fecf43ebSMatt Arsenault };
151fecf43ebSMatt Arsenault 
152fecf43ebSMatt Arsenault struct FormalArgHandler : public IncomingArgHandler {
15306c8cb03SAustin Kerbow   FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
154fecf43ebSMatt Arsenault                    CCAssignFn *AssignFn)
15506c8cb03SAustin Kerbow     : IncomingArgHandler(B, MRI, AssignFn) {}
156fecf43ebSMatt Arsenault 
157fecf43ebSMatt Arsenault   void markPhysRegUsed(unsigned PhysReg) override {
158fecf43ebSMatt Arsenault     MIRBuilder.getMBB().addLiveIn(PhysReg);
159fecf43ebSMatt Arsenault   }
160fecf43ebSMatt Arsenault };
161fecf43ebSMatt Arsenault 
162206b9927STom Stellard }
163206b9927STom Stellard 
164000c5af3STom Stellard AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
1650da6350dSMatt Arsenault   : CallLowering(&TLI) {
166000c5af3STom Stellard }
167000c5af3STom Stellard 
168eb416277SMatt Arsenault // FIXME: Compatability shim
169eb416277SMatt Arsenault static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) {
170eb416277SMatt Arsenault   switch (MIOpc) {
171eb416277SMatt Arsenault   case TargetOpcode::G_SEXT:
172eb416277SMatt Arsenault     return ISD::SIGN_EXTEND;
173eb416277SMatt Arsenault   case TargetOpcode::G_ZEXT:
174eb416277SMatt Arsenault     return ISD::ZERO_EXTEND;
175eb416277SMatt Arsenault   case TargetOpcode::G_ANYEXT:
176eb416277SMatt Arsenault     return ISD::ANY_EXTEND;
177eb416277SMatt Arsenault   default:
178eb416277SMatt Arsenault     llvm_unreachable("not an extend opcode");
179eb416277SMatt Arsenault   }
180eb416277SMatt Arsenault }
181eb416277SMatt Arsenault 
182fecf43ebSMatt Arsenault void AMDGPUCallLowering::splitToValueTypes(
183eb416277SMatt Arsenault   MachineIRBuilder &B,
184eb416277SMatt Arsenault   const ArgInfo &OrigArg, unsigned OrigArgIdx,
185eb416277SMatt Arsenault   SmallVectorImpl<ArgInfo> &SplitArgs,
186eb416277SMatt Arsenault   const DataLayout &DL, CallingConv::ID CallConv,
187fecf43ebSMatt Arsenault   SplitArgTy PerformArgSplit) const {
188fecf43ebSMatt Arsenault   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
189fecf43ebSMatt Arsenault   LLVMContext &Ctx = OrigArg.Ty->getContext();
190fecf43ebSMatt Arsenault 
191fecf43ebSMatt Arsenault   if (OrigArg.Ty->isVoidTy())
192fecf43ebSMatt Arsenault     return;
193fecf43ebSMatt Arsenault 
194fecf43ebSMatt Arsenault   SmallVector<EVT, 4> SplitVTs;
195fecf43ebSMatt Arsenault   ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs);
196fecf43ebSMatt Arsenault 
197b60a2ae4SMatt Arsenault   assert(OrigArg.Regs.size() == SplitVTs.size());
198b60a2ae4SMatt Arsenault 
199b60a2ae4SMatt Arsenault   int SplitIdx = 0;
200b60a2ae4SMatt Arsenault   for (EVT VT : SplitVTs) {
201eb416277SMatt Arsenault     Register Reg = OrigArg.Regs[SplitIdx];
202b60a2ae4SMatt Arsenault     Type *Ty = VT.getTypeForEVT(Ctx);
203eb416277SMatt Arsenault     LLT LLTy = getLLTForType(*Ty, DL);
204b60a2ae4SMatt Arsenault 
205eb416277SMatt Arsenault     if (OrigArgIdx == AttributeList::ReturnIndex && VT.isScalarInteger()) {
206eb416277SMatt Arsenault       unsigned ExtendOp = TargetOpcode::G_ANYEXT;
207eb416277SMatt Arsenault       if (OrigArg.Flags[0].isSExt()) {
208eb416277SMatt Arsenault         assert(OrigArg.Regs.size() == 1 && "expect only simple return values");
209eb416277SMatt Arsenault         ExtendOp = TargetOpcode::G_SEXT;
210eb416277SMatt Arsenault       } else if (OrigArg.Flags[0].isZExt()) {
211eb416277SMatt Arsenault         assert(OrigArg.Regs.size() == 1 && "expect only simple return values");
212eb416277SMatt Arsenault         ExtendOp = TargetOpcode::G_ZEXT;
213eb416277SMatt Arsenault       }
214b60a2ae4SMatt Arsenault 
215eb416277SMatt Arsenault       EVT ExtVT = TLI.getTypeForExtReturn(Ctx, VT,
216eb416277SMatt Arsenault                                           extOpcodeToISDExtOpcode(ExtendOp));
217eb416277SMatt Arsenault       if (ExtVT != VT) {
218eb416277SMatt Arsenault         VT = ExtVT;
219eb416277SMatt Arsenault         Ty = ExtVT.getTypeForEVT(Ctx);
220eb416277SMatt Arsenault         LLTy = getLLTForType(*Ty, DL);
221eb416277SMatt Arsenault         Reg = B.buildInstr(ExtendOp, {LLTy}, {Reg}).getReg(0);
222eb416277SMatt Arsenault       }
223eb416277SMatt Arsenault     }
224eb416277SMatt Arsenault 
225eb416277SMatt Arsenault     unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
226eb416277SMatt Arsenault     MVT RegVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
227fecf43ebSMatt Arsenault 
228fecf43ebSMatt Arsenault     if (NumParts == 1) {
229bb009498SMatt Arsenault       // Fixup EVTs to an MVT.
230bb009498SMatt Arsenault       //
231bb009498SMatt Arsenault       // FIXME: This is pretty hacky. Why do we have to split the type
232bb009498SMatt Arsenault       // legalization logic between here and handleAssignments?
233bb009498SMatt Arsenault       if (OrigArgIdx != AttributeList::ReturnIndex && VT != RegVT) {
234bb009498SMatt Arsenault         assert(VT.getSizeInBits() < 32 &&
235bb009498SMatt Arsenault                "unexpected illegal type");
236bb009498SMatt Arsenault         Ty = Type::getInt32Ty(Ctx);
237bb009498SMatt Arsenault         Register OrigReg = Reg;
238bb009498SMatt Arsenault         Reg = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
239bb009498SMatt Arsenault         B.buildTrunc(OrigReg, Reg);
240bb009498SMatt Arsenault       }
241bb009498SMatt Arsenault 
242fecf43ebSMatt Arsenault       // No splitting to do, but we want to replace the original type (e.g. [1 x
243fecf43ebSMatt Arsenault       // double] -> double).
244eb416277SMatt Arsenault       SplitArgs.emplace_back(Reg, Ty, OrigArg.Flags, OrigArg.IsFixed);
245b60a2ae4SMatt Arsenault 
246b60a2ae4SMatt Arsenault       ++SplitIdx;
247b60a2ae4SMatt Arsenault       continue;
248fecf43ebSMatt Arsenault     }
249fecf43ebSMatt Arsenault 
250fecf43ebSMatt Arsenault     SmallVector<Register, 8> SplitRegs;
251eb416277SMatt Arsenault     Type *PartTy = EVT(RegVT).getTypeForEVT(Ctx);
252fecf43ebSMatt Arsenault     LLT PartLLT = getLLTForType(*PartTy, DL);
253eb416277SMatt Arsenault     MachineRegisterInfo &MRI = *B.getMRI();
254fecf43ebSMatt Arsenault 
255fecf43ebSMatt Arsenault     // FIXME: Should we be reporting all of the part registers for a single
256fecf43ebSMatt Arsenault     // argument, and let handleAssignments take care of the repacking?
257fecf43ebSMatt Arsenault     for (unsigned i = 0; i < NumParts; ++i) {
258fecf43ebSMatt Arsenault       Register PartReg = MRI.createGenericVirtualRegister(PartLLT);
259fecf43ebSMatt Arsenault       SplitRegs.push_back(PartReg);
260fecf43ebSMatt Arsenault       SplitArgs.emplace_back(ArrayRef<Register>(PartReg), PartTy, OrigArg.Flags);
261fecf43ebSMatt Arsenault     }
262fecf43ebSMatt Arsenault 
263eb416277SMatt Arsenault     PerformArgSplit(SplitRegs, Reg, LLTy, PartLLT, SplitIdx);
264b60a2ae4SMatt Arsenault 
265b60a2ae4SMatt Arsenault     ++SplitIdx;
266b60a2ae4SMatt Arsenault   }
267fecf43ebSMatt Arsenault }
268fecf43ebSMatt Arsenault 
269a9ea8a9aSMatt Arsenault // Get the appropriate type to make \p OrigTy \p Factor times bigger.
270a9ea8a9aSMatt Arsenault static LLT getMultipleType(LLT OrigTy, int Factor) {
271a9ea8a9aSMatt Arsenault   if (OrigTy.isVector()) {
272a9ea8a9aSMatt Arsenault     return LLT::vector(OrigTy.getNumElements() * Factor,
273a9ea8a9aSMatt Arsenault                        OrigTy.getElementType());
274a9ea8a9aSMatt Arsenault   }
275a9ea8a9aSMatt Arsenault 
276a9ea8a9aSMatt Arsenault   return LLT::scalar(OrigTy.getSizeInBits() * Factor);
277a9ea8a9aSMatt Arsenault }
278a9ea8a9aSMatt Arsenault 
279a9ea8a9aSMatt Arsenault // TODO: Move to generic code
28006c8cb03SAustin Kerbow static void unpackRegsToOrigType(MachineIRBuilder &B,
281a9ea8a9aSMatt Arsenault                                  ArrayRef<Register> DstRegs,
282a9ea8a9aSMatt Arsenault                                  Register SrcReg,
283eb416277SMatt Arsenault                                  const CallLowering::ArgInfo &Info,
284a9ea8a9aSMatt Arsenault                                  LLT SrcTy,
285a9ea8a9aSMatt Arsenault                                  LLT PartTy) {
286a9ea8a9aSMatt Arsenault   assert(DstRegs.size() > 1 && "Nothing to unpack");
287a9ea8a9aSMatt Arsenault 
288a9ea8a9aSMatt Arsenault   const unsigned SrcSize = SrcTy.getSizeInBits();
289a9ea8a9aSMatt Arsenault   const unsigned PartSize = PartTy.getSizeInBits();
290a9ea8a9aSMatt Arsenault 
291a9ea8a9aSMatt Arsenault   if (SrcTy.isVector() && !PartTy.isVector() &&
292a9ea8a9aSMatt Arsenault       PartSize > SrcTy.getElementType().getSizeInBits()) {
293a9ea8a9aSMatt Arsenault     // Vector was scalarized, and the elements extended.
29406c8cb03SAustin Kerbow     auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(),
295a9ea8a9aSMatt Arsenault                                                   SrcReg);
296a9ea8a9aSMatt Arsenault     for (int i = 0, e = DstRegs.size(); i != e; ++i)
29706c8cb03SAustin Kerbow       B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i));
298a9ea8a9aSMatt Arsenault     return;
299a9ea8a9aSMatt Arsenault   }
300a9ea8a9aSMatt Arsenault 
301a9ea8a9aSMatt Arsenault   if (SrcSize % PartSize == 0) {
30206c8cb03SAustin Kerbow     B.buildUnmerge(DstRegs, SrcReg);
303a9ea8a9aSMatt Arsenault     return;
304a9ea8a9aSMatt Arsenault   }
305a9ea8a9aSMatt Arsenault 
306a9ea8a9aSMatt Arsenault   const int NumRoundedParts = (SrcSize + PartSize - 1) / PartSize;
307a9ea8a9aSMatt Arsenault 
308a9ea8a9aSMatt Arsenault   LLT BigTy = getMultipleType(PartTy, NumRoundedParts);
30906c8cb03SAustin Kerbow   auto ImpDef = B.buildUndef(BigTy);
310a9ea8a9aSMatt Arsenault 
3112a1b5af2SJay Foad   auto Big = B.buildInsert(BigTy, ImpDef.getReg(0), SrcReg, 0).getReg(0);
312a9ea8a9aSMatt Arsenault 
313a9ea8a9aSMatt Arsenault   int64_t Offset = 0;
314a9ea8a9aSMatt Arsenault   for (unsigned i = 0, e = DstRegs.size(); i != e; ++i, Offset += PartSize)
3152a1b5af2SJay Foad     B.buildExtract(DstRegs[i], Big, Offset);
316a9ea8a9aSMatt Arsenault }
317a9ea8a9aSMatt Arsenault 
318a9ea8a9aSMatt Arsenault /// Lower the return value for the already existing \p Ret. This assumes that
31906c8cb03SAustin Kerbow /// \p B's insertion point is correct.
32006c8cb03SAustin Kerbow bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
321a9ea8a9aSMatt Arsenault                                         const Value *Val, ArrayRef<Register> VRegs,
322a9ea8a9aSMatt Arsenault                                         MachineInstrBuilder &Ret) const {
323a9ea8a9aSMatt Arsenault   if (!Val)
324a9ea8a9aSMatt Arsenault     return true;
325a9ea8a9aSMatt Arsenault 
32606c8cb03SAustin Kerbow   auto &MF = B.getMF();
327a9ea8a9aSMatt Arsenault   const auto &F = MF.getFunction();
328a9ea8a9aSMatt Arsenault   const DataLayout &DL = MF.getDataLayout();
329eb416277SMatt Arsenault   MachineRegisterInfo *MRI = B.getMRI();
330a9ea8a9aSMatt Arsenault 
331a9ea8a9aSMatt Arsenault   CallingConv::ID CC = F.getCallingConv();
332a9ea8a9aSMatt Arsenault   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
333a9ea8a9aSMatt Arsenault 
334a9ea8a9aSMatt Arsenault   ArgInfo OrigRetInfo(VRegs, Val->getType());
335a9ea8a9aSMatt Arsenault   setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F);
336a9ea8a9aSMatt Arsenault   SmallVector<ArgInfo, 4> SplitRetInfos;
337a9ea8a9aSMatt Arsenault 
338a9ea8a9aSMatt Arsenault   splitToValueTypes(
339eb416277SMatt Arsenault     B, OrigRetInfo, AttributeList::ReturnIndex, SplitRetInfos, DL, CC,
340eb416277SMatt Arsenault     [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT,
341eb416277SMatt Arsenault         int VTSplitIdx) {
342eb416277SMatt Arsenault       unpackRegsToOrigType(B, Regs, SrcReg,
343eb416277SMatt Arsenault                            SplitRetInfos[VTSplitIdx],
344eb416277SMatt Arsenault                            LLTy, PartLLT);
345a9ea8a9aSMatt Arsenault     });
346a9ea8a9aSMatt Arsenault 
347a9ea8a9aSMatt Arsenault   CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg());
348eb416277SMatt Arsenault   OutgoingValueHandler RetHandler(B, *MRI, Ret, AssignFn);
34906c8cb03SAustin Kerbow   return handleAssignments(B, SplitRetInfos, RetHandler);
350a9ea8a9aSMatt Arsenault }
351a9ea8a9aSMatt Arsenault 
35206c8cb03SAustin Kerbow bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B,
35349168f67SAlexander Ivchenko                                      const Value *Val,
354e3a676e9SMatt Arsenault                                      ArrayRef<Register> VRegs) const {
355206b9927STom Stellard 
35606c8cb03SAustin Kerbow   MachineFunction &MF = B.getMF();
357206b9927STom Stellard   MachineRegisterInfo &MRI = MF.getRegInfo();
358206b9927STom Stellard   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
359206b9927STom Stellard   MFI->setIfReturnsVoid(!Val);
360206b9927STom Stellard 
361a9ea8a9aSMatt Arsenault   assert(!Val == VRegs.empty() && "Return value without a vreg");
362a9ea8a9aSMatt Arsenault 
36306c8cb03SAustin Kerbow   CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
364a9ea8a9aSMatt Arsenault   const bool IsShader = AMDGPU::isShader(CC);
365a9ea8a9aSMatt Arsenault   const bool IsWaveEnd = (IsShader && MFI->returnsVoid()) ||
366a9ea8a9aSMatt Arsenault                          AMDGPU::isKernel(CC);
367a9ea8a9aSMatt Arsenault   if (IsWaveEnd) {
36806c8cb03SAustin Kerbow     B.buildInstr(AMDGPU::S_ENDPGM)
369a9ea8a9aSMatt Arsenault       .addImm(0);
370206b9927STom Stellard     return true;
371206b9927STom Stellard   }
372206b9927STom Stellard 
373eb416277SMatt Arsenault   auto const &ST = MF.getSubtarget<GCNSubtarget>();
374206b9927STom Stellard 
375711556e6SMichael Liao   unsigned ReturnOpc =
376711556e6SMichael Liao       IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return;
377257882ffSTom Stellard 
37806c8cb03SAustin Kerbow   auto Ret = B.buildInstrNoInsert(ReturnOpc);
379a9ea8a9aSMatt Arsenault   Register ReturnAddrVReg;
380a9ea8a9aSMatt Arsenault   if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
381a9ea8a9aSMatt Arsenault     ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass);
382a9ea8a9aSMatt Arsenault     Ret.addUse(ReturnAddrVReg);
383206b9927STom Stellard   }
384206b9927STom Stellard 
38506c8cb03SAustin Kerbow   if (!lowerReturnVal(B, Val, VRegs, Ret))
386a9ea8a9aSMatt Arsenault     return false;
387a9ea8a9aSMatt Arsenault 
388a9ea8a9aSMatt Arsenault   if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
389a9ea8a9aSMatt Arsenault     const SIRegisterInfo *TRI = ST.getRegisterInfo();
390a9ea8a9aSMatt Arsenault     Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF),
391a9ea8a9aSMatt Arsenault                                          &AMDGPU::SGPR_64RegClass);
39206c8cb03SAustin Kerbow     B.buildCopy(ReturnAddrVReg, LiveInReturn);
393a9ea8a9aSMatt Arsenault   }
394a9ea8a9aSMatt Arsenault 
395a9ea8a9aSMatt Arsenault   // TODO: Handle CalleeSavedRegsViaCopy.
396a9ea8a9aSMatt Arsenault 
39706c8cb03SAustin Kerbow   B.insertInstr(Ret);
398000c5af3STom Stellard   return true;
399000c5af3STom Stellard }
400000c5af3STom Stellard 
40106c8cb03SAustin Kerbow Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &B,
402ca16621bSTom Stellard                                                Type *ParamTy,
40329f30379SMatt Arsenault                                                uint64_t Offset) const {
404ca16621bSTom Stellard 
40506c8cb03SAustin Kerbow   MachineFunction &MF = B.getMF();
4068623e8d8SMatt Arsenault   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
407ca16621bSTom Stellard   MachineRegisterInfo &MRI = MF.getRegInfo();
408f1caa283SMatthias Braun   const Function &F = MF.getFunction();
409ca16621bSTom Stellard   const DataLayout &DL = F.getParent()->getDataLayout();
4100da6350dSMatt Arsenault   PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS);
41152b4ce72SDaniel Sanders   LLT PtrType = getLLTForType(*PtrTy, DL);
412faeaedf8SMatt Arsenault   Register KernArgSegmentPtr =
4138623e8d8SMatt Arsenault     MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
414faeaedf8SMatt Arsenault   Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
415ca16621bSTom Stellard 
4162a1b5af2SJay Foad   auto OffsetReg = B.buildConstant(LLT::scalar(64), Offset);
417ca16621bSTom Stellard 
4182a1b5af2SJay Foad   return B.buildPtrAdd(PtrType, KernArgSegmentVReg, OffsetReg).getReg(0);
419ca16621bSTom Stellard }
420ca16621bSTom Stellard 
4210de874adSGuillaume Chatelet void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy,
4220de874adSGuillaume Chatelet                                         uint64_t Offset, Align Alignment,
423e3a676e9SMatt Arsenault                                         Register DstReg) const {
42406c8cb03SAustin Kerbow   MachineFunction &MF = B.getMF();
425f1caa283SMatthias Braun   const Function &F = MF.getFunction();
426ca16621bSTom Stellard   const DataLayout &DL = F.getParent()->getDataLayout();
427c7c05b0cSJay Foad   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
428ca16621bSTom Stellard   unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
42906c8cb03SAustin Kerbow   Register PtrReg = lowerParameterPtr(B, ParamTy, Offset);
430ca16621bSTom Stellard 
4310de874adSGuillaume Chatelet   MachineMemOperand *MMO = MF.getMachineMemOperand(
4320de874adSGuillaume Chatelet       PtrInfo,
4330de874adSGuillaume Chatelet       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
434ca16621bSTom Stellard           MachineMemOperand::MOInvariant,
4350de874adSGuillaume Chatelet       TypeSize, Alignment);
436ca16621bSTom Stellard 
43706c8cb03SAustin Kerbow   B.buildLoad(DstReg, PtrReg, *MMO);
438ca16621bSTom Stellard }
439ca16621bSTom Stellard 
440bae3636fSMatt Arsenault // Allocate special inputs passed in user SGPRs.
441bae3636fSMatt Arsenault static void allocateHSAUserSGPRs(CCState &CCInfo,
44206c8cb03SAustin Kerbow                                  MachineIRBuilder &B,
443bae3636fSMatt Arsenault                                  MachineFunction &MF,
444bae3636fSMatt Arsenault                                  const SIRegisterInfo &TRI,
445bae3636fSMatt Arsenault                                  SIMachineFunctionInfo &Info) {
446bae3636fSMatt Arsenault   // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
447bae3636fSMatt Arsenault   if (Info.hasPrivateSegmentBuffer()) {
4484dad4914SMatt Arsenault     Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
449bae3636fSMatt Arsenault     MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
450bae3636fSMatt Arsenault     CCInfo.AllocateReg(PrivateSegmentBufferReg);
451bae3636fSMatt Arsenault   }
452bae3636fSMatt Arsenault 
453bae3636fSMatt Arsenault   if (Info.hasDispatchPtr()) {
4544dad4914SMatt Arsenault     Register DispatchPtrReg = Info.addDispatchPtr(TRI);
455bae3636fSMatt Arsenault     MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
456bae3636fSMatt Arsenault     CCInfo.AllocateReg(DispatchPtrReg);
457bae3636fSMatt Arsenault   }
458bae3636fSMatt Arsenault 
459bae3636fSMatt Arsenault   if (Info.hasQueuePtr()) {
4604dad4914SMatt Arsenault     Register QueuePtrReg = Info.addQueuePtr(TRI);
461bae3636fSMatt Arsenault     MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
462bae3636fSMatt Arsenault     CCInfo.AllocateReg(QueuePtrReg);
463bae3636fSMatt Arsenault   }
464bae3636fSMatt Arsenault 
465bae3636fSMatt Arsenault   if (Info.hasKernargSegmentPtr()) {
466bae3636fSMatt Arsenault     MachineRegisterInfo &MRI = MF.getRegInfo();
467bae3636fSMatt Arsenault     Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
468bae3636fSMatt Arsenault     const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
469bae3636fSMatt Arsenault     Register VReg = MRI.createGenericVirtualRegister(P4);
470bae3636fSMatt Arsenault     MRI.addLiveIn(InputPtrReg, VReg);
47106c8cb03SAustin Kerbow     B.getMBB().addLiveIn(InputPtrReg);
47206c8cb03SAustin Kerbow     B.buildCopy(VReg, InputPtrReg);
473bae3636fSMatt Arsenault     CCInfo.AllocateReg(InputPtrReg);
474bae3636fSMatt Arsenault   }
475bae3636fSMatt Arsenault 
476bae3636fSMatt Arsenault   if (Info.hasDispatchID()) {
4774dad4914SMatt Arsenault     Register DispatchIDReg = Info.addDispatchID(TRI);
478bae3636fSMatt Arsenault     MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
479bae3636fSMatt Arsenault     CCInfo.AllocateReg(DispatchIDReg);
480bae3636fSMatt Arsenault   }
481bae3636fSMatt Arsenault 
482bae3636fSMatt Arsenault   if (Info.hasFlatScratchInit()) {
4834dad4914SMatt Arsenault     Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
484bae3636fSMatt Arsenault     MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
485bae3636fSMatt Arsenault     CCInfo.AllocateReg(FlatScratchInitReg);
486bae3636fSMatt Arsenault   }
487bae3636fSMatt Arsenault 
488bae3636fSMatt Arsenault   // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
489bae3636fSMatt Arsenault   // these from the dispatch pointer.
490bae3636fSMatt Arsenault }
491bae3636fSMatt Arsenault 
492b725d273SMatt Arsenault bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
49306c8cb03SAustin Kerbow     MachineIRBuilder &B, const Function &F,
494c3dbe239SDiana Picus     ArrayRef<ArrayRef<Register>> VRegs) const {
49506c8cb03SAustin Kerbow   MachineFunction &MF = B.getMF();
4965bfbae5cSTom Stellard   const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
497ca16621bSTom Stellard   MachineRegisterInfo &MRI = MF.getRegInfo();
498ca16621bSTom Stellard   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
499fecf43ebSMatt Arsenault   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
500fecf43ebSMatt Arsenault   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
501fecf43ebSMatt Arsenault 
502ca16621bSTom Stellard   const DataLayout &DL = F.getParent()->getDataLayout();
503ca16621bSTom Stellard 
504ca16621bSTom Stellard   SmallVector<CCValAssign, 16> ArgLocs;
505ca16621bSTom Stellard   CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
506ca16621bSTom Stellard 
50706c8cb03SAustin Kerbow   allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info);
508bae3636fSMatt Arsenault 
50929f30379SMatt Arsenault   unsigned i = 0;
5100de874adSGuillaume Chatelet   const Align KernArgBaseAlign(16);
51129f30379SMatt Arsenault   const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F);
51229f30379SMatt Arsenault   uint64_t ExplicitArgOffset = 0;
51329f30379SMatt Arsenault 
51429f30379SMatt Arsenault   // TODO: Align down to dword alignment and extract bits for extending loads.
51529f30379SMatt Arsenault   for (auto &Arg : F.args()) {
51629f30379SMatt Arsenault     Type *ArgTy = Arg.getType();
51729f30379SMatt Arsenault     unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
51829f30379SMatt Arsenault     if (AllocSize == 0)
51929f30379SMatt Arsenault       continue;
52029f30379SMatt Arsenault 
52129f30379SMatt Arsenault     unsigned ABIAlign = DL.getABITypeAlignment(ArgTy);
52229f30379SMatt Arsenault 
52329f30379SMatt Arsenault     uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
52429f30379SMatt Arsenault     ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
52529f30379SMatt Arsenault 
526c3dbe239SDiana Picus     ArrayRef<Register> OrigArgRegs = VRegs[i];
527c3dbe239SDiana Picus     Register ArgReg =
528c3dbe239SDiana Picus       OrigArgRegs.size() == 1
529c3dbe239SDiana Picus       ? OrigArgRegs[0]
530c3dbe239SDiana Picus       : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL));
531*431daedeSMatt Arsenault 
5320de874adSGuillaume Chatelet     Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset);
5330de874adSGuillaume Chatelet     lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg);
534c3dbe239SDiana Picus     if (OrigArgRegs.size() > 1)
53506c8cb03SAustin Kerbow       unpackRegs(OrigArgRegs, ArgReg, ArgTy, B);
53629f30379SMatt Arsenault     ++i;
53729f30379SMatt Arsenault   }
53829f30379SMatt Arsenault 
539fecf43ebSMatt Arsenault   TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
540fecf43ebSMatt Arsenault   TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
54129f30379SMatt Arsenault   return true;
54229f30379SMatt Arsenault }
54329f30379SMatt Arsenault 
5449e1d2afcSMatt Arsenault /// Pack values \p SrcRegs to cover the vector type result \p DstRegs.
5459e1d2afcSMatt Arsenault static MachineInstrBuilder mergeVectorRegsToResultRegs(
5469e1d2afcSMatt Arsenault   MachineIRBuilder &B, ArrayRef<Register> DstRegs, ArrayRef<Register> SrcRegs) {
5479e1d2afcSMatt Arsenault   MachineRegisterInfo &MRI = *B.getMRI();
5489e1d2afcSMatt Arsenault   LLT LLTy = MRI.getType(DstRegs[0]);
5499e1d2afcSMatt Arsenault   LLT PartLLT = MRI.getType(SrcRegs[0]);
5509e1d2afcSMatt Arsenault 
5519e1d2afcSMatt Arsenault   // Deal with v3s16 split into v2s16
5529e1d2afcSMatt Arsenault   LLT LCMTy = getLCMType(LLTy, PartLLT);
5539e1d2afcSMatt Arsenault   if (LCMTy == LLTy) {
5549e1d2afcSMatt Arsenault     // Common case where no padding is needed.
5559e1d2afcSMatt Arsenault     assert(DstRegs.size() == 1);
5569e1d2afcSMatt Arsenault     return B.buildConcatVectors(DstRegs[0], SrcRegs);
5579e1d2afcSMatt Arsenault   }
5589e1d2afcSMatt Arsenault 
5599e1d2afcSMatt Arsenault   const int NumWide =  LCMTy.getSizeInBits() / PartLLT.getSizeInBits();
5609e1d2afcSMatt Arsenault   Register Undef = B.buildUndef(PartLLT).getReg(0);
5619e1d2afcSMatt Arsenault 
5629e1d2afcSMatt Arsenault   // Build vector of undefs.
5639e1d2afcSMatt Arsenault   SmallVector<Register, 8> WidenedSrcs(NumWide, Undef);
5649e1d2afcSMatt Arsenault 
5659e1d2afcSMatt Arsenault   // Replace the first sources with the real registers.
5669e1d2afcSMatt Arsenault   std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin());
5679e1d2afcSMatt Arsenault 
5689e1d2afcSMatt Arsenault   auto Widened = B.buildConcatVectors(LCMTy, WidenedSrcs);
5699e1d2afcSMatt Arsenault   int NumDst = LCMTy.getSizeInBits() / LLTy.getSizeInBits();
5709e1d2afcSMatt Arsenault 
5719e1d2afcSMatt Arsenault   SmallVector<Register, 8> PadDstRegs(NumDst);
5729e1d2afcSMatt Arsenault   std::copy(DstRegs.begin(), DstRegs.end(), PadDstRegs.begin());
5739e1d2afcSMatt Arsenault 
5749e1d2afcSMatt Arsenault   // Create the excess dead defs for the unmerge.
5759e1d2afcSMatt Arsenault   for (int I = DstRegs.size(); I != NumDst; ++I)
5769e1d2afcSMatt Arsenault     PadDstRegs[I] = MRI.createGenericVirtualRegister(LLTy);
5779e1d2afcSMatt Arsenault 
5789e1d2afcSMatt Arsenault   return B.buildUnmerge(PadDstRegs, Widened);
5799e1d2afcSMatt Arsenault }
5809e1d2afcSMatt Arsenault 
581a9ea8a9aSMatt Arsenault // TODO: Move this to generic code
58206c8cb03SAustin Kerbow static void packSplitRegsToOrigType(MachineIRBuilder &B,
583fecf43ebSMatt Arsenault                                     ArrayRef<Register> OrigRegs,
584fecf43ebSMatt Arsenault                                     ArrayRef<Register> Regs,
585fecf43ebSMatt Arsenault                                     LLT LLTy,
586fecf43ebSMatt Arsenault                                     LLT PartLLT) {
587c460dc6eSMatt Arsenault   MachineRegisterInfo &MRI = *B.getMRI();
588c460dc6eSMatt Arsenault 
589fecf43ebSMatt Arsenault   if (!LLTy.isVector() && !PartLLT.isVector()) {
590c460dc6eSMatt Arsenault     assert(OrigRegs.size() == 1);
591c460dc6eSMatt Arsenault     LLT OrigTy = MRI.getType(OrigRegs[0]);
592c460dc6eSMatt Arsenault 
593c460dc6eSMatt Arsenault     unsigned SrcSize = PartLLT.getSizeInBits() * Regs.size();
594c460dc6eSMatt Arsenault     if (SrcSize == OrigTy.getSizeInBits())
59506c8cb03SAustin Kerbow       B.buildMerge(OrigRegs[0], Regs);
596c460dc6eSMatt Arsenault     else {
597c460dc6eSMatt Arsenault       auto Widened = B.buildMerge(LLT::scalar(SrcSize), Regs);
598c460dc6eSMatt Arsenault       B.buildTrunc(OrigRegs[0], Widened);
599c460dc6eSMatt Arsenault     }
600c460dc6eSMatt Arsenault 
601fecf43ebSMatt Arsenault     return;
602fecf43ebSMatt Arsenault   }
603fecf43ebSMatt Arsenault 
604fecf43ebSMatt Arsenault   if (LLTy.isVector() && PartLLT.isVector()) {
6059e1d2afcSMatt Arsenault     assert(OrigRegs.size() == 1);
606fecf43ebSMatt Arsenault     assert(LLTy.getElementType() == PartLLT.getElementType());
6079e1d2afcSMatt Arsenault     mergeVectorRegsToResultRegs(B, OrigRegs, Regs);
608fecf43ebSMatt Arsenault     return;
609fecf43ebSMatt Arsenault   }
610fecf43ebSMatt Arsenault 
611fecf43ebSMatt Arsenault   assert(LLTy.isVector() && !PartLLT.isVector());
612fecf43ebSMatt Arsenault 
613fecf43ebSMatt Arsenault   LLT DstEltTy = LLTy.getElementType();
614767aa507SMatt Arsenault 
615767aa507SMatt Arsenault   // Pointer information was discarded. We'll need to coerce some register types
616767aa507SMatt Arsenault   // to avoid violating type constraints.
617767aa507SMatt Arsenault   LLT RealDstEltTy = MRI.getType(OrigRegs[0]).getElementType();
618767aa507SMatt Arsenault 
619767aa507SMatt Arsenault   assert(DstEltTy.getSizeInBits() == RealDstEltTy.getSizeInBits());
620767aa507SMatt Arsenault 
621fecf43ebSMatt Arsenault   if (DstEltTy == PartLLT) {
622fecf43ebSMatt Arsenault     // Vector was trivially scalarized.
623767aa507SMatt Arsenault 
624767aa507SMatt Arsenault     if (RealDstEltTy.isPointer()) {
625767aa507SMatt Arsenault       for (Register Reg : Regs)
626767aa507SMatt Arsenault         MRI.setType(Reg, RealDstEltTy);
627767aa507SMatt Arsenault     }
628767aa507SMatt Arsenault 
62906c8cb03SAustin Kerbow     B.buildBuildVector(OrigRegs[0], Regs);
630fecf43ebSMatt Arsenault   } else if (DstEltTy.getSizeInBits() > PartLLT.getSizeInBits()) {
631fecf43ebSMatt Arsenault     // Deal with vector with 64-bit elements decomposed to 32-bit
632fecf43ebSMatt Arsenault     // registers. Need to create intermediate 64-bit elements.
633fecf43ebSMatt Arsenault     SmallVector<Register, 8> EltMerges;
634fecf43ebSMatt Arsenault     int PartsPerElt = DstEltTy.getSizeInBits() / PartLLT.getSizeInBits();
635fecf43ebSMatt Arsenault 
636fecf43ebSMatt Arsenault     assert(DstEltTy.getSizeInBits() % PartLLT.getSizeInBits() == 0);
637fecf43ebSMatt Arsenault 
638fecf43ebSMatt Arsenault     for (int I = 0, NumElts = LLTy.getNumElements(); I != NumElts; ++I)  {
639767aa507SMatt Arsenault       auto Merge = B.buildMerge(RealDstEltTy, Regs.take_front(PartsPerElt));
640767aa507SMatt Arsenault       // Fix the type in case this is really a vector of pointers.
641767aa507SMatt Arsenault       MRI.setType(Merge.getReg(0), RealDstEltTy);
642fecf43ebSMatt Arsenault       EltMerges.push_back(Merge.getReg(0));
643fecf43ebSMatt Arsenault       Regs = Regs.drop_front(PartsPerElt);
644fecf43ebSMatt Arsenault     }
645fecf43ebSMatt Arsenault 
64606c8cb03SAustin Kerbow     B.buildBuildVector(OrigRegs[0], EltMerges);
647fecf43ebSMatt Arsenault   } else {
648fecf43ebSMatt Arsenault     // Vector was split, and elements promoted to a wider type.
649fecf43ebSMatt Arsenault     LLT BVType = LLT::vector(LLTy.getNumElements(), PartLLT);
65006c8cb03SAustin Kerbow     auto BV = B.buildBuildVector(BVType, Regs);
65106c8cb03SAustin Kerbow     B.buildTrunc(OrigRegs[0], BV);
652fecf43ebSMatt Arsenault   }
653fecf43ebSMatt Arsenault }
654fecf43ebSMatt Arsenault 
655b725d273SMatt Arsenault bool AMDGPUCallLowering::lowerFormalArguments(
65606c8cb03SAustin Kerbow     MachineIRBuilder &B, const Function &F,
657b725d273SMatt Arsenault     ArrayRef<ArrayRef<Register>> VRegs) const {
658fecf43ebSMatt Arsenault   CallingConv::ID CC = F.getCallingConv();
659fecf43ebSMatt Arsenault 
660b725d273SMatt Arsenault   // The infrastructure for normal calling convention lowering is essentially
661b725d273SMatt Arsenault   // useless for kernels. We want to avoid any kind of legalization or argument
662b725d273SMatt Arsenault   // splitting.
663fecf43ebSMatt Arsenault   if (CC == CallingConv::AMDGPU_KERNEL)
66406c8cb03SAustin Kerbow     return lowerFormalArgumentsKernel(B, F, VRegs);
665b725d273SMatt Arsenault 
666fecf43ebSMatt Arsenault   const bool IsShader = AMDGPU::isShader(CC);
667fecf43ebSMatt Arsenault   const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC);
668fecf43ebSMatt Arsenault 
66906c8cb03SAustin Kerbow   MachineFunction &MF = B.getMF();
67006c8cb03SAustin Kerbow   MachineBasicBlock &MBB = B.getMBB();
671b725d273SMatt Arsenault   MachineRegisterInfo &MRI = MF.getRegInfo();
672b725d273SMatt Arsenault   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
673fecf43ebSMatt Arsenault   const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
674fecf43ebSMatt Arsenault   const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
675b725d273SMatt Arsenault   const DataLayout &DL = F.getParent()->getDataLayout();
676b725d273SMatt Arsenault 
677b725d273SMatt Arsenault 
678b725d273SMatt Arsenault   SmallVector<CCValAssign, 16> ArgLocs;
679fecf43ebSMatt Arsenault   CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
680b725d273SMatt Arsenault 
681a9ea8a9aSMatt Arsenault   if (!IsEntryFunc) {
682a9ea8a9aSMatt Arsenault     Register ReturnAddrReg = TRI->getReturnAddressReg(MF);
683a9ea8a9aSMatt Arsenault     Register LiveInReturn = MF.addLiveIn(ReturnAddrReg,
684a9ea8a9aSMatt Arsenault                                          &AMDGPU::SGPR_64RegClass);
685a9ea8a9aSMatt Arsenault     MBB.addLiveIn(ReturnAddrReg);
68606c8cb03SAustin Kerbow     B.buildCopy(LiveInReturn, ReturnAddrReg);
687a9ea8a9aSMatt Arsenault   }
688a9ea8a9aSMatt Arsenault 
689bae3636fSMatt Arsenault   if (Info->hasImplicitBufferPtr()) {
690fecf43ebSMatt Arsenault     Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
691bae3636fSMatt Arsenault     MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
692bae3636fSMatt Arsenault     CCInfo.AllocateReg(ImplicitBufferPtrReg);
693bae3636fSMatt Arsenault   }
694bae3636fSMatt Arsenault 
695fecf43ebSMatt Arsenault 
696fecf43ebSMatt Arsenault   SmallVector<ArgInfo, 32> SplitArgs;
697fecf43ebSMatt Arsenault   unsigned Idx = 0;
698c7709e1cSTom Stellard   unsigned PSInputNum = 0;
6999d8337d8STom Stellard 
700fecf43ebSMatt Arsenault   for (auto &Arg : F.args()) {
701fecf43ebSMatt Arsenault     if (DL.getTypeStoreSize(Arg.getType()) == 0)
702c7709e1cSTom Stellard       continue;
703c7709e1cSTom Stellard 
704fecf43ebSMatt Arsenault     const bool InReg = Arg.hasAttribute(Attribute::InReg);
705fecf43ebSMatt Arsenault 
706fecf43ebSMatt Arsenault     // SGPR arguments to functions not implemented.
707fecf43ebSMatt Arsenault     if (!IsShader && InReg)
708fecf43ebSMatt Arsenault       return false;
709fecf43ebSMatt Arsenault 
710a9ea8a9aSMatt Arsenault     if (Arg.hasAttribute(Attribute::SwiftSelf) ||
711fecf43ebSMatt Arsenault         Arg.hasAttribute(Attribute::SwiftError) ||
712b60a2ae4SMatt Arsenault         Arg.hasAttribute(Attribute::Nest))
713fecf43ebSMatt Arsenault       return false;
714fecf43ebSMatt Arsenault 
715fecf43ebSMatt Arsenault     if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) {
716fecf43ebSMatt Arsenault       const bool ArgUsed = !Arg.use_empty();
717fecf43ebSMatt Arsenault       bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum);
718fecf43ebSMatt Arsenault 
719fecf43ebSMatt Arsenault       if (!SkipArg) {
720c7709e1cSTom Stellard         Info->markPSInputAllocated(PSInputNum);
721fecf43ebSMatt Arsenault         if (ArgUsed)
722c7709e1cSTom Stellard           Info->markPSInputEnabled(PSInputNum);
723fecf43ebSMatt Arsenault       }
724c7709e1cSTom Stellard 
725c7709e1cSTom Stellard       ++PSInputNum;
726c7709e1cSTom Stellard 
727fecf43ebSMatt Arsenault       if (SkipArg) {
728b60a2ae4SMatt Arsenault         for (int I = 0, E = VRegs[Idx].size(); I != E; ++I)
72906c8cb03SAustin Kerbow           B.buildUndef(VRegs[Idx][I]);
730b60a2ae4SMatt Arsenault 
731fecf43ebSMatt Arsenault         ++Idx;
732c7709e1cSTom Stellard         continue;
733fecf43ebSMatt Arsenault       }
7349d8337d8STom Stellard     }
735e0a4da8cSMatt Arsenault 
736fecf43ebSMatt Arsenault     ArgInfo OrigArg(VRegs[Idx], Arg.getType());
737eb416277SMatt Arsenault     const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex;
738eb416277SMatt Arsenault     setArgFlags(OrigArg, OrigArgIdx, DL, F);
739b60a2ae4SMatt Arsenault 
740b60a2ae4SMatt Arsenault     splitToValueTypes(
741eb416277SMatt Arsenault       B, OrigArg, OrigArgIdx, SplitArgs, DL, CC,
742fecf43ebSMatt Arsenault       // FIXME: We should probably be passing multiple registers to
743fecf43ebSMatt Arsenault       // handleAssignments to do this
744eb416277SMatt Arsenault       [&](ArrayRef<Register> Regs, Register DstReg,
745eb416277SMatt Arsenault           LLT LLTy, LLT PartLLT, int VTSplitIdx) {
746eb416277SMatt Arsenault         assert(DstReg == VRegs[Idx][VTSplitIdx]);
74706c8cb03SAustin Kerbow         packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs,
748b60a2ae4SMatt Arsenault                                 LLTy, PartLLT);
749fecf43ebSMatt Arsenault       });
750fecf43ebSMatt Arsenault 
751fecf43ebSMatt Arsenault     ++Idx;
7529d8337d8STom Stellard   }
7539d8337d8STom Stellard 
754fecf43ebSMatt Arsenault   // At least one interpolation mode must be enabled or else the GPU will
755fecf43ebSMatt Arsenault   // hang.
756fecf43ebSMatt Arsenault   //
757fecf43ebSMatt Arsenault   // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
758fecf43ebSMatt Arsenault   // set PSInputAddr, the user wants to enable some bits after the compilation
759fecf43ebSMatt Arsenault   // based on run-time states. Since we can't know what the final PSInputEna
760fecf43ebSMatt Arsenault   // will look like, so we shouldn't do anything here and the user should take
761fecf43ebSMatt Arsenault   // responsibility for the correct programming.
762fecf43ebSMatt Arsenault   //
763fecf43ebSMatt Arsenault   // Otherwise, the following restrictions apply:
764fecf43ebSMatt Arsenault   // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
765fecf43ebSMatt Arsenault   // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
766fecf43ebSMatt Arsenault   //   enabled too.
767fecf43ebSMatt Arsenault   if (CC == CallingConv::AMDGPU_PS) {
768fecf43ebSMatt Arsenault     if ((Info->getPSInputAddr() & 0x7F) == 0 ||
769fecf43ebSMatt Arsenault         ((Info->getPSInputAddr() & 0xF) == 0 &&
770fecf43ebSMatt Arsenault          Info->isPSInputAllocated(11))) {
771fecf43ebSMatt Arsenault       CCInfo.AllocateReg(AMDGPU::VGPR0);
772fecf43ebSMatt Arsenault       CCInfo.AllocateReg(AMDGPU::VGPR1);
773fecf43ebSMatt Arsenault       Info->markPSInputAllocated(0);
774fecf43ebSMatt Arsenault       Info->markPSInputEnabled(0);
775fecf43ebSMatt Arsenault     }
776fecf43ebSMatt Arsenault 
777fecf43ebSMatt Arsenault     if (Subtarget.isAmdPalOS()) {
778fecf43ebSMatt Arsenault       // For isAmdPalOS, the user does not enable some bits after compilation
779fecf43ebSMatt Arsenault       // based on run-time states; the register values being generated here are
780fecf43ebSMatt Arsenault       // the final ones set in hardware. Therefore we need to apply the
781fecf43ebSMatt Arsenault       // workaround to PSInputAddr and PSInputEnable together.  (The case where
782fecf43ebSMatt Arsenault       // a bit is set in PSInputAddr but not PSInputEnable is where the frontend
783fecf43ebSMatt Arsenault       // set up an input arg for a particular interpolation mode, but nothing
784fecf43ebSMatt Arsenault       // uses that input arg. Really we should have an earlier pass that removes
785fecf43ebSMatt Arsenault       // such an arg.)
786fecf43ebSMatt Arsenault       unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
787fecf43ebSMatt Arsenault       if ((PsInputBits & 0x7F) == 0 ||
788fecf43ebSMatt Arsenault           ((PsInputBits & 0xF) == 0 &&
789fecf43ebSMatt Arsenault            (PsInputBits >> 11 & 1)))
790fecf43ebSMatt Arsenault         Info->markPSInputEnabled(
791fecf43ebSMatt Arsenault           countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
792fecf43ebSMatt Arsenault     }
793fecf43ebSMatt Arsenault   }
794fecf43ebSMatt Arsenault 
795fecf43ebSMatt Arsenault   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
796fecf43ebSMatt Arsenault   CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg());
797fecf43ebSMatt Arsenault 
798fecf43ebSMatt Arsenault   if (!MBB.empty())
79906c8cb03SAustin Kerbow     B.setInstr(*MBB.begin());
800fecf43ebSMatt Arsenault 
801a162048aSMatt Arsenault   if (!IsEntryFunc) {
802a162048aSMatt Arsenault     // For the fixed ABI, pass workitem IDs in the last argument register.
803a162048aSMatt Arsenault     if (AMDGPUTargetMachine::EnableFixedFunctionABI)
804a162048aSMatt Arsenault       TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
805a162048aSMatt Arsenault   }
806a162048aSMatt Arsenault 
80706c8cb03SAustin Kerbow   FormalArgHandler Handler(B, MRI, AssignFn);
80806c8cb03SAustin Kerbow   if (!handleAssignments(CCInfo, ArgLocs, B, SplitArgs, Handler))
80929f30379SMatt Arsenault     return false;
810fecf43ebSMatt Arsenault 
811a162048aSMatt Arsenault   if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) {
812fecf43ebSMatt Arsenault     // Special inputs come after user arguments.
813fecf43ebSMatt Arsenault     TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
814fecf43ebSMatt Arsenault   }
815fecf43ebSMatt Arsenault 
816fecf43ebSMatt Arsenault   // Start adding system SGPRs.
817fecf43ebSMatt Arsenault   if (IsEntryFunc) {
818fecf43ebSMatt Arsenault     TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader);
819fecf43ebSMatt Arsenault   } else {
820fecf43ebSMatt Arsenault     CCInfo.AllocateReg(Info->getScratchRSrcReg());
821fecf43ebSMatt Arsenault     TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
822fecf43ebSMatt Arsenault   }
823fecf43ebSMatt Arsenault 
824fecf43ebSMatt Arsenault   // Move back to the end of the basic block.
82506c8cb03SAustin Kerbow   B.setMBB(MBB);
826fecf43ebSMatt Arsenault 
827fecf43ebSMatt Arsenault   return true;
828000c5af3STom Stellard }
829