1 //===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements the lowering of LLVM calls to machine code calls for
11 /// GlobalISel.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUISelLowering.h"
18 #include "AMDGPUSubtarget.h"
19 #include "SIISelLowering.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "SIRegisterInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/CodeGen/Analysis.h"
24 #include "llvm/CodeGen/CallingConvLower.h"
25 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
26 #include "llvm/CodeGen/MachineInstrBuilder.h"
27 #include "llvm/Support/LowLevelTypeImpl.h"
28 
29 using namespace llvm;
30 
31 namespace {
32 
33 struct OutgoingValueHandler : public CallLowering::ValueHandler {
34   OutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
35                        MachineInstrBuilder MIB, CCAssignFn *AssignFn)
36       : ValueHandler(B, MRI, AssignFn), MIB(MIB) {}
37 
38   MachineInstrBuilder MIB;
39 
40   bool isIncomingArgumentHandler() const override { return false; }
41 
42   Register getStackAddress(uint64_t Size, int64_t Offset,
43                            MachinePointerInfo &MPO) override {
44     llvm_unreachable("not implemented");
45   }
46 
47   void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
48                             MachinePointerInfo &MPO, CCValAssign &VA) override {
49     llvm_unreachable("not implemented");
50   }
51 
52   void assignValueToReg(Register ValVReg, Register PhysReg,
53                         CCValAssign &VA) override {
54     Register ExtReg;
55     if (VA.getLocVT().getSizeInBits() < 32) {
56       // 16-bit types are reported as legal for 32-bit registers. We need to
57       // extend and do a 32-bit copy to avoid the verifier complaining about it.
58       ExtReg = MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0);
59     } else
60       ExtReg = extendRegister(ValVReg, VA);
61 
62     // If this is a scalar return, insert a readfirstlane just in case the value
63     // ends up in a VGPR.
64     // FIXME: Assert this is a shader return.
65     const SIRegisterInfo *TRI
66       = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
67     if (TRI->isSGPRReg(MRI, PhysReg)) {
68       auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane,
69                                               {MRI.getType(ExtReg)}, false)
70         .addReg(ExtReg);
71       ExtReg = ToSGPR.getReg(0);
72     }
73 
74     MIRBuilder.buildCopy(PhysReg, ExtReg);
75     MIB.addUse(PhysReg, RegState::Implicit);
76   }
77 
78   bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
79                  CCValAssign::LocInfo LocInfo,
80                  const CallLowering::ArgInfo &Info,
81                  ISD::ArgFlagsTy Flags,
82                  CCState &State) override {
83     return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
84   }
85 };
86 
87 struct IncomingArgHandler : public CallLowering::ValueHandler {
88   uint64_t StackUsed = 0;
89 
90   IncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
91                      CCAssignFn *AssignFn)
92     : ValueHandler(B, MRI, AssignFn) {}
93 
94   Register getStackAddress(uint64_t Size, int64_t Offset,
95                            MachinePointerInfo &MPO) override {
96     auto &MFI = MIRBuilder.getMF().getFrameInfo();
97     int FI = MFI.CreateFixedObject(Size, Offset, true);
98     MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
99     auto AddrReg = MIRBuilder.buildFrameIndex(
100         LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32), FI);
101     StackUsed = std::max(StackUsed, Size + Offset);
102     return AddrReg.getReg(0);
103   }
104 
105   void assignValueToReg(Register ValVReg, Register PhysReg,
106                         CCValAssign &VA) override {
107     markPhysRegUsed(PhysReg);
108 
109     if (VA.getLocVT().getSizeInBits() < 32) {
110       // 16-bit types are reported as legal for 32-bit registers. We need to do
111       // a 32-bit copy, and truncate to avoid the verifier complaining about it.
112       auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg);
113       MIRBuilder.buildTrunc(ValVReg, Copy);
114       return;
115     }
116 
117     switch (VA.getLocInfo()) {
118     case CCValAssign::LocInfo::SExt:
119     case CCValAssign::LocInfo::ZExt:
120     case CCValAssign::LocInfo::AExt: {
121       auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg);
122       MIRBuilder.buildTrunc(ValVReg, Copy);
123       break;
124     }
125     default:
126       MIRBuilder.buildCopy(ValVReg, PhysReg);
127       break;
128     }
129   }
130 
131   void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
132                             MachinePointerInfo &MPO, CCValAssign &VA) override {
133     MachineFunction &MF = MIRBuilder.getMF();
134 
135     // FIXME: Get alignment
136     auto MMO = MF.getMachineMemOperand(
137         MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size,
138         inferAlignFromPtrInfo(MF, MPO));
139     MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
140   }
141 
142   /// How the physical register gets marked varies between formal
143   /// parameters (it's a basic-block live-in), and a call instruction
144   /// (it's an implicit-def of the BL).
145   virtual void markPhysRegUsed(unsigned PhysReg) = 0;
146 
147   // FIXME: What is the point of this being a callback?
148   bool isIncomingArgumentHandler() const override { return true; }
149 };
150 
151 struct FormalArgHandler : public IncomingArgHandler {
152   FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
153                    CCAssignFn *AssignFn)
154     : IncomingArgHandler(B, MRI, AssignFn) {}
155 
156   void markPhysRegUsed(unsigned PhysReg) override {
157     MIRBuilder.getMBB().addLiveIn(PhysReg);
158   }
159 };
160 
161 }
162 
163 AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
164   : CallLowering(&TLI) {
165 }
166 
167 // FIXME: Compatability shim
168 static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) {
169   switch (MIOpc) {
170   case TargetOpcode::G_SEXT:
171     return ISD::SIGN_EXTEND;
172   case TargetOpcode::G_ZEXT:
173     return ISD::ZERO_EXTEND;
174   case TargetOpcode::G_ANYEXT:
175     return ISD::ANY_EXTEND;
176   default:
177     llvm_unreachable("not an extend opcode");
178   }
179 }
180 
181 void AMDGPUCallLowering::splitToValueTypes(
182   MachineIRBuilder &B,
183   const ArgInfo &OrigArg, unsigned OrigArgIdx,
184   SmallVectorImpl<ArgInfo> &SplitArgs,
185   const DataLayout &DL, CallingConv::ID CallConv,
186   SplitArgTy PerformArgSplit) const {
187   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
188   LLVMContext &Ctx = OrigArg.Ty->getContext();
189 
190   if (OrigArg.Ty->isVoidTy())
191     return;
192 
193   SmallVector<EVT, 4> SplitVTs;
194   ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs);
195 
196   assert(OrigArg.Regs.size() == SplitVTs.size());
197 
198   int SplitIdx = 0;
199   for (EVT VT : SplitVTs) {
200     Register Reg = OrigArg.Regs[SplitIdx];
201     Type *Ty = VT.getTypeForEVT(Ctx);
202     LLT LLTy = getLLTForType(*Ty, DL);
203 
204     if (OrigArgIdx == AttributeList::ReturnIndex && VT.isScalarInteger()) {
205       unsigned ExtendOp = TargetOpcode::G_ANYEXT;
206       if (OrigArg.Flags[0].isSExt()) {
207         assert(OrigArg.Regs.size() == 1 && "expect only simple return values");
208         ExtendOp = TargetOpcode::G_SEXT;
209       } else if (OrigArg.Flags[0].isZExt()) {
210         assert(OrigArg.Regs.size() == 1 && "expect only simple return values");
211         ExtendOp = TargetOpcode::G_ZEXT;
212       }
213 
214       EVT ExtVT = TLI.getTypeForExtReturn(Ctx, VT,
215                                           extOpcodeToISDExtOpcode(ExtendOp));
216       if (ExtVT != VT) {
217         VT = ExtVT;
218         Ty = ExtVT.getTypeForEVT(Ctx);
219         LLTy = getLLTForType(*Ty, DL);
220         Reg = B.buildInstr(ExtendOp, {LLTy}, {Reg}).getReg(0);
221       }
222     }
223 
224     unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
225     MVT RegVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
226 
227     if (NumParts == 1) {
228       // Fixup EVTs to an MVT.
229       //
230       // FIXME: This is pretty hacky. Why do we have to split the type
231       // legalization logic between here and handleAssignments?
232       if (OrigArgIdx != AttributeList::ReturnIndex && VT != RegVT) {
233         assert(VT.getSizeInBits() < 32 &&
234                "unexpected illegal type");
235         Ty = Type::getInt32Ty(Ctx);
236         Register OrigReg = Reg;
237         Reg = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
238         B.buildTrunc(OrigReg, Reg);
239       }
240 
241       // No splitting to do, but we want to replace the original type (e.g. [1 x
242       // double] -> double).
243       SplitArgs.emplace_back(Reg, Ty, OrigArg.Flags, OrigArg.IsFixed);
244 
245       ++SplitIdx;
246       continue;
247     }
248 
249     SmallVector<Register, 8> SplitRegs;
250     Type *PartTy = EVT(RegVT).getTypeForEVT(Ctx);
251     LLT PartLLT = getLLTForType(*PartTy, DL);
252     MachineRegisterInfo &MRI = *B.getMRI();
253 
254     // FIXME: Should we be reporting all of the part registers for a single
255     // argument, and let handleAssignments take care of the repacking?
256     for (unsigned i = 0; i < NumParts; ++i) {
257       Register PartReg = MRI.createGenericVirtualRegister(PartLLT);
258       SplitRegs.push_back(PartReg);
259       SplitArgs.emplace_back(ArrayRef<Register>(PartReg), PartTy, OrigArg.Flags);
260     }
261 
262     PerformArgSplit(SplitRegs, Reg, LLTy, PartLLT, SplitIdx);
263 
264     ++SplitIdx;
265   }
266 }
267 
268 // Get the appropriate type to make \p OrigTy \p Factor times bigger.
269 static LLT getMultipleType(LLT OrigTy, int Factor) {
270   if (OrigTy.isVector()) {
271     return LLT::vector(OrigTy.getNumElements() * Factor,
272                        OrigTy.getElementType());
273   }
274 
275   return LLT::scalar(OrigTy.getSizeInBits() * Factor);
276 }
277 
278 // TODO: Move to generic code
279 static void unpackRegsToOrigType(MachineIRBuilder &B,
280                                  ArrayRef<Register> DstRegs,
281                                  Register SrcReg,
282                                  const CallLowering::ArgInfo &Info,
283                                  LLT SrcTy,
284                                  LLT PartTy) {
285   assert(DstRegs.size() > 1 && "Nothing to unpack");
286 
287   const unsigned SrcSize = SrcTy.getSizeInBits();
288   const unsigned PartSize = PartTy.getSizeInBits();
289 
290   if (SrcTy.isVector() && !PartTy.isVector() &&
291       PartSize > SrcTy.getElementType().getSizeInBits()) {
292     // Vector was scalarized, and the elements extended.
293     auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(),
294                                                   SrcReg);
295     for (int i = 0, e = DstRegs.size(); i != e; ++i)
296       B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i));
297     return;
298   }
299 
300   if (SrcSize % PartSize == 0) {
301     B.buildUnmerge(DstRegs, SrcReg);
302     return;
303   }
304 
305   const int NumRoundedParts = (SrcSize + PartSize - 1) / PartSize;
306 
307   LLT BigTy = getMultipleType(PartTy, NumRoundedParts);
308   auto ImpDef = B.buildUndef(BigTy);
309 
310   auto Big = B.buildInsert(BigTy, ImpDef.getReg(0), SrcReg, 0).getReg(0);
311 
312   int64_t Offset = 0;
313   for (unsigned i = 0, e = DstRegs.size(); i != e; ++i, Offset += PartSize)
314     B.buildExtract(DstRegs[i], Big, Offset);
315 }
316 
317 /// Lower the return value for the already existing \p Ret. This assumes that
318 /// \p B's insertion point is correct.
319 bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
320                                         const Value *Val, ArrayRef<Register> VRegs,
321                                         MachineInstrBuilder &Ret) const {
322   if (!Val)
323     return true;
324 
325   auto &MF = B.getMF();
326   const auto &F = MF.getFunction();
327   const DataLayout &DL = MF.getDataLayout();
328   MachineRegisterInfo *MRI = B.getMRI();
329 
330   CallingConv::ID CC = F.getCallingConv();
331   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
332 
333   ArgInfo OrigRetInfo(VRegs, Val->getType());
334   setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F);
335   SmallVector<ArgInfo, 4> SplitRetInfos;
336 
337   splitToValueTypes(
338     B, OrigRetInfo, AttributeList::ReturnIndex, SplitRetInfos, DL, CC,
339     [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT,
340         int VTSplitIdx) {
341       unpackRegsToOrigType(B, Regs, SrcReg,
342                            SplitRetInfos[VTSplitIdx],
343                            LLTy, PartLLT);
344     });
345 
346   CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg());
347   OutgoingValueHandler RetHandler(B, *MRI, Ret, AssignFn);
348   return handleAssignments(B, SplitRetInfos, RetHandler);
349 }
350 
351 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B,
352                                      const Value *Val,
353                                      ArrayRef<Register> VRegs) const {
354 
355   MachineFunction &MF = B.getMF();
356   MachineRegisterInfo &MRI = MF.getRegInfo();
357   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
358   MFI->setIfReturnsVoid(!Val);
359 
360   assert(!Val == VRegs.empty() && "Return value without a vreg");
361 
362   CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
363   const bool IsShader = AMDGPU::isShader(CC);
364   const bool IsWaveEnd = (IsShader && MFI->returnsVoid()) ||
365                          AMDGPU::isKernel(CC);
366   if (IsWaveEnd) {
367     B.buildInstr(AMDGPU::S_ENDPGM)
368       .addImm(0);
369     return true;
370   }
371 
372   auto const &ST = MF.getSubtarget<GCNSubtarget>();
373 
374   unsigned ReturnOpc =
375       IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return;
376 
377   auto Ret = B.buildInstrNoInsert(ReturnOpc);
378   Register ReturnAddrVReg;
379   if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
380     ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass);
381     Ret.addUse(ReturnAddrVReg);
382   }
383 
384   if (!lowerReturnVal(B, Val, VRegs, Ret))
385     return false;
386 
387   if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
388     const SIRegisterInfo *TRI = ST.getRegisterInfo();
389     Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF),
390                                          &AMDGPU::SGPR_64RegClass);
391     B.buildCopy(ReturnAddrVReg, LiveInReturn);
392   }
393 
394   // TODO: Handle CalleeSavedRegsViaCopy.
395 
396   B.insertInstr(Ret);
397   return true;
398 }
399 
400 Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &B,
401                                                Type *ParamTy,
402                                                uint64_t Offset) const {
403 
404   MachineFunction &MF = B.getMF();
405   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
406   MachineRegisterInfo &MRI = MF.getRegInfo();
407   const Function &F = MF.getFunction();
408   const DataLayout &DL = F.getParent()->getDataLayout();
409   PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS);
410   LLT PtrType = getLLTForType(*PtrTy, DL);
411   Register KernArgSegmentPtr =
412     MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
413   Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
414 
415   auto OffsetReg = B.buildConstant(LLT::scalar(64), Offset);
416 
417   return B.buildPtrAdd(PtrType, KernArgSegmentVReg, OffsetReg).getReg(0);
418 }
419 
420 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy,
421                                         uint64_t Offset, Align Alignment,
422                                         Register DstReg) const {
423   MachineFunction &MF = B.getMF();
424   const Function &F = MF.getFunction();
425   const DataLayout &DL = F.getParent()->getDataLayout();
426   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
427   unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
428   Register PtrReg = lowerParameterPtr(B, ParamTy, Offset);
429 
430   MachineMemOperand *MMO = MF.getMachineMemOperand(
431       PtrInfo,
432       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
433           MachineMemOperand::MOInvariant,
434       TypeSize, Alignment);
435 
436   B.buildLoad(DstReg, PtrReg, *MMO);
437 }
438 
439 // Allocate special inputs passed in user SGPRs.
440 static void allocateHSAUserSGPRs(CCState &CCInfo,
441                                  MachineIRBuilder &B,
442                                  MachineFunction &MF,
443                                  const SIRegisterInfo &TRI,
444                                  SIMachineFunctionInfo &Info) {
445   // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
446   if (Info.hasPrivateSegmentBuffer()) {
447     Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
448     MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
449     CCInfo.AllocateReg(PrivateSegmentBufferReg);
450   }
451 
452   if (Info.hasDispatchPtr()) {
453     Register DispatchPtrReg = Info.addDispatchPtr(TRI);
454     MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
455     CCInfo.AllocateReg(DispatchPtrReg);
456   }
457 
458   if (Info.hasQueuePtr()) {
459     Register QueuePtrReg = Info.addQueuePtr(TRI);
460     MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
461     CCInfo.AllocateReg(QueuePtrReg);
462   }
463 
464   if (Info.hasKernargSegmentPtr()) {
465     MachineRegisterInfo &MRI = MF.getRegInfo();
466     Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
467     const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
468     Register VReg = MRI.createGenericVirtualRegister(P4);
469     MRI.addLiveIn(InputPtrReg, VReg);
470     B.getMBB().addLiveIn(InputPtrReg);
471     B.buildCopy(VReg, InputPtrReg);
472     CCInfo.AllocateReg(InputPtrReg);
473   }
474 
475   if (Info.hasDispatchID()) {
476     Register DispatchIDReg = Info.addDispatchID(TRI);
477     MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
478     CCInfo.AllocateReg(DispatchIDReg);
479   }
480 
481   if (Info.hasFlatScratchInit()) {
482     Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
483     MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
484     CCInfo.AllocateReg(FlatScratchInitReg);
485   }
486 
487   // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
488   // these from the dispatch pointer.
489 }
490 
491 bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
492     MachineIRBuilder &B, const Function &F,
493     ArrayRef<ArrayRef<Register>> VRegs) const {
494   MachineFunction &MF = B.getMF();
495   const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
496   MachineRegisterInfo &MRI = MF.getRegInfo();
497   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
498   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
499   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
500 
501   const DataLayout &DL = F.getParent()->getDataLayout();
502 
503   SmallVector<CCValAssign, 16> ArgLocs;
504   CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
505 
506   allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info);
507 
508   unsigned i = 0;
509   const Align KernArgBaseAlign(16);
510   const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F);
511   uint64_t ExplicitArgOffset = 0;
512 
513   // TODO: Align down to dword alignment and extract bits for extending loads.
514   for (auto &Arg : F.args()) {
515     Type *ArgTy = Arg.getType();
516     unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
517     if (AllocSize == 0)
518       continue;
519 
520     unsigned ABIAlign = DL.getABITypeAlignment(ArgTy);
521 
522     uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
523     ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
524 
525     ArrayRef<Register> OrigArgRegs = VRegs[i];
526     Register ArgReg =
527       OrigArgRegs.size() == 1
528       ? OrigArgRegs[0]
529       : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL));
530     Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset);
531     ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy));
532     lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg);
533     if (OrigArgRegs.size() > 1)
534       unpackRegs(OrigArgRegs, ArgReg, ArgTy, B);
535     ++i;
536   }
537 
538   TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
539   TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
540   return true;
541 }
542 
543 /// Pack values \p SrcRegs to cover the vector type result \p DstRegs.
544 static MachineInstrBuilder mergeVectorRegsToResultRegs(
545   MachineIRBuilder &B, ArrayRef<Register> DstRegs, ArrayRef<Register> SrcRegs) {
546   MachineRegisterInfo &MRI = *B.getMRI();
547   LLT LLTy = MRI.getType(DstRegs[0]);
548   LLT PartLLT = MRI.getType(SrcRegs[0]);
549 
550   // Deal with v3s16 split into v2s16
551   LLT LCMTy = getLCMType(LLTy, PartLLT);
552   if (LCMTy == LLTy) {
553     // Common case where no padding is needed.
554     assert(DstRegs.size() == 1);
555     return B.buildConcatVectors(DstRegs[0], SrcRegs);
556   }
557 
558   const int NumWide =  LCMTy.getSizeInBits() / PartLLT.getSizeInBits();
559   Register Undef = B.buildUndef(PartLLT).getReg(0);
560 
561   // Build vector of undefs.
562   SmallVector<Register, 8> WidenedSrcs(NumWide, Undef);
563 
564   // Replace the first sources with the real registers.
565   std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin());
566 
567   auto Widened = B.buildConcatVectors(LCMTy, WidenedSrcs);
568   int NumDst = LCMTy.getSizeInBits() / LLTy.getSizeInBits();
569 
570   SmallVector<Register, 8> PadDstRegs(NumDst);
571   std::copy(DstRegs.begin(), DstRegs.end(), PadDstRegs.begin());
572 
573   // Create the excess dead defs for the unmerge.
574   for (int I = DstRegs.size(); I != NumDst; ++I)
575     PadDstRegs[I] = MRI.createGenericVirtualRegister(LLTy);
576 
577   return B.buildUnmerge(PadDstRegs, Widened);
578 }
579 
580 // TODO: Move this to generic code
581 static void packSplitRegsToOrigType(MachineIRBuilder &B,
582                                     ArrayRef<Register> OrigRegs,
583                                     ArrayRef<Register> Regs,
584                                     LLT LLTy,
585                                     LLT PartLLT) {
586   MachineRegisterInfo &MRI = *B.getMRI();
587 
588   if (!LLTy.isVector() && !PartLLT.isVector()) {
589     assert(OrigRegs.size() == 1);
590     LLT OrigTy = MRI.getType(OrigRegs[0]);
591 
592     unsigned SrcSize = PartLLT.getSizeInBits() * Regs.size();
593     if (SrcSize == OrigTy.getSizeInBits())
594       B.buildMerge(OrigRegs[0], Regs);
595     else {
596       auto Widened = B.buildMerge(LLT::scalar(SrcSize), Regs);
597       B.buildTrunc(OrigRegs[0], Widened);
598     }
599 
600     return;
601   }
602 
603   if (LLTy.isVector() && PartLLT.isVector()) {
604     assert(OrigRegs.size() == 1);
605     assert(LLTy.getElementType() == PartLLT.getElementType());
606     mergeVectorRegsToResultRegs(B, OrigRegs, Regs);
607     return;
608   }
609 
610   assert(LLTy.isVector() && !PartLLT.isVector());
611 
612   LLT DstEltTy = LLTy.getElementType();
613 
614   // Pointer information was discarded. We'll need to coerce some register types
615   // to avoid violating type constraints.
616   LLT RealDstEltTy = MRI.getType(OrigRegs[0]).getElementType();
617 
618   assert(DstEltTy.getSizeInBits() == RealDstEltTy.getSizeInBits());
619 
620   if (DstEltTy == PartLLT) {
621     // Vector was trivially scalarized.
622 
623     if (RealDstEltTy.isPointer()) {
624       for (Register Reg : Regs)
625         MRI.setType(Reg, RealDstEltTy);
626     }
627 
628     B.buildBuildVector(OrigRegs[0], Regs);
629   } else if (DstEltTy.getSizeInBits() > PartLLT.getSizeInBits()) {
630     // Deal with vector with 64-bit elements decomposed to 32-bit
631     // registers. Need to create intermediate 64-bit elements.
632     SmallVector<Register, 8> EltMerges;
633     int PartsPerElt = DstEltTy.getSizeInBits() / PartLLT.getSizeInBits();
634 
635     assert(DstEltTy.getSizeInBits() % PartLLT.getSizeInBits() == 0);
636 
637     for (int I = 0, NumElts = LLTy.getNumElements(); I != NumElts; ++I)  {
638       auto Merge = B.buildMerge(RealDstEltTy, Regs.take_front(PartsPerElt));
639       // Fix the type in case this is really a vector of pointers.
640       MRI.setType(Merge.getReg(0), RealDstEltTy);
641       EltMerges.push_back(Merge.getReg(0));
642       Regs = Regs.drop_front(PartsPerElt);
643     }
644 
645     B.buildBuildVector(OrigRegs[0], EltMerges);
646   } else {
647     // Vector was split, and elements promoted to a wider type.
648     LLT BVType = LLT::vector(LLTy.getNumElements(), PartLLT);
649     auto BV = B.buildBuildVector(BVType, Regs);
650     B.buildTrunc(OrigRegs[0], BV);
651   }
652 }
653 
654 bool AMDGPUCallLowering::lowerFormalArguments(
655     MachineIRBuilder &B, const Function &F,
656     ArrayRef<ArrayRef<Register>> VRegs) const {
657   CallingConv::ID CC = F.getCallingConv();
658 
659   // The infrastructure for normal calling convention lowering is essentially
660   // useless for kernels. We want to avoid any kind of legalization or argument
661   // splitting.
662   if (CC == CallingConv::AMDGPU_KERNEL)
663     return lowerFormalArgumentsKernel(B, F, VRegs);
664 
665   const bool IsShader = AMDGPU::isShader(CC);
666   const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC);
667 
668   MachineFunction &MF = B.getMF();
669   MachineBasicBlock &MBB = B.getMBB();
670   MachineRegisterInfo &MRI = MF.getRegInfo();
671   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
672   const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
673   const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
674   const DataLayout &DL = F.getParent()->getDataLayout();
675 
676 
677   SmallVector<CCValAssign, 16> ArgLocs;
678   CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
679 
680   if (!IsEntryFunc) {
681     Register ReturnAddrReg = TRI->getReturnAddressReg(MF);
682     Register LiveInReturn = MF.addLiveIn(ReturnAddrReg,
683                                          &AMDGPU::SGPR_64RegClass);
684     MBB.addLiveIn(ReturnAddrReg);
685     B.buildCopy(LiveInReturn, ReturnAddrReg);
686   }
687 
688   if (Info->hasImplicitBufferPtr()) {
689     Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
690     MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
691     CCInfo.AllocateReg(ImplicitBufferPtrReg);
692   }
693 
694 
695   SmallVector<ArgInfo, 32> SplitArgs;
696   unsigned Idx = 0;
697   unsigned PSInputNum = 0;
698 
699   for (auto &Arg : F.args()) {
700     if (DL.getTypeStoreSize(Arg.getType()) == 0)
701       continue;
702 
703     const bool InReg = Arg.hasAttribute(Attribute::InReg);
704 
705     // SGPR arguments to functions not implemented.
706     if (!IsShader && InReg)
707       return false;
708 
709     if (Arg.hasAttribute(Attribute::SwiftSelf) ||
710         Arg.hasAttribute(Attribute::SwiftError) ||
711         Arg.hasAttribute(Attribute::Nest))
712       return false;
713 
714     if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) {
715       const bool ArgUsed = !Arg.use_empty();
716       bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum);
717 
718       if (!SkipArg) {
719         Info->markPSInputAllocated(PSInputNum);
720         if (ArgUsed)
721           Info->markPSInputEnabled(PSInputNum);
722       }
723 
724       ++PSInputNum;
725 
726       if (SkipArg) {
727         for (int I = 0, E = VRegs[Idx].size(); I != E; ++I)
728           B.buildUndef(VRegs[Idx][I]);
729 
730         ++Idx;
731         continue;
732       }
733     }
734 
735     ArgInfo OrigArg(VRegs[Idx], Arg.getType());
736     const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex;
737     setArgFlags(OrigArg, OrigArgIdx, DL, F);
738 
739     splitToValueTypes(
740       B, OrigArg, OrigArgIdx, SplitArgs, DL, CC,
741       // FIXME: We should probably be passing multiple registers to
742       // handleAssignments to do this
743       [&](ArrayRef<Register> Regs, Register DstReg,
744           LLT LLTy, LLT PartLLT, int VTSplitIdx) {
745         assert(DstReg == VRegs[Idx][VTSplitIdx]);
746         packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs,
747                                 LLTy, PartLLT);
748       });
749 
750     ++Idx;
751   }
752 
753   // At least one interpolation mode must be enabled or else the GPU will
754   // hang.
755   //
756   // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
757   // set PSInputAddr, the user wants to enable some bits after the compilation
758   // based on run-time states. Since we can't know what the final PSInputEna
759   // will look like, so we shouldn't do anything here and the user should take
760   // responsibility for the correct programming.
761   //
762   // Otherwise, the following restrictions apply:
763   // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
764   // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
765   //   enabled too.
766   if (CC == CallingConv::AMDGPU_PS) {
767     if ((Info->getPSInputAddr() & 0x7F) == 0 ||
768         ((Info->getPSInputAddr() & 0xF) == 0 &&
769          Info->isPSInputAllocated(11))) {
770       CCInfo.AllocateReg(AMDGPU::VGPR0);
771       CCInfo.AllocateReg(AMDGPU::VGPR1);
772       Info->markPSInputAllocated(0);
773       Info->markPSInputEnabled(0);
774     }
775 
776     if (Subtarget.isAmdPalOS()) {
777       // For isAmdPalOS, the user does not enable some bits after compilation
778       // based on run-time states; the register values being generated here are
779       // the final ones set in hardware. Therefore we need to apply the
780       // workaround to PSInputAddr and PSInputEnable together.  (The case where
781       // a bit is set in PSInputAddr but not PSInputEnable is where the frontend
782       // set up an input arg for a particular interpolation mode, but nothing
783       // uses that input arg. Really we should have an earlier pass that removes
784       // such an arg.)
785       unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
786       if ((PsInputBits & 0x7F) == 0 ||
787           ((PsInputBits & 0xF) == 0 &&
788            (PsInputBits >> 11 & 1)))
789         Info->markPSInputEnabled(
790           countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
791     }
792   }
793 
794   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
795   CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg());
796 
797   if (!MBB.empty())
798     B.setInstr(*MBB.begin());
799 
800   FormalArgHandler Handler(B, MRI, AssignFn);
801   if (!handleAssignments(CCInfo, ArgLocs, B, SplitArgs, Handler))
802     return false;
803 
804   if (!IsEntryFunc) {
805     // Special inputs come after user arguments.
806     TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
807   }
808 
809   // Start adding system SGPRs.
810   if (IsEntryFunc) {
811     TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader);
812   } else {
813     CCInfo.AllocateReg(Info->getScratchRSrcReg());
814     TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
815   }
816 
817   // Move back to the end of the basic block.
818   B.setMBB(MBB);
819 
820   return true;
821 }
822