1 //===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements the lowering of LLVM calls to machine code calls for
11 /// GlobalISel.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUISelLowering.h"
18 #include "AMDGPULegalizerInfo.h"
19 #include "AMDGPUSubtarget.h"
20 #include "AMDGPUTargetMachine.h"
21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22 #include "SIISelLowering.h"
23 #include "SIMachineFunctionInfo.h"
24 #include "SIRegisterInfo.h"
25 #include "llvm/CodeGen/Analysis.h"
26 #include "llvm/CodeGen/CallingConvLower.h"
27 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
28 #include "llvm/CodeGen/MachineInstrBuilder.h"
29 #include "llvm/Support/LowLevelTypeImpl.h"
30 
31 #define DEBUG_TYPE "amdgpu-call-lowering"
32 
33 using namespace llvm;
34 
35 namespace {
36 
37 struct AMDGPUValueHandler : public CallLowering::ValueHandler {
38   AMDGPUValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
39                      CCAssignFn *AssignFn)
40     : ValueHandler(B, MRI, AssignFn) {}
41 
42   /// Wrapper around extendRegister to ensure we extend to a full 32-bit
43   /// register.
44   Register extendRegisterMin32(Register ValVReg, CCValAssign &VA) {
45     if (VA.getLocVT().getSizeInBits() < 32) {
46       // 16-bit types are reported as legal for 32-bit registers. We need to
47       // extend and do a 32-bit copy to avoid the verifier complaining about it.
48       return MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0);
49     }
50 
51     return extendRegister(ValVReg, VA);
52   }
53 };
54 
55 struct OutgoingValueHandler : public AMDGPUValueHandler {
56   OutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
57                        MachineInstrBuilder MIB, CCAssignFn *AssignFn)
58       : AMDGPUValueHandler(B, MRI, AssignFn), MIB(MIB) {}
59 
60   MachineInstrBuilder MIB;
61 
62   bool isIncomingArgumentHandler() const override { return false; }
63 
64   Register getStackAddress(uint64_t Size, int64_t Offset,
65                            MachinePointerInfo &MPO) override {
66     llvm_unreachable("not implemented");
67   }
68 
69   void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
70                             MachinePointerInfo &MPO, CCValAssign &VA) override {
71     llvm_unreachable("not implemented");
72   }
73 
74   void assignValueToReg(Register ValVReg, Register PhysReg,
75                         CCValAssign &VA) override {
76     Register ExtReg = extendRegisterMin32(ValVReg, VA);
77 
78     // If this is a scalar return, insert a readfirstlane just in case the value
79     // ends up in a VGPR.
80     // FIXME: Assert this is a shader return.
81     const SIRegisterInfo *TRI
82       = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
83     if (TRI->isSGPRReg(MRI, PhysReg)) {
84       auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane,
85                                               {MRI.getType(ExtReg)}, false)
86         .addReg(ExtReg);
87       ExtReg = ToSGPR.getReg(0);
88     }
89 
90     MIRBuilder.buildCopy(PhysReg, ExtReg);
91     MIB.addUse(PhysReg, RegState::Implicit);
92   }
93 
94   bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
95                  CCValAssign::LocInfo LocInfo,
96                  const CallLowering::ArgInfo &Info,
97                  ISD::ArgFlagsTy Flags,
98                  CCState &State) override {
99     return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
100   }
101 };
102 
103 struct IncomingArgHandler : public AMDGPUValueHandler {
104   uint64_t StackUsed = 0;
105 
106   IncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
107                      CCAssignFn *AssignFn)
108     : AMDGPUValueHandler(B, MRI, AssignFn) {}
109 
110   Register getStackAddress(uint64_t Size, int64_t Offset,
111                            MachinePointerInfo &MPO) override {
112     auto &MFI = MIRBuilder.getMF().getFrameInfo();
113     int FI = MFI.CreateFixedObject(Size, Offset, true);
114     MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
115     auto AddrReg = MIRBuilder.buildFrameIndex(
116         LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32), FI);
117     StackUsed = std::max(StackUsed, Size + Offset);
118     return AddrReg.getReg(0);
119   }
120 
121   void assignValueToReg(Register ValVReg, Register PhysReg,
122                         CCValAssign &VA) override {
123     markPhysRegUsed(PhysReg);
124 
125     if (VA.getLocVT().getSizeInBits() < 32) {
126       // 16-bit types are reported as legal for 32-bit registers. We need to do
127       // a 32-bit copy, and truncate to avoid the verifier complaining about it.
128       auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg);
129       MIRBuilder.buildTrunc(ValVReg, Copy);
130       return;
131     }
132 
133     switch (VA.getLocInfo()) {
134     case CCValAssign::LocInfo::SExt:
135     case CCValAssign::LocInfo::ZExt:
136     case CCValAssign::LocInfo::AExt: {
137       auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg);
138       MIRBuilder.buildTrunc(ValVReg, Copy);
139       break;
140     }
141     default:
142       MIRBuilder.buildCopy(ValVReg, PhysReg);
143       break;
144     }
145   }
146 
147   void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
148                             MachinePointerInfo &MPO, CCValAssign &VA) override {
149     MachineFunction &MF = MIRBuilder.getMF();
150 
151     // FIXME: Get alignment
152     auto MMO = MF.getMachineMemOperand(
153         MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size,
154         inferAlignFromPtrInfo(MF, MPO));
155     MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
156   }
157 
158   /// How the physical register gets marked varies between formal
159   /// parameters (it's a basic-block live-in), and a call instruction
160   /// (it's an implicit-def of the BL).
161   virtual void markPhysRegUsed(unsigned PhysReg) = 0;
162 
163   // FIXME: What is the point of this being a callback?
164   bool isIncomingArgumentHandler() const override { return true; }
165 };
166 
167 struct FormalArgHandler : public IncomingArgHandler {
168   FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
169                    CCAssignFn *AssignFn)
170     : IncomingArgHandler(B, MRI, AssignFn) {}
171 
172   void markPhysRegUsed(unsigned PhysReg) override {
173     MIRBuilder.getMBB().addLiveIn(PhysReg);
174   }
175 };
176 
177 struct CallReturnHandler : public IncomingArgHandler {
178   CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
179                     MachineInstrBuilder MIB, CCAssignFn *AssignFn)
180     : IncomingArgHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
181 
182   void markPhysRegUsed(unsigned PhysReg) override {
183     MIB.addDef(PhysReg, RegState::Implicit);
184   }
185 
186   MachineInstrBuilder MIB;
187 };
188 
189 struct OutgoingArgHandler : public AMDGPUValueHandler {
190   MachineInstrBuilder MIB;
191   CCAssignFn *AssignFnVarArg;
192 
193   /// For tail calls, the byte offset of the call's argument area from the
194   /// callee's. Unused elsewhere.
195   int FPDiff;
196 
197   // Cache the SP register vreg if we need it more than once in this call site.
198   Register SPReg;
199 
200   bool IsTailCall;
201 
202   OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
203                      MachineInstrBuilder MIB, CCAssignFn *AssignFn,
204                      CCAssignFn *AssignFnVarArg, bool IsTailCall = false,
205                      int FPDiff = 0)
206       : AMDGPUValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB),
207         AssignFnVarArg(AssignFnVarArg),
208         FPDiff(FPDiff), IsTailCall(IsTailCall) {}
209 
210   bool isIncomingArgumentHandler() const override { return false; }
211 
212   Register getStackAddress(uint64_t Size, int64_t Offset,
213                            MachinePointerInfo &MPO) override {
214     MachineFunction &MF = MIRBuilder.getMF();
215     const LLT PtrTy = LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32);
216     const LLT S32 = LLT::scalar(32);
217 
218     if (IsTailCall) {
219       llvm_unreachable("implement me");
220     }
221 
222     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
223 
224     if (!SPReg)
225       SPReg = MIRBuilder.buildCopy(PtrTy, MFI->getStackPtrOffsetReg()).getReg(0);
226 
227     auto OffsetReg = MIRBuilder.buildConstant(S32, Offset);
228 
229     auto AddrReg = MIRBuilder.buildPtrAdd(PtrTy, SPReg, OffsetReg);
230     MPO = MachinePointerInfo::getStack(MF, Offset);
231     return AddrReg.getReg(0);
232   }
233 
234   void assignValueToReg(Register ValVReg, Register PhysReg,
235                         CCValAssign &VA) override {
236     MIB.addUse(PhysReg, RegState::Implicit);
237     Register ExtReg = extendRegisterMin32(ValVReg, VA);
238     MIRBuilder.buildCopy(PhysReg, ExtReg);
239   }
240 
241   void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
242                             MachinePointerInfo &MPO, CCValAssign &VA) override {
243     MachineFunction &MF = MIRBuilder.getMF();
244     uint64_t LocMemOffset = VA.getLocMemOffset();
245     const auto &ST = MF.getSubtarget<GCNSubtarget>();
246 
247     auto MMO = MF.getMachineMemOperand(
248       MPO, MachineMemOperand::MOStore, Size,
249       commonAlignment(ST.getStackAlignment(), LocMemOffset));
250     MIRBuilder.buildStore(ValVReg, Addr, *MMO);
251   }
252 
253   void assignValueToAddress(const CallLowering::ArgInfo &Arg, Register Addr,
254                             uint64_t Size, MachinePointerInfo &MPO,
255                             CCValAssign &VA) override {
256     Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt
257                            ? extendRegister(Arg.Regs[0], VA)
258                            : Arg.Regs[0];
259 
260     // If we extended we might need to adjust the MMO's Size.
261     const LLT RegTy = MRI.getType(ValVReg);
262     if (RegTy.getSizeInBytes() > Size)
263       Size = RegTy.getSizeInBytes();
264 
265     assignValueToAddress(ValVReg, Addr, Size, MPO, VA);
266   }
267 };
268 }
269 
270 AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
271   : CallLowering(&TLI) {
272 }
273 
274 // FIXME: Compatability shim
275 static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) {
276   switch (MIOpc) {
277   case TargetOpcode::G_SEXT:
278     return ISD::SIGN_EXTEND;
279   case TargetOpcode::G_ZEXT:
280     return ISD::ZERO_EXTEND;
281   case TargetOpcode::G_ANYEXT:
282     return ISD::ANY_EXTEND;
283   default:
284     llvm_unreachable("not an extend opcode");
285   }
286 }
287 
288 void AMDGPUCallLowering::splitToValueTypes(
289   MachineIRBuilder &B,
290   const ArgInfo &OrigArg,
291   SmallVectorImpl<ArgInfo> &SplitArgs,
292   const DataLayout &DL, CallingConv::ID CallConv,
293   bool IsOutgoing,
294   SplitArgTy PerformArgSplit) const {
295   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
296   LLVMContext &Ctx = OrigArg.Ty->getContext();
297 
298   if (OrigArg.Ty->isVoidTy())
299     return;
300 
301   SmallVector<EVT, 4> SplitVTs;
302   ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs);
303 
304   assert(OrigArg.Regs.size() == SplitVTs.size());
305 
306   int SplitIdx = 0;
307   for (EVT VT : SplitVTs) {
308     Register Reg = OrigArg.Regs[SplitIdx];
309     Type *Ty = VT.getTypeForEVT(Ctx);
310     LLT LLTy = getLLTForType(*Ty, DL);
311 
312     if (IsOutgoing && VT.isScalarInteger()) {
313       unsigned ExtendOp = TargetOpcode::G_ANYEXT;
314       if (OrigArg.Flags[0].isSExt()) {
315         assert(OrigArg.Regs.size() == 1 && "expect only simple return values");
316         ExtendOp = TargetOpcode::G_SEXT;
317       } else if (OrigArg.Flags[0].isZExt()) {
318         assert(OrigArg.Regs.size() == 1 && "expect only simple return values");
319         ExtendOp = TargetOpcode::G_ZEXT;
320       }
321 
322       EVT ExtVT = TLI.getTypeForExtReturn(Ctx, VT,
323                                           extOpcodeToISDExtOpcode(ExtendOp));
324       if (ExtVT.getSizeInBits() != VT.getSizeInBits()) {
325         VT = ExtVT;
326         Ty = ExtVT.getTypeForEVT(Ctx);
327         LLTy = getLLTForType(*Ty, DL);
328         Reg = B.buildInstr(ExtendOp, {LLTy}, {Reg}).getReg(0);
329       }
330     }
331 
332     unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
333     MVT RegVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
334 
335     if (NumParts == 1) {
336       // No splitting to do, but we want to replace the original type (e.g. [1 x
337       // double] -> double).
338       SplitArgs.emplace_back(Reg, Ty, OrigArg.Flags, OrigArg.IsFixed);
339 
340       ++SplitIdx;
341       continue;
342     }
343 
344     SmallVector<Register, 8> SplitRegs;
345     Type *PartTy = EVT(RegVT).getTypeForEVT(Ctx);
346     LLT PartLLT = getLLTForType(*PartTy, DL);
347     MachineRegisterInfo &MRI = *B.getMRI();
348 
349     // FIXME: Should we be reporting all of the part registers for a single
350     // argument, and let handleAssignments take care of the repacking?
351     for (unsigned i = 0; i < NumParts; ++i) {
352       Register PartReg = MRI.createGenericVirtualRegister(PartLLT);
353       SplitRegs.push_back(PartReg);
354       SplitArgs.emplace_back(ArrayRef<Register>(PartReg), PartTy, OrigArg.Flags);
355     }
356 
357     PerformArgSplit(SplitRegs, Reg, LLTy, PartLLT, SplitIdx);
358 
359     ++SplitIdx;
360   }
361 }
362 
363 // Get the appropriate type to make \p OrigTy \p Factor times bigger.
364 static LLT getMultipleType(LLT OrigTy, int Factor) {
365   if (OrigTy.isVector()) {
366     return LLT::vector(OrigTy.getNumElements() * Factor,
367                        OrigTy.getElementType());
368   }
369 
370   return LLT::scalar(OrigTy.getSizeInBits() * Factor);
371 }
372 
373 // TODO: Move to generic code
374 static void unpackRegsToOrigType(MachineIRBuilder &B,
375                                  ArrayRef<Register> DstRegs,
376                                  Register SrcReg,
377                                  const CallLowering::ArgInfo &Info,
378                                  LLT SrcTy,
379                                  LLT PartTy) {
380   assert(DstRegs.size() > 1 && "Nothing to unpack");
381 
382   const unsigned SrcSize = SrcTy.getSizeInBits();
383   const unsigned PartSize = PartTy.getSizeInBits();
384 
385   if (SrcTy.isVector() && !PartTy.isVector() &&
386       PartSize > SrcTy.getElementType().getSizeInBits()) {
387     // Vector was scalarized, and the elements extended.
388     auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(),
389                                                   SrcReg);
390     for (int i = 0, e = DstRegs.size(); i != e; ++i)
391       B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i));
392     return;
393   }
394 
395   if (SrcSize % PartSize == 0) {
396     B.buildUnmerge(DstRegs, SrcReg);
397     return;
398   }
399 
400   const int NumRoundedParts = (SrcSize + PartSize - 1) / PartSize;
401 
402   LLT BigTy = getMultipleType(PartTy, NumRoundedParts);
403   auto ImpDef = B.buildUndef(BigTy);
404 
405   auto Big = B.buildInsert(BigTy, ImpDef.getReg(0), SrcReg, 0).getReg(0);
406 
407   int64_t Offset = 0;
408   for (unsigned i = 0, e = DstRegs.size(); i != e; ++i, Offset += PartSize)
409     B.buildExtract(DstRegs[i], Big, Offset);
410 }
411 
412 /// Lower the return value for the already existing \p Ret. This assumes that
413 /// \p B's insertion point is correct.
414 bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
415                                         const Value *Val, ArrayRef<Register> VRegs,
416                                         MachineInstrBuilder &Ret) const {
417   if (!Val)
418     return true;
419 
420   auto &MF = B.getMF();
421   const auto &F = MF.getFunction();
422   const DataLayout &DL = MF.getDataLayout();
423   MachineRegisterInfo *MRI = B.getMRI();
424 
425   CallingConv::ID CC = F.getCallingConv();
426   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
427 
428   ArgInfo OrigRetInfo(VRegs, Val->getType());
429   setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F);
430   SmallVector<ArgInfo, 4> SplitRetInfos;
431 
432   splitToValueTypes(
433     B, OrigRetInfo, SplitRetInfos, DL, CC, true,
434     [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT,
435         int VTSplitIdx) {
436       unpackRegsToOrigType(B, Regs, SrcReg,
437                            SplitRetInfos[VTSplitIdx],
438                            LLTy, PartLLT);
439     });
440 
441   CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg());
442   OutgoingValueHandler RetHandler(B, *MRI, Ret, AssignFn);
443   return handleAssignments(B, SplitRetInfos, RetHandler);
444 }
445 
446 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B,
447                                      const Value *Val,
448                                      ArrayRef<Register> VRegs) const {
449 
450   MachineFunction &MF = B.getMF();
451   MachineRegisterInfo &MRI = MF.getRegInfo();
452   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
453   MFI->setIfReturnsVoid(!Val);
454 
455   assert(!Val == VRegs.empty() && "Return value without a vreg");
456 
457   CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
458   const bool IsShader = AMDGPU::isShader(CC);
459   const bool IsWaveEnd = (IsShader && MFI->returnsVoid()) ||
460                          AMDGPU::isKernel(CC);
461   if (IsWaveEnd) {
462     B.buildInstr(AMDGPU::S_ENDPGM)
463       .addImm(0);
464     return true;
465   }
466 
467   auto const &ST = MF.getSubtarget<GCNSubtarget>();
468 
469   unsigned ReturnOpc =
470       IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return;
471 
472   auto Ret = B.buildInstrNoInsert(ReturnOpc);
473   Register ReturnAddrVReg;
474   if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
475     ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass);
476     Ret.addUse(ReturnAddrVReg);
477   }
478 
479   if (!lowerReturnVal(B, Val, VRegs, Ret))
480     return false;
481 
482   if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
483     const SIRegisterInfo *TRI = ST.getRegisterInfo();
484     Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF),
485                                          &AMDGPU::SGPR_64RegClass);
486     B.buildCopy(ReturnAddrVReg, LiveInReturn);
487   }
488 
489   // TODO: Handle CalleeSavedRegsViaCopy.
490 
491   B.insertInstr(Ret);
492   return true;
493 }
494 
495 void AMDGPUCallLowering::lowerParameterPtr(Register DstReg, MachineIRBuilder &B,
496                                            Type *ParamTy,
497                                            uint64_t Offset) const {
498   MachineFunction &MF = B.getMF();
499   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
500   MachineRegisterInfo &MRI = MF.getRegInfo();
501   Register KernArgSegmentPtr =
502     MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
503   Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
504 
505   auto OffsetReg = B.buildConstant(LLT::scalar(64), Offset);
506 
507   B.buildPtrAdd(DstReg, KernArgSegmentVReg, OffsetReg);
508 }
509 
510 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy,
511                                         uint64_t Offset, Align Alignment,
512                                         Register DstReg) const {
513   MachineFunction &MF = B.getMF();
514   const Function &F = MF.getFunction();
515   const DataLayout &DL = F.getParent()->getDataLayout();
516   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
517   unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
518 
519   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
520   Register PtrReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
521   lowerParameterPtr(PtrReg, B, ParamTy, Offset);
522 
523   MachineMemOperand *MMO = MF.getMachineMemOperand(
524       PtrInfo,
525       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
526           MachineMemOperand::MOInvariant,
527       TypeSize, Alignment);
528 
529   B.buildLoad(DstReg, PtrReg, *MMO);
530 }
531 
532 // Allocate special inputs passed in user SGPRs.
533 static void allocateHSAUserSGPRs(CCState &CCInfo,
534                                  MachineIRBuilder &B,
535                                  MachineFunction &MF,
536                                  const SIRegisterInfo &TRI,
537                                  SIMachineFunctionInfo &Info) {
538   // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
539   if (Info.hasPrivateSegmentBuffer()) {
540     Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
541     MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
542     CCInfo.AllocateReg(PrivateSegmentBufferReg);
543   }
544 
545   if (Info.hasDispatchPtr()) {
546     Register DispatchPtrReg = Info.addDispatchPtr(TRI);
547     MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
548     CCInfo.AllocateReg(DispatchPtrReg);
549   }
550 
551   if (Info.hasQueuePtr()) {
552     Register QueuePtrReg = Info.addQueuePtr(TRI);
553     MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
554     CCInfo.AllocateReg(QueuePtrReg);
555   }
556 
557   if (Info.hasKernargSegmentPtr()) {
558     MachineRegisterInfo &MRI = MF.getRegInfo();
559     Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
560     const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
561     Register VReg = MRI.createGenericVirtualRegister(P4);
562     MRI.addLiveIn(InputPtrReg, VReg);
563     B.getMBB().addLiveIn(InputPtrReg);
564     B.buildCopy(VReg, InputPtrReg);
565     CCInfo.AllocateReg(InputPtrReg);
566   }
567 
568   if (Info.hasDispatchID()) {
569     Register DispatchIDReg = Info.addDispatchID(TRI);
570     MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
571     CCInfo.AllocateReg(DispatchIDReg);
572   }
573 
574   if (Info.hasFlatScratchInit()) {
575     Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
576     MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
577     CCInfo.AllocateReg(FlatScratchInitReg);
578   }
579 
580   // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
581   // these from the dispatch pointer.
582 }
583 
584 bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
585     MachineIRBuilder &B, const Function &F,
586     ArrayRef<ArrayRef<Register>> VRegs) const {
587   MachineFunction &MF = B.getMF();
588   const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
589   MachineRegisterInfo &MRI = MF.getRegInfo();
590   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
591   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
592   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
593 
594   const DataLayout &DL = F.getParent()->getDataLayout();
595 
596   SmallVector<CCValAssign, 16> ArgLocs;
597   CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
598 
599   allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info);
600 
601   unsigned i = 0;
602   const Align KernArgBaseAlign(16);
603   const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F);
604   uint64_t ExplicitArgOffset = 0;
605 
606   // TODO: Align down to dword alignment and extract bits for extending loads.
607   for (auto &Arg : F.args()) {
608     const bool IsByRef = Arg.hasByRefAttr();
609     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
610     unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
611     if (AllocSize == 0)
612       continue;
613 
614     MaybeAlign ABIAlign = IsByRef ? Arg.getParamAlign() : None;
615     if (!ABIAlign)
616       ABIAlign = DL.getABITypeAlign(ArgTy);
617 
618     uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
619     ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
620 
621     if (Arg.use_empty()) {
622       ++i;
623       continue;
624     }
625 
626     Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset);
627 
628     if (IsByRef) {
629       unsigned ByRefAS = cast<PointerType>(Arg.getType())->getAddressSpace();
630 
631       assert(VRegs[i].size() == 1 &&
632              "expected only one register for byval pointers");
633       if (ByRefAS == AMDGPUAS::CONSTANT_ADDRESS) {
634         lowerParameterPtr(VRegs[i][0], B, ArgTy, ArgOffset);
635       } else {
636         const LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
637         Register PtrReg = MRI.createGenericVirtualRegister(ConstPtrTy);
638         lowerParameterPtr(PtrReg, B, ArgTy, ArgOffset);
639 
640         B.buildAddrSpaceCast(VRegs[i][0], PtrReg);
641       }
642     } else {
643       ArrayRef<Register> OrigArgRegs = VRegs[i];
644       Register ArgReg =
645         OrigArgRegs.size() == 1
646         ? OrigArgRegs[0]
647         : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL));
648 
649       lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg);
650       if (OrigArgRegs.size() > 1)
651         unpackRegs(OrigArgRegs, ArgReg, ArgTy, B);
652     }
653 
654     ++i;
655   }
656 
657   TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
658   TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
659   return true;
660 }
661 
662 /// Pack values \p SrcRegs to cover the vector type result \p DstRegs.
663 static MachineInstrBuilder mergeVectorRegsToResultRegs(
664   MachineIRBuilder &B, ArrayRef<Register> DstRegs, ArrayRef<Register> SrcRegs) {
665   MachineRegisterInfo &MRI = *B.getMRI();
666   LLT LLTy = MRI.getType(DstRegs[0]);
667   LLT PartLLT = MRI.getType(SrcRegs[0]);
668 
669   // Deal with v3s16 split into v2s16
670   LLT LCMTy = getLCMType(LLTy, PartLLT);
671   if (LCMTy == LLTy) {
672     // Common case where no padding is needed.
673     assert(DstRegs.size() == 1);
674     return B.buildConcatVectors(DstRegs[0], SrcRegs);
675   }
676 
677   const int NumWide =  LCMTy.getSizeInBits() / PartLLT.getSizeInBits();
678   Register Undef = B.buildUndef(PartLLT).getReg(0);
679 
680   // Build vector of undefs.
681   SmallVector<Register, 8> WidenedSrcs(NumWide, Undef);
682 
683   // Replace the first sources with the real registers.
684   std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin());
685 
686   auto Widened = B.buildConcatVectors(LCMTy, WidenedSrcs);
687   int NumDst = LCMTy.getSizeInBits() / LLTy.getSizeInBits();
688 
689   SmallVector<Register, 8> PadDstRegs(NumDst);
690   std::copy(DstRegs.begin(), DstRegs.end(), PadDstRegs.begin());
691 
692   // Create the excess dead defs for the unmerge.
693   for (int I = DstRegs.size(); I != NumDst; ++I)
694     PadDstRegs[I] = MRI.createGenericVirtualRegister(LLTy);
695 
696   return B.buildUnmerge(PadDstRegs, Widened);
697 }
698 
699 // TODO: Move this to generic code
700 static void packSplitRegsToOrigType(MachineIRBuilder &B,
701                                     ArrayRef<Register> OrigRegs,
702                                     ArrayRef<Register> Regs,
703                                     LLT LLTy,
704                                     LLT PartLLT) {
705   MachineRegisterInfo &MRI = *B.getMRI();
706 
707   if (!LLTy.isVector() && !PartLLT.isVector()) {
708     assert(OrigRegs.size() == 1);
709     LLT OrigTy = MRI.getType(OrigRegs[0]);
710 
711     unsigned SrcSize = PartLLT.getSizeInBits() * Regs.size();
712     if (SrcSize == OrigTy.getSizeInBits())
713       B.buildMerge(OrigRegs[0], Regs);
714     else {
715       auto Widened = B.buildMerge(LLT::scalar(SrcSize), Regs);
716       B.buildTrunc(OrigRegs[0], Widened);
717     }
718 
719     return;
720   }
721 
722   if (LLTy.isVector() && PartLLT.isVector()) {
723     assert(OrigRegs.size() == 1);
724     assert(LLTy.getElementType() == PartLLT.getElementType());
725     mergeVectorRegsToResultRegs(B, OrigRegs, Regs);
726     return;
727   }
728 
729   assert(LLTy.isVector() && !PartLLT.isVector());
730 
731   LLT DstEltTy = LLTy.getElementType();
732 
733   // Pointer information was discarded. We'll need to coerce some register types
734   // to avoid violating type constraints.
735   LLT RealDstEltTy = MRI.getType(OrigRegs[0]).getElementType();
736 
737   assert(DstEltTy.getSizeInBits() == RealDstEltTy.getSizeInBits());
738 
739   if (DstEltTy == PartLLT) {
740     // Vector was trivially scalarized.
741 
742     if (RealDstEltTy.isPointer()) {
743       for (Register Reg : Regs)
744         MRI.setType(Reg, RealDstEltTy);
745     }
746 
747     B.buildBuildVector(OrigRegs[0], Regs);
748   } else if (DstEltTy.getSizeInBits() > PartLLT.getSizeInBits()) {
749     // Deal with vector with 64-bit elements decomposed to 32-bit
750     // registers. Need to create intermediate 64-bit elements.
751     SmallVector<Register, 8> EltMerges;
752     int PartsPerElt = DstEltTy.getSizeInBits() / PartLLT.getSizeInBits();
753 
754     assert(DstEltTy.getSizeInBits() % PartLLT.getSizeInBits() == 0);
755 
756     for (int I = 0, NumElts = LLTy.getNumElements(); I != NumElts; ++I)  {
757       auto Merge = B.buildMerge(RealDstEltTy, Regs.take_front(PartsPerElt));
758       // Fix the type in case this is really a vector of pointers.
759       MRI.setType(Merge.getReg(0), RealDstEltTy);
760       EltMerges.push_back(Merge.getReg(0));
761       Regs = Regs.drop_front(PartsPerElt);
762     }
763 
764     B.buildBuildVector(OrigRegs[0], EltMerges);
765   } else {
766     // Vector was split, and elements promoted to a wider type.
767     LLT BVType = LLT::vector(LLTy.getNumElements(), PartLLT);
768     auto BV = B.buildBuildVector(BVType, Regs);
769     B.buildTrunc(OrigRegs[0], BV);
770   }
771 }
772 
773 bool AMDGPUCallLowering::lowerFormalArguments(
774     MachineIRBuilder &B, const Function &F,
775     ArrayRef<ArrayRef<Register>> VRegs) const {
776   CallingConv::ID CC = F.getCallingConv();
777 
778   // The infrastructure for normal calling convention lowering is essentially
779   // useless for kernels. We want to avoid any kind of legalization or argument
780   // splitting.
781   if (CC == CallingConv::AMDGPU_KERNEL)
782     return lowerFormalArgumentsKernel(B, F, VRegs);
783 
784   const bool IsShader = AMDGPU::isShader(CC);
785   const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC);
786 
787   MachineFunction &MF = B.getMF();
788   MachineBasicBlock &MBB = B.getMBB();
789   MachineRegisterInfo &MRI = MF.getRegInfo();
790   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
791   const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
792   const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
793   const DataLayout &DL = F.getParent()->getDataLayout();
794 
795 
796   SmallVector<CCValAssign, 16> ArgLocs;
797   CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
798 
799   if (!IsEntryFunc) {
800     Register ReturnAddrReg = TRI->getReturnAddressReg(MF);
801     Register LiveInReturn = MF.addLiveIn(ReturnAddrReg,
802                                          &AMDGPU::SGPR_64RegClass);
803     MBB.addLiveIn(ReturnAddrReg);
804     B.buildCopy(LiveInReturn, ReturnAddrReg);
805   }
806 
807   if (Info->hasImplicitBufferPtr()) {
808     Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
809     MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
810     CCInfo.AllocateReg(ImplicitBufferPtrReg);
811   }
812 
813 
814   SmallVector<ArgInfo, 32> SplitArgs;
815   unsigned Idx = 0;
816   unsigned PSInputNum = 0;
817 
818   for (auto &Arg : F.args()) {
819     if (DL.getTypeStoreSize(Arg.getType()) == 0)
820       continue;
821 
822     const bool InReg = Arg.hasAttribute(Attribute::InReg);
823 
824     // SGPR arguments to functions not implemented.
825     if (!IsShader && InReg)
826       return false;
827 
828     if (Arg.hasAttribute(Attribute::SwiftSelf) ||
829         Arg.hasAttribute(Attribute::SwiftError) ||
830         Arg.hasAttribute(Attribute::Nest))
831       return false;
832 
833     if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) {
834       const bool ArgUsed = !Arg.use_empty();
835       bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum);
836 
837       if (!SkipArg) {
838         Info->markPSInputAllocated(PSInputNum);
839         if (ArgUsed)
840           Info->markPSInputEnabled(PSInputNum);
841       }
842 
843       ++PSInputNum;
844 
845       if (SkipArg) {
846         for (int I = 0, E = VRegs[Idx].size(); I != E; ++I)
847           B.buildUndef(VRegs[Idx][I]);
848 
849         ++Idx;
850         continue;
851       }
852     }
853 
854     ArgInfo OrigArg(VRegs[Idx], Arg.getType());
855     const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex;
856     setArgFlags(OrigArg, OrigArgIdx, DL, F);
857 
858     splitToValueTypes(
859       B, OrigArg, SplitArgs, DL, CC, false,
860       // FIXME: We should probably be passing multiple registers to
861       // handleAssignments to do this
862       [&](ArrayRef<Register> Regs, Register DstReg,
863           LLT LLTy, LLT PartLLT, int VTSplitIdx) {
864         assert(DstReg == VRegs[Idx][VTSplitIdx]);
865         packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs,
866                                 LLTy, PartLLT);
867       });
868 
869     ++Idx;
870   }
871 
872   // At least one interpolation mode must be enabled or else the GPU will
873   // hang.
874   //
875   // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
876   // set PSInputAddr, the user wants to enable some bits after the compilation
877   // based on run-time states. Since we can't know what the final PSInputEna
878   // will look like, so we shouldn't do anything here and the user should take
879   // responsibility for the correct programming.
880   //
881   // Otherwise, the following restrictions apply:
882   // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
883   // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
884   //   enabled too.
885   if (CC == CallingConv::AMDGPU_PS) {
886     if ((Info->getPSInputAddr() & 0x7F) == 0 ||
887         ((Info->getPSInputAddr() & 0xF) == 0 &&
888          Info->isPSInputAllocated(11))) {
889       CCInfo.AllocateReg(AMDGPU::VGPR0);
890       CCInfo.AllocateReg(AMDGPU::VGPR1);
891       Info->markPSInputAllocated(0);
892       Info->markPSInputEnabled(0);
893     }
894 
895     if (Subtarget.isAmdPalOS()) {
896       // For isAmdPalOS, the user does not enable some bits after compilation
897       // based on run-time states; the register values being generated here are
898       // the final ones set in hardware. Therefore we need to apply the
899       // workaround to PSInputAddr and PSInputEnable together.  (The case where
900       // a bit is set in PSInputAddr but not PSInputEnable is where the frontend
901       // set up an input arg for a particular interpolation mode, but nothing
902       // uses that input arg. Really we should have an earlier pass that removes
903       // such an arg.)
904       unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
905       if ((PsInputBits & 0x7F) == 0 ||
906           ((PsInputBits & 0xF) == 0 &&
907            (PsInputBits >> 11 & 1)))
908         Info->markPSInputEnabled(
909           countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
910     }
911   }
912 
913   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
914   CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg());
915 
916   if (!MBB.empty())
917     B.setInstr(*MBB.begin());
918 
919   if (!IsEntryFunc) {
920     // For the fixed ABI, pass workitem IDs in the last argument register.
921     if (AMDGPUTargetMachine::EnableFixedFunctionABI)
922       TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
923   }
924 
925   FormalArgHandler Handler(B, MRI, AssignFn);
926   if (!handleAssignments(CCInfo, ArgLocs, B, SplitArgs, Handler))
927     return false;
928 
929   if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) {
930     // Special inputs come after user arguments.
931     TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
932   }
933 
934   // Start adding system SGPRs.
935   if (IsEntryFunc) {
936     TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader);
937   } else {
938     CCInfo.AllocateReg(Info->getScratchRSrcReg());
939     TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
940   }
941 
942   // Move back to the end of the basic block.
943   B.setMBB(MBB);
944 
945   return true;
946 }
947 
948 bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
949                                            CCState &CCInfo,
950                                            SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs,
951                                            CallLoweringInfo &Info) const {
952   MachineFunction &MF = MIRBuilder.getMF();
953 
954   const AMDGPUFunctionArgInfo *CalleeArgInfo
955     = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
956 
957   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
958   const AMDGPUFunctionArgInfo &CallerArgInfo = MFI->getArgInfo();
959 
960 
961   // TODO: Unify with private memory register handling. This is complicated by
962   // the fact that at least in kernels, the input argument is not necessarily
963   // in the same location as the input.
964   AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
965     AMDGPUFunctionArgInfo::DISPATCH_PTR,
966     AMDGPUFunctionArgInfo::QUEUE_PTR,
967     AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR,
968     AMDGPUFunctionArgInfo::DISPATCH_ID,
969     AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
970     AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
971     AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
972   };
973 
974   MachineRegisterInfo &MRI = MF.getRegInfo();
975 
976   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
977   const AMDGPULegalizerInfo *LI
978     = static_cast<const AMDGPULegalizerInfo*>(ST.getLegalizerInfo());
979 
980   for (auto InputID : InputRegs) {
981     const ArgDescriptor *OutgoingArg;
982     const TargetRegisterClass *ArgRC;
983     LLT ArgTy;
984 
985     std::tie(OutgoingArg, ArgRC, ArgTy) =
986         CalleeArgInfo->getPreloadedValue(InputID);
987     if (!OutgoingArg)
988       continue;
989 
990     const ArgDescriptor *IncomingArg;
991     const TargetRegisterClass *IncomingArgRC;
992     std::tie(IncomingArg, IncomingArgRC, ArgTy) =
993         CallerArgInfo.getPreloadedValue(InputID);
994     assert(IncomingArgRC == ArgRC);
995 
996     Register InputReg = MRI.createGenericVirtualRegister(ArgTy);
997 
998     if (IncomingArg) {
999       LI->loadInputValue(InputReg, MIRBuilder, IncomingArg);
1000     } else {
1001       assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1002       LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder);
1003     }
1004 
1005     if (OutgoingArg->isRegister()) {
1006       ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg);
1007       if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
1008         report_fatal_error("failed to allocate implicit input argument");
1009     } else {
1010       LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
1011       return false;
1012     }
1013   }
1014 
1015   // Pack workitem IDs into a single register or pass it as is if already
1016   // packed.
1017   const ArgDescriptor *OutgoingArg;
1018   const TargetRegisterClass *ArgRC;
1019   LLT ArgTy;
1020 
1021   std::tie(OutgoingArg, ArgRC, ArgTy) =
1022       CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
1023   if (!OutgoingArg)
1024     std::tie(OutgoingArg, ArgRC, ArgTy) =
1025         CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
1026   if (!OutgoingArg)
1027     std::tie(OutgoingArg, ArgRC, ArgTy) =
1028         CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
1029   if (!OutgoingArg)
1030     return false;
1031 
1032   const ArgDescriptor *IncomingArgX = std::get<0>(
1033       CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X));
1034   const ArgDescriptor *IncomingArgY = std::get<0>(
1035       CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y));
1036   const ArgDescriptor *IncomingArgZ = std::get<0>(
1037       CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z));
1038 
1039   const LLT S32 = LLT::scalar(32);
1040 
1041   // If incoming ids are not packed we need to pack them.
1042   // FIXME: Should consider known workgroup size to eliminate known 0 cases.
1043   Register InputReg;
1044   if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX) {
1045     InputReg = MRI.createGenericVirtualRegister(S32);
1046     LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX);
1047   }
1048 
1049   if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY) {
1050     Register Y = MRI.createGenericVirtualRegister(S32);
1051     LI->loadInputValue(Y, MIRBuilder, IncomingArgY);
1052 
1053     Y = MIRBuilder.buildShl(S32, Y, MIRBuilder.buildConstant(S32, 10)).getReg(0);
1054     InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Y).getReg(0) : Y;
1055   }
1056 
1057   if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ) {
1058     Register Z = MRI.createGenericVirtualRegister(S32);
1059     LI->loadInputValue(Z, MIRBuilder, IncomingArgZ);
1060 
1061     Z = MIRBuilder.buildShl(S32, Z, MIRBuilder.buildConstant(S32, 20)).getReg(0);
1062     InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Z).getReg(0) : Z;
1063   }
1064 
1065   if (!InputReg) {
1066     InputReg = MRI.createGenericVirtualRegister(S32);
1067 
1068     // Workitem ids are already packed, any of present incoming arguments will
1069     // carry all required fields.
1070     ArgDescriptor IncomingArg = ArgDescriptor::createArg(
1071       IncomingArgX ? *IncomingArgX :
1072         IncomingArgY ? *IncomingArgY : *IncomingArgZ, ~0u);
1073     LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg);
1074   }
1075 
1076   if (OutgoingArg->isRegister()) {
1077     ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg);
1078     if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
1079       report_fatal_error("failed to allocate implicit input argument");
1080   } else {
1081     LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
1082     return false;
1083   }
1084 
1085   return true;
1086 }
1087 
1088 /// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for
1089 /// CC.
1090 static std::pair<CCAssignFn *, CCAssignFn *>
1091 getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) {
1092   return {TLI.CCAssignFnForCall(CC, false), TLI.CCAssignFnForCall(CC, true)};
1093 }
1094 
1095 static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
1096                               bool IsTailCall) {
1097   return AMDGPU::SI_CALL;
1098 }
1099 
1100 // Add operands to call instruction to track the callee.
1101 static bool addCallTargetOperands(MachineInstrBuilder &CallInst,
1102                                   MachineIRBuilder &MIRBuilder,
1103                                   AMDGPUCallLowering::CallLoweringInfo &Info) {
1104   if (Info.Callee.isReg()) {
1105     CallInst.addImm(0);
1106     CallInst.add(Info.Callee);
1107   } else if (Info.Callee.isGlobal() && Info.Callee.getOffset() == 0) {
1108     // The call lowering lightly assumed we can directly encode a call target in
1109     // the instruction, which is not the case. Materialize the address here.
1110     const GlobalValue *GV = Info.Callee.getGlobal();
1111     auto Ptr = MIRBuilder.buildGlobalValue(
1112       LLT::pointer(GV->getAddressSpace(), 64), GV);
1113     CallInst.addReg(Ptr.getReg(0));
1114     CallInst.add(Info.Callee);
1115   } else
1116     return false;
1117 
1118   return true;
1119 }
1120 
1121 bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
1122                                    CallLoweringInfo &Info) const {
1123   if (!AMDGPUTargetMachine::EnableFixedFunctionABI) {
1124     LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n");
1125     return false;
1126   }
1127 
1128   if (Info.IsVarArg) {
1129     LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n");
1130     return false;
1131   }
1132 
1133   MachineFunction &MF = MIRBuilder.getMF();
1134   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1135   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1136 
1137   const Function &F = MF.getFunction();
1138   MachineRegisterInfo &MRI = MF.getRegInfo();
1139   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1140   const DataLayout &DL = F.getParent()->getDataLayout();
1141 
1142   if (AMDGPU::isShader(F.getCallingConv())) {
1143     LLVM_DEBUG(dbgs() << "Unhandled call from graphics shader\n");
1144     return false;
1145   }
1146 
1147   SmallVector<ArgInfo, 8> OutArgs;
1148   SmallVector<ArgInfo, 4> SplitRetInfos;
1149 
1150   for (auto &OrigArg : Info.OrigArgs) {
1151     splitToValueTypes(
1152       MIRBuilder, OrigArg, OutArgs, DL, Info.CallConv, true,
1153       // FIXME: We should probably be passing multiple registers to
1154       // handleAssignments to do this
1155       [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT,
1156           int VTSplitIdx) {
1157         unpackRegsToOrigType(MIRBuilder, Regs, SrcReg, OrigArg, LLTy, PartLLT);
1158       });
1159   }
1160 
1161   SmallVector<ArgInfo, 8> InArgs;
1162   if (!Info.OrigRet.Ty->isVoidTy()) {
1163     LLVM_DEBUG(dbgs() << "Call return values not yet handled\n");
1164     return false;
1165   }
1166 
1167   // If we can lower as a tail call, do that instead.
1168   bool CanTailCallOpt = false;
1169 
1170   // We must emit a tail call if we have musttail.
1171   if (Info.IsMustTailCall && !CanTailCallOpt) {
1172     LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n");
1173     return false;
1174   }
1175 
1176   // Find out which ABI gets to decide where things go.
1177   CCAssignFn *AssignFnFixed;
1178   CCAssignFn *AssignFnVarArg;
1179   std::tie(AssignFnFixed, AssignFnVarArg) =
1180       getAssignFnsForCC(Info.CallConv, TLI);
1181 
1182   MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP)
1183     .addImm(0)
1184     .addImm(0);
1185 
1186   // Create a temporarily-floating call instruction so we can add the implicit
1187   // uses of arg registers.
1188   unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false);
1189 
1190   auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
1191   MIB.addDef(TRI->getReturnAddressReg(MF));
1192 
1193   if (!addCallTargetOperands(MIB, MIRBuilder, Info))
1194     return false;
1195 
1196   // Tell the call which registers are clobbered.
1197   const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv);
1198   MIB.addRegMask(Mask);
1199 
1200   SmallVector<CCValAssign, 16> ArgLocs;
1201   CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
1202 
1203   // We could pass MIB and directly add the implicit uses to the call
1204   // now. However, as an aesthetic choice, place implicit argument operands
1205   // after the ordinary user argument registers.
1206   SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
1207 
1208   if (AMDGPUTargetMachine::EnableFixedFunctionABI) {
1209     // With a fixed ABI, allocate fixed registers before user arguments.
1210     if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
1211       return false;
1212   }
1213 
1214   // Do the actual argument marshalling.
1215   SmallVector<Register, 8> PhysRegs;
1216   OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,
1217                              AssignFnVarArg, false);
1218   if (!handleAssignments(CCInfo, ArgLocs, MIRBuilder, OutArgs, Handler))
1219     return false;
1220 
1221   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1222 
1223   // Insert copies for the SRD. In the HSA case, this should be an identity
1224   // copy.
1225   auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::vector(4, 32),
1226                                              MFI->getScratchRSrcReg());
1227   MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
1228   MIB.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit);
1229 
1230   for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
1231     MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second);
1232     MIB.addReg(ArgReg.first, RegState::Implicit);
1233   }
1234 
1235   // Get a count of how many bytes are to be pushed on the stack.
1236   unsigned NumBytes = CCInfo.getNextStackOffset();
1237 
1238   // Now we can add the actual call instruction to the correct position.
1239   MIRBuilder.insertInstr(MIB);
1240 
1241   // If Callee is a reg, since it is used by a target specific
1242   // instruction, it must have a register class matching the
1243   // constraint of that instruction.
1244 
1245   // FIXME: We should define regbankselectable call instructions to handle
1246   // divergent call targets.
1247   if (MIB->getOperand(1).isReg()) {
1248     MIB->getOperand(1).setReg(constrainOperandRegClass(
1249         MF, *TRI, MRI, *ST.getInstrInfo(),
1250         *ST.getRegBankInfo(), *MIB, MIB->getDesc(), MIB->getOperand(1),
1251         1));
1252   }
1253 
1254   // Finally we can copy the returned value back into its virtual-register. In
1255   // symmetry with the arguments, the physical register must be an
1256   // implicit-define of the call instruction.
1257   if (!Info.OrigRet.Ty->isVoidTy()) {
1258     CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv,
1259                                                       Info.IsVarArg);
1260     CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn);
1261     if (!handleAssignments(MIRBuilder, InArgs, Handler))
1262       return false;
1263   }
1264 
1265   uint64_t CalleePopBytes = NumBytes;
1266   MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN)
1267     .addImm(0)
1268     .addImm(CalleePopBytes);
1269 
1270   return true;
1271 }
1272