1 //===-- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp - Call lowering -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements the lowering of LLVM calls to machine code calls for
11 /// GlobalISel.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUISelLowering.h"
18 #include "AMDGPULegalizerInfo.h"
19 #include "AMDGPUSubtarget.h"
20 #include "AMDGPUTargetMachine.h"
21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22 #include "SIISelLowering.h"
23 #include "SIMachineFunctionInfo.h"
24 #include "SIRegisterInfo.h"
25 #include "llvm/CodeGen/Analysis.h"
26 #include "llvm/CodeGen/CallingConvLower.h"
27 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
28 #include "llvm/CodeGen/MachineInstrBuilder.h"
29 #include "llvm/Support/LowLevelTypeImpl.h"
30 
31 #define DEBUG_TYPE "amdgpu-call-lowering"
32 
33 using namespace llvm;
34 
35 namespace {
36 
37 struct AMDGPUValueHandler : public CallLowering::ValueHandler {
38   AMDGPUValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
39                      CCAssignFn *AssignFn)
40     : ValueHandler(B, MRI, AssignFn) {}
41 
42   /// Wrapper around extendRegister to ensure we extend to a full 32-bit
43   /// register.
44   Register extendRegisterMin32(Register ValVReg, CCValAssign &VA) {
45     if (VA.getLocVT().getSizeInBits() < 32) {
46       // 16-bit types are reported as legal for 32-bit registers. We need to
47       // extend and do a 32-bit copy to avoid the verifier complaining about it.
48       return MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0);
49     }
50 
51     return extendRegister(ValVReg, VA);
52   }
53 };
54 
55 struct OutgoingValueHandler : public AMDGPUValueHandler {
56   OutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
57                        MachineInstrBuilder MIB, CCAssignFn *AssignFn)
58       : AMDGPUValueHandler(B, MRI, AssignFn), MIB(MIB) {}
59 
60   MachineInstrBuilder MIB;
61 
62   bool isIncomingArgumentHandler() const override { return false; }
63 
64   Register getStackAddress(uint64_t Size, int64_t Offset,
65                            MachinePointerInfo &MPO) override {
66     llvm_unreachable("not implemented");
67   }
68 
69   void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
70                             MachinePointerInfo &MPO, CCValAssign &VA) override {
71     llvm_unreachable("not implemented");
72   }
73 
74   void assignValueToReg(Register ValVReg, Register PhysReg,
75                         CCValAssign &VA) override {
76     Register ExtReg = extendRegisterMin32(ValVReg, VA);
77 
78     // If this is a scalar return, insert a readfirstlane just in case the value
79     // ends up in a VGPR.
80     // FIXME: Assert this is a shader return.
81     const SIRegisterInfo *TRI
82       = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
83     if (TRI->isSGPRReg(MRI, PhysReg)) {
84       auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane,
85                                               {MRI.getType(ExtReg)}, false)
86         .addReg(ExtReg);
87       ExtReg = ToSGPR.getReg(0);
88     }
89 
90     MIRBuilder.buildCopy(PhysReg, ExtReg);
91     MIB.addUse(PhysReg, RegState::Implicit);
92   }
93 
94   bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
95                  CCValAssign::LocInfo LocInfo,
96                  const CallLowering::ArgInfo &Info,
97                  ISD::ArgFlagsTy Flags,
98                  CCState &State) override {
99     return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
100   }
101 };
102 
103 struct IncomingArgHandler : public AMDGPUValueHandler {
104   uint64_t StackUsed = 0;
105 
106   IncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
107                      CCAssignFn *AssignFn)
108     : AMDGPUValueHandler(B, MRI, AssignFn) {}
109 
110   Register getStackAddress(uint64_t Size, int64_t Offset,
111                            MachinePointerInfo &MPO) override {
112     auto &MFI = MIRBuilder.getMF().getFrameInfo();
113     int FI = MFI.CreateFixedObject(Size, Offset, true);
114     MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
115     auto AddrReg = MIRBuilder.buildFrameIndex(
116         LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32), FI);
117     StackUsed = std::max(StackUsed, Size + Offset);
118     return AddrReg.getReg(0);
119   }
120 
121   void assignValueToReg(Register ValVReg, Register PhysReg,
122                         CCValAssign &VA) override {
123     markPhysRegUsed(PhysReg);
124 
125     if (VA.getLocVT().getSizeInBits() < 32) {
126       // 16-bit types are reported as legal for 32-bit registers. We need to do
127       // a 32-bit copy, and truncate to avoid the verifier complaining about it.
128       auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg);
129       MIRBuilder.buildTrunc(ValVReg, Copy);
130       return;
131     }
132 
133     switch (VA.getLocInfo()) {
134     case CCValAssign::LocInfo::SExt:
135     case CCValAssign::LocInfo::ZExt:
136     case CCValAssign::LocInfo::AExt: {
137       auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg);
138       MIRBuilder.buildTrunc(ValVReg, Copy);
139       break;
140     }
141     default:
142       MIRBuilder.buildCopy(ValVReg, PhysReg);
143       break;
144     }
145   }
146 
147   void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
148                             MachinePointerInfo &MPO, CCValAssign &VA) override {
149     MachineFunction &MF = MIRBuilder.getMF();
150 
151     // FIXME: Get alignment
152     auto MMO = MF.getMachineMemOperand(
153         MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size,
154         inferAlignFromPtrInfo(MF, MPO));
155     MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
156   }
157 
158   /// How the physical register gets marked varies between formal
159   /// parameters (it's a basic-block live-in), and a call instruction
160   /// (it's an implicit-def of the BL).
161   virtual void markPhysRegUsed(unsigned PhysReg) = 0;
162 
163   // FIXME: What is the point of this being a callback?
164   bool isIncomingArgumentHandler() const override { return true; }
165 };
166 
167 struct FormalArgHandler : public IncomingArgHandler {
168   FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
169                    CCAssignFn *AssignFn)
170     : IncomingArgHandler(B, MRI, AssignFn) {}
171 
172   void markPhysRegUsed(unsigned PhysReg) override {
173     MIRBuilder.getMBB().addLiveIn(PhysReg);
174   }
175 };
176 
177 struct CallReturnHandler : public IncomingArgHandler {
178   CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
179                     MachineInstrBuilder MIB, CCAssignFn *AssignFn)
180     : IncomingArgHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
181 
182   void markPhysRegUsed(unsigned PhysReg) override {
183     MIB.addDef(PhysReg, RegState::Implicit);
184   }
185 
186   MachineInstrBuilder MIB;
187 };
188 
189 struct OutgoingArgHandler : public AMDGPUValueHandler {
190   MachineInstrBuilder MIB;
191   CCAssignFn *AssignFnVarArg;
192 
193   /// For tail calls, the byte offset of the call's argument area from the
194   /// callee's. Unused elsewhere.
195   int FPDiff;
196 
197   // Cache the SP register vreg if we need it more than once in this call site.
198   Register SPReg;
199 
200   bool IsTailCall;
201 
202   OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
203                      MachineInstrBuilder MIB, CCAssignFn *AssignFn,
204                      CCAssignFn *AssignFnVarArg, bool IsTailCall = false,
205                      int FPDiff = 0)
206       : AMDGPUValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB),
207         AssignFnVarArg(AssignFnVarArg),
208         FPDiff(FPDiff), IsTailCall(IsTailCall) {}
209 
210   bool isIncomingArgumentHandler() const override { return false; }
211 
212   Register getStackAddress(uint64_t Size, int64_t Offset,
213                            MachinePointerInfo &MPO) override {
214     MachineFunction &MF = MIRBuilder.getMF();
215     const LLT PtrTy = LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32);
216     const LLT S32 = LLT::scalar(32);
217 
218     if (IsTailCall) {
219       llvm_unreachable("implement me");
220     }
221 
222     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
223 
224     if (!SPReg)
225       SPReg = MIRBuilder.buildCopy(PtrTy, MFI->getStackPtrOffsetReg()).getReg(0);
226 
227     auto OffsetReg = MIRBuilder.buildConstant(S32, Offset);
228 
229     auto AddrReg = MIRBuilder.buildPtrAdd(PtrTy, SPReg, OffsetReg);
230     MPO = MachinePointerInfo::getStack(MF, Offset);
231     return AddrReg.getReg(0);
232   }
233 
234   void assignValueToReg(Register ValVReg, Register PhysReg,
235                         CCValAssign &VA) override {
236     MIB.addUse(PhysReg, RegState::Implicit);
237     Register ExtReg = extendRegisterMin32(ValVReg, VA);
238     MIRBuilder.buildCopy(PhysReg, ExtReg);
239   }
240 
241   void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
242                             MachinePointerInfo &MPO, CCValAssign &VA) override {
243     MachineFunction &MF = MIRBuilder.getMF();
244     uint64_t LocMemOffset = VA.getLocMemOffset();
245     const auto &ST = MF.getSubtarget<GCNSubtarget>();
246 
247     auto MMO = MF.getMachineMemOperand(
248       MPO, MachineMemOperand::MOStore, Size,
249       commonAlignment(ST.getStackAlignment(), LocMemOffset));
250     MIRBuilder.buildStore(ValVReg, Addr, *MMO);
251   }
252 
253   void assignValueToAddress(const CallLowering::ArgInfo &Arg, Register Addr,
254                             uint64_t Size, MachinePointerInfo &MPO,
255                             CCValAssign &VA) override {
256     Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt
257                            ? extendRegister(Arg.Regs[0], VA)
258                            : Arg.Regs[0];
259 
260     // If we extended we might need to adjust the MMO's Size.
261     const LLT RegTy = MRI.getType(ValVReg);
262     if (RegTy.getSizeInBytes() > Size)
263       Size = RegTy.getSizeInBytes();
264 
265     assignValueToAddress(ValVReg, Addr, Size, MPO, VA);
266   }
267 };
268 }
269 
270 AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
271   : CallLowering(&TLI) {
272 }
273 
274 // FIXME: Compatability shim
275 static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) {
276   switch (MIOpc) {
277   case TargetOpcode::G_SEXT:
278     return ISD::SIGN_EXTEND;
279   case TargetOpcode::G_ZEXT:
280     return ISD::ZERO_EXTEND;
281   case TargetOpcode::G_ANYEXT:
282     return ISD::ANY_EXTEND;
283   default:
284     llvm_unreachable("not an extend opcode");
285   }
286 }
287 
288 void AMDGPUCallLowering::splitToValueTypes(
289   MachineIRBuilder &B,
290   const ArgInfo &OrigArg,
291   SmallVectorImpl<ArgInfo> &SplitArgs,
292   const DataLayout &DL, CallingConv::ID CallConv,
293   bool IsOutgoing,
294   SplitArgTy PerformArgSplit) const {
295   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
296   LLVMContext &Ctx = OrigArg.Ty->getContext();
297 
298   if (OrigArg.Ty->isVoidTy())
299     return;
300 
301   SmallVector<EVT, 4> SplitVTs;
302   ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs);
303 
304   assert(OrigArg.Regs.size() == SplitVTs.size());
305 
306   int SplitIdx = 0;
307   for (EVT VT : SplitVTs) {
308     Register Reg = OrigArg.Regs[SplitIdx];
309     Type *Ty = VT.getTypeForEVT(Ctx);
310     LLT LLTy = getLLTForType(*Ty, DL);
311 
312     if (IsOutgoing && VT.isScalarInteger()) {
313       unsigned ExtendOp = TargetOpcode::G_ANYEXT;
314       if (OrigArg.Flags[0].isSExt()) {
315         assert(OrigArg.Regs.size() == 1 && "expect only simple return values");
316         ExtendOp = TargetOpcode::G_SEXT;
317       } else if (OrigArg.Flags[0].isZExt()) {
318         assert(OrigArg.Regs.size() == 1 && "expect only simple return values");
319         ExtendOp = TargetOpcode::G_ZEXT;
320       }
321 
322       EVT ExtVT = TLI.getTypeForExtReturn(Ctx, VT,
323                                           extOpcodeToISDExtOpcode(ExtendOp));
324       if (ExtVT.getSizeInBits() != VT.getSizeInBits()) {
325         VT = ExtVT;
326         Ty = ExtVT.getTypeForEVT(Ctx);
327         LLTy = getLLTForType(*Ty, DL);
328         Reg = B.buildInstr(ExtendOp, {LLTy}, {Reg}).getReg(0);
329       }
330     }
331 
332     unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
333     MVT RegVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
334 
335     if (NumParts == 1) {
336       // No splitting to do, but we want to replace the original type (e.g. [1 x
337       // double] -> double).
338       SplitArgs.emplace_back(Reg, Ty, OrigArg.Flags, OrigArg.IsFixed);
339 
340       ++SplitIdx;
341       continue;
342     }
343 
344     SmallVector<Register, 8> SplitRegs;
345     Type *PartTy = EVT(RegVT).getTypeForEVT(Ctx);
346     LLT PartLLT = getLLTForType(*PartTy, DL);
347     MachineRegisterInfo &MRI = *B.getMRI();
348 
349     // FIXME: Should we be reporting all of the part registers for a single
350     // argument, and let handleAssignments take care of the repacking?
351     for (unsigned i = 0; i < NumParts; ++i) {
352       Register PartReg = MRI.createGenericVirtualRegister(PartLLT);
353       SplitRegs.push_back(PartReg);
354       SplitArgs.emplace_back(ArrayRef<Register>(PartReg), PartTy, OrigArg.Flags);
355     }
356 
357     PerformArgSplit(SplitRegs, Reg, LLTy, PartLLT, SplitIdx);
358 
359     ++SplitIdx;
360   }
361 }
362 
363 // Get the appropriate type to make \p OrigTy \p Factor times bigger.
364 static LLT getMultipleType(LLT OrigTy, int Factor) {
365   if (OrigTy.isVector()) {
366     return LLT::vector(OrigTy.getNumElements() * Factor,
367                        OrigTy.getElementType());
368   }
369 
370   return LLT::scalar(OrigTy.getSizeInBits() * Factor);
371 }
372 
373 // TODO: Move to generic code
374 static void unpackRegsToOrigType(MachineIRBuilder &B,
375                                  ArrayRef<Register> DstRegs,
376                                  Register SrcReg,
377                                  const CallLowering::ArgInfo &Info,
378                                  LLT SrcTy,
379                                  LLT PartTy) {
380   assert(DstRegs.size() > 1 && "Nothing to unpack");
381 
382   const unsigned SrcSize = SrcTy.getSizeInBits();
383   const unsigned PartSize = PartTy.getSizeInBits();
384 
385   if (SrcTy.isVector() && !PartTy.isVector() &&
386       PartSize > SrcTy.getElementType().getSizeInBits()) {
387     // Vector was scalarized, and the elements extended.
388     auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(),
389                                                   SrcReg);
390     for (int i = 0, e = DstRegs.size(); i != e; ++i)
391       B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i));
392     return;
393   }
394 
395   if (SrcSize % PartSize == 0) {
396     B.buildUnmerge(DstRegs, SrcReg);
397     return;
398   }
399 
400   const int NumRoundedParts = (SrcSize + PartSize - 1) / PartSize;
401 
402   LLT BigTy = getMultipleType(PartTy, NumRoundedParts);
403   auto ImpDef = B.buildUndef(BigTy);
404 
405   auto Big = B.buildInsert(BigTy, ImpDef.getReg(0), SrcReg, 0).getReg(0);
406 
407   int64_t Offset = 0;
408   for (unsigned i = 0, e = DstRegs.size(); i != e; ++i, Offset += PartSize)
409     B.buildExtract(DstRegs[i], Big, Offset);
410 }
411 
412 /// Lower the return value for the already existing \p Ret. This assumes that
413 /// \p B's insertion point is correct.
414 bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
415                                         const Value *Val, ArrayRef<Register> VRegs,
416                                         MachineInstrBuilder &Ret) const {
417   if (!Val)
418     return true;
419 
420   auto &MF = B.getMF();
421   const auto &F = MF.getFunction();
422   const DataLayout &DL = MF.getDataLayout();
423   MachineRegisterInfo *MRI = B.getMRI();
424 
425   CallingConv::ID CC = F.getCallingConv();
426   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
427 
428   ArgInfo OrigRetInfo(VRegs, Val->getType());
429   setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F);
430   SmallVector<ArgInfo, 4> SplitRetInfos;
431 
432   splitToValueTypes(
433     B, OrigRetInfo, SplitRetInfos, DL, CC, true,
434     [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT,
435         int VTSplitIdx) {
436       unpackRegsToOrigType(B, Regs, SrcReg,
437                            SplitRetInfos[VTSplitIdx],
438                            LLTy, PartLLT);
439     });
440 
441   CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg());
442   OutgoingValueHandler RetHandler(B, *MRI, Ret, AssignFn);
443   return handleAssignments(B, SplitRetInfos, RetHandler);
444 }
445 
446 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B,
447                                      const Value *Val,
448                                      ArrayRef<Register> VRegs) const {
449 
450   MachineFunction &MF = B.getMF();
451   MachineRegisterInfo &MRI = MF.getRegInfo();
452   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
453   MFI->setIfReturnsVoid(!Val);
454 
455   assert(!Val == VRegs.empty() && "Return value without a vreg");
456 
457   CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
458   const bool IsShader = AMDGPU::isShader(CC);
459   const bool IsWaveEnd = (IsShader && MFI->returnsVoid()) ||
460                          AMDGPU::isKernel(CC);
461   if (IsWaveEnd) {
462     B.buildInstr(AMDGPU::S_ENDPGM)
463       .addImm(0);
464     return true;
465   }
466 
467   auto const &ST = MF.getSubtarget<GCNSubtarget>();
468 
469   unsigned ReturnOpc =
470       IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return;
471 
472   auto Ret = B.buildInstrNoInsert(ReturnOpc);
473   Register ReturnAddrVReg;
474   if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
475     ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass);
476     Ret.addUse(ReturnAddrVReg);
477   }
478 
479   if (!lowerReturnVal(B, Val, VRegs, Ret))
480     return false;
481 
482   if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
483     const SIRegisterInfo *TRI = ST.getRegisterInfo();
484     Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF),
485                                          &AMDGPU::SGPR_64RegClass);
486     B.buildCopy(ReturnAddrVReg, LiveInReturn);
487   }
488 
489   // TODO: Handle CalleeSavedRegsViaCopy.
490 
491   B.insertInstr(Ret);
492   return true;
493 }
494 
495 Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &B,
496                                                Type *ParamTy,
497                                                uint64_t Offset) const {
498 
499   MachineFunction &MF = B.getMF();
500   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
501   MachineRegisterInfo &MRI = MF.getRegInfo();
502   const Function &F = MF.getFunction();
503   const DataLayout &DL = F.getParent()->getDataLayout();
504   PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS);
505   LLT PtrType = getLLTForType(*PtrTy, DL);
506   Register KernArgSegmentPtr =
507     MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
508   Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
509 
510   auto OffsetReg = B.buildConstant(LLT::scalar(64), Offset);
511 
512   return B.buildPtrAdd(PtrType, KernArgSegmentVReg, OffsetReg).getReg(0);
513 }
514 
515 void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy,
516                                         uint64_t Offset, Align Alignment,
517                                         Register DstReg) const {
518   MachineFunction &MF = B.getMF();
519   const Function &F = MF.getFunction();
520   const DataLayout &DL = F.getParent()->getDataLayout();
521   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
522   unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
523   Register PtrReg = lowerParameterPtr(B, ParamTy, Offset);
524 
525   MachineMemOperand *MMO = MF.getMachineMemOperand(
526       PtrInfo,
527       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
528           MachineMemOperand::MOInvariant,
529       TypeSize, Alignment);
530 
531   B.buildLoad(DstReg, PtrReg, *MMO);
532 }
533 
534 // Allocate special inputs passed in user SGPRs.
535 static void allocateHSAUserSGPRs(CCState &CCInfo,
536                                  MachineIRBuilder &B,
537                                  MachineFunction &MF,
538                                  const SIRegisterInfo &TRI,
539                                  SIMachineFunctionInfo &Info) {
540   // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
541   if (Info.hasPrivateSegmentBuffer()) {
542     Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
543     MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
544     CCInfo.AllocateReg(PrivateSegmentBufferReg);
545   }
546 
547   if (Info.hasDispatchPtr()) {
548     Register DispatchPtrReg = Info.addDispatchPtr(TRI);
549     MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
550     CCInfo.AllocateReg(DispatchPtrReg);
551   }
552 
553   if (Info.hasQueuePtr()) {
554     Register QueuePtrReg = Info.addQueuePtr(TRI);
555     MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
556     CCInfo.AllocateReg(QueuePtrReg);
557   }
558 
559   if (Info.hasKernargSegmentPtr()) {
560     MachineRegisterInfo &MRI = MF.getRegInfo();
561     Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
562     const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
563     Register VReg = MRI.createGenericVirtualRegister(P4);
564     MRI.addLiveIn(InputPtrReg, VReg);
565     B.getMBB().addLiveIn(InputPtrReg);
566     B.buildCopy(VReg, InputPtrReg);
567     CCInfo.AllocateReg(InputPtrReg);
568   }
569 
570   if (Info.hasDispatchID()) {
571     Register DispatchIDReg = Info.addDispatchID(TRI);
572     MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
573     CCInfo.AllocateReg(DispatchIDReg);
574   }
575 
576   if (Info.hasFlatScratchInit()) {
577     Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
578     MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
579     CCInfo.AllocateReg(FlatScratchInitReg);
580   }
581 
582   // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
583   // these from the dispatch pointer.
584 }
585 
586 bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
587     MachineIRBuilder &B, const Function &F,
588     ArrayRef<ArrayRef<Register>> VRegs) const {
589   MachineFunction &MF = B.getMF();
590   const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
591   MachineRegisterInfo &MRI = MF.getRegInfo();
592   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
593   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
594   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
595 
596   const DataLayout &DL = F.getParent()->getDataLayout();
597 
598   SmallVector<CCValAssign, 16> ArgLocs;
599   CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
600 
601   allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info);
602 
603   unsigned i = 0;
604   const Align KernArgBaseAlign(16);
605   const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F);
606   uint64_t ExplicitArgOffset = 0;
607 
608   // TODO: Align down to dword alignment and extract bits for extending loads.
609   for (auto &Arg : F.args()) {
610     Type *ArgTy = Arg.getType();
611     unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
612     if (AllocSize == 0)
613       continue;
614 
615     Align ABIAlign = DL.getABITypeAlign(ArgTy);
616 
617     uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
618     ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
619 
620     if (Arg.use_empty()) {
621       ++i;
622       continue;
623     }
624 
625     ArrayRef<Register> OrigArgRegs = VRegs[i];
626     Register ArgReg =
627       OrigArgRegs.size() == 1
628       ? OrigArgRegs[0]
629       : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL));
630 
631     Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset);
632     lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg);
633     if (OrigArgRegs.size() > 1)
634       unpackRegs(OrigArgRegs, ArgReg, ArgTy, B);
635     ++i;
636   }
637 
638   TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
639   TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
640   return true;
641 }
642 
643 /// Pack values \p SrcRegs to cover the vector type result \p DstRegs.
644 static MachineInstrBuilder mergeVectorRegsToResultRegs(
645   MachineIRBuilder &B, ArrayRef<Register> DstRegs, ArrayRef<Register> SrcRegs) {
646   MachineRegisterInfo &MRI = *B.getMRI();
647   LLT LLTy = MRI.getType(DstRegs[0]);
648   LLT PartLLT = MRI.getType(SrcRegs[0]);
649 
650   // Deal with v3s16 split into v2s16
651   LLT LCMTy = getLCMType(LLTy, PartLLT);
652   if (LCMTy == LLTy) {
653     // Common case where no padding is needed.
654     assert(DstRegs.size() == 1);
655     return B.buildConcatVectors(DstRegs[0], SrcRegs);
656   }
657 
658   const int NumWide =  LCMTy.getSizeInBits() / PartLLT.getSizeInBits();
659   Register Undef = B.buildUndef(PartLLT).getReg(0);
660 
661   // Build vector of undefs.
662   SmallVector<Register, 8> WidenedSrcs(NumWide, Undef);
663 
664   // Replace the first sources with the real registers.
665   std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin());
666 
667   auto Widened = B.buildConcatVectors(LCMTy, WidenedSrcs);
668   int NumDst = LCMTy.getSizeInBits() / LLTy.getSizeInBits();
669 
670   SmallVector<Register, 8> PadDstRegs(NumDst);
671   std::copy(DstRegs.begin(), DstRegs.end(), PadDstRegs.begin());
672 
673   // Create the excess dead defs for the unmerge.
674   for (int I = DstRegs.size(); I != NumDst; ++I)
675     PadDstRegs[I] = MRI.createGenericVirtualRegister(LLTy);
676 
677   return B.buildUnmerge(PadDstRegs, Widened);
678 }
679 
680 // TODO: Move this to generic code
681 static void packSplitRegsToOrigType(MachineIRBuilder &B,
682                                     ArrayRef<Register> OrigRegs,
683                                     ArrayRef<Register> Regs,
684                                     LLT LLTy,
685                                     LLT PartLLT) {
686   MachineRegisterInfo &MRI = *B.getMRI();
687 
688   if (!LLTy.isVector() && !PartLLT.isVector()) {
689     assert(OrigRegs.size() == 1);
690     LLT OrigTy = MRI.getType(OrigRegs[0]);
691 
692     unsigned SrcSize = PartLLT.getSizeInBits() * Regs.size();
693     if (SrcSize == OrigTy.getSizeInBits())
694       B.buildMerge(OrigRegs[0], Regs);
695     else {
696       auto Widened = B.buildMerge(LLT::scalar(SrcSize), Regs);
697       B.buildTrunc(OrigRegs[0], Widened);
698     }
699 
700     return;
701   }
702 
703   if (LLTy.isVector() && PartLLT.isVector()) {
704     assert(OrigRegs.size() == 1);
705     assert(LLTy.getElementType() == PartLLT.getElementType());
706     mergeVectorRegsToResultRegs(B, OrigRegs, Regs);
707     return;
708   }
709 
710   assert(LLTy.isVector() && !PartLLT.isVector());
711 
712   LLT DstEltTy = LLTy.getElementType();
713 
714   // Pointer information was discarded. We'll need to coerce some register types
715   // to avoid violating type constraints.
716   LLT RealDstEltTy = MRI.getType(OrigRegs[0]).getElementType();
717 
718   assert(DstEltTy.getSizeInBits() == RealDstEltTy.getSizeInBits());
719 
720   if (DstEltTy == PartLLT) {
721     // Vector was trivially scalarized.
722 
723     if (RealDstEltTy.isPointer()) {
724       for (Register Reg : Regs)
725         MRI.setType(Reg, RealDstEltTy);
726     }
727 
728     B.buildBuildVector(OrigRegs[0], Regs);
729   } else if (DstEltTy.getSizeInBits() > PartLLT.getSizeInBits()) {
730     // Deal with vector with 64-bit elements decomposed to 32-bit
731     // registers. Need to create intermediate 64-bit elements.
732     SmallVector<Register, 8> EltMerges;
733     int PartsPerElt = DstEltTy.getSizeInBits() / PartLLT.getSizeInBits();
734 
735     assert(DstEltTy.getSizeInBits() % PartLLT.getSizeInBits() == 0);
736 
737     for (int I = 0, NumElts = LLTy.getNumElements(); I != NumElts; ++I)  {
738       auto Merge = B.buildMerge(RealDstEltTy, Regs.take_front(PartsPerElt));
739       // Fix the type in case this is really a vector of pointers.
740       MRI.setType(Merge.getReg(0), RealDstEltTy);
741       EltMerges.push_back(Merge.getReg(0));
742       Regs = Regs.drop_front(PartsPerElt);
743     }
744 
745     B.buildBuildVector(OrigRegs[0], EltMerges);
746   } else {
747     // Vector was split, and elements promoted to a wider type.
748     LLT BVType = LLT::vector(LLTy.getNumElements(), PartLLT);
749     auto BV = B.buildBuildVector(BVType, Regs);
750     B.buildTrunc(OrigRegs[0], BV);
751   }
752 }
753 
754 bool AMDGPUCallLowering::lowerFormalArguments(
755     MachineIRBuilder &B, const Function &F,
756     ArrayRef<ArrayRef<Register>> VRegs) const {
757   CallingConv::ID CC = F.getCallingConv();
758 
759   // The infrastructure for normal calling convention lowering is essentially
760   // useless for kernels. We want to avoid any kind of legalization or argument
761   // splitting.
762   if (CC == CallingConv::AMDGPU_KERNEL)
763     return lowerFormalArgumentsKernel(B, F, VRegs);
764 
765   const bool IsShader = AMDGPU::isShader(CC);
766   const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC);
767 
768   MachineFunction &MF = B.getMF();
769   MachineBasicBlock &MBB = B.getMBB();
770   MachineRegisterInfo &MRI = MF.getRegInfo();
771   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
772   const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
773   const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
774   const DataLayout &DL = F.getParent()->getDataLayout();
775 
776 
777   SmallVector<CCValAssign, 16> ArgLocs;
778   CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
779 
780   if (!IsEntryFunc) {
781     Register ReturnAddrReg = TRI->getReturnAddressReg(MF);
782     Register LiveInReturn = MF.addLiveIn(ReturnAddrReg,
783                                          &AMDGPU::SGPR_64RegClass);
784     MBB.addLiveIn(ReturnAddrReg);
785     B.buildCopy(LiveInReturn, ReturnAddrReg);
786   }
787 
788   if (Info->hasImplicitBufferPtr()) {
789     Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
790     MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
791     CCInfo.AllocateReg(ImplicitBufferPtrReg);
792   }
793 
794 
795   SmallVector<ArgInfo, 32> SplitArgs;
796   unsigned Idx = 0;
797   unsigned PSInputNum = 0;
798 
799   for (auto &Arg : F.args()) {
800     if (DL.getTypeStoreSize(Arg.getType()) == 0)
801       continue;
802 
803     const bool InReg = Arg.hasAttribute(Attribute::InReg);
804 
805     // SGPR arguments to functions not implemented.
806     if (!IsShader && InReg)
807       return false;
808 
809     if (Arg.hasAttribute(Attribute::SwiftSelf) ||
810         Arg.hasAttribute(Attribute::SwiftError) ||
811         Arg.hasAttribute(Attribute::Nest))
812       return false;
813 
814     if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) {
815       const bool ArgUsed = !Arg.use_empty();
816       bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum);
817 
818       if (!SkipArg) {
819         Info->markPSInputAllocated(PSInputNum);
820         if (ArgUsed)
821           Info->markPSInputEnabled(PSInputNum);
822       }
823 
824       ++PSInputNum;
825 
826       if (SkipArg) {
827         for (int I = 0, E = VRegs[Idx].size(); I != E; ++I)
828           B.buildUndef(VRegs[Idx][I]);
829 
830         ++Idx;
831         continue;
832       }
833     }
834 
835     ArgInfo OrigArg(VRegs[Idx], Arg.getType());
836     const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex;
837     setArgFlags(OrigArg, OrigArgIdx, DL, F);
838 
839     splitToValueTypes(
840       B, OrigArg, SplitArgs, DL, CC, false,
841       // FIXME: We should probably be passing multiple registers to
842       // handleAssignments to do this
843       [&](ArrayRef<Register> Regs, Register DstReg,
844           LLT LLTy, LLT PartLLT, int VTSplitIdx) {
845         assert(DstReg == VRegs[Idx][VTSplitIdx]);
846         packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs,
847                                 LLTy, PartLLT);
848       });
849 
850     ++Idx;
851   }
852 
853   // At least one interpolation mode must be enabled or else the GPU will
854   // hang.
855   //
856   // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
857   // set PSInputAddr, the user wants to enable some bits after the compilation
858   // based on run-time states. Since we can't know what the final PSInputEna
859   // will look like, so we shouldn't do anything here and the user should take
860   // responsibility for the correct programming.
861   //
862   // Otherwise, the following restrictions apply:
863   // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
864   // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
865   //   enabled too.
866   if (CC == CallingConv::AMDGPU_PS) {
867     if ((Info->getPSInputAddr() & 0x7F) == 0 ||
868         ((Info->getPSInputAddr() & 0xF) == 0 &&
869          Info->isPSInputAllocated(11))) {
870       CCInfo.AllocateReg(AMDGPU::VGPR0);
871       CCInfo.AllocateReg(AMDGPU::VGPR1);
872       Info->markPSInputAllocated(0);
873       Info->markPSInputEnabled(0);
874     }
875 
876     if (Subtarget.isAmdPalOS()) {
877       // For isAmdPalOS, the user does not enable some bits after compilation
878       // based on run-time states; the register values being generated here are
879       // the final ones set in hardware. Therefore we need to apply the
880       // workaround to PSInputAddr and PSInputEnable together.  (The case where
881       // a bit is set in PSInputAddr but not PSInputEnable is where the frontend
882       // set up an input arg for a particular interpolation mode, but nothing
883       // uses that input arg. Really we should have an earlier pass that removes
884       // such an arg.)
885       unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
886       if ((PsInputBits & 0x7F) == 0 ||
887           ((PsInputBits & 0xF) == 0 &&
888            (PsInputBits >> 11 & 1)))
889         Info->markPSInputEnabled(
890           countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
891     }
892   }
893 
894   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
895   CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg());
896 
897   if (!MBB.empty())
898     B.setInstr(*MBB.begin());
899 
900   if (!IsEntryFunc) {
901     // For the fixed ABI, pass workitem IDs in the last argument register.
902     if (AMDGPUTargetMachine::EnableFixedFunctionABI)
903       TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
904   }
905 
906   FormalArgHandler Handler(B, MRI, AssignFn);
907   if (!handleAssignments(CCInfo, ArgLocs, B, SplitArgs, Handler))
908     return false;
909 
910   if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) {
911     // Special inputs come after user arguments.
912     TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
913   }
914 
915   // Start adding system SGPRs.
916   if (IsEntryFunc) {
917     TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader);
918   } else {
919     CCInfo.AllocateReg(Info->getScratchRSrcReg());
920     TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
921   }
922 
923   // Move back to the end of the basic block.
924   B.setMBB(MBB);
925 
926   return true;
927 }
928 
929 bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
930                                            CCState &CCInfo,
931                                            SmallVectorImpl<std::pair<MCRegister, Register>> &ArgRegs,
932                                            CallLoweringInfo &Info) const {
933   MachineFunction &MF = MIRBuilder.getMF();
934 
935   const AMDGPUFunctionArgInfo *CalleeArgInfo
936     = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
937 
938   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
939   const AMDGPUFunctionArgInfo &CallerArgInfo = MFI->getArgInfo();
940 
941 
942   // TODO: Unify with private memory register handling. This is complicated by
943   // the fact that at least in kernels, the input argument is not necessarily
944   // in the same location as the input.
945   AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
946     AMDGPUFunctionArgInfo::DISPATCH_PTR,
947     AMDGPUFunctionArgInfo::QUEUE_PTR,
948     AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR,
949     AMDGPUFunctionArgInfo::DISPATCH_ID,
950     AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
951     AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
952     AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
953   };
954 
955   MachineRegisterInfo &MRI = MF.getRegInfo();
956 
957   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
958   const AMDGPULegalizerInfo *LI
959     = static_cast<const AMDGPULegalizerInfo*>(ST.getLegalizerInfo());
960 
961   for (auto InputID : InputRegs) {
962     const ArgDescriptor *OutgoingArg;
963     const TargetRegisterClass *ArgRC;
964     LLT ArgTy;
965 
966     std::tie(OutgoingArg, ArgRC, ArgTy) =
967         CalleeArgInfo->getPreloadedValue(InputID);
968     if (!OutgoingArg)
969       continue;
970 
971     const ArgDescriptor *IncomingArg;
972     const TargetRegisterClass *IncomingArgRC;
973     std::tie(IncomingArg, IncomingArgRC, ArgTy) =
974         CallerArgInfo.getPreloadedValue(InputID);
975     assert(IncomingArgRC == ArgRC);
976 
977     Register InputReg = MRI.createGenericVirtualRegister(ArgTy);
978 
979     if (IncomingArg) {
980       LI->loadInputValue(InputReg, MIRBuilder, IncomingArg);
981     } else {
982       assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
983       LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder);
984     }
985 
986     if (OutgoingArg->isRegister()) {
987       ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg);
988       if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
989         report_fatal_error("failed to allocate implicit input argument");
990     } else {
991       LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
992       return false;
993     }
994   }
995 
996   // Pack workitem IDs into a single register or pass it as is if already
997   // packed.
998   const ArgDescriptor *OutgoingArg;
999   const TargetRegisterClass *ArgRC;
1000   LLT ArgTy;
1001 
1002   std::tie(OutgoingArg, ArgRC, ArgTy) =
1003       CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
1004   if (!OutgoingArg)
1005     std::tie(OutgoingArg, ArgRC, ArgTy) =
1006         CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
1007   if (!OutgoingArg)
1008     std::tie(OutgoingArg, ArgRC, ArgTy) =
1009         CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
1010   if (!OutgoingArg)
1011     return false;
1012 
1013   const ArgDescriptor *IncomingArgX = std::get<0>(
1014       CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X));
1015   const ArgDescriptor *IncomingArgY = std::get<0>(
1016       CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y));
1017   const ArgDescriptor *IncomingArgZ = std::get<0>(
1018       CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z));
1019 
1020   const LLT S32 = LLT::scalar(32);
1021 
1022   // If incoming ids are not packed we need to pack them.
1023   // FIXME: Should consider known workgroup size to eliminate known 0 cases.
1024   Register InputReg;
1025   if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX) {
1026     InputReg = MRI.createGenericVirtualRegister(S32);
1027     LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX);
1028   }
1029 
1030   if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY) {
1031     Register Y = MRI.createGenericVirtualRegister(S32);
1032     LI->loadInputValue(Y, MIRBuilder, IncomingArgY);
1033 
1034     Y = MIRBuilder.buildShl(S32, Y, MIRBuilder.buildConstant(S32, 10)).getReg(0);
1035     InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Y).getReg(0) : Y;
1036   }
1037 
1038   if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ) {
1039     Register Z = MRI.createGenericVirtualRegister(S32);
1040     LI->loadInputValue(Z, MIRBuilder, IncomingArgZ);
1041 
1042     Z = MIRBuilder.buildShl(S32, Z, MIRBuilder.buildConstant(S32, 20)).getReg(0);
1043     InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Z).getReg(0) : Z;
1044   }
1045 
1046   if (!InputReg) {
1047     InputReg = MRI.createGenericVirtualRegister(S32);
1048 
1049     // Workitem ids are already packed, any of present incoming arguments will
1050     // carry all required fields.
1051     ArgDescriptor IncomingArg = ArgDescriptor::createArg(
1052       IncomingArgX ? *IncomingArgX :
1053         IncomingArgY ? *IncomingArgY : *IncomingArgZ, ~0u);
1054     LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg);
1055   }
1056 
1057   if (OutgoingArg->isRegister()) {
1058     ArgRegs.emplace_back(OutgoingArg->getRegister(), InputReg);
1059     if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
1060       report_fatal_error("failed to allocate implicit input argument");
1061   } else {
1062     LLVM_DEBUG(dbgs() << "Unhandled stack passed implicit input argument\n");
1063     return false;
1064   }
1065 
1066   return true;
1067 }
1068 
1069 /// Returns a pair containing the fixed CCAssignFn and the vararg CCAssignFn for
1070 /// CC.
1071 static std::pair<CCAssignFn *, CCAssignFn *>
1072 getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) {
1073   return {TLI.CCAssignFnForCall(CC, false), TLI.CCAssignFnForCall(CC, true)};
1074 }
1075 
1076 static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
1077                               bool IsTailCall) {
1078   return AMDGPU::SI_CALL;
1079 }
1080 
1081 // Add operands to call instruction to track the callee.
1082 static bool addCallTargetOperands(MachineInstrBuilder &CallInst,
1083                                   MachineIRBuilder &MIRBuilder,
1084                                   AMDGPUCallLowering::CallLoweringInfo &Info) {
1085   if (Info.Callee.isReg()) {
1086     CallInst.addImm(0);
1087     CallInst.add(Info.Callee);
1088   } else if (Info.Callee.isGlobal() && Info.Callee.getOffset() == 0) {
1089     // The call lowering lightly assumed we can directly encode a call target in
1090     // the instruction, which is not the case. Materialize the address here.
1091     const GlobalValue *GV = Info.Callee.getGlobal();
1092     auto Ptr = MIRBuilder.buildGlobalValue(
1093       LLT::pointer(GV->getAddressSpace(), 64), GV);
1094     CallInst.addReg(Ptr.getReg(0));
1095     CallInst.add(Info.Callee);
1096   } else
1097     return false;
1098 
1099   return true;
1100 }
1101 
1102 bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
1103                                    CallLoweringInfo &Info) const {
1104   if (!AMDGPUTargetMachine::EnableFixedFunctionABI) {
1105     LLVM_DEBUG(dbgs() << "Variable function ABI not implemented\n");
1106     return false;
1107   }
1108 
1109   if (Info.IsVarArg) {
1110     LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n");
1111     return false;
1112   }
1113 
1114   MachineFunction &MF = MIRBuilder.getMF();
1115   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1116   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1117 
1118   const Function &F = MF.getFunction();
1119   MachineRegisterInfo &MRI = MF.getRegInfo();
1120   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
1121   const DataLayout &DL = F.getParent()->getDataLayout();
1122 
1123   if (AMDGPU::isShader(F.getCallingConv())) {
1124     LLVM_DEBUG(dbgs() << "Unhandled call from graphics shader\n");
1125     return false;
1126   }
1127 
1128   SmallVector<ArgInfo, 8> OutArgs;
1129   SmallVector<ArgInfo, 4> SplitRetInfos;
1130 
1131   for (auto &OrigArg : Info.OrigArgs) {
1132     splitToValueTypes(
1133       MIRBuilder, OrigArg, OutArgs, DL, Info.CallConv, true,
1134       // FIXME: We should probably be passing multiple registers to
1135       // handleAssignments to do this
1136       [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT,
1137           int VTSplitIdx) {
1138         unpackRegsToOrigType(MIRBuilder, Regs, SrcReg, OrigArg, LLTy, PartLLT);
1139       });
1140   }
1141 
1142   SmallVector<ArgInfo, 8> InArgs;
1143   if (!Info.OrigRet.Ty->isVoidTy()) {
1144     LLVM_DEBUG(dbgs() << "Call return values not yet handled\n");
1145     return false;
1146   }
1147 
1148   // If we can lower as a tail call, do that instead.
1149   bool CanTailCallOpt = false;
1150 
1151   // We must emit a tail call if we have musttail.
1152   if (Info.IsMustTailCall && !CanTailCallOpt) {
1153     LLVM_DEBUG(dbgs() << "Failed to lower musttail call as tail call\n");
1154     return false;
1155   }
1156 
1157   // Find out which ABI gets to decide where things go.
1158   CCAssignFn *AssignFnFixed;
1159   CCAssignFn *AssignFnVarArg;
1160   std::tie(AssignFnFixed, AssignFnVarArg) =
1161       getAssignFnsForCC(Info.CallConv, TLI);
1162 
1163   MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP)
1164     .addImm(0)
1165     .addImm(0);
1166 
1167   // Create a temporarily-floating call instruction so we can add the implicit
1168   // uses of arg registers.
1169   unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false);
1170 
1171   auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
1172   MIB.addDef(TRI->getReturnAddressReg(MF));
1173 
1174   if (!addCallTargetOperands(MIB, MIRBuilder, Info))
1175     return false;
1176 
1177   // Tell the call which registers are clobbered.
1178   const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv);
1179   MIB.addRegMask(Mask);
1180 
1181   SmallVector<CCValAssign, 16> ArgLocs;
1182   CCState CCInfo(Info.CallConv, Info.IsVarArg, MF, ArgLocs, F.getContext());
1183 
1184   // We could pass MIB and directly add the implicit uses to the call
1185   // now. However, as an aesthetic choice, place implicit argument operands
1186   // after the ordinary user argument registers.
1187   SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
1188 
1189   if (AMDGPUTargetMachine::EnableFixedFunctionABI) {
1190     // With a fixed ABI, allocate fixed registers before user arguments.
1191     if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
1192       return false;
1193   }
1194 
1195   // Do the actual argument marshalling.
1196   SmallVector<Register, 8> PhysRegs;
1197   OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,
1198                              AssignFnVarArg, false);
1199   if (!handleAssignments(CCInfo, ArgLocs, MIRBuilder, OutArgs, Handler))
1200     return false;
1201 
1202   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1203 
1204   // Insert copies for the SRD. In the HSA case, this should be an identity
1205   // copy.
1206   auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::vector(4, 32),
1207                                              MFI->getScratchRSrcReg());
1208   MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
1209   MIB.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit);
1210 
1211   for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
1212     MIRBuilder.buildCopy((Register)ArgReg.first, ArgReg.second);
1213     MIB.addReg(ArgReg.first, RegState::Implicit);
1214   }
1215 
1216   // Get a count of how many bytes are to be pushed on the stack.
1217   unsigned NumBytes = CCInfo.getNextStackOffset();
1218 
1219   // Now we can add the actual call instruction to the correct position.
1220   MIRBuilder.insertInstr(MIB);
1221 
1222   // If Callee is a reg, since it is used by a target specific
1223   // instruction, it must have a register class matching the
1224   // constraint of that instruction.
1225 
1226   // FIXME: We should define regbankselectable call instructions to handle
1227   // divergent call targets.
1228   if (MIB->getOperand(1).isReg()) {
1229     MIB->getOperand(1).setReg(constrainOperandRegClass(
1230         MF, *TRI, MRI, *ST.getInstrInfo(),
1231         *ST.getRegBankInfo(), *MIB, MIB->getDesc(), MIB->getOperand(1),
1232         1));
1233   }
1234 
1235   // Finally we can copy the returned value back into its virtual-register. In
1236   // symmetry with the arguments, the physical register must be an
1237   // implicit-define of the call instruction.
1238   if (!Info.OrigRet.Ty->isVoidTy()) {
1239     CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv,
1240                                                       Info.IsVarArg);
1241     CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn);
1242     if (!handleAssignments(MIRBuilder, InArgs, Handler))
1243       return false;
1244   }
1245 
1246   uint64_t CalleePopBytes = NumBytes;
1247   MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN)
1248     .addImm(0)
1249     .addImm(CalleePopBytes);
1250 
1251   return true;
1252 }
1253