145bb48eaSTom Stellard //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
245bb48eaSTom Stellard //
32946cd70SChandler Carruth // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
42946cd70SChandler Carruth // See https://llvm.org/LICENSE.txt for license information.
52946cd70SChandler Carruth // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
645bb48eaSTom Stellard //
745bb48eaSTom Stellard //===----------------------------------------------------------------------===//
845bb48eaSTom Stellard //
945bb48eaSTom Stellard // This pass eliminates allocas by either converting them into vectors or
1045bb48eaSTom Stellard // by migrating them to local address space.
1145bb48eaSTom Stellard //
1245bb48eaSTom Stellard //===----------------------------------------------------------------------===//
1345bb48eaSTom Stellard 
1445bb48eaSTom Stellard #include "AMDGPU.h"
15560d7e04Sdfukalov #include "GCNSubtarget.h"
16e188aae4Sserge-sans-paille #include "Utils/AMDGPUBaseInfo.h"
17c85abbd9SChangpeng Fang #include "llvm/Analysis/CaptureTracking.h"
1845bb48eaSTom Stellard #include "llvm/Analysis/ValueTracking.h"
198b61764cSFrancis Visoiu Mistrih #include "llvm/CodeGen/TargetPassConfig.h"
206bda14b3SChandler Carruth #include "llvm/IR/IRBuilder.h"
21e188aae4Sserge-sans-paille #include "llvm/IR/IntrinsicInst.h"
225d986953SReid Kleckner #include "llvm/IR/IntrinsicsAMDGPU.h"
235d986953SReid Kleckner #include "llvm/IR/IntrinsicsR600.h"
24734bb7bbSEugene Zelenko #include "llvm/Pass.h"
25734bb7bbSEugene Zelenko #include "llvm/Target/TargetMachine.h"
2645bb48eaSTom Stellard 
2745bb48eaSTom Stellard #define DEBUG_TYPE "amdgpu-promote-alloca"
2845bb48eaSTom Stellard 
2945bb48eaSTom Stellard using namespace llvm;
3045bb48eaSTom Stellard 
3145bb48eaSTom Stellard namespace {
3245bb48eaSTom Stellard 
33ba92059cSChangpeng Fang static cl::opt<bool> DisablePromoteAllocaToVector(
34ba92059cSChangpeng Fang   "disable-promote-alloca-to-vector",
35ba92059cSChangpeng Fang   cl::desc("Disable promote alloca to vector"),
36ba92059cSChangpeng Fang   cl::init(false));
37ba92059cSChangpeng Fang 
3873bf0af3SYaxun Liu static cl::opt<bool> DisablePromoteAllocaToLDS(
3973bf0af3SYaxun Liu   "disable-promote-alloca-to-lds",
4073bf0af3SYaxun Liu   cl::desc("Disable promote alloca to LDS"),
4173bf0af3SYaxun Liu   cl::init(false));
4273bf0af3SYaxun Liu 
4354e2dc75SStanislav Mekhanoshin static cl::opt<unsigned> PromoteAllocaToVectorLimit(
4454e2dc75SStanislav Mekhanoshin   "amdgpu-promote-alloca-to-vector-limit",
4554e2dc75SStanislav Mekhanoshin   cl::desc("Maximum byte size to consider promote alloca to vector"),
4654e2dc75SStanislav Mekhanoshin   cl::init(0));
4754e2dc75SStanislav Mekhanoshin 
48e0132464SMatt Arsenault // FIXME: This can create globals so should be a module pass.
49bafc9dc5SMatt Arsenault class AMDGPUPromoteAlloca : public FunctionPass {
500e9abcfcSArthur Eubanks public:
510e9abcfcSArthur Eubanks   static char ID;
520e9abcfcSArthur Eubanks 
AMDGPUPromoteAlloca()530e9abcfcSArthur Eubanks   AMDGPUPromoteAlloca() : FunctionPass(ID) {}
540e9abcfcSArthur Eubanks 
550e9abcfcSArthur Eubanks   bool runOnFunction(Function &F) override;
560e9abcfcSArthur Eubanks 
getPassName() const570e9abcfcSArthur Eubanks   StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
580e9abcfcSArthur Eubanks 
590e9abcfcSArthur Eubanks   bool handleAlloca(AllocaInst &I, bool SufficientLDS);
600e9abcfcSArthur Eubanks 
getAnalysisUsage(AnalysisUsage & AU) const610e9abcfcSArthur Eubanks   void getAnalysisUsage(AnalysisUsage &AU) const override {
620e9abcfcSArthur Eubanks     AU.setPreservesCFG();
630e9abcfcSArthur Eubanks     FunctionPass::getAnalysisUsage(AU);
640e9abcfcSArthur Eubanks   }
650e9abcfcSArthur Eubanks };
660e9abcfcSArthur Eubanks 
670e9abcfcSArthur Eubanks class AMDGPUPromoteAllocaImpl {
68e0132464SMatt Arsenault private:
690e9abcfcSArthur Eubanks   const TargetMachine &TM;
70734bb7bbSEugene Zelenko   Module *Mod = nullptr;
71734bb7bbSEugene Zelenko   const DataLayout *DL = nullptr;
72e0132464SMatt Arsenault 
73e0132464SMatt Arsenault   // FIXME: This should be per-kernel.
74734bb7bbSEugene Zelenko   uint32_t LocalMemLimit = 0;
75734bb7bbSEugene Zelenko   uint32_t CurrentLocalMemUsage = 0;
7654e2dc75SStanislav Mekhanoshin   unsigned MaxVGPRs;
7745bb48eaSTom Stellard 
78734bb7bbSEugene Zelenko   bool IsAMDGCN = false;
79734bb7bbSEugene Zelenko   bool IsAMDHSA = false;
80e0132464SMatt Arsenault 
81e0132464SMatt Arsenault   std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
82e0132464SMatt Arsenault   Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
83e0132464SMatt Arsenault 
84a61cb48dSMatt Arsenault   /// BaseAlloca is the alloca root the search started from.
85a61cb48dSMatt Arsenault   /// Val may be that alloca or a recursive user of it.
86a61cb48dSMatt Arsenault   bool collectUsesWithPtrTypes(Value *BaseAlloca,
87a61cb48dSMatt Arsenault                                Value *Val,
88a61cb48dSMatt Arsenault                                std::vector<Value*> &WorkList) const;
89a61cb48dSMatt Arsenault 
90a61cb48dSMatt Arsenault   /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
91a61cb48dSMatt Arsenault   /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
92a61cb48dSMatt Arsenault   /// Returns true if both operands are derived from the same alloca. Val should
93a61cb48dSMatt Arsenault   /// be the same value as one of the input operands of UseInst.
94a61cb48dSMatt Arsenault   bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
95a61cb48dSMatt Arsenault                                        Instruction *UseInst,
96a61cb48dSMatt Arsenault                                        int OpIdx0, int OpIdx1) const;
97a61cb48dSMatt Arsenault 
981dbace19SChangpeng Fang   /// Check whether we have enough local memory for promotion.
991dbace19SChangpeng Fang   bool hasSufficientLocalMem(const Function &F);
1001dbace19SChangpeng Fang 
1011dbace19SChangpeng Fang   bool handleAlloca(AllocaInst &I, bool SufficientLDS);
102a61cb48dSMatt Arsenault 
1030e9abcfcSArthur Eubanks public:
AMDGPUPromoteAllocaImpl(TargetMachine & TM)1040e9abcfcSArthur Eubanks   AMDGPUPromoteAllocaImpl(TargetMachine &TM) : TM(TM) {}
1050e9abcfcSArthur Eubanks   bool run(Function &F);
10645bb48eaSTom Stellard };
10745bb48eaSTom Stellard 
108689e616eSStanislav Mekhanoshin class AMDGPUPromoteAllocaToVector : public FunctionPass {
109689e616eSStanislav Mekhanoshin public:
110689e616eSStanislav Mekhanoshin   static char ID;
111689e616eSStanislav Mekhanoshin 
AMDGPUPromoteAllocaToVector()112689e616eSStanislav Mekhanoshin   AMDGPUPromoteAllocaToVector() : FunctionPass(ID) {}
113689e616eSStanislav Mekhanoshin 
114689e616eSStanislav Mekhanoshin   bool runOnFunction(Function &F) override;
115689e616eSStanislav Mekhanoshin 
getPassName() const116689e616eSStanislav Mekhanoshin   StringRef getPassName() const override {
117689e616eSStanislav Mekhanoshin     return "AMDGPU Promote Alloca to vector";
118689e616eSStanislav Mekhanoshin   }
119689e616eSStanislav Mekhanoshin 
getAnalysisUsage(AnalysisUsage & AU) const120689e616eSStanislav Mekhanoshin   void getAnalysisUsage(AnalysisUsage &AU) const override {
121689e616eSStanislav Mekhanoshin     AU.setPreservesCFG();
122689e616eSStanislav Mekhanoshin     FunctionPass::getAnalysisUsage(AU);
123689e616eSStanislav Mekhanoshin   }
124689e616eSStanislav Mekhanoshin };
125689e616eSStanislav Mekhanoshin 
126734bb7bbSEugene Zelenko } // end anonymous namespace
12745bb48eaSTom Stellard 
12845bb48eaSTom Stellard char AMDGPUPromoteAlloca::ID = 0;
129689e616eSStanislav Mekhanoshin char AMDGPUPromoteAllocaToVector::ID = 0;
13045bb48eaSTom Stellard 
13113e49dceSJon Chesterfield INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
13213e49dceSJon Chesterfield                       "AMDGPU promote alloca to vector or LDS", false, false)
13313e49dceSJon Chesterfield // Move LDS uses from functions to kernels before promote alloca for accurate
13413e49dceSJon Chesterfield // estimation of LDS available
13513e49dceSJon Chesterfield INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDS)
13613e49dceSJon Chesterfield INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
137e0132464SMatt Arsenault                     "AMDGPU promote alloca to vector or LDS", false, false)
138e0132464SMatt Arsenault 
139689e616eSStanislav Mekhanoshin INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
140689e616eSStanislav Mekhanoshin                 "AMDGPU promote alloca to vector", false, false)
141689e616eSStanislav Mekhanoshin 
142e0132464SMatt Arsenault char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
143689e616eSStanislav Mekhanoshin char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
144e0132464SMatt Arsenault 
runOnFunction(Function & F)14545bb48eaSTom Stellard bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
1468b61764cSFrancis Visoiu Mistrih   if (skipFunction(F))
147e0132464SMatt Arsenault     return false;
148e0132464SMatt Arsenault 
1490e9abcfcSArthur Eubanks   if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
1500e9abcfcSArthur Eubanks     return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>()).run(F);
1510e9abcfcSArthur Eubanks   }
1528b61764cSFrancis Visoiu Mistrih   return false;
1530e9abcfcSArthur Eubanks }
1548b61764cSFrancis Visoiu Mistrih 
run(Function & F,FunctionAnalysisManager & AM)1550e9abcfcSArthur Eubanks PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
1560e9abcfcSArthur Eubanks                                                FunctionAnalysisManager &AM) {
1570e9abcfcSArthur Eubanks   bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F);
1580e9abcfcSArthur Eubanks   if (Changed) {
1590e9abcfcSArthur Eubanks     PreservedAnalyses PA;
1600e9abcfcSArthur Eubanks     PA.preserveSet<CFGAnalyses>();
1610e9abcfcSArthur Eubanks     return PA;
1620e9abcfcSArthur Eubanks   }
1630e9abcfcSArthur Eubanks   return PreservedAnalyses::all();
1640e9abcfcSArthur Eubanks }
1650e9abcfcSArthur Eubanks 
run(Function & F)1660e9abcfcSArthur Eubanks bool AMDGPUPromoteAllocaImpl::run(Function &F) {
1670e9abcfcSArthur Eubanks   Mod = F.getParent();
1680e9abcfcSArthur Eubanks   DL = &Mod->getDataLayout();
1690e9abcfcSArthur Eubanks 
1700e9abcfcSArthur Eubanks   const Triple &TT = TM.getTargetTriple();
1718b61764cSFrancis Visoiu Mistrih   IsAMDGCN = TT.getArch() == Triple::amdgcn;
1728b61764cSFrancis Visoiu Mistrih   IsAMDHSA = TT.getOS() == Triple::AMDHSA;
1738b61764cSFrancis Visoiu Mistrih 
1740e9abcfcSArthur Eubanks   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
17503d85845SMatt Arsenault   if (!ST.isPromoteAllocaEnabled())
17603d85845SMatt Arsenault     return false;
1771dbace19SChangpeng Fang 
17854e2dc75SStanislav Mekhanoshin   if (IsAMDGCN) {
1790e9abcfcSArthur Eubanks     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
18054e2dc75SStanislav Mekhanoshin     MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
181cf74ef13SStanislav Mekhanoshin     // A non-entry function has only 32 caller preserved registers.
182cf74ef13SStanislav Mekhanoshin     // Do not promote alloca which will force spilling.
183cf74ef13SStanislav Mekhanoshin     if (!AMDGPU::isEntryFunctionCC(F.getCallingConv()))
184cf74ef13SStanislav Mekhanoshin       MaxVGPRs = std::min(MaxVGPRs, 32u);
18554e2dc75SStanislav Mekhanoshin   } else {
18654e2dc75SStanislav Mekhanoshin     MaxVGPRs = 128;
18754e2dc75SStanislav Mekhanoshin   }
18854e2dc75SStanislav Mekhanoshin 
1891dbace19SChangpeng Fang   bool SufficientLDS = hasSufficientLocalMem(F);
1901dbace19SChangpeng Fang   bool Changed = false;
191bafc9dc5SMatt Arsenault   BasicBlock &EntryBB = *F.begin();
192bafc9dc5SMatt Arsenault 
19323f03f50SMatt Arsenault   SmallVector<AllocaInst *, 16> Allocas;
19423f03f50SMatt Arsenault   for (Instruction &I : EntryBB) {
19523f03f50SMatt Arsenault     if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
19623f03f50SMatt Arsenault       Allocas.push_back(AI);
19723f03f50SMatt Arsenault   }
19823f03f50SMatt Arsenault 
19923f03f50SMatt Arsenault   for (AllocaInst *AI : Allocas) {
20023f03f50SMatt Arsenault     if (handleAlloca(*AI, SufficientLDS))
20123f03f50SMatt Arsenault       Changed = true;
202bafc9dc5SMatt Arsenault   }
20345bb48eaSTom Stellard 
2041dbace19SChangpeng Fang   return Changed;
20545bb48eaSTom Stellard }
20645bb48eaSTom Stellard 
207e0132464SMatt Arsenault std::pair<Value *, Value *>
getLocalSizeYZ(IRBuilder<> & Builder)2080e9abcfcSArthur Eubanks AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
20904ce2de3SMatt Arsenault   Function &F = *Builder.GetInsertBlock()->getParent();
2100e9abcfcSArthur Eubanks   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
211c90347d7SStanislav Mekhanoshin 
212e0132464SMatt Arsenault   if (!IsAMDHSA) {
213e0132464SMatt Arsenault     Function *LocalSizeYFn
214e0132464SMatt Arsenault       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
215e0132464SMatt Arsenault     Function *LocalSizeZFn
216e0132464SMatt Arsenault       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
217e0132464SMatt Arsenault 
218e0132464SMatt Arsenault     CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
219e0132464SMatt Arsenault     CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
220e0132464SMatt Arsenault 
221c90347d7SStanislav Mekhanoshin     ST.makeLIDRangeMetadata(LocalSizeY);
222c90347d7SStanislav Mekhanoshin     ST.makeLIDRangeMetadata(LocalSizeZ);
223e0132464SMatt Arsenault 
224e0132464SMatt Arsenault     return std::make_pair(LocalSizeY, LocalSizeZ);
225e0132464SMatt Arsenault   }
226e0132464SMatt Arsenault 
227e0132464SMatt Arsenault   // We must read the size out of the dispatch pointer.
228e0132464SMatt Arsenault   assert(IsAMDGCN);
229e0132464SMatt Arsenault 
230e0132464SMatt Arsenault   // We are indexing into this struct, and want to extract the workgroup_size_*
231e0132464SMatt Arsenault   // fields.
232e0132464SMatt Arsenault   //
233e0132464SMatt Arsenault   //   typedef struct hsa_kernel_dispatch_packet_s {
234e0132464SMatt Arsenault   //     uint16_t header;
235e0132464SMatt Arsenault   //     uint16_t setup;
236e0132464SMatt Arsenault   //     uint16_t workgroup_size_x ;
237e0132464SMatt Arsenault   //     uint16_t workgroup_size_y;
238e0132464SMatt Arsenault   //     uint16_t workgroup_size_z;
239e0132464SMatt Arsenault   //     uint16_t reserved0;
240e0132464SMatt Arsenault   //     uint32_t grid_size_x ;
241e0132464SMatt Arsenault   //     uint32_t grid_size_y ;
242e0132464SMatt Arsenault   //     uint32_t grid_size_z;
243e0132464SMatt Arsenault   //
244e0132464SMatt Arsenault   //     uint32_t private_segment_size;
245e0132464SMatt Arsenault   //     uint32_t group_segment_size;
246e0132464SMatt Arsenault   //     uint64_t kernel_object;
247e0132464SMatt Arsenault   //
248e0132464SMatt Arsenault   // #ifdef HSA_LARGE_MODEL
249e0132464SMatt Arsenault   //     void *kernarg_address;
250e0132464SMatt Arsenault   // #elif defined HSA_LITTLE_ENDIAN
251e0132464SMatt Arsenault   //     void *kernarg_address;
252e0132464SMatt Arsenault   //     uint32_t reserved1;
253e0132464SMatt Arsenault   // #else
254e0132464SMatt Arsenault   //     uint32_t reserved1;
255e0132464SMatt Arsenault   //     void *kernarg_address;
256e0132464SMatt Arsenault   // #endif
257e0132464SMatt Arsenault   //     uint64_t reserved2;
258e0132464SMatt Arsenault   //     hsa_signal_t completion_signal; // uint64_t wrapper
259e0132464SMatt Arsenault   //   } hsa_kernel_dispatch_packet_t
260e0132464SMatt Arsenault   //
261e0132464SMatt Arsenault   Function *DispatchPtrFn
262e0132464SMatt Arsenault     = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
263e0132464SMatt Arsenault 
264e0132464SMatt Arsenault   CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
2653f4d00bcSArthur Eubanks   DispatchPtr->addRetAttr(Attribute::NoAlias);
2663f4d00bcSArthur Eubanks   DispatchPtr->addRetAttr(Attribute::NonNull);
26704ce2de3SMatt Arsenault   F.removeFnAttr("amdgpu-no-dispatch-ptr");
268e0132464SMatt Arsenault 
269e0132464SMatt Arsenault   // Size of the dispatch packet struct.
27044a3241fSArthur Eubanks   DispatchPtr->addDereferenceableRetAttr(64);
271e0132464SMatt Arsenault 
272e0132464SMatt Arsenault   Type *I32Ty = Type::getInt32Ty(Mod->getContext());
273e0132464SMatt Arsenault   Value *CastDispatchPtr = Builder.CreateBitCast(
2740da6350dSMatt Arsenault     DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
275e0132464SMatt Arsenault 
276e0132464SMatt Arsenault   // We could do a single 64-bit load here, but it's likely that the basic
277e0132464SMatt Arsenault   // 32-bit and extract sequence is already present, and it is probably easier
278d1f45ed5SNeubauer, Sebastian   // to CSE this. The loads should be mergeable later anyway.
2797716075aSJames Y Knight   Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 1);
280279fa8e0SGuillaume Chatelet   LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
281e0132464SMatt Arsenault 
2827716075aSJames Y Knight   Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 2);
283279fa8e0SGuillaume Chatelet   LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
284e0132464SMatt Arsenault 
285734bb7bbSEugene Zelenko   MDNode *MD = MDNode::get(Mod->getContext(), None);
286e0132464SMatt Arsenault   LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
287e0132464SMatt Arsenault   LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
288c90347d7SStanislav Mekhanoshin   ST.makeLIDRangeMetadata(LoadZU);
289e0132464SMatt Arsenault 
290e0132464SMatt Arsenault   // Extract y component. Upper half of LoadZU should be zero already.
291e0132464SMatt Arsenault   Value *Y = Builder.CreateLShr(LoadXY, 16);
292e0132464SMatt Arsenault 
293e0132464SMatt Arsenault   return std::make_pair(Y, LoadZU);
294e0132464SMatt Arsenault }
295e0132464SMatt Arsenault 
getWorkitemID(IRBuilder<> & Builder,unsigned N)2960e9abcfcSArthur Eubanks Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
2970e9abcfcSArthur Eubanks                                               unsigned N) {
29804ce2de3SMatt Arsenault   Function *F = Builder.GetInsertBlock()->getParent();
29904ce2de3SMatt Arsenault   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, *F);
3005d986953SReid Kleckner   Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
30104ce2de3SMatt Arsenault   StringRef AttrName;
302e0132464SMatt Arsenault 
303e0132464SMatt Arsenault   switch (N) {
304e0132464SMatt Arsenault   case 0:
30511b2b2f4SMichael Liao     IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
30611b2b2f4SMichael Liao                       : (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
30704ce2de3SMatt Arsenault     AttrName = "amdgpu-no-workitem-id-x";
308e0132464SMatt Arsenault     break;
309e0132464SMatt Arsenault   case 1:
31011b2b2f4SMichael Liao     IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
31111b2b2f4SMichael Liao                       : (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
31204ce2de3SMatt Arsenault     AttrName = "amdgpu-no-workitem-id-y";
313e0132464SMatt Arsenault     break;
314e0132464SMatt Arsenault 
315e0132464SMatt Arsenault   case 2:
31611b2b2f4SMichael Liao     IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
31711b2b2f4SMichael Liao                       : (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
31804ce2de3SMatt Arsenault     AttrName = "amdgpu-no-workitem-id-z";
319e0132464SMatt Arsenault     break;
320e0132464SMatt Arsenault   default:
321e0132464SMatt Arsenault     llvm_unreachable("invalid dimension");
322e0132464SMatt Arsenault   }
323e0132464SMatt Arsenault 
324e0132464SMatt Arsenault   Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
325e0132464SMatt Arsenault   CallInst *CI = Builder.CreateCall(WorkitemIdFn);
326c90347d7SStanislav Mekhanoshin   ST.makeLIDRangeMetadata(CI);
32704ce2de3SMatt Arsenault   F->removeFnAttr(AttrName);
328e0132464SMatt Arsenault 
329e0132464SMatt Arsenault   return CI;
330e0132464SMatt Arsenault }
331e0132464SMatt Arsenault 
arrayTypeToVecType(ArrayType * ArrayTy)3323254a001SChristopher Tetreault static FixedVectorType *arrayTypeToVecType(ArrayType *ArrayTy) {
3333254a001SChristopher Tetreault   return FixedVectorType::get(ArrayTy->getElementType(),
33437ab4cf8SMatt Arsenault                               ArrayTy->getNumElements());
33545bb48eaSTom Stellard }
33645bb48eaSTom Stellard 
33745bb48eaSTom Stellard static Value *
calculateVectorIndex(Value * Ptr,const std::map<GetElementPtrInst *,Value * > & GEPIdx)33845bb48eaSTom Stellard calculateVectorIndex(Value *Ptr,
33945bb48eaSTom Stellard                      const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
3403ed643eaSNikita Popov   auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts());
341512e806aSStanislav Mekhanoshin   if (!GEP)
3423ed643eaSNikita Popov     return ConstantInt::getNullValue(Type::getInt32Ty(Ptr->getContext()));
34345bb48eaSTom Stellard 
34445bb48eaSTom Stellard   auto I = GEPIdx.find(GEP);
3453ed643eaSNikita Popov   assert(I != GEPIdx.end() && "Must have entry for GEP!");
3463ed643eaSNikita Popov   return I->second;
34745bb48eaSTom Stellard }
34845bb48eaSTom Stellard 
GEPToVectorIndex(GetElementPtrInst * GEP,AllocaInst * Alloca,Type * VecElemTy,const DataLayout & DL)3493ed643eaSNikita Popov static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
3503ed643eaSNikita Popov                                Type *VecElemTy, const DataLayout &DL) {
3513ed643eaSNikita Popov   // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
3523ed643eaSNikita Popov   // helper.
3533ed643eaSNikita Popov   unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
3543ed643eaSNikita Popov   MapVector<Value *, APInt> VarOffsets;
3553ed643eaSNikita Popov   APInt ConstOffset(BW, 0);
3563ed643eaSNikita Popov   if (GEP->getPointerOperand()->stripPointerCasts() != Alloca ||
3573ed643eaSNikita Popov       !GEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
358efb24540SMatt Arsenault     return nullptr;
35945bb48eaSTom Stellard 
3603ed643eaSNikita Popov   unsigned VecElemSize = DL.getTypeAllocSize(VecElemTy);
3613ed643eaSNikita Popov   if (VarOffsets.size() > 1)
362efb24540SMatt Arsenault     return nullptr;
36345bb48eaSTom Stellard 
3643ed643eaSNikita Popov   if (VarOffsets.size() == 1) {
3653ed643eaSNikita Popov     // Only handle cases where we don't need to insert extra arithmetic
36645bb48eaSTom Stellard     // instructions.
3673ed643eaSNikita Popov     const auto &VarOffset = VarOffsets.front();
3683ed643eaSNikita Popov     if (!ConstOffset.isZero() || VarOffset.second != VecElemSize)
3693ed643eaSNikita Popov       return nullptr;
3703ed643eaSNikita Popov     return VarOffset.first;
371161e8c39SChangpeng Fang   }
372db7dea2bSStanislav Mekhanoshin 
3733ed643eaSNikita Popov   APInt Quot;
3743ed643eaSNikita Popov   uint64_t Rem;
3753ed643eaSNikita Popov   APInt::udivrem(ConstOffset, VecElemSize, Quot, Rem);
3763ed643eaSNikita Popov   if (Rem != 0)
3773ed643eaSNikita Popov     return nullptr;
378db7dea2bSStanislav Mekhanoshin 
3793ed643eaSNikita Popov   return ConstantInt::get(GEP->getContext(), Quot);
38045bb48eaSTom Stellard }
38145bb48eaSTom Stellard 
tryPromoteAllocaToVector(AllocaInst * Alloca,const DataLayout & DL,unsigned MaxVGPRs)38254e2dc75SStanislav Mekhanoshin static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
38354e2dc75SStanislav Mekhanoshin                                      unsigned MaxVGPRs) {
384ba92059cSChangpeng Fang 
385ba92059cSChangpeng Fang   if (DisablePromoteAllocaToVector) {
386d34e60caSNicola Zaghen     LLVM_DEBUG(dbgs() << "  Promotion alloca to vector is disabled\n");
387ba92059cSChangpeng Fang     return false;
388ba92059cSChangpeng Fang   }
389ba92059cSChangpeng Fang 
39068b03aeeSEli Friedman   Type *AllocaTy = Alloca->getAllocatedType();
3913254a001SChristopher Tetreault   auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
39268b03aeeSEli Friedman   if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
39368b03aeeSEli Friedman     if (VectorType::isValidElementType(ArrayTy->getElementType()) &&
39468b03aeeSEli Friedman         ArrayTy->getNumElements() > 0)
39568b03aeeSEli Friedman       VectorTy = arrayTypeToVecType(ArrayTy);
39668b03aeeSEli Friedman   }
39745bb48eaSTom Stellard 
39854e2dc75SStanislav Mekhanoshin   // Use up to 1/4 of available register budget for vectorization.
39954e2dc75SStanislav Mekhanoshin   unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
40054e2dc75SStanislav Mekhanoshin                                               : (MaxVGPRs * 32);
40154e2dc75SStanislav Mekhanoshin 
40254e2dc75SStanislav Mekhanoshin   if (DL.getTypeSizeInBits(AllocaTy) * 4 > Limit) {
40354e2dc75SStanislav Mekhanoshin     LLVM_DEBUG(dbgs() << "  Alloca too big for vectorization with "
40454e2dc75SStanislav Mekhanoshin                       << MaxVGPRs << " registers available\n");
40554e2dc75SStanislav Mekhanoshin     return false;
40654e2dc75SStanislav Mekhanoshin   }
40754e2dc75SStanislav Mekhanoshin 
408d34e60caSNicola Zaghen   LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
40945bb48eaSTom Stellard 
41045bb48eaSTom Stellard   // FIXME: There is no reason why we can't support larger arrays, we
41145bb48eaSTom Stellard   // are just being conservative for now.
41282618baaSDavid Stuttard   // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these
41382618baaSDavid Stuttard   // could also be promoted but we don't currently handle this case
41468b03aeeSEli Friedman   if (!VectorTy || VectorTy->getNumElements() > 16 ||
41568b03aeeSEli Friedman       VectorTy->getNumElements() < 2) {
416d34e60caSNicola Zaghen     LLVM_DEBUG(dbgs() << "  Cannot convert type to vector\n");
41745bb48eaSTom Stellard     return false;
41845bb48eaSTom Stellard   }
41945bb48eaSTom Stellard 
42045bb48eaSTom Stellard   std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
4213ed643eaSNikita Popov   SmallVector<Instruction *> WorkList;
4223ed643eaSNikita Popov   SmallVector<Use *, 8> Uses;
4233ed643eaSNikita Popov   for (Use &U : Alloca->uses())
4243ed643eaSNikita Popov     Uses.push_back(&U);
425db7dea2bSStanislav Mekhanoshin 
4263ed643eaSNikita Popov   Type *VecEltTy = VectorTy->getElementType();
4273ed643eaSNikita Popov   while (!Uses.empty()) {
4283ed643eaSNikita Popov     Use *U = Uses.pop_back_val();
4293ed643eaSNikita Popov     Instruction *Inst = dyn_cast<Instruction>(U->getUser());
4303ed643eaSNikita Popov 
4313ed643eaSNikita Popov     if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
4323ed643eaSNikita Popov       // This is a store of the pointer, not to the pointer.
4333ed643eaSNikita Popov       if (isa<StoreInst>(Inst) &&
4343ed643eaSNikita Popov           U->getOperandNo() != StoreInst::getPointerOperandIndex())
43545bb48eaSTom Stellard         return false;
43645bb48eaSTom Stellard 
4373ed643eaSNikita Popov       Type *AccessTy = getLoadStoreType(Inst);
4383ed643eaSNikita Popov       Ptr = Ptr->stripPointerCasts();
4393ed643eaSNikita Popov 
4403ed643eaSNikita Popov       // Alloca already accessed as vector, leave alone.
4413ed643eaSNikita Popov       if (Ptr == Alloca && DL.getTypeStoreSize(Alloca->getAllocatedType()) ==
4423ed643eaSNikita Popov                                DL.getTypeStoreSize(AccessTy))
443db7dea2bSStanislav Mekhanoshin         continue;
444db7dea2bSStanislav Mekhanoshin 
4453ed643eaSNikita Popov       // Check that this is a simple access of a vector element.
4463ed643eaSNikita Popov       bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple()
4473ed643eaSNikita Popov                                           : cast<StoreInst>(Inst)->isSimple();
4483ed643eaSNikita Popov       if (!IsSimple ||
4493ed643eaSNikita Popov           !CastInst::isBitOrNoopPointerCastable(VecEltTy, AccessTy, DL))
4503ed643eaSNikita Popov         return false;
451db7dea2bSStanislav Mekhanoshin 
4523ed643eaSNikita Popov       WorkList.push_back(Inst);
453db7dea2bSStanislav Mekhanoshin       continue;
454db7dea2bSStanislav Mekhanoshin     }
455db7dea2bSStanislav Mekhanoshin 
4563ed643eaSNikita Popov     if (isa<BitCastInst>(Inst)) {
4573ed643eaSNikita Popov       // Look through bitcasts.
4583ed643eaSNikita Popov       for (Use &U : Inst->uses())
4593ed643eaSNikita Popov         Uses.push_back(&U);
46045bb48eaSTom Stellard       continue;
46145bb48eaSTom Stellard     }
46245bb48eaSTom Stellard 
4633ed643eaSNikita Popov     if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
46445bb48eaSTom Stellard       // If we can't compute a vector index from this GEP, then we can't
46545bb48eaSTom Stellard       // promote this alloca to vector.
4663ed643eaSNikita Popov       Value *Index = GEPToVectorIndex(GEP, Alloca, VecEltTy, DL);
46745bb48eaSTom Stellard       if (!Index) {
468d34e60caSNicola Zaghen         LLVM_DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP
469d34e60caSNicola Zaghen                           << '\n');
47045bb48eaSTom Stellard         return false;
47145bb48eaSTom Stellard       }
47245bb48eaSTom Stellard 
47345bb48eaSTom Stellard       GEPVectorIdx[GEP] = Index;
4743ed643eaSNikita Popov       for (Use &U : Inst->uses())
4753ed643eaSNikita Popov         Uses.push_back(&U);
4763ed643eaSNikita Popov       continue;
4773ed643eaSNikita Popov     }
4783ed643eaSNikita Popov 
4793ed643eaSNikita Popov     // Ignore assume-like intrinsics and comparisons used in assumes.
4803ed643eaSNikita Popov     if (isAssumeLikeIntrinsic(Inst))
4813ed643eaSNikita Popov       continue;
4823ed643eaSNikita Popov 
4833ed643eaSNikita Popov     if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
4843ed643eaSNikita Popov           return isAssumeLikeIntrinsic(cast<Instruction>(U));
4853ed643eaSNikita Popov         }))
4863ed643eaSNikita Popov       continue;
4873ed643eaSNikita Popov 
4883ed643eaSNikita Popov     // Unknown user.
4893ed643eaSNikita Popov     return false;
49045bb48eaSTom Stellard   }
49145bb48eaSTom Stellard 
492d34e60caSNicola Zaghen   LLVM_DEBUG(dbgs() << "  Converting alloca to vector " << *AllocaTy << " -> "
493d34e60caSNicola Zaghen                     << *VectorTy << '\n');
49445bb48eaSTom Stellard 
4953ed643eaSNikita Popov   for (Instruction *Inst : WorkList) {
49645bb48eaSTom Stellard     IRBuilder<> Builder(Inst);
49745bb48eaSTom Stellard     switch (Inst->getOpcode()) {
49845bb48eaSTom Stellard     case Instruction::Load: {
49982618baaSDavid Stuttard       Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
50045bb48eaSTom Stellard       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
5013ed643eaSNikita Popov       Type *VecPtrTy = VectorTy->getPointerTo(Alloca->getAddressSpace());
502efb24540SMatt Arsenault       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
50314359ef1SJames Y Knight       Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
50445bb48eaSTom Stellard       Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
505db7dea2bSStanislav Mekhanoshin       if (Inst->getType() != VecEltTy)
5066e1eee60SStanislav Mekhanoshin         ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType());
50745bb48eaSTom Stellard       Inst->replaceAllUsesWith(ExtractElement);
50845bb48eaSTom Stellard       Inst->eraseFromParent();
50945bb48eaSTom Stellard       break;
51045bb48eaSTom Stellard     }
51145bb48eaSTom Stellard     case Instruction::Store: {
51282618baaSDavid Stuttard       StoreInst *SI = cast<StoreInst>(Inst);
51382618baaSDavid Stuttard       Value *Ptr = SI->getPointerOperand();
51445bb48eaSTom Stellard       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
5153ed643eaSNikita Popov       Type *VecPtrTy = VectorTy->getPointerTo(Alloca->getAddressSpace());
516efb24540SMatt Arsenault       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
51714359ef1SJames Y Knight       Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
518db7dea2bSStanislav Mekhanoshin       Value *Elt = SI->getValueOperand();
519db7dea2bSStanislav Mekhanoshin       if (Elt->getType() != VecEltTy)
5206e1eee60SStanislav Mekhanoshin         Elt = Builder.CreateBitOrPointerCast(Elt, VecEltTy);
521db7dea2bSStanislav Mekhanoshin       Value *NewVecValue = Builder.CreateInsertElement(VecValue, Elt, Index);
52245bb48eaSTom Stellard       Builder.CreateStore(NewVecValue, BitCast);
52345bb48eaSTom Stellard       Inst->eraseFromParent();
52445bb48eaSTom Stellard       break;
52545bb48eaSTom Stellard     }
52645bb48eaSTom Stellard 
52745bb48eaSTom Stellard     default:
52845bb48eaSTom Stellard       llvm_unreachable("Inconsistency in instructions promotable to vector");
52945bb48eaSTom Stellard     }
53045bb48eaSTom Stellard   }
53145bb48eaSTom Stellard   return true;
53245bb48eaSTom Stellard }
53345bb48eaSTom Stellard 
isCallPromotable(CallInst * CI)534ad134845SMatt Arsenault static bool isCallPromotable(CallInst *CI) {
535ad134845SMatt Arsenault   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
536ad134845SMatt Arsenault   if (!II)
537ad134845SMatt Arsenault     return false;
538ad134845SMatt Arsenault 
539ad134845SMatt Arsenault   switch (II->getIntrinsicID()) {
540ad134845SMatt Arsenault   case Intrinsic::memcpy:
5417e747f1aSMatt Arsenault   case Intrinsic::memmove:
542ad134845SMatt Arsenault   case Intrinsic::memset:
543ad134845SMatt Arsenault   case Intrinsic::lifetime_start:
544ad134845SMatt Arsenault   case Intrinsic::lifetime_end:
545ad134845SMatt Arsenault   case Intrinsic::invariant_start:
546ad134845SMatt Arsenault   case Intrinsic::invariant_end:
5475dde8094SPiotr Padlewski   case Intrinsic::launder_invariant_group:
5485b3db45eSPiotr Padlewski   case Intrinsic::strip_invariant_group:
5497e747f1aSMatt Arsenault   case Intrinsic::objectsize:
550ad134845SMatt Arsenault     return true;
551ad134845SMatt Arsenault   default:
552ad134845SMatt Arsenault     return false;
553ad134845SMatt Arsenault   }
554ad134845SMatt Arsenault }
555ad134845SMatt Arsenault 
binaryOpIsDerivedFromSameAlloca(Value * BaseAlloca,Value * Val,Instruction * Inst,int OpIdx0,int OpIdx1) const5560e9abcfcSArthur Eubanks bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
5570e9abcfcSArthur Eubanks     Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0,
558a61cb48dSMatt Arsenault     int OpIdx1) const {
559a61cb48dSMatt Arsenault   // Figure out which operand is the one we might not be promoting.
560a61cb48dSMatt Arsenault   Value *OtherOp = Inst->getOperand(OpIdx0);
561a61cb48dSMatt Arsenault   if (Val == OtherOp)
562a61cb48dSMatt Arsenault     OtherOp = Inst->getOperand(OpIdx1);
563a61cb48dSMatt Arsenault 
564891fccc0SMatt Arsenault   if (isa<ConstantPointerNull>(OtherOp))
565891fccc0SMatt Arsenault     return true;
566891fccc0SMatt Arsenault 
567b0eb40caSVitaly Buka   Value *OtherObj = getUnderlyingObject(OtherOp);
568a61cb48dSMatt Arsenault   if (!isa<AllocaInst>(OtherObj))
569a61cb48dSMatt Arsenault     return false;
570a61cb48dSMatt Arsenault 
571a61cb48dSMatt Arsenault   // TODO: We should be able to replace undefs with the right pointer type.
572a61cb48dSMatt Arsenault 
573a61cb48dSMatt Arsenault   // TODO: If we know the other base object is another promotable
574a61cb48dSMatt Arsenault   // alloca, not necessarily this alloca, we can do this. The
575a61cb48dSMatt Arsenault   // important part is both must have the same address space at
576a61cb48dSMatt Arsenault   // the end.
577a61cb48dSMatt Arsenault   if (OtherObj != BaseAlloca) {
578d34e60caSNicola Zaghen     LLVM_DEBUG(
579d34e60caSNicola Zaghen         dbgs() << "Found a binary instruction with another alloca object\n");
580a61cb48dSMatt Arsenault     return false;
581a61cb48dSMatt Arsenault   }
582a61cb48dSMatt Arsenault 
583a61cb48dSMatt Arsenault   return true;
584a61cb48dSMatt Arsenault }
585a61cb48dSMatt Arsenault 
collectUsesWithPtrTypes(Value * BaseAlloca,Value * Val,std::vector<Value * > & WorkList) const5860e9abcfcSArthur Eubanks bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
5870e9abcfcSArthur Eubanks     Value *BaseAlloca, Value *Val, std::vector<Value *> &WorkList) const {
588a61cb48dSMatt Arsenault 
589ad134845SMatt Arsenault   for (User *User : Val->users()) {
5900d955d0bSDavid Majnemer     if (is_contained(WorkList, User))
591ad134845SMatt Arsenault       continue;
592ad134845SMatt Arsenault 
593ad134845SMatt Arsenault     if (CallInst *CI = dyn_cast<CallInst>(User)) {
594ad134845SMatt Arsenault       if (!isCallPromotable(CI))
595ad134845SMatt Arsenault         return false;
596ad134845SMatt Arsenault 
59745bb48eaSTom Stellard       WorkList.push_back(User);
59845bb48eaSTom Stellard       continue;
59945bb48eaSTom Stellard     }
60045bb48eaSTom Stellard 
601a61cb48dSMatt Arsenault     Instruction *UseInst = cast<Instruction>(User);
602a61cb48dSMatt Arsenault     if (UseInst->getOpcode() == Instruction::PtrToInt)
60345bb48eaSTom Stellard       return false;
60445bb48eaSTom Stellard 
605210b7cf3SMatt Arsenault     if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
606c438ef57SMatt Arsenault       if (LI->isVolatile())
607c438ef57SMatt Arsenault         return false;
608c438ef57SMatt Arsenault 
609c438ef57SMatt Arsenault       continue;
610c438ef57SMatt Arsenault     }
611c438ef57SMatt Arsenault 
612a61cb48dSMatt Arsenault     if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
6130a30e456SMatt Arsenault       if (SI->isVolatile())
6140a30e456SMatt Arsenault         return false;
6150a30e456SMatt Arsenault 
6167227cc1aSMatt Arsenault       // Reject if the stored value is not the pointer operand.
6177227cc1aSMatt Arsenault       if (SI->getPointerOperand() != Val)
6187227cc1aSMatt Arsenault         return false;
619210b7cf3SMatt Arsenault     } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
6200a30e456SMatt Arsenault       if (RMW->isVolatile())
6210a30e456SMatt Arsenault         return false;
622210b7cf3SMatt Arsenault     } else if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
6230a30e456SMatt Arsenault       if (CAS->isVolatile())
6240a30e456SMatt Arsenault         return false;
6257227cc1aSMatt Arsenault     }
6267227cc1aSMatt Arsenault 
627a61cb48dSMatt Arsenault     // Only promote a select if we know that the other select operand
628a61cb48dSMatt Arsenault     // is from another pointer that will also be promoted.
629a61cb48dSMatt Arsenault     if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
630a61cb48dSMatt Arsenault       if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
631a61cb48dSMatt Arsenault         return false;
632891fccc0SMatt Arsenault 
633891fccc0SMatt Arsenault       // May need to rewrite constant operands.
634891fccc0SMatt Arsenault       WorkList.push_back(ICmp);
635a61cb48dSMatt Arsenault     }
636a61cb48dSMatt Arsenault 
6372402b95dSMatt Arsenault     if (UseInst->getOpcode() == Instruction::AddrSpaceCast) {
638c85abbd9SChangpeng Fang       // Give up if the pointer may be captured.
639c85abbd9SChangpeng Fang       if (PointerMayBeCaptured(UseInst, true, true))
640c85abbd9SChangpeng Fang         return false;
6412402b95dSMatt Arsenault       // Don't collect the users of this.
6422402b95dSMatt Arsenault       WorkList.push_back(User);
6432402b95dSMatt Arsenault       continue;
6442402b95dSMatt Arsenault     }
6452402b95dSMatt Arsenault 
646544be708SChristudasan Devadasan     // Do not promote vector/aggregate type instructions. It is hard to track
647544be708SChristudasan Devadasan     // their users.
648544be708SChristudasan Devadasan     if (isa<InsertValueInst>(User) || isa<InsertElementInst>(User))
649544be708SChristudasan Devadasan       return false;
650544be708SChristudasan Devadasan 
65145bb48eaSTom Stellard     if (!User->getType()->isPointerTy())
65245bb48eaSTom Stellard       continue;
65345bb48eaSTom Stellard 
654de420812SMatt Arsenault     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
655de420812SMatt Arsenault       // Be conservative if an address could be computed outside the bounds of
656de420812SMatt Arsenault       // the alloca.
657de420812SMatt Arsenault       if (!GEP->isInBounds())
658de420812SMatt Arsenault         return false;
659de420812SMatt Arsenault     }
660de420812SMatt Arsenault 
661a61cb48dSMatt Arsenault     // Only promote a select if we know that the other select operand is from
662a61cb48dSMatt Arsenault     // another pointer that will also be promoted.
663a61cb48dSMatt Arsenault     if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
664a61cb48dSMatt Arsenault       if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
665a61cb48dSMatt Arsenault         return false;
666a61cb48dSMatt Arsenault     }
667a61cb48dSMatt Arsenault 
668a61cb48dSMatt Arsenault     // Repeat for phis.
669a61cb48dSMatt Arsenault     if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
670a61cb48dSMatt Arsenault       // TODO: Handle more complex cases. We should be able to replace loops
671a61cb48dSMatt Arsenault       // over arrays.
672a61cb48dSMatt Arsenault       switch (Phi->getNumIncomingValues()) {
673a61cb48dSMatt Arsenault       case 1:
674a61cb48dSMatt Arsenault         break;
675a61cb48dSMatt Arsenault       case 2:
676a61cb48dSMatt Arsenault         if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
677a61cb48dSMatt Arsenault           return false;
678a61cb48dSMatt Arsenault         break;
679a61cb48dSMatt Arsenault       default:
680a61cb48dSMatt Arsenault         return false;
681a61cb48dSMatt Arsenault       }
682a61cb48dSMatt Arsenault     }
683a61cb48dSMatt Arsenault 
68445bb48eaSTom Stellard     WorkList.push_back(User);
685a61cb48dSMatt Arsenault     if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
686ad134845SMatt Arsenault       return false;
68745bb48eaSTom Stellard   }
688ad134845SMatt Arsenault 
689ad134845SMatt Arsenault   return true;
69045bb48eaSTom Stellard }
69145bb48eaSTom Stellard 
hasSufficientLocalMem(const Function & F)6920e9abcfcSArthur Eubanks bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
6931dbace19SChangpeng Fang 
6941dbace19SChangpeng Fang   FunctionType *FTy = F.getFunctionType();
6950e9abcfcSArthur Eubanks   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
6961dbace19SChangpeng Fang 
6971dbace19SChangpeng Fang   // If the function has any arguments in the local address space, then it's
6981dbace19SChangpeng Fang   // possible these arguments require the entire local memory space, so
6991dbace19SChangpeng Fang   // we cannot use local memory in the pass.
7001dbace19SChangpeng Fang   for (Type *ParamTy : FTy->params()) {
7011dbace19SChangpeng Fang     PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
7020da6350dSMatt Arsenault     if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
7031dbace19SChangpeng Fang       LocalMemLimit = 0;
704d34e60caSNicola Zaghen       LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
7051dbace19SChangpeng Fang                            "local memory disabled.\n");
7061dbace19SChangpeng Fang       return false;
7071dbace19SChangpeng Fang     }
7081dbace19SChangpeng Fang   }
7091dbace19SChangpeng Fang 
7101dbace19SChangpeng Fang   LocalMemLimit = ST.getLocalMemorySize();
7111dbace19SChangpeng Fang   if (LocalMemLimit == 0)
7121dbace19SChangpeng Fang     return false;
7131dbace19SChangpeng Fang 
714d42c7b22SMatt Arsenault   SmallVector<const Constant *, 16> Stack;
715d42c7b22SMatt Arsenault   SmallPtrSet<const Constant *, 8> VisitedConstants;
716d42c7b22SMatt Arsenault   SmallPtrSet<const GlobalVariable *, 8> UsedLDS;
7171dbace19SChangpeng Fang 
718d42c7b22SMatt Arsenault   auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
719d42c7b22SMatt Arsenault     for (const User *U : Val->users()) {
720d42c7b22SMatt Arsenault       if (const Instruction *Use = dyn_cast<Instruction>(U)) {
721d42c7b22SMatt Arsenault         if (Use->getParent()->getParent() == &F)
722d42c7b22SMatt Arsenault           return true;
723d42c7b22SMatt Arsenault       } else {
724d42c7b22SMatt Arsenault         const Constant *C = cast<Constant>(U);
725d42c7b22SMatt Arsenault         if (VisitedConstants.insert(C).second)
726d42c7b22SMatt Arsenault           Stack.push_back(C);
727d42c7b22SMatt Arsenault       }
728d42c7b22SMatt Arsenault     }
729d42c7b22SMatt Arsenault 
730d42c7b22SMatt Arsenault     return false;
731d42c7b22SMatt Arsenault   };
732d42c7b22SMatt Arsenault 
7331dbace19SChangpeng Fang   for (GlobalVariable &GV : Mod->globals()) {
734bc561662SMatt Arsenault     if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
7351dbace19SChangpeng Fang       continue;
7361dbace19SChangpeng Fang 
737d42c7b22SMatt Arsenault     if (visitUsers(&GV, &GV)) {
738d42c7b22SMatt Arsenault       UsedLDS.insert(&GV);
739d42c7b22SMatt Arsenault       Stack.clear();
740d42c7b22SMatt Arsenault       continue;
74184704d98SMatt Arsenault     }
7421dbace19SChangpeng Fang 
743d42c7b22SMatt Arsenault     // For any ConstantExpr uses, we need to recursively search the users until
744d42c7b22SMatt Arsenault     // we see a function.
745d42c7b22SMatt Arsenault     while (!Stack.empty()) {
746d42c7b22SMatt Arsenault       const Constant *C = Stack.pop_back_val();
747d42c7b22SMatt Arsenault       if (visitUsers(&GV, C)) {
748d42c7b22SMatt Arsenault         UsedLDS.insert(&GV);
749d42c7b22SMatt Arsenault         Stack.clear();
7501dbace19SChangpeng Fang         break;
7511dbace19SChangpeng Fang       }
7521dbace19SChangpeng Fang     }
7531dbace19SChangpeng Fang   }
7541dbace19SChangpeng Fang 
755d42c7b22SMatt Arsenault   const DataLayout &DL = Mod->getDataLayout();
756d42c7b22SMatt Arsenault   SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
757d42c7b22SMatt Arsenault   AllocatedSizes.reserve(UsedLDS.size());
758d42c7b22SMatt Arsenault 
759d42c7b22SMatt Arsenault   for (const GlobalVariable *GV : UsedLDS) {
760d42c7b22SMatt Arsenault     Align Alignment =
761d42c7b22SMatt Arsenault         DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
762d42c7b22SMatt Arsenault     uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType());
76315f54dd5SYaxun (Sam) Liu 
76415f54dd5SYaxun (Sam) Liu     // HIP uses an extern unsized array in local address space for dynamically
76515f54dd5SYaxun (Sam) Liu     // allocated shared memory.  In that case, we have to disable the promotion.
76615f54dd5SYaxun (Sam) Liu     if (GV->hasExternalLinkage() && AllocSize == 0) {
76715f54dd5SYaxun (Sam) Liu       LocalMemLimit = 0;
76815f54dd5SYaxun (Sam) Liu       LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
76915f54dd5SYaxun (Sam) Liu                            "local memory. Promoting to local memory "
77015f54dd5SYaxun (Sam) Liu                            "disabled.\n");
77115f54dd5SYaxun (Sam) Liu       return false;
77215f54dd5SYaxun (Sam) Liu     }
77315f54dd5SYaxun (Sam) Liu 
774d42c7b22SMatt Arsenault     AllocatedSizes.emplace_back(AllocSize, Alignment);
775d42c7b22SMatt Arsenault   }
776d42c7b22SMatt Arsenault 
777d42c7b22SMatt Arsenault   // Sort to try to estimate the worst case alignment padding
778d42c7b22SMatt Arsenault   //
779d42c7b22SMatt Arsenault   // FIXME: We should really do something to fix the addresses to a more optimal
780d42c7b22SMatt Arsenault   // value instead
781*e0039b8dSKazu Hirata   llvm::sort(AllocatedSizes, llvm::less_second());
782d42c7b22SMatt Arsenault 
783d42c7b22SMatt Arsenault   // Check how much local memory is being used by global objects
784d42c7b22SMatt Arsenault   CurrentLocalMemUsage = 0;
785d42c7b22SMatt Arsenault 
786d42c7b22SMatt Arsenault   // FIXME: Try to account for padding here. The real padding and address is
787d42c7b22SMatt Arsenault   // currently determined from the inverse order of uses in the function when
788d42c7b22SMatt Arsenault   // legalizing, which could also potentially change. We try to estimate the
789d42c7b22SMatt Arsenault   // worst case here, but we probably should fix the addresses earlier.
790d42c7b22SMatt Arsenault   for (auto Alloc : AllocatedSizes) {
791d42c7b22SMatt Arsenault     CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second);
792d42c7b22SMatt Arsenault     CurrentLocalMemUsage += Alloc.first;
793d42c7b22SMatt Arsenault   }
794d42c7b22SMatt Arsenault 
7951dbace19SChangpeng Fang   unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
7961dbace19SChangpeng Fang                                                           F);
7971dbace19SChangpeng Fang 
7981dbace19SChangpeng Fang   // Restrict local memory usage so that we don't drastically reduce occupancy,
7991dbace19SChangpeng Fang   // unless it is already significantly reduced.
8001dbace19SChangpeng Fang 
8011dbace19SChangpeng Fang   // TODO: Have some sort of hint or other heuristics to guess occupancy based
8021dbace19SChangpeng Fang   // on other factors..
8031dbace19SChangpeng Fang   unsigned OccupancyHint = ST.getWavesPerEU(F).second;
8041dbace19SChangpeng Fang   if (OccupancyHint == 0)
8051dbace19SChangpeng Fang     OccupancyHint = 7;
8061dbace19SChangpeng Fang 
8071dbace19SChangpeng Fang   // Clamp to max value.
8081dbace19SChangpeng Fang   OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
8091dbace19SChangpeng Fang 
8101dbace19SChangpeng Fang   // Check the hint but ignore it if it's obviously wrong from the existing LDS
8111dbace19SChangpeng Fang   // usage.
8121dbace19SChangpeng Fang   MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
8131dbace19SChangpeng Fang 
8141dbace19SChangpeng Fang 
8151dbace19SChangpeng Fang   // Round up to the next tier of usage.
8161dbace19SChangpeng Fang   unsigned MaxSizeWithWaveCount
8171dbace19SChangpeng Fang     = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
8181dbace19SChangpeng Fang 
8191dbace19SChangpeng Fang   // Program is possibly broken by using more local mem than available.
8201dbace19SChangpeng Fang   if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
8211dbace19SChangpeng Fang     return false;
8221dbace19SChangpeng Fang 
8231dbace19SChangpeng Fang   LocalMemLimit = MaxSizeWithWaveCount;
8241dbace19SChangpeng Fang 
825d34e60caSNicola Zaghen   LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
826d34e60caSNicola Zaghen                     << " bytes of LDS\n"
8271dbace19SChangpeng Fang                     << "  Rounding size to " << MaxSizeWithWaveCount
8281dbace19SChangpeng Fang                     << " with a maximum occupancy of " << MaxOccupancy << '\n'
8291dbace19SChangpeng Fang                     << " and " << (LocalMemLimit - CurrentLocalMemUsage)
830d34e60caSNicola Zaghen                     << " available for promotion\n");
8311dbace19SChangpeng Fang 
8321dbace19SChangpeng Fang   return true;
8331dbace19SChangpeng Fang }
8341dbace19SChangpeng Fang 
8358a028bf4SMatt Arsenault // FIXME: Should try to pick the most likely to be profitable allocas first.
handleAlloca(AllocaInst & I,bool SufficientLDS)8360e9abcfcSArthur Eubanks bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
837c5fce690SMatt Arsenault   // Array allocations are probably not worth handling, since an allocation of
838c5fce690SMatt Arsenault   // the array type is the canonical form.
839c5fce690SMatt Arsenault   if (!I.isStaticAlloca() || I.isArrayAllocation())
8401dbace19SChangpeng Fang     return false;
84119c54880SMatt Arsenault 
842db7dea2bSStanislav Mekhanoshin   const DataLayout &DL = Mod->getDataLayout();
84345bb48eaSTom Stellard   IRBuilder<> Builder(&I);
84445bb48eaSTom Stellard 
84545bb48eaSTom Stellard   // First try to replace the alloca with a vector
84645bb48eaSTom Stellard   Type *AllocaTy = I.getAllocatedType();
84745bb48eaSTom Stellard 
848d34e60caSNicola Zaghen   LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
84945bb48eaSTom Stellard 
85054e2dc75SStanislav Mekhanoshin   if (tryPromoteAllocaToVector(&I, DL, MaxVGPRs))
8511dbace19SChangpeng Fang     return true; // Promoted to vector.
85245bb48eaSTom Stellard 
85373bf0af3SYaxun Liu   if (DisablePromoteAllocaToLDS)
85473bf0af3SYaxun Liu     return false;
85573bf0af3SYaxun Liu 
85679a1fd71STom Stellard   const Function &ContainingFunction = *I.getParent()->getParent();
8575c80618fSMatt Arsenault   CallingConv::ID CC = ContainingFunction.getCallingConv();
85879a1fd71STom Stellard 
859bef1ceb8SNicolai Haehnle   // Don't promote the alloca to LDS for shader calling conventions as the work
860bef1ceb8SNicolai Haehnle   // item ID intrinsics are not supported for these calling conventions.
861bef1ceb8SNicolai Haehnle   // Furthermore not all LDS is available for some of the stages.
8625c80618fSMatt Arsenault   switch (CC) {
8635c80618fSMatt Arsenault   case CallingConv::AMDGPU_KERNEL:
8645c80618fSMatt Arsenault   case CallingConv::SPIR_KERNEL:
8655c80618fSMatt Arsenault     break;
8665c80618fSMatt Arsenault   default:
867d34e60caSNicola Zaghen     LLVM_DEBUG(
868d34e60caSNicola Zaghen         dbgs()
869d34e60caSNicola Zaghen         << " promote alloca to LDS not supported with calling convention.\n");
8701dbace19SChangpeng Fang     return false;
8715c80618fSMatt Arsenault   }
872bef1ceb8SNicolai Haehnle 
8731dbace19SChangpeng Fang   // Not likely to have sufficient local memory for promotion.
8741dbace19SChangpeng Fang   if (!SufficientLDS)
8751dbace19SChangpeng Fang     return false;
8761dbace19SChangpeng Fang 
8770e9abcfcSArthur Eubanks   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction);
8781d65026cSKonstantin Zhuravlyov   unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
87979a1fd71STom Stellard 
88052911428SGuillaume Chatelet   Align Alignment =
88152911428SGuillaume Chatelet       DL.getValueOrABITypeAlignment(I.getAlign(), I.getAllocatedType());
8828a028bf4SMatt Arsenault 
8838a028bf4SMatt Arsenault   // FIXME: This computed padding is likely wrong since it depends on inverse
8848a028bf4SMatt Arsenault   // usage order.
8858a028bf4SMatt Arsenault   //
8868a028bf4SMatt Arsenault   // FIXME: It is also possible that if we're allowed to use all of the memory
8876527b2a4SSebastian Neubauer   // could end up using more than the maximum due to alignment padding.
8888a028bf4SMatt Arsenault 
88952911428SGuillaume Chatelet   uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
8908a028bf4SMatt Arsenault   uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
8918a028bf4SMatt Arsenault   NewSize += AllocSize;
8928a028bf4SMatt Arsenault 
8938a028bf4SMatt Arsenault   if (NewSize > LocalMemLimit) {
894d34e60caSNicola Zaghen     LLVM_DEBUG(dbgs() << "  " << AllocSize
8958a028bf4SMatt Arsenault                       << " bytes of local memory not available to promote\n");
8961dbace19SChangpeng Fang     return false;
89745bb48eaSTom Stellard   }
89845bb48eaSTom Stellard 
8998a028bf4SMatt Arsenault   CurrentLocalMemUsage = NewSize;
9008a028bf4SMatt Arsenault 
90145bb48eaSTom Stellard   std::vector<Value*> WorkList;
90245bb48eaSTom Stellard 
903a61cb48dSMatt Arsenault   if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
904d34e60caSNicola Zaghen     LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
9051dbace19SChangpeng Fang     return false;
90645bb48eaSTom Stellard   }
90745bb48eaSTom Stellard 
908d34e60caSNicola Zaghen   LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
90945bb48eaSTom Stellard 
910cf84e26fSMatt Arsenault   Function *F = I.getParent()->getParent();
911cf84e26fSMatt Arsenault 
91279a1fd71STom Stellard   Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
91345bb48eaSTom Stellard   GlobalVariable *GV = new GlobalVariable(
914cf84e26fSMatt Arsenault       *Mod, GVTy, false, GlobalValue::InternalLinkage,
915cf84e26fSMatt Arsenault       UndefValue::get(GVTy),
916cf84e26fSMatt Arsenault       Twine(F->getName()) + Twine('.') + I.getName(),
917cf84e26fSMatt Arsenault       nullptr,
918cf84e26fSMatt Arsenault       GlobalVariable::NotThreadLocal,
9190da6350dSMatt Arsenault       AMDGPUAS::LOCAL_ADDRESS);
92096efdd61SPeter Collingbourne   GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
9211172712fSArthur Eubanks   GV->setAlignment(I.getAlign());
92245bb48eaSTom Stellard 
923e0132464SMatt Arsenault   Value *TCntY, *TCntZ;
92445bb48eaSTom Stellard 
925e0132464SMatt Arsenault   std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
926e0132464SMatt Arsenault   Value *TIdX = getWorkitemID(Builder, 0);
927e0132464SMatt Arsenault   Value *TIdY = getWorkitemID(Builder, 1);
928e0132464SMatt Arsenault   Value *TIdZ = getWorkitemID(Builder, 2);
92945bb48eaSTom Stellard 
930853a1fc6SMatt Arsenault   Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
93145bb48eaSTom Stellard   Tmp0 = Builder.CreateMul(Tmp0, TIdX);
932853a1fc6SMatt Arsenault   Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
93345bb48eaSTom Stellard   Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
93445bb48eaSTom Stellard   TID = Builder.CreateAdd(TID, TIdZ);
93545bb48eaSTom Stellard 
936853a1fc6SMatt Arsenault   Value *Indices[] = {
937853a1fc6SMatt Arsenault     Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
938853a1fc6SMatt Arsenault     TID
939853a1fc6SMatt Arsenault   };
94045bb48eaSTom Stellard 
941853a1fc6SMatt Arsenault   Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
94245bb48eaSTom Stellard   I.mutateType(Offset->getType());
94345bb48eaSTom Stellard   I.replaceAllUsesWith(Offset);
94445bb48eaSTom Stellard   I.eraseFromParent();
94545bb48eaSTom Stellard 
946cb41ee92SStanislav Mekhanoshin   SmallVector<IntrinsicInst *> DeferredIntrs;
947cb41ee92SStanislav Mekhanoshin 
948fb8cdbaeSMatt Arsenault   for (Value *V : WorkList) {
94945bb48eaSTom Stellard     CallInst *Call = dyn_cast<CallInst>(V);
95045bb48eaSTom Stellard     if (!Call) {
951891fccc0SMatt Arsenault       if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
952891fccc0SMatt Arsenault         Value *Src0 = CI->getOperand(0);
95389612938SArthur Eubanks         PointerType *NewTy = PointerType::getWithSamePointeeType(
95489612938SArthur Eubanks             cast<PointerType>(Src0->getType()), AMDGPUAS::LOCAL_ADDRESS);
95545bb48eaSTom Stellard 
956891fccc0SMatt Arsenault         if (isa<ConstantPointerNull>(CI->getOperand(0)))
957891fccc0SMatt Arsenault           CI->setOperand(0, ConstantPointerNull::get(NewTy));
958891fccc0SMatt Arsenault 
959891fccc0SMatt Arsenault         if (isa<ConstantPointerNull>(CI->getOperand(1)))
960891fccc0SMatt Arsenault           CI->setOperand(1, ConstantPointerNull::get(NewTy));
961891fccc0SMatt Arsenault 
962891fccc0SMatt Arsenault         continue;
963891fccc0SMatt Arsenault       }
964891fccc0SMatt Arsenault 
9652402b95dSMatt Arsenault       // The operand's value should be corrected on its own and we don't want to
9662402b95dSMatt Arsenault       // touch the users.
96745bb48eaSTom Stellard       if (isa<AddrSpaceCastInst>(V))
96845bb48eaSTom Stellard         continue;
96945bb48eaSTom Stellard 
97089612938SArthur Eubanks       PointerType *NewTy = PointerType::getWithSamePointeeType(
97189612938SArthur Eubanks           cast<PointerType>(V->getType()), AMDGPUAS::LOCAL_ADDRESS);
972891fccc0SMatt Arsenault 
97345bb48eaSTom Stellard       // FIXME: It doesn't really make sense to try to do this for all
97445bb48eaSTom Stellard       // instructions.
97545bb48eaSTom Stellard       V->mutateType(NewTy);
976891fccc0SMatt Arsenault 
977891fccc0SMatt Arsenault       // Adjust the types of any constant operands.
978891fccc0SMatt Arsenault       if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
979891fccc0SMatt Arsenault         if (isa<ConstantPointerNull>(SI->getOperand(1)))
980891fccc0SMatt Arsenault           SI->setOperand(1, ConstantPointerNull::get(NewTy));
981891fccc0SMatt Arsenault 
982891fccc0SMatt Arsenault         if (isa<ConstantPointerNull>(SI->getOperand(2)))
983891fccc0SMatt Arsenault           SI->setOperand(2, ConstantPointerNull::get(NewTy));
984891fccc0SMatt Arsenault       } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
985891fccc0SMatt Arsenault         for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
986891fccc0SMatt Arsenault           if (isa<ConstantPointerNull>(Phi->getIncomingValue(I)))
987891fccc0SMatt Arsenault             Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy));
988891fccc0SMatt Arsenault         }
989891fccc0SMatt Arsenault       }
990891fccc0SMatt Arsenault 
99145bb48eaSTom Stellard       continue;
99245bb48eaSTom Stellard     }
99345bb48eaSTom Stellard 
9942e08e181SMatt Arsenault     IntrinsicInst *Intr = cast<IntrinsicInst>(Call);
99545bb48eaSTom Stellard     Builder.SetInsertPoint(Intr);
99645bb48eaSTom Stellard     switch (Intr->getIntrinsicID()) {
99745bb48eaSTom Stellard     case Intrinsic::lifetime_start:
99845bb48eaSTom Stellard     case Intrinsic::lifetime_end:
99945bb48eaSTom Stellard       // These intrinsics are for address space 0 only
100045bb48eaSTom Stellard       Intr->eraseFromParent();
100145bb48eaSTom Stellard       continue;
1002cb41ee92SStanislav Mekhanoshin     case Intrinsic::memcpy:
1003cb41ee92SStanislav Mekhanoshin     case Intrinsic::memmove:
1004cb41ee92SStanislav Mekhanoshin       // These have 2 pointer operands. In case if second pointer also needs
1005cb41ee92SStanislav Mekhanoshin       // to be replaced we defer processing of these intrinsics until all
1006cb41ee92SStanislav Mekhanoshin       // other values are processed.
1007cb41ee92SStanislav Mekhanoshin       DeferredIntrs.push_back(Intr);
100845bb48eaSTom Stellard       continue;
100945bb48eaSTom Stellard     case Intrinsic::memset: {
101045bb48eaSTom Stellard       MemSetInst *MemSet = cast<MemSetInst>(Intr);
10111b2842bfSGuillaume Chatelet       Builder.CreateMemSet(
10121b2842bfSGuillaume Chatelet           MemSet->getRawDest(), MemSet->getValue(), MemSet->getLength(),
10131b2842bfSGuillaume Chatelet           MaybeAlign(MemSet->getDestAlignment()), MemSet->isVolatile());
101445bb48eaSTom Stellard       Intr->eraseFromParent();
101545bb48eaSTom Stellard       continue;
101645bb48eaSTom Stellard     }
10170b783ef0SMatt Arsenault     case Intrinsic::invariant_start:
10180b783ef0SMatt Arsenault     case Intrinsic::invariant_end:
10195dde8094SPiotr Padlewski     case Intrinsic::launder_invariant_group:
10205b3db45eSPiotr Padlewski     case Intrinsic::strip_invariant_group:
10210b783ef0SMatt Arsenault       Intr->eraseFromParent();
10220b783ef0SMatt Arsenault       // FIXME: I think the invariant marker should still theoretically apply,
10230b783ef0SMatt Arsenault       // but the intrinsics need to be changed to accept pointers with any
10240b783ef0SMatt Arsenault       // address space.
10250b783ef0SMatt Arsenault       continue;
10267e747f1aSMatt Arsenault     case Intrinsic::objectsize: {
10277e747f1aSMatt Arsenault       Value *Src = Intr->getOperand(0);
102889612938SArthur Eubanks       Function *ObjectSize = Intrinsic::getDeclaration(
102989612938SArthur Eubanks           Mod, Intrinsic::objectsize,
103089612938SArthur Eubanks           {Intr->getType(),
103189612938SArthur Eubanks            PointerType::getWithSamePointeeType(
103289612938SArthur Eubanks                cast<PointerType>(Src->getType()), AMDGPUAS::LOCAL_ADDRESS)});
10337e747f1aSMatt Arsenault 
103456c7e88cSGeorge Burgess IV       CallInst *NewCall = Builder.CreateCall(
1035600e9deaSErik Pilkington           ObjectSize,
1036600e9deaSErik Pilkington           {Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)});
10377e747f1aSMatt Arsenault       Intr->replaceAllUsesWith(NewCall);
10387e747f1aSMatt Arsenault       Intr->eraseFromParent();
10397e747f1aSMatt Arsenault       continue;
10407e747f1aSMatt Arsenault     }
104145bb48eaSTom Stellard     default:
10428c209aa8SMatthias Braun       Intr->print(errs());
104345bb48eaSTom Stellard       llvm_unreachable("Don't know how to promote alloca intrinsic use.");
104445bb48eaSTom Stellard     }
104545bb48eaSTom Stellard   }
1046cb41ee92SStanislav Mekhanoshin 
1047cb41ee92SStanislav Mekhanoshin   for (IntrinsicInst *Intr : DeferredIntrs) {
1048cb41ee92SStanislav Mekhanoshin     Builder.SetInsertPoint(Intr);
1049cb41ee92SStanislav Mekhanoshin     Intrinsic::ID ID = Intr->getIntrinsicID();
1050cb41ee92SStanislav Mekhanoshin     assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
1051cb41ee92SStanislav Mekhanoshin 
1052cb41ee92SStanislav Mekhanoshin     MemTransferInst *MI = cast<MemTransferInst>(Intr);
1053cb41ee92SStanislav Mekhanoshin     auto *B =
1054cb41ee92SStanislav Mekhanoshin       Builder.CreateMemTransferInst(ID, MI->getRawDest(), MI->getDestAlign(),
1055cb41ee92SStanislav Mekhanoshin                                     MI->getRawSource(), MI->getSourceAlign(),
1056cb41ee92SStanislav Mekhanoshin                                     MI->getLength(), MI->isVolatile());
1057cb41ee92SStanislav Mekhanoshin 
105844a3241fSArthur Eubanks     for (unsigned I = 0; I != 2; ++I) {
105944a3241fSArthur Eubanks       if (uint64_t Bytes = Intr->getParamDereferenceableBytes(I)) {
106044a3241fSArthur Eubanks         B->addDereferenceableParamAttr(I, Bytes);
1061cb41ee92SStanislav Mekhanoshin       }
1062cb41ee92SStanislav Mekhanoshin     }
1063cb41ee92SStanislav Mekhanoshin 
1064cb41ee92SStanislav Mekhanoshin     Intr->eraseFromParent();
1065cb41ee92SStanislav Mekhanoshin   }
1066cb41ee92SStanislav Mekhanoshin 
10671dbace19SChangpeng Fang   return true;
106845bb48eaSTom Stellard }
106945bb48eaSTom Stellard 
handlePromoteAllocaToVector(AllocaInst & I,unsigned MaxVGPRs)10700e9abcfcSArthur Eubanks bool handlePromoteAllocaToVector(AllocaInst &I, unsigned MaxVGPRs) {
10710e9abcfcSArthur Eubanks   // Array allocations are probably not worth handling, since an allocation of
10720e9abcfcSArthur Eubanks   // the array type is the canonical form.
10730e9abcfcSArthur Eubanks   if (!I.isStaticAlloca() || I.isArrayAllocation())
1074689e616eSStanislav Mekhanoshin     return false;
1075689e616eSStanislav Mekhanoshin 
10760e9abcfcSArthur Eubanks   LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
10770e9abcfcSArthur Eubanks 
10780e9abcfcSArthur Eubanks   Module *Mod = I.getParent()->getParent()->getParent();
10790e9abcfcSArthur Eubanks   return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs);
10800e9abcfcSArthur Eubanks }
10810e9abcfcSArthur Eubanks 
promoteAllocasToVector(Function & F,TargetMachine & TM)10820e9abcfcSArthur Eubanks bool promoteAllocasToVector(Function &F, TargetMachine &TM) {
10830e9abcfcSArthur Eubanks   if (DisablePromoteAllocaToVector)
108454e2dc75SStanislav Mekhanoshin     return false;
108554e2dc75SStanislav Mekhanoshin 
10860e9abcfcSArthur Eubanks   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
108754e2dc75SStanislav Mekhanoshin   if (!ST.isPromoteAllocaEnabled())
108854e2dc75SStanislav Mekhanoshin     return false;
108954e2dc75SStanislav Mekhanoshin 
10900e9abcfcSArthur Eubanks   unsigned MaxVGPRs;
10910e9abcfcSArthur Eubanks   if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
10920e9abcfcSArthur Eubanks     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
109354e2dc75SStanislav Mekhanoshin     MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
1094cf74ef13SStanislav Mekhanoshin     // A non-entry function has only 32 caller preserved registers.
1095cf74ef13SStanislav Mekhanoshin     // Do not promote alloca which will force spilling.
1096cf74ef13SStanislav Mekhanoshin     if (!AMDGPU::isEntryFunctionCC(F.getCallingConv()))
1097cf74ef13SStanislav Mekhanoshin       MaxVGPRs = std::min(MaxVGPRs, 32u);
109854e2dc75SStanislav Mekhanoshin   } else {
109954e2dc75SStanislav Mekhanoshin     MaxVGPRs = 128;
110054e2dc75SStanislav Mekhanoshin   }
110154e2dc75SStanislav Mekhanoshin 
1102689e616eSStanislav Mekhanoshin   bool Changed = false;
1103689e616eSStanislav Mekhanoshin   BasicBlock &EntryBB = *F.begin();
1104689e616eSStanislav Mekhanoshin 
1105689e616eSStanislav Mekhanoshin   SmallVector<AllocaInst *, 16> Allocas;
1106689e616eSStanislav Mekhanoshin   for (Instruction &I : EntryBB) {
1107689e616eSStanislav Mekhanoshin     if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
1108689e616eSStanislav Mekhanoshin       Allocas.push_back(AI);
1109689e616eSStanislav Mekhanoshin   }
1110689e616eSStanislav Mekhanoshin 
1111689e616eSStanislav Mekhanoshin   for (AllocaInst *AI : Allocas) {
11120e9abcfcSArthur Eubanks     if (handlePromoteAllocaToVector(*AI, MaxVGPRs))
1113689e616eSStanislav Mekhanoshin       Changed = true;
1114689e616eSStanislav Mekhanoshin   }
1115689e616eSStanislav Mekhanoshin 
1116689e616eSStanislav Mekhanoshin   return Changed;
1117689e616eSStanislav Mekhanoshin }
1118689e616eSStanislav Mekhanoshin 
runOnFunction(Function & F)11190e9abcfcSArthur Eubanks bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {
11200e9abcfcSArthur Eubanks   if (skipFunction(F))
1121689e616eSStanislav Mekhanoshin     return false;
11220e9abcfcSArthur Eubanks   if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
11230e9abcfcSArthur Eubanks     return promoteAllocasToVector(F, TPC->getTM<TargetMachine>());
11240e9abcfcSArthur Eubanks   }
11250e9abcfcSArthur Eubanks   return false;
11260e9abcfcSArthur Eubanks }
1127689e616eSStanislav Mekhanoshin 
11280e9abcfcSArthur Eubanks PreservedAnalyses
run(Function & F,FunctionAnalysisManager & AM)11290e9abcfcSArthur Eubanks AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
11300e9abcfcSArthur Eubanks   bool Changed = promoteAllocasToVector(F, TM);
11310e9abcfcSArthur Eubanks   if (Changed) {
11320e9abcfcSArthur Eubanks     PreservedAnalyses PA;
11330e9abcfcSArthur Eubanks     PA.preserveSet<CFGAnalyses>();
11340e9abcfcSArthur Eubanks     return PA;
11350e9abcfcSArthur Eubanks   }
11360e9abcfcSArthur Eubanks   return PreservedAnalyses::all();
1137689e616eSStanislav Mekhanoshin }
1138689e616eSStanislav Mekhanoshin 
createAMDGPUPromoteAlloca()11398b61764cSFrancis Visoiu Mistrih FunctionPass *llvm::createAMDGPUPromoteAlloca() {
11408b61764cSFrancis Visoiu Mistrih   return new AMDGPUPromoteAlloca();
114145bb48eaSTom Stellard }
1142689e616eSStanislav Mekhanoshin 
createAMDGPUPromoteAllocaToVector()1143689e616eSStanislav Mekhanoshin FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() {
1144689e616eSStanislav Mekhanoshin   return new AMDGPUPromoteAllocaToVector();
1145689e616eSStanislav Mekhanoshin }
1146