145bb48eaSTom Stellard //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
245bb48eaSTom Stellard //
32946cd70SChandler Carruth // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
42946cd70SChandler Carruth // See https://llvm.org/LICENSE.txt for license information.
52946cd70SChandler Carruth // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
645bb48eaSTom Stellard //
745bb48eaSTom Stellard //===----------------------------------------------------------------------===//
845bb48eaSTom Stellard //
945bb48eaSTom Stellard // This pass eliminates allocas by either converting them into vectors or
1045bb48eaSTom Stellard // by migrating them to local address space.
1145bb48eaSTom Stellard //
1245bb48eaSTom Stellard //===----------------------------------------------------------------------===//
1345bb48eaSTom Stellard
1445bb48eaSTom Stellard #include "AMDGPU.h"
15560d7e04Sdfukalov #include "GCNSubtarget.h"
16e188aae4Sserge-sans-paille #include "Utils/AMDGPUBaseInfo.h"
17c85abbd9SChangpeng Fang #include "llvm/Analysis/CaptureTracking.h"
1845bb48eaSTom Stellard #include "llvm/Analysis/ValueTracking.h"
198b61764cSFrancis Visoiu Mistrih #include "llvm/CodeGen/TargetPassConfig.h"
206bda14b3SChandler Carruth #include "llvm/IR/IRBuilder.h"
21e188aae4Sserge-sans-paille #include "llvm/IR/IntrinsicInst.h"
225d986953SReid Kleckner #include "llvm/IR/IntrinsicsAMDGPU.h"
235d986953SReid Kleckner #include "llvm/IR/IntrinsicsR600.h"
24734bb7bbSEugene Zelenko #include "llvm/Pass.h"
25734bb7bbSEugene Zelenko #include "llvm/Target/TargetMachine.h"
2645bb48eaSTom Stellard
2745bb48eaSTom Stellard #define DEBUG_TYPE "amdgpu-promote-alloca"
2845bb48eaSTom Stellard
2945bb48eaSTom Stellard using namespace llvm;
3045bb48eaSTom Stellard
3145bb48eaSTom Stellard namespace {
3245bb48eaSTom Stellard
33ba92059cSChangpeng Fang static cl::opt<bool> DisablePromoteAllocaToVector(
34ba92059cSChangpeng Fang "disable-promote-alloca-to-vector",
35ba92059cSChangpeng Fang cl::desc("Disable promote alloca to vector"),
36ba92059cSChangpeng Fang cl::init(false));
37ba92059cSChangpeng Fang
3873bf0af3SYaxun Liu static cl::opt<bool> DisablePromoteAllocaToLDS(
3973bf0af3SYaxun Liu "disable-promote-alloca-to-lds",
4073bf0af3SYaxun Liu cl::desc("Disable promote alloca to LDS"),
4173bf0af3SYaxun Liu cl::init(false));
4273bf0af3SYaxun Liu
4354e2dc75SStanislav Mekhanoshin static cl::opt<unsigned> PromoteAllocaToVectorLimit(
4454e2dc75SStanislav Mekhanoshin "amdgpu-promote-alloca-to-vector-limit",
4554e2dc75SStanislav Mekhanoshin cl::desc("Maximum byte size to consider promote alloca to vector"),
4654e2dc75SStanislav Mekhanoshin cl::init(0));
4754e2dc75SStanislav Mekhanoshin
48e0132464SMatt Arsenault // FIXME: This can create globals so should be a module pass.
49bafc9dc5SMatt Arsenault class AMDGPUPromoteAlloca : public FunctionPass {
500e9abcfcSArthur Eubanks public:
510e9abcfcSArthur Eubanks static char ID;
520e9abcfcSArthur Eubanks
AMDGPUPromoteAlloca()530e9abcfcSArthur Eubanks AMDGPUPromoteAlloca() : FunctionPass(ID) {}
540e9abcfcSArthur Eubanks
550e9abcfcSArthur Eubanks bool runOnFunction(Function &F) override;
560e9abcfcSArthur Eubanks
getPassName() const570e9abcfcSArthur Eubanks StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
580e9abcfcSArthur Eubanks
590e9abcfcSArthur Eubanks bool handleAlloca(AllocaInst &I, bool SufficientLDS);
600e9abcfcSArthur Eubanks
getAnalysisUsage(AnalysisUsage & AU) const610e9abcfcSArthur Eubanks void getAnalysisUsage(AnalysisUsage &AU) const override {
620e9abcfcSArthur Eubanks AU.setPreservesCFG();
630e9abcfcSArthur Eubanks FunctionPass::getAnalysisUsage(AU);
640e9abcfcSArthur Eubanks }
650e9abcfcSArthur Eubanks };
660e9abcfcSArthur Eubanks
670e9abcfcSArthur Eubanks class AMDGPUPromoteAllocaImpl {
68e0132464SMatt Arsenault private:
690e9abcfcSArthur Eubanks const TargetMachine &TM;
70734bb7bbSEugene Zelenko Module *Mod = nullptr;
71734bb7bbSEugene Zelenko const DataLayout *DL = nullptr;
72e0132464SMatt Arsenault
73e0132464SMatt Arsenault // FIXME: This should be per-kernel.
74734bb7bbSEugene Zelenko uint32_t LocalMemLimit = 0;
75734bb7bbSEugene Zelenko uint32_t CurrentLocalMemUsage = 0;
7654e2dc75SStanislav Mekhanoshin unsigned MaxVGPRs;
7745bb48eaSTom Stellard
78734bb7bbSEugene Zelenko bool IsAMDGCN = false;
79734bb7bbSEugene Zelenko bool IsAMDHSA = false;
80e0132464SMatt Arsenault
81e0132464SMatt Arsenault std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
82e0132464SMatt Arsenault Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
83e0132464SMatt Arsenault
84a61cb48dSMatt Arsenault /// BaseAlloca is the alloca root the search started from.
85a61cb48dSMatt Arsenault /// Val may be that alloca or a recursive user of it.
86a61cb48dSMatt Arsenault bool collectUsesWithPtrTypes(Value *BaseAlloca,
87a61cb48dSMatt Arsenault Value *Val,
88a61cb48dSMatt Arsenault std::vector<Value*> &WorkList) const;
89a61cb48dSMatt Arsenault
90a61cb48dSMatt Arsenault /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
91a61cb48dSMatt Arsenault /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
92a61cb48dSMatt Arsenault /// Returns true if both operands are derived from the same alloca. Val should
93a61cb48dSMatt Arsenault /// be the same value as one of the input operands of UseInst.
94a61cb48dSMatt Arsenault bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
95a61cb48dSMatt Arsenault Instruction *UseInst,
96a61cb48dSMatt Arsenault int OpIdx0, int OpIdx1) const;
97a61cb48dSMatt Arsenault
981dbace19SChangpeng Fang /// Check whether we have enough local memory for promotion.
991dbace19SChangpeng Fang bool hasSufficientLocalMem(const Function &F);
1001dbace19SChangpeng Fang
1011dbace19SChangpeng Fang bool handleAlloca(AllocaInst &I, bool SufficientLDS);
102a61cb48dSMatt Arsenault
1030e9abcfcSArthur Eubanks public:
AMDGPUPromoteAllocaImpl(TargetMachine & TM)1040e9abcfcSArthur Eubanks AMDGPUPromoteAllocaImpl(TargetMachine &TM) : TM(TM) {}
1050e9abcfcSArthur Eubanks bool run(Function &F);
10645bb48eaSTom Stellard };
10745bb48eaSTom Stellard
108689e616eSStanislav Mekhanoshin class AMDGPUPromoteAllocaToVector : public FunctionPass {
109689e616eSStanislav Mekhanoshin public:
110689e616eSStanislav Mekhanoshin static char ID;
111689e616eSStanislav Mekhanoshin
AMDGPUPromoteAllocaToVector()112689e616eSStanislav Mekhanoshin AMDGPUPromoteAllocaToVector() : FunctionPass(ID) {}
113689e616eSStanislav Mekhanoshin
114689e616eSStanislav Mekhanoshin bool runOnFunction(Function &F) override;
115689e616eSStanislav Mekhanoshin
getPassName() const116689e616eSStanislav Mekhanoshin StringRef getPassName() const override {
117689e616eSStanislav Mekhanoshin return "AMDGPU Promote Alloca to vector";
118689e616eSStanislav Mekhanoshin }
119689e616eSStanislav Mekhanoshin
getAnalysisUsage(AnalysisUsage & AU) const120689e616eSStanislav Mekhanoshin void getAnalysisUsage(AnalysisUsage &AU) const override {
121689e616eSStanislav Mekhanoshin AU.setPreservesCFG();
122689e616eSStanislav Mekhanoshin FunctionPass::getAnalysisUsage(AU);
123689e616eSStanislav Mekhanoshin }
124689e616eSStanislav Mekhanoshin };
125689e616eSStanislav Mekhanoshin
126734bb7bbSEugene Zelenko } // end anonymous namespace
12745bb48eaSTom Stellard
12845bb48eaSTom Stellard char AMDGPUPromoteAlloca::ID = 0;
129689e616eSStanislav Mekhanoshin char AMDGPUPromoteAllocaToVector::ID = 0;
13045bb48eaSTom Stellard
13113e49dceSJon Chesterfield INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
13213e49dceSJon Chesterfield "AMDGPU promote alloca to vector or LDS", false, false)
13313e49dceSJon Chesterfield // Move LDS uses from functions to kernels before promote alloca for accurate
13413e49dceSJon Chesterfield // estimation of LDS available
13513e49dceSJon Chesterfield INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDS)
13613e49dceSJon Chesterfield INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
137e0132464SMatt Arsenault "AMDGPU promote alloca to vector or LDS", false, false)
138e0132464SMatt Arsenault
139689e616eSStanislav Mekhanoshin INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
140689e616eSStanislav Mekhanoshin "AMDGPU promote alloca to vector", false, false)
141689e616eSStanislav Mekhanoshin
142e0132464SMatt Arsenault char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
143689e616eSStanislav Mekhanoshin char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
144e0132464SMatt Arsenault
runOnFunction(Function & F)14545bb48eaSTom Stellard bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
1468b61764cSFrancis Visoiu Mistrih if (skipFunction(F))
147e0132464SMatt Arsenault return false;
148e0132464SMatt Arsenault
1490e9abcfcSArthur Eubanks if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
1500e9abcfcSArthur Eubanks return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>()).run(F);
1510e9abcfcSArthur Eubanks }
1528b61764cSFrancis Visoiu Mistrih return false;
1530e9abcfcSArthur Eubanks }
1548b61764cSFrancis Visoiu Mistrih
run(Function & F,FunctionAnalysisManager & AM)1550e9abcfcSArthur Eubanks PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
1560e9abcfcSArthur Eubanks FunctionAnalysisManager &AM) {
1570e9abcfcSArthur Eubanks bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F);
1580e9abcfcSArthur Eubanks if (Changed) {
1590e9abcfcSArthur Eubanks PreservedAnalyses PA;
1600e9abcfcSArthur Eubanks PA.preserveSet<CFGAnalyses>();
1610e9abcfcSArthur Eubanks return PA;
1620e9abcfcSArthur Eubanks }
1630e9abcfcSArthur Eubanks return PreservedAnalyses::all();
1640e9abcfcSArthur Eubanks }
1650e9abcfcSArthur Eubanks
run(Function & F)1660e9abcfcSArthur Eubanks bool AMDGPUPromoteAllocaImpl::run(Function &F) {
1670e9abcfcSArthur Eubanks Mod = F.getParent();
1680e9abcfcSArthur Eubanks DL = &Mod->getDataLayout();
1690e9abcfcSArthur Eubanks
1700e9abcfcSArthur Eubanks const Triple &TT = TM.getTargetTriple();
1718b61764cSFrancis Visoiu Mistrih IsAMDGCN = TT.getArch() == Triple::amdgcn;
1728b61764cSFrancis Visoiu Mistrih IsAMDHSA = TT.getOS() == Triple::AMDHSA;
1738b61764cSFrancis Visoiu Mistrih
1740e9abcfcSArthur Eubanks const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
17503d85845SMatt Arsenault if (!ST.isPromoteAllocaEnabled())
17603d85845SMatt Arsenault return false;
1771dbace19SChangpeng Fang
17854e2dc75SStanislav Mekhanoshin if (IsAMDGCN) {
1790e9abcfcSArthur Eubanks const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
18054e2dc75SStanislav Mekhanoshin MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
181cf74ef13SStanislav Mekhanoshin // A non-entry function has only 32 caller preserved registers.
182cf74ef13SStanislav Mekhanoshin // Do not promote alloca which will force spilling.
183cf74ef13SStanislav Mekhanoshin if (!AMDGPU::isEntryFunctionCC(F.getCallingConv()))
184cf74ef13SStanislav Mekhanoshin MaxVGPRs = std::min(MaxVGPRs, 32u);
18554e2dc75SStanislav Mekhanoshin } else {
18654e2dc75SStanislav Mekhanoshin MaxVGPRs = 128;
18754e2dc75SStanislav Mekhanoshin }
18854e2dc75SStanislav Mekhanoshin
1891dbace19SChangpeng Fang bool SufficientLDS = hasSufficientLocalMem(F);
1901dbace19SChangpeng Fang bool Changed = false;
191bafc9dc5SMatt Arsenault BasicBlock &EntryBB = *F.begin();
192bafc9dc5SMatt Arsenault
19323f03f50SMatt Arsenault SmallVector<AllocaInst *, 16> Allocas;
19423f03f50SMatt Arsenault for (Instruction &I : EntryBB) {
19523f03f50SMatt Arsenault if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
19623f03f50SMatt Arsenault Allocas.push_back(AI);
19723f03f50SMatt Arsenault }
19823f03f50SMatt Arsenault
19923f03f50SMatt Arsenault for (AllocaInst *AI : Allocas) {
20023f03f50SMatt Arsenault if (handleAlloca(*AI, SufficientLDS))
20123f03f50SMatt Arsenault Changed = true;
202bafc9dc5SMatt Arsenault }
20345bb48eaSTom Stellard
2041dbace19SChangpeng Fang return Changed;
20545bb48eaSTom Stellard }
20645bb48eaSTom Stellard
207e0132464SMatt Arsenault std::pair<Value *, Value *>
getLocalSizeYZ(IRBuilder<> & Builder)2080e9abcfcSArthur Eubanks AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
20904ce2de3SMatt Arsenault Function &F = *Builder.GetInsertBlock()->getParent();
2100e9abcfcSArthur Eubanks const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
211c90347d7SStanislav Mekhanoshin
212e0132464SMatt Arsenault if (!IsAMDHSA) {
213e0132464SMatt Arsenault Function *LocalSizeYFn
214e0132464SMatt Arsenault = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
215e0132464SMatt Arsenault Function *LocalSizeZFn
216e0132464SMatt Arsenault = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
217e0132464SMatt Arsenault
218e0132464SMatt Arsenault CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
219e0132464SMatt Arsenault CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
220e0132464SMatt Arsenault
221c90347d7SStanislav Mekhanoshin ST.makeLIDRangeMetadata(LocalSizeY);
222c90347d7SStanislav Mekhanoshin ST.makeLIDRangeMetadata(LocalSizeZ);
223e0132464SMatt Arsenault
224e0132464SMatt Arsenault return std::make_pair(LocalSizeY, LocalSizeZ);
225e0132464SMatt Arsenault }
226e0132464SMatt Arsenault
227e0132464SMatt Arsenault // We must read the size out of the dispatch pointer.
228e0132464SMatt Arsenault assert(IsAMDGCN);
229e0132464SMatt Arsenault
230e0132464SMatt Arsenault // We are indexing into this struct, and want to extract the workgroup_size_*
231e0132464SMatt Arsenault // fields.
232e0132464SMatt Arsenault //
233e0132464SMatt Arsenault // typedef struct hsa_kernel_dispatch_packet_s {
234e0132464SMatt Arsenault // uint16_t header;
235e0132464SMatt Arsenault // uint16_t setup;
236e0132464SMatt Arsenault // uint16_t workgroup_size_x ;
237e0132464SMatt Arsenault // uint16_t workgroup_size_y;
238e0132464SMatt Arsenault // uint16_t workgroup_size_z;
239e0132464SMatt Arsenault // uint16_t reserved0;
240e0132464SMatt Arsenault // uint32_t grid_size_x ;
241e0132464SMatt Arsenault // uint32_t grid_size_y ;
242e0132464SMatt Arsenault // uint32_t grid_size_z;
243e0132464SMatt Arsenault //
244e0132464SMatt Arsenault // uint32_t private_segment_size;
245e0132464SMatt Arsenault // uint32_t group_segment_size;
246e0132464SMatt Arsenault // uint64_t kernel_object;
247e0132464SMatt Arsenault //
248e0132464SMatt Arsenault // #ifdef HSA_LARGE_MODEL
249e0132464SMatt Arsenault // void *kernarg_address;
250e0132464SMatt Arsenault // #elif defined HSA_LITTLE_ENDIAN
251e0132464SMatt Arsenault // void *kernarg_address;
252e0132464SMatt Arsenault // uint32_t reserved1;
253e0132464SMatt Arsenault // #else
254e0132464SMatt Arsenault // uint32_t reserved1;
255e0132464SMatt Arsenault // void *kernarg_address;
256e0132464SMatt Arsenault // #endif
257e0132464SMatt Arsenault // uint64_t reserved2;
258e0132464SMatt Arsenault // hsa_signal_t completion_signal; // uint64_t wrapper
259e0132464SMatt Arsenault // } hsa_kernel_dispatch_packet_t
260e0132464SMatt Arsenault //
261e0132464SMatt Arsenault Function *DispatchPtrFn
262e0132464SMatt Arsenault = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
263e0132464SMatt Arsenault
264e0132464SMatt Arsenault CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
2653f4d00bcSArthur Eubanks DispatchPtr->addRetAttr(Attribute::NoAlias);
2663f4d00bcSArthur Eubanks DispatchPtr->addRetAttr(Attribute::NonNull);
26704ce2de3SMatt Arsenault F.removeFnAttr("amdgpu-no-dispatch-ptr");
268e0132464SMatt Arsenault
269e0132464SMatt Arsenault // Size of the dispatch packet struct.
27044a3241fSArthur Eubanks DispatchPtr->addDereferenceableRetAttr(64);
271e0132464SMatt Arsenault
272e0132464SMatt Arsenault Type *I32Ty = Type::getInt32Ty(Mod->getContext());
273e0132464SMatt Arsenault Value *CastDispatchPtr = Builder.CreateBitCast(
2740da6350dSMatt Arsenault DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
275e0132464SMatt Arsenault
276e0132464SMatt Arsenault // We could do a single 64-bit load here, but it's likely that the basic
277e0132464SMatt Arsenault // 32-bit and extract sequence is already present, and it is probably easier
278d1f45ed5SNeubauer, Sebastian // to CSE this. The loads should be mergeable later anyway.
2797716075aSJames Y Knight Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 1);
280279fa8e0SGuillaume Chatelet LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
281e0132464SMatt Arsenault
2827716075aSJames Y Knight Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 2);
283279fa8e0SGuillaume Chatelet LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
284e0132464SMatt Arsenault
285734bb7bbSEugene Zelenko MDNode *MD = MDNode::get(Mod->getContext(), None);
286e0132464SMatt Arsenault LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
287e0132464SMatt Arsenault LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
288c90347d7SStanislav Mekhanoshin ST.makeLIDRangeMetadata(LoadZU);
289e0132464SMatt Arsenault
290e0132464SMatt Arsenault // Extract y component. Upper half of LoadZU should be zero already.
291e0132464SMatt Arsenault Value *Y = Builder.CreateLShr(LoadXY, 16);
292e0132464SMatt Arsenault
293e0132464SMatt Arsenault return std::make_pair(Y, LoadZU);
294e0132464SMatt Arsenault }
295e0132464SMatt Arsenault
getWorkitemID(IRBuilder<> & Builder,unsigned N)2960e9abcfcSArthur Eubanks Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
2970e9abcfcSArthur Eubanks unsigned N) {
29804ce2de3SMatt Arsenault Function *F = Builder.GetInsertBlock()->getParent();
29904ce2de3SMatt Arsenault const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, *F);
3005d986953SReid Kleckner Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
30104ce2de3SMatt Arsenault StringRef AttrName;
302e0132464SMatt Arsenault
303e0132464SMatt Arsenault switch (N) {
304e0132464SMatt Arsenault case 0:
30511b2b2f4SMichael Liao IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
30611b2b2f4SMichael Liao : (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
30704ce2de3SMatt Arsenault AttrName = "amdgpu-no-workitem-id-x";
308e0132464SMatt Arsenault break;
309e0132464SMatt Arsenault case 1:
31011b2b2f4SMichael Liao IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
31111b2b2f4SMichael Liao : (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
31204ce2de3SMatt Arsenault AttrName = "amdgpu-no-workitem-id-y";
313e0132464SMatt Arsenault break;
314e0132464SMatt Arsenault
315e0132464SMatt Arsenault case 2:
31611b2b2f4SMichael Liao IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
31711b2b2f4SMichael Liao : (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
31804ce2de3SMatt Arsenault AttrName = "amdgpu-no-workitem-id-z";
319e0132464SMatt Arsenault break;
320e0132464SMatt Arsenault default:
321e0132464SMatt Arsenault llvm_unreachable("invalid dimension");
322e0132464SMatt Arsenault }
323e0132464SMatt Arsenault
324e0132464SMatt Arsenault Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
325e0132464SMatt Arsenault CallInst *CI = Builder.CreateCall(WorkitemIdFn);
326c90347d7SStanislav Mekhanoshin ST.makeLIDRangeMetadata(CI);
32704ce2de3SMatt Arsenault F->removeFnAttr(AttrName);
328e0132464SMatt Arsenault
329e0132464SMatt Arsenault return CI;
330e0132464SMatt Arsenault }
331e0132464SMatt Arsenault
arrayTypeToVecType(ArrayType * ArrayTy)3323254a001SChristopher Tetreault static FixedVectorType *arrayTypeToVecType(ArrayType *ArrayTy) {
3333254a001SChristopher Tetreault return FixedVectorType::get(ArrayTy->getElementType(),
33437ab4cf8SMatt Arsenault ArrayTy->getNumElements());
33545bb48eaSTom Stellard }
33645bb48eaSTom Stellard
33745bb48eaSTom Stellard static Value *
calculateVectorIndex(Value * Ptr,const std::map<GetElementPtrInst *,Value * > & GEPIdx)33845bb48eaSTom Stellard calculateVectorIndex(Value *Ptr,
33945bb48eaSTom Stellard const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
3403ed643eaSNikita Popov auto *GEP = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts());
341512e806aSStanislav Mekhanoshin if (!GEP)
3423ed643eaSNikita Popov return ConstantInt::getNullValue(Type::getInt32Ty(Ptr->getContext()));
34345bb48eaSTom Stellard
34445bb48eaSTom Stellard auto I = GEPIdx.find(GEP);
3453ed643eaSNikita Popov assert(I != GEPIdx.end() && "Must have entry for GEP!");
3463ed643eaSNikita Popov return I->second;
34745bb48eaSTom Stellard }
34845bb48eaSTom Stellard
GEPToVectorIndex(GetElementPtrInst * GEP,AllocaInst * Alloca,Type * VecElemTy,const DataLayout & DL)3493ed643eaSNikita Popov static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
3503ed643eaSNikita Popov Type *VecElemTy, const DataLayout &DL) {
3513ed643eaSNikita Popov // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
3523ed643eaSNikita Popov // helper.
3533ed643eaSNikita Popov unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
3543ed643eaSNikita Popov MapVector<Value *, APInt> VarOffsets;
3553ed643eaSNikita Popov APInt ConstOffset(BW, 0);
3563ed643eaSNikita Popov if (GEP->getPointerOperand()->stripPointerCasts() != Alloca ||
3573ed643eaSNikita Popov !GEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
358efb24540SMatt Arsenault return nullptr;
35945bb48eaSTom Stellard
3603ed643eaSNikita Popov unsigned VecElemSize = DL.getTypeAllocSize(VecElemTy);
3613ed643eaSNikita Popov if (VarOffsets.size() > 1)
362efb24540SMatt Arsenault return nullptr;
36345bb48eaSTom Stellard
3643ed643eaSNikita Popov if (VarOffsets.size() == 1) {
3653ed643eaSNikita Popov // Only handle cases where we don't need to insert extra arithmetic
36645bb48eaSTom Stellard // instructions.
3673ed643eaSNikita Popov const auto &VarOffset = VarOffsets.front();
3683ed643eaSNikita Popov if (!ConstOffset.isZero() || VarOffset.second != VecElemSize)
3693ed643eaSNikita Popov return nullptr;
3703ed643eaSNikita Popov return VarOffset.first;
371161e8c39SChangpeng Fang }
372db7dea2bSStanislav Mekhanoshin
3733ed643eaSNikita Popov APInt Quot;
3743ed643eaSNikita Popov uint64_t Rem;
3753ed643eaSNikita Popov APInt::udivrem(ConstOffset, VecElemSize, Quot, Rem);
3763ed643eaSNikita Popov if (Rem != 0)
3773ed643eaSNikita Popov return nullptr;
378db7dea2bSStanislav Mekhanoshin
3793ed643eaSNikita Popov return ConstantInt::get(GEP->getContext(), Quot);
38045bb48eaSTom Stellard }
38145bb48eaSTom Stellard
tryPromoteAllocaToVector(AllocaInst * Alloca,const DataLayout & DL,unsigned MaxVGPRs)38254e2dc75SStanislav Mekhanoshin static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
38354e2dc75SStanislav Mekhanoshin unsigned MaxVGPRs) {
384ba92059cSChangpeng Fang
385ba92059cSChangpeng Fang if (DisablePromoteAllocaToVector) {
386d34e60caSNicola Zaghen LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n");
387ba92059cSChangpeng Fang return false;
388ba92059cSChangpeng Fang }
389ba92059cSChangpeng Fang
39068b03aeeSEli Friedman Type *AllocaTy = Alloca->getAllocatedType();
3913254a001SChristopher Tetreault auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
39268b03aeeSEli Friedman if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
39368b03aeeSEli Friedman if (VectorType::isValidElementType(ArrayTy->getElementType()) &&
39468b03aeeSEli Friedman ArrayTy->getNumElements() > 0)
39568b03aeeSEli Friedman VectorTy = arrayTypeToVecType(ArrayTy);
39668b03aeeSEli Friedman }
39745bb48eaSTom Stellard
39854e2dc75SStanislav Mekhanoshin // Use up to 1/4 of available register budget for vectorization.
39954e2dc75SStanislav Mekhanoshin unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
40054e2dc75SStanislav Mekhanoshin : (MaxVGPRs * 32);
40154e2dc75SStanislav Mekhanoshin
40254e2dc75SStanislav Mekhanoshin if (DL.getTypeSizeInBits(AllocaTy) * 4 > Limit) {
40354e2dc75SStanislav Mekhanoshin LLVM_DEBUG(dbgs() << " Alloca too big for vectorization with "
40454e2dc75SStanislav Mekhanoshin << MaxVGPRs << " registers available\n");
40554e2dc75SStanislav Mekhanoshin return false;
40654e2dc75SStanislav Mekhanoshin }
40754e2dc75SStanislav Mekhanoshin
408d34e60caSNicola Zaghen LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
40945bb48eaSTom Stellard
41045bb48eaSTom Stellard // FIXME: There is no reason why we can't support larger arrays, we
41145bb48eaSTom Stellard // are just being conservative for now.
41282618baaSDavid Stuttard // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these
41382618baaSDavid Stuttard // could also be promoted but we don't currently handle this case
41468b03aeeSEli Friedman if (!VectorTy || VectorTy->getNumElements() > 16 ||
41568b03aeeSEli Friedman VectorTy->getNumElements() < 2) {
416d34e60caSNicola Zaghen LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
41745bb48eaSTom Stellard return false;
41845bb48eaSTom Stellard }
41945bb48eaSTom Stellard
42045bb48eaSTom Stellard std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
4213ed643eaSNikita Popov SmallVector<Instruction *> WorkList;
4223ed643eaSNikita Popov SmallVector<Use *, 8> Uses;
4233ed643eaSNikita Popov for (Use &U : Alloca->uses())
4243ed643eaSNikita Popov Uses.push_back(&U);
425db7dea2bSStanislav Mekhanoshin
4263ed643eaSNikita Popov Type *VecEltTy = VectorTy->getElementType();
4273ed643eaSNikita Popov while (!Uses.empty()) {
4283ed643eaSNikita Popov Use *U = Uses.pop_back_val();
4293ed643eaSNikita Popov Instruction *Inst = dyn_cast<Instruction>(U->getUser());
4303ed643eaSNikita Popov
4313ed643eaSNikita Popov if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
4323ed643eaSNikita Popov // This is a store of the pointer, not to the pointer.
4333ed643eaSNikita Popov if (isa<StoreInst>(Inst) &&
4343ed643eaSNikita Popov U->getOperandNo() != StoreInst::getPointerOperandIndex())
43545bb48eaSTom Stellard return false;
43645bb48eaSTom Stellard
4373ed643eaSNikita Popov Type *AccessTy = getLoadStoreType(Inst);
4383ed643eaSNikita Popov Ptr = Ptr->stripPointerCasts();
4393ed643eaSNikita Popov
4403ed643eaSNikita Popov // Alloca already accessed as vector, leave alone.
4413ed643eaSNikita Popov if (Ptr == Alloca && DL.getTypeStoreSize(Alloca->getAllocatedType()) ==
4423ed643eaSNikita Popov DL.getTypeStoreSize(AccessTy))
443db7dea2bSStanislav Mekhanoshin continue;
444db7dea2bSStanislav Mekhanoshin
4453ed643eaSNikita Popov // Check that this is a simple access of a vector element.
4463ed643eaSNikita Popov bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple()
4473ed643eaSNikita Popov : cast<StoreInst>(Inst)->isSimple();
4483ed643eaSNikita Popov if (!IsSimple ||
4493ed643eaSNikita Popov !CastInst::isBitOrNoopPointerCastable(VecEltTy, AccessTy, DL))
4503ed643eaSNikita Popov return false;
451db7dea2bSStanislav Mekhanoshin
4523ed643eaSNikita Popov WorkList.push_back(Inst);
453db7dea2bSStanislav Mekhanoshin continue;
454db7dea2bSStanislav Mekhanoshin }
455db7dea2bSStanislav Mekhanoshin
4563ed643eaSNikita Popov if (isa<BitCastInst>(Inst)) {
4573ed643eaSNikita Popov // Look through bitcasts.
4583ed643eaSNikita Popov for (Use &U : Inst->uses())
4593ed643eaSNikita Popov Uses.push_back(&U);
46045bb48eaSTom Stellard continue;
46145bb48eaSTom Stellard }
46245bb48eaSTom Stellard
4633ed643eaSNikita Popov if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
46445bb48eaSTom Stellard // If we can't compute a vector index from this GEP, then we can't
46545bb48eaSTom Stellard // promote this alloca to vector.
4663ed643eaSNikita Popov Value *Index = GEPToVectorIndex(GEP, Alloca, VecEltTy, DL);
46745bb48eaSTom Stellard if (!Index) {
468d34e60caSNicola Zaghen LLVM_DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP
469d34e60caSNicola Zaghen << '\n');
47045bb48eaSTom Stellard return false;
47145bb48eaSTom Stellard }
47245bb48eaSTom Stellard
47345bb48eaSTom Stellard GEPVectorIdx[GEP] = Index;
4743ed643eaSNikita Popov for (Use &U : Inst->uses())
4753ed643eaSNikita Popov Uses.push_back(&U);
4763ed643eaSNikita Popov continue;
4773ed643eaSNikita Popov }
4783ed643eaSNikita Popov
4793ed643eaSNikita Popov // Ignore assume-like intrinsics and comparisons used in assumes.
4803ed643eaSNikita Popov if (isAssumeLikeIntrinsic(Inst))
4813ed643eaSNikita Popov continue;
4823ed643eaSNikita Popov
4833ed643eaSNikita Popov if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
4843ed643eaSNikita Popov return isAssumeLikeIntrinsic(cast<Instruction>(U));
4853ed643eaSNikita Popov }))
4863ed643eaSNikita Popov continue;
4873ed643eaSNikita Popov
4883ed643eaSNikita Popov // Unknown user.
4893ed643eaSNikita Popov return false;
49045bb48eaSTom Stellard }
49145bb48eaSTom Stellard
492d34e60caSNicola Zaghen LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> "
493d34e60caSNicola Zaghen << *VectorTy << '\n');
49445bb48eaSTom Stellard
4953ed643eaSNikita Popov for (Instruction *Inst : WorkList) {
49645bb48eaSTom Stellard IRBuilder<> Builder(Inst);
49745bb48eaSTom Stellard switch (Inst->getOpcode()) {
49845bb48eaSTom Stellard case Instruction::Load: {
49982618baaSDavid Stuttard Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
50045bb48eaSTom Stellard Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
5013ed643eaSNikita Popov Type *VecPtrTy = VectorTy->getPointerTo(Alloca->getAddressSpace());
502efb24540SMatt Arsenault Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
50314359ef1SJames Y Knight Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
50445bb48eaSTom Stellard Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
505db7dea2bSStanislav Mekhanoshin if (Inst->getType() != VecEltTy)
5066e1eee60SStanislav Mekhanoshin ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType());
50745bb48eaSTom Stellard Inst->replaceAllUsesWith(ExtractElement);
50845bb48eaSTom Stellard Inst->eraseFromParent();
50945bb48eaSTom Stellard break;
51045bb48eaSTom Stellard }
51145bb48eaSTom Stellard case Instruction::Store: {
51282618baaSDavid Stuttard StoreInst *SI = cast<StoreInst>(Inst);
51382618baaSDavid Stuttard Value *Ptr = SI->getPointerOperand();
51445bb48eaSTom Stellard Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
5153ed643eaSNikita Popov Type *VecPtrTy = VectorTy->getPointerTo(Alloca->getAddressSpace());
516efb24540SMatt Arsenault Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
51714359ef1SJames Y Knight Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
518db7dea2bSStanislav Mekhanoshin Value *Elt = SI->getValueOperand();
519db7dea2bSStanislav Mekhanoshin if (Elt->getType() != VecEltTy)
5206e1eee60SStanislav Mekhanoshin Elt = Builder.CreateBitOrPointerCast(Elt, VecEltTy);
521db7dea2bSStanislav Mekhanoshin Value *NewVecValue = Builder.CreateInsertElement(VecValue, Elt, Index);
52245bb48eaSTom Stellard Builder.CreateStore(NewVecValue, BitCast);
52345bb48eaSTom Stellard Inst->eraseFromParent();
52445bb48eaSTom Stellard break;
52545bb48eaSTom Stellard }
52645bb48eaSTom Stellard
52745bb48eaSTom Stellard default:
52845bb48eaSTom Stellard llvm_unreachable("Inconsistency in instructions promotable to vector");
52945bb48eaSTom Stellard }
53045bb48eaSTom Stellard }
53145bb48eaSTom Stellard return true;
53245bb48eaSTom Stellard }
53345bb48eaSTom Stellard
isCallPromotable(CallInst * CI)534ad134845SMatt Arsenault static bool isCallPromotable(CallInst *CI) {
535ad134845SMatt Arsenault IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
536ad134845SMatt Arsenault if (!II)
537ad134845SMatt Arsenault return false;
538ad134845SMatt Arsenault
539ad134845SMatt Arsenault switch (II->getIntrinsicID()) {
540ad134845SMatt Arsenault case Intrinsic::memcpy:
5417e747f1aSMatt Arsenault case Intrinsic::memmove:
542ad134845SMatt Arsenault case Intrinsic::memset:
543ad134845SMatt Arsenault case Intrinsic::lifetime_start:
544ad134845SMatt Arsenault case Intrinsic::lifetime_end:
545ad134845SMatt Arsenault case Intrinsic::invariant_start:
546ad134845SMatt Arsenault case Intrinsic::invariant_end:
5475dde8094SPiotr Padlewski case Intrinsic::launder_invariant_group:
5485b3db45eSPiotr Padlewski case Intrinsic::strip_invariant_group:
5497e747f1aSMatt Arsenault case Intrinsic::objectsize:
550ad134845SMatt Arsenault return true;
551ad134845SMatt Arsenault default:
552ad134845SMatt Arsenault return false;
553ad134845SMatt Arsenault }
554ad134845SMatt Arsenault }
555ad134845SMatt Arsenault
binaryOpIsDerivedFromSameAlloca(Value * BaseAlloca,Value * Val,Instruction * Inst,int OpIdx0,int OpIdx1) const5560e9abcfcSArthur Eubanks bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
5570e9abcfcSArthur Eubanks Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0,
558a61cb48dSMatt Arsenault int OpIdx1) const {
559a61cb48dSMatt Arsenault // Figure out which operand is the one we might not be promoting.
560a61cb48dSMatt Arsenault Value *OtherOp = Inst->getOperand(OpIdx0);
561a61cb48dSMatt Arsenault if (Val == OtherOp)
562a61cb48dSMatt Arsenault OtherOp = Inst->getOperand(OpIdx1);
563a61cb48dSMatt Arsenault
564891fccc0SMatt Arsenault if (isa<ConstantPointerNull>(OtherOp))
565891fccc0SMatt Arsenault return true;
566891fccc0SMatt Arsenault
567b0eb40caSVitaly Buka Value *OtherObj = getUnderlyingObject(OtherOp);
568a61cb48dSMatt Arsenault if (!isa<AllocaInst>(OtherObj))
569a61cb48dSMatt Arsenault return false;
570a61cb48dSMatt Arsenault
571a61cb48dSMatt Arsenault // TODO: We should be able to replace undefs with the right pointer type.
572a61cb48dSMatt Arsenault
573a61cb48dSMatt Arsenault // TODO: If we know the other base object is another promotable
574a61cb48dSMatt Arsenault // alloca, not necessarily this alloca, we can do this. The
575a61cb48dSMatt Arsenault // important part is both must have the same address space at
576a61cb48dSMatt Arsenault // the end.
577a61cb48dSMatt Arsenault if (OtherObj != BaseAlloca) {
578d34e60caSNicola Zaghen LLVM_DEBUG(
579d34e60caSNicola Zaghen dbgs() << "Found a binary instruction with another alloca object\n");
580a61cb48dSMatt Arsenault return false;
581a61cb48dSMatt Arsenault }
582a61cb48dSMatt Arsenault
583a61cb48dSMatt Arsenault return true;
584a61cb48dSMatt Arsenault }
585a61cb48dSMatt Arsenault
collectUsesWithPtrTypes(Value * BaseAlloca,Value * Val,std::vector<Value * > & WorkList) const5860e9abcfcSArthur Eubanks bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
5870e9abcfcSArthur Eubanks Value *BaseAlloca, Value *Val, std::vector<Value *> &WorkList) const {
588a61cb48dSMatt Arsenault
589ad134845SMatt Arsenault for (User *User : Val->users()) {
5900d955d0bSDavid Majnemer if (is_contained(WorkList, User))
591ad134845SMatt Arsenault continue;
592ad134845SMatt Arsenault
593ad134845SMatt Arsenault if (CallInst *CI = dyn_cast<CallInst>(User)) {
594ad134845SMatt Arsenault if (!isCallPromotable(CI))
595ad134845SMatt Arsenault return false;
596ad134845SMatt Arsenault
59745bb48eaSTom Stellard WorkList.push_back(User);
59845bb48eaSTom Stellard continue;
59945bb48eaSTom Stellard }
60045bb48eaSTom Stellard
601a61cb48dSMatt Arsenault Instruction *UseInst = cast<Instruction>(User);
602a61cb48dSMatt Arsenault if (UseInst->getOpcode() == Instruction::PtrToInt)
60345bb48eaSTom Stellard return false;
60445bb48eaSTom Stellard
605210b7cf3SMatt Arsenault if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
606c438ef57SMatt Arsenault if (LI->isVolatile())
607c438ef57SMatt Arsenault return false;
608c438ef57SMatt Arsenault
609c438ef57SMatt Arsenault continue;
610c438ef57SMatt Arsenault }
611c438ef57SMatt Arsenault
612a61cb48dSMatt Arsenault if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
6130a30e456SMatt Arsenault if (SI->isVolatile())
6140a30e456SMatt Arsenault return false;
6150a30e456SMatt Arsenault
6167227cc1aSMatt Arsenault // Reject if the stored value is not the pointer operand.
6177227cc1aSMatt Arsenault if (SI->getPointerOperand() != Val)
6187227cc1aSMatt Arsenault return false;
619210b7cf3SMatt Arsenault } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
6200a30e456SMatt Arsenault if (RMW->isVolatile())
6210a30e456SMatt Arsenault return false;
622210b7cf3SMatt Arsenault } else if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
6230a30e456SMatt Arsenault if (CAS->isVolatile())
6240a30e456SMatt Arsenault return false;
6257227cc1aSMatt Arsenault }
6267227cc1aSMatt Arsenault
627a61cb48dSMatt Arsenault // Only promote a select if we know that the other select operand
628a61cb48dSMatt Arsenault // is from another pointer that will also be promoted.
629a61cb48dSMatt Arsenault if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
630a61cb48dSMatt Arsenault if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
631a61cb48dSMatt Arsenault return false;
632891fccc0SMatt Arsenault
633891fccc0SMatt Arsenault // May need to rewrite constant operands.
634891fccc0SMatt Arsenault WorkList.push_back(ICmp);
635a61cb48dSMatt Arsenault }
636a61cb48dSMatt Arsenault
6372402b95dSMatt Arsenault if (UseInst->getOpcode() == Instruction::AddrSpaceCast) {
638c85abbd9SChangpeng Fang // Give up if the pointer may be captured.
639c85abbd9SChangpeng Fang if (PointerMayBeCaptured(UseInst, true, true))
640c85abbd9SChangpeng Fang return false;
6412402b95dSMatt Arsenault // Don't collect the users of this.
6422402b95dSMatt Arsenault WorkList.push_back(User);
6432402b95dSMatt Arsenault continue;
6442402b95dSMatt Arsenault }
6452402b95dSMatt Arsenault
646544be708SChristudasan Devadasan // Do not promote vector/aggregate type instructions. It is hard to track
647544be708SChristudasan Devadasan // their users.
648544be708SChristudasan Devadasan if (isa<InsertValueInst>(User) || isa<InsertElementInst>(User))
649544be708SChristudasan Devadasan return false;
650544be708SChristudasan Devadasan
65145bb48eaSTom Stellard if (!User->getType()->isPointerTy())
65245bb48eaSTom Stellard continue;
65345bb48eaSTom Stellard
654de420812SMatt Arsenault if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
655de420812SMatt Arsenault // Be conservative if an address could be computed outside the bounds of
656de420812SMatt Arsenault // the alloca.
657de420812SMatt Arsenault if (!GEP->isInBounds())
658de420812SMatt Arsenault return false;
659de420812SMatt Arsenault }
660de420812SMatt Arsenault
661a61cb48dSMatt Arsenault // Only promote a select if we know that the other select operand is from
662a61cb48dSMatt Arsenault // another pointer that will also be promoted.
663a61cb48dSMatt Arsenault if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
664a61cb48dSMatt Arsenault if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
665a61cb48dSMatt Arsenault return false;
666a61cb48dSMatt Arsenault }
667a61cb48dSMatt Arsenault
668a61cb48dSMatt Arsenault // Repeat for phis.
669a61cb48dSMatt Arsenault if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
670a61cb48dSMatt Arsenault // TODO: Handle more complex cases. We should be able to replace loops
671a61cb48dSMatt Arsenault // over arrays.
672a61cb48dSMatt Arsenault switch (Phi->getNumIncomingValues()) {
673a61cb48dSMatt Arsenault case 1:
674a61cb48dSMatt Arsenault break;
675a61cb48dSMatt Arsenault case 2:
676a61cb48dSMatt Arsenault if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
677a61cb48dSMatt Arsenault return false;
678a61cb48dSMatt Arsenault break;
679a61cb48dSMatt Arsenault default:
680a61cb48dSMatt Arsenault return false;
681a61cb48dSMatt Arsenault }
682a61cb48dSMatt Arsenault }
683a61cb48dSMatt Arsenault
68445bb48eaSTom Stellard WorkList.push_back(User);
685a61cb48dSMatt Arsenault if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
686ad134845SMatt Arsenault return false;
68745bb48eaSTom Stellard }
688ad134845SMatt Arsenault
689ad134845SMatt Arsenault return true;
69045bb48eaSTom Stellard }
69145bb48eaSTom Stellard
hasSufficientLocalMem(const Function & F)6920e9abcfcSArthur Eubanks bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
6931dbace19SChangpeng Fang
6941dbace19SChangpeng Fang FunctionType *FTy = F.getFunctionType();
6950e9abcfcSArthur Eubanks const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
6961dbace19SChangpeng Fang
6971dbace19SChangpeng Fang // If the function has any arguments in the local address space, then it's
6981dbace19SChangpeng Fang // possible these arguments require the entire local memory space, so
6991dbace19SChangpeng Fang // we cannot use local memory in the pass.
7001dbace19SChangpeng Fang for (Type *ParamTy : FTy->params()) {
7011dbace19SChangpeng Fang PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
7020da6350dSMatt Arsenault if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
7031dbace19SChangpeng Fang LocalMemLimit = 0;
704d34e60caSNicola Zaghen LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
7051dbace19SChangpeng Fang "local memory disabled.\n");
7061dbace19SChangpeng Fang return false;
7071dbace19SChangpeng Fang }
7081dbace19SChangpeng Fang }
7091dbace19SChangpeng Fang
7101dbace19SChangpeng Fang LocalMemLimit = ST.getLocalMemorySize();
7111dbace19SChangpeng Fang if (LocalMemLimit == 0)
7121dbace19SChangpeng Fang return false;
7131dbace19SChangpeng Fang
714d42c7b22SMatt Arsenault SmallVector<const Constant *, 16> Stack;
715d42c7b22SMatt Arsenault SmallPtrSet<const Constant *, 8> VisitedConstants;
716d42c7b22SMatt Arsenault SmallPtrSet<const GlobalVariable *, 8> UsedLDS;
7171dbace19SChangpeng Fang
718d42c7b22SMatt Arsenault auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
719d42c7b22SMatt Arsenault for (const User *U : Val->users()) {
720d42c7b22SMatt Arsenault if (const Instruction *Use = dyn_cast<Instruction>(U)) {
721d42c7b22SMatt Arsenault if (Use->getParent()->getParent() == &F)
722d42c7b22SMatt Arsenault return true;
723d42c7b22SMatt Arsenault } else {
724d42c7b22SMatt Arsenault const Constant *C = cast<Constant>(U);
725d42c7b22SMatt Arsenault if (VisitedConstants.insert(C).second)
726d42c7b22SMatt Arsenault Stack.push_back(C);
727d42c7b22SMatt Arsenault }
728d42c7b22SMatt Arsenault }
729d42c7b22SMatt Arsenault
730d42c7b22SMatt Arsenault return false;
731d42c7b22SMatt Arsenault };
732d42c7b22SMatt Arsenault
7331dbace19SChangpeng Fang for (GlobalVariable &GV : Mod->globals()) {
734bc561662SMatt Arsenault if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
7351dbace19SChangpeng Fang continue;
7361dbace19SChangpeng Fang
737d42c7b22SMatt Arsenault if (visitUsers(&GV, &GV)) {
738d42c7b22SMatt Arsenault UsedLDS.insert(&GV);
739d42c7b22SMatt Arsenault Stack.clear();
740d42c7b22SMatt Arsenault continue;
74184704d98SMatt Arsenault }
7421dbace19SChangpeng Fang
743d42c7b22SMatt Arsenault // For any ConstantExpr uses, we need to recursively search the users until
744d42c7b22SMatt Arsenault // we see a function.
745d42c7b22SMatt Arsenault while (!Stack.empty()) {
746d42c7b22SMatt Arsenault const Constant *C = Stack.pop_back_val();
747d42c7b22SMatt Arsenault if (visitUsers(&GV, C)) {
748d42c7b22SMatt Arsenault UsedLDS.insert(&GV);
749d42c7b22SMatt Arsenault Stack.clear();
7501dbace19SChangpeng Fang break;
7511dbace19SChangpeng Fang }
7521dbace19SChangpeng Fang }
7531dbace19SChangpeng Fang }
7541dbace19SChangpeng Fang
755d42c7b22SMatt Arsenault const DataLayout &DL = Mod->getDataLayout();
756d42c7b22SMatt Arsenault SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
757d42c7b22SMatt Arsenault AllocatedSizes.reserve(UsedLDS.size());
758d42c7b22SMatt Arsenault
759d42c7b22SMatt Arsenault for (const GlobalVariable *GV : UsedLDS) {
760d42c7b22SMatt Arsenault Align Alignment =
761d42c7b22SMatt Arsenault DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
762d42c7b22SMatt Arsenault uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType());
76315f54dd5SYaxun (Sam) Liu
76415f54dd5SYaxun (Sam) Liu // HIP uses an extern unsized array in local address space for dynamically
76515f54dd5SYaxun (Sam) Liu // allocated shared memory. In that case, we have to disable the promotion.
76615f54dd5SYaxun (Sam) Liu if (GV->hasExternalLinkage() && AllocSize == 0) {
76715f54dd5SYaxun (Sam) Liu LocalMemLimit = 0;
76815f54dd5SYaxun (Sam) Liu LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
76915f54dd5SYaxun (Sam) Liu "local memory. Promoting to local memory "
77015f54dd5SYaxun (Sam) Liu "disabled.\n");
77115f54dd5SYaxun (Sam) Liu return false;
77215f54dd5SYaxun (Sam) Liu }
77315f54dd5SYaxun (Sam) Liu
774d42c7b22SMatt Arsenault AllocatedSizes.emplace_back(AllocSize, Alignment);
775d42c7b22SMatt Arsenault }
776d42c7b22SMatt Arsenault
777d42c7b22SMatt Arsenault // Sort to try to estimate the worst case alignment padding
778d42c7b22SMatt Arsenault //
779d42c7b22SMatt Arsenault // FIXME: We should really do something to fix the addresses to a more optimal
780d42c7b22SMatt Arsenault // value instead
781*e0039b8dSKazu Hirata llvm::sort(AllocatedSizes, llvm::less_second());
782d42c7b22SMatt Arsenault
783d42c7b22SMatt Arsenault // Check how much local memory is being used by global objects
784d42c7b22SMatt Arsenault CurrentLocalMemUsage = 0;
785d42c7b22SMatt Arsenault
786d42c7b22SMatt Arsenault // FIXME: Try to account for padding here. The real padding and address is
787d42c7b22SMatt Arsenault // currently determined from the inverse order of uses in the function when
788d42c7b22SMatt Arsenault // legalizing, which could also potentially change. We try to estimate the
789d42c7b22SMatt Arsenault // worst case here, but we probably should fix the addresses earlier.
790d42c7b22SMatt Arsenault for (auto Alloc : AllocatedSizes) {
791d42c7b22SMatt Arsenault CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second);
792d42c7b22SMatt Arsenault CurrentLocalMemUsage += Alloc.first;
793d42c7b22SMatt Arsenault }
794d42c7b22SMatt Arsenault
7951dbace19SChangpeng Fang unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
7961dbace19SChangpeng Fang F);
7971dbace19SChangpeng Fang
7981dbace19SChangpeng Fang // Restrict local memory usage so that we don't drastically reduce occupancy,
7991dbace19SChangpeng Fang // unless it is already significantly reduced.
8001dbace19SChangpeng Fang
8011dbace19SChangpeng Fang // TODO: Have some sort of hint or other heuristics to guess occupancy based
8021dbace19SChangpeng Fang // on other factors..
8031dbace19SChangpeng Fang unsigned OccupancyHint = ST.getWavesPerEU(F).second;
8041dbace19SChangpeng Fang if (OccupancyHint == 0)
8051dbace19SChangpeng Fang OccupancyHint = 7;
8061dbace19SChangpeng Fang
8071dbace19SChangpeng Fang // Clamp to max value.
8081dbace19SChangpeng Fang OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
8091dbace19SChangpeng Fang
8101dbace19SChangpeng Fang // Check the hint but ignore it if it's obviously wrong from the existing LDS
8111dbace19SChangpeng Fang // usage.
8121dbace19SChangpeng Fang MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
8131dbace19SChangpeng Fang
8141dbace19SChangpeng Fang
8151dbace19SChangpeng Fang // Round up to the next tier of usage.
8161dbace19SChangpeng Fang unsigned MaxSizeWithWaveCount
8171dbace19SChangpeng Fang = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
8181dbace19SChangpeng Fang
8191dbace19SChangpeng Fang // Program is possibly broken by using more local mem than available.
8201dbace19SChangpeng Fang if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
8211dbace19SChangpeng Fang return false;
8221dbace19SChangpeng Fang
8231dbace19SChangpeng Fang LocalMemLimit = MaxSizeWithWaveCount;
8241dbace19SChangpeng Fang
825d34e60caSNicola Zaghen LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
826d34e60caSNicola Zaghen << " bytes of LDS\n"
8271dbace19SChangpeng Fang << " Rounding size to " << MaxSizeWithWaveCount
8281dbace19SChangpeng Fang << " with a maximum occupancy of " << MaxOccupancy << '\n'
8291dbace19SChangpeng Fang << " and " << (LocalMemLimit - CurrentLocalMemUsage)
830d34e60caSNicola Zaghen << " available for promotion\n");
8311dbace19SChangpeng Fang
8321dbace19SChangpeng Fang return true;
8331dbace19SChangpeng Fang }
8341dbace19SChangpeng Fang
8358a028bf4SMatt Arsenault // FIXME: Should try to pick the most likely to be profitable allocas first.
handleAlloca(AllocaInst & I,bool SufficientLDS)8360e9abcfcSArthur Eubanks bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
837c5fce690SMatt Arsenault // Array allocations are probably not worth handling, since an allocation of
838c5fce690SMatt Arsenault // the array type is the canonical form.
839c5fce690SMatt Arsenault if (!I.isStaticAlloca() || I.isArrayAllocation())
8401dbace19SChangpeng Fang return false;
84119c54880SMatt Arsenault
842db7dea2bSStanislav Mekhanoshin const DataLayout &DL = Mod->getDataLayout();
84345bb48eaSTom Stellard IRBuilder<> Builder(&I);
84445bb48eaSTom Stellard
84545bb48eaSTom Stellard // First try to replace the alloca with a vector
84645bb48eaSTom Stellard Type *AllocaTy = I.getAllocatedType();
84745bb48eaSTom Stellard
848d34e60caSNicola Zaghen LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
84945bb48eaSTom Stellard
85054e2dc75SStanislav Mekhanoshin if (tryPromoteAllocaToVector(&I, DL, MaxVGPRs))
8511dbace19SChangpeng Fang return true; // Promoted to vector.
85245bb48eaSTom Stellard
85373bf0af3SYaxun Liu if (DisablePromoteAllocaToLDS)
85473bf0af3SYaxun Liu return false;
85573bf0af3SYaxun Liu
85679a1fd71STom Stellard const Function &ContainingFunction = *I.getParent()->getParent();
8575c80618fSMatt Arsenault CallingConv::ID CC = ContainingFunction.getCallingConv();
85879a1fd71STom Stellard
859bef1ceb8SNicolai Haehnle // Don't promote the alloca to LDS for shader calling conventions as the work
860bef1ceb8SNicolai Haehnle // item ID intrinsics are not supported for these calling conventions.
861bef1ceb8SNicolai Haehnle // Furthermore not all LDS is available for some of the stages.
8625c80618fSMatt Arsenault switch (CC) {
8635c80618fSMatt Arsenault case CallingConv::AMDGPU_KERNEL:
8645c80618fSMatt Arsenault case CallingConv::SPIR_KERNEL:
8655c80618fSMatt Arsenault break;
8665c80618fSMatt Arsenault default:
867d34e60caSNicola Zaghen LLVM_DEBUG(
868d34e60caSNicola Zaghen dbgs()
869d34e60caSNicola Zaghen << " promote alloca to LDS not supported with calling convention.\n");
8701dbace19SChangpeng Fang return false;
8715c80618fSMatt Arsenault }
872bef1ceb8SNicolai Haehnle
8731dbace19SChangpeng Fang // Not likely to have sufficient local memory for promotion.
8741dbace19SChangpeng Fang if (!SufficientLDS)
8751dbace19SChangpeng Fang return false;
8761dbace19SChangpeng Fang
8770e9abcfcSArthur Eubanks const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction);
8781d65026cSKonstantin Zhuravlyov unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
87979a1fd71STom Stellard
88052911428SGuillaume Chatelet Align Alignment =
88152911428SGuillaume Chatelet DL.getValueOrABITypeAlignment(I.getAlign(), I.getAllocatedType());
8828a028bf4SMatt Arsenault
8838a028bf4SMatt Arsenault // FIXME: This computed padding is likely wrong since it depends on inverse
8848a028bf4SMatt Arsenault // usage order.
8858a028bf4SMatt Arsenault //
8868a028bf4SMatt Arsenault // FIXME: It is also possible that if we're allowed to use all of the memory
8876527b2a4SSebastian Neubauer // could end up using more than the maximum due to alignment padding.
8888a028bf4SMatt Arsenault
88952911428SGuillaume Chatelet uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
8908a028bf4SMatt Arsenault uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
8918a028bf4SMatt Arsenault NewSize += AllocSize;
8928a028bf4SMatt Arsenault
8938a028bf4SMatt Arsenault if (NewSize > LocalMemLimit) {
894d34e60caSNicola Zaghen LLVM_DEBUG(dbgs() << " " << AllocSize
8958a028bf4SMatt Arsenault << " bytes of local memory not available to promote\n");
8961dbace19SChangpeng Fang return false;
89745bb48eaSTom Stellard }
89845bb48eaSTom Stellard
8998a028bf4SMatt Arsenault CurrentLocalMemUsage = NewSize;
9008a028bf4SMatt Arsenault
90145bb48eaSTom Stellard std::vector<Value*> WorkList;
90245bb48eaSTom Stellard
903a61cb48dSMatt Arsenault if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
904d34e60caSNicola Zaghen LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
9051dbace19SChangpeng Fang return false;
90645bb48eaSTom Stellard }
90745bb48eaSTom Stellard
908d34e60caSNicola Zaghen LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
90945bb48eaSTom Stellard
910cf84e26fSMatt Arsenault Function *F = I.getParent()->getParent();
911cf84e26fSMatt Arsenault
91279a1fd71STom Stellard Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
91345bb48eaSTom Stellard GlobalVariable *GV = new GlobalVariable(
914cf84e26fSMatt Arsenault *Mod, GVTy, false, GlobalValue::InternalLinkage,
915cf84e26fSMatt Arsenault UndefValue::get(GVTy),
916cf84e26fSMatt Arsenault Twine(F->getName()) + Twine('.') + I.getName(),
917cf84e26fSMatt Arsenault nullptr,
918cf84e26fSMatt Arsenault GlobalVariable::NotThreadLocal,
9190da6350dSMatt Arsenault AMDGPUAS::LOCAL_ADDRESS);
92096efdd61SPeter Collingbourne GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
9211172712fSArthur Eubanks GV->setAlignment(I.getAlign());
92245bb48eaSTom Stellard
923e0132464SMatt Arsenault Value *TCntY, *TCntZ;
92445bb48eaSTom Stellard
925e0132464SMatt Arsenault std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
926e0132464SMatt Arsenault Value *TIdX = getWorkitemID(Builder, 0);
927e0132464SMatt Arsenault Value *TIdY = getWorkitemID(Builder, 1);
928e0132464SMatt Arsenault Value *TIdZ = getWorkitemID(Builder, 2);
92945bb48eaSTom Stellard
930853a1fc6SMatt Arsenault Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
93145bb48eaSTom Stellard Tmp0 = Builder.CreateMul(Tmp0, TIdX);
932853a1fc6SMatt Arsenault Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
93345bb48eaSTom Stellard Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
93445bb48eaSTom Stellard TID = Builder.CreateAdd(TID, TIdZ);
93545bb48eaSTom Stellard
936853a1fc6SMatt Arsenault Value *Indices[] = {
937853a1fc6SMatt Arsenault Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
938853a1fc6SMatt Arsenault TID
939853a1fc6SMatt Arsenault };
94045bb48eaSTom Stellard
941853a1fc6SMatt Arsenault Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
94245bb48eaSTom Stellard I.mutateType(Offset->getType());
94345bb48eaSTom Stellard I.replaceAllUsesWith(Offset);
94445bb48eaSTom Stellard I.eraseFromParent();
94545bb48eaSTom Stellard
946cb41ee92SStanislav Mekhanoshin SmallVector<IntrinsicInst *> DeferredIntrs;
947cb41ee92SStanislav Mekhanoshin
948fb8cdbaeSMatt Arsenault for (Value *V : WorkList) {
94945bb48eaSTom Stellard CallInst *Call = dyn_cast<CallInst>(V);
95045bb48eaSTom Stellard if (!Call) {
951891fccc0SMatt Arsenault if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
952891fccc0SMatt Arsenault Value *Src0 = CI->getOperand(0);
95389612938SArthur Eubanks PointerType *NewTy = PointerType::getWithSamePointeeType(
95489612938SArthur Eubanks cast<PointerType>(Src0->getType()), AMDGPUAS::LOCAL_ADDRESS);
95545bb48eaSTom Stellard
956891fccc0SMatt Arsenault if (isa<ConstantPointerNull>(CI->getOperand(0)))
957891fccc0SMatt Arsenault CI->setOperand(0, ConstantPointerNull::get(NewTy));
958891fccc0SMatt Arsenault
959891fccc0SMatt Arsenault if (isa<ConstantPointerNull>(CI->getOperand(1)))
960891fccc0SMatt Arsenault CI->setOperand(1, ConstantPointerNull::get(NewTy));
961891fccc0SMatt Arsenault
962891fccc0SMatt Arsenault continue;
963891fccc0SMatt Arsenault }
964891fccc0SMatt Arsenault
9652402b95dSMatt Arsenault // The operand's value should be corrected on its own and we don't want to
9662402b95dSMatt Arsenault // touch the users.
96745bb48eaSTom Stellard if (isa<AddrSpaceCastInst>(V))
96845bb48eaSTom Stellard continue;
96945bb48eaSTom Stellard
97089612938SArthur Eubanks PointerType *NewTy = PointerType::getWithSamePointeeType(
97189612938SArthur Eubanks cast<PointerType>(V->getType()), AMDGPUAS::LOCAL_ADDRESS);
972891fccc0SMatt Arsenault
97345bb48eaSTom Stellard // FIXME: It doesn't really make sense to try to do this for all
97445bb48eaSTom Stellard // instructions.
97545bb48eaSTom Stellard V->mutateType(NewTy);
976891fccc0SMatt Arsenault
977891fccc0SMatt Arsenault // Adjust the types of any constant operands.
978891fccc0SMatt Arsenault if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
979891fccc0SMatt Arsenault if (isa<ConstantPointerNull>(SI->getOperand(1)))
980891fccc0SMatt Arsenault SI->setOperand(1, ConstantPointerNull::get(NewTy));
981891fccc0SMatt Arsenault
982891fccc0SMatt Arsenault if (isa<ConstantPointerNull>(SI->getOperand(2)))
983891fccc0SMatt Arsenault SI->setOperand(2, ConstantPointerNull::get(NewTy));
984891fccc0SMatt Arsenault } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
985891fccc0SMatt Arsenault for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
986891fccc0SMatt Arsenault if (isa<ConstantPointerNull>(Phi->getIncomingValue(I)))
987891fccc0SMatt Arsenault Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy));
988891fccc0SMatt Arsenault }
989891fccc0SMatt Arsenault }
990891fccc0SMatt Arsenault
99145bb48eaSTom Stellard continue;
99245bb48eaSTom Stellard }
99345bb48eaSTom Stellard
9942e08e181SMatt Arsenault IntrinsicInst *Intr = cast<IntrinsicInst>(Call);
99545bb48eaSTom Stellard Builder.SetInsertPoint(Intr);
99645bb48eaSTom Stellard switch (Intr->getIntrinsicID()) {
99745bb48eaSTom Stellard case Intrinsic::lifetime_start:
99845bb48eaSTom Stellard case Intrinsic::lifetime_end:
99945bb48eaSTom Stellard // These intrinsics are for address space 0 only
100045bb48eaSTom Stellard Intr->eraseFromParent();
100145bb48eaSTom Stellard continue;
1002cb41ee92SStanislav Mekhanoshin case Intrinsic::memcpy:
1003cb41ee92SStanislav Mekhanoshin case Intrinsic::memmove:
1004cb41ee92SStanislav Mekhanoshin // These have 2 pointer operands. In case if second pointer also needs
1005cb41ee92SStanislav Mekhanoshin // to be replaced we defer processing of these intrinsics until all
1006cb41ee92SStanislav Mekhanoshin // other values are processed.
1007cb41ee92SStanislav Mekhanoshin DeferredIntrs.push_back(Intr);
100845bb48eaSTom Stellard continue;
100945bb48eaSTom Stellard case Intrinsic::memset: {
101045bb48eaSTom Stellard MemSetInst *MemSet = cast<MemSetInst>(Intr);
10111b2842bfSGuillaume Chatelet Builder.CreateMemSet(
10121b2842bfSGuillaume Chatelet MemSet->getRawDest(), MemSet->getValue(), MemSet->getLength(),
10131b2842bfSGuillaume Chatelet MaybeAlign(MemSet->getDestAlignment()), MemSet->isVolatile());
101445bb48eaSTom Stellard Intr->eraseFromParent();
101545bb48eaSTom Stellard continue;
101645bb48eaSTom Stellard }
10170b783ef0SMatt Arsenault case Intrinsic::invariant_start:
10180b783ef0SMatt Arsenault case Intrinsic::invariant_end:
10195dde8094SPiotr Padlewski case Intrinsic::launder_invariant_group:
10205b3db45eSPiotr Padlewski case Intrinsic::strip_invariant_group:
10210b783ef0SMatt Arsenault Intr->eraseFromParent();
10220b783ef0SMatt Arsenault // FIXME: I think the invariant marker should still theoretically apply,
10230b783ef0SMatt Arsenault // but the intrinsics need to be changed to accept pointers with any
10240b783ef0SMatt Arsenault // address space.
10250b783ef0SMatt Arsenault continue;
10267e747f1aSMatt Arsenault case Intrinsic::objectsize: {
10277e747f1aSMatt Arsenault Value *Src = Intr->getOperand(0);
102889612938SArthur Eubanks Function *ObjectSize = Intrinsic::getDeclaration(
102989612938SArthur Eubanks Mod, Intrinsic::objectsize,
103089612938SArthur Eubanks {Intr->getType(),
103189612938SArthur Eubanks PointerType::getWithSamePointeeType(
103289612938SArthur Eubanks cast<PointerType>(Src->getType()), AMDGPUAS::LOCAL_ADDRESS)});
10337e747f1aSMatt Arsenault
103456c7e88cSGeorge Burgess IV CallInst *NewCall = Builder.CreateCall(
1035600e9deaSErik Pilkington ObjectSize,
1036600e9deaSErik Pilkington {Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)});
10377e747f1aSMatt Arsenault Intr->replaceAllUsesWith(NewCall);
10387e747f1aSMatt Arsenault Intr->eraseFromParent();
10397e747f1aSMatt Arsenault continue;
10407e747f1aSMatt Arsenault }
104145bb48eaSTom Stellard default:
10428c209aa8SMatthias Braun Intr->print(errs());
104345bb48eaSTom Stellard llvm_unreachable("Don't know how to promote alloca intrinsic use.");
104445bb48eaSTom Stellard }
104545bb48eaSTom Stellard }
1046cb41ee92SStanislav Mekhanoshin
1047cb41ee92SStanislav Mekhanoshin for (IntrinsicInst *Intr : DeferredIntrs) {
1048cb41ee92SStanislav Mekhanoshin Builder.SetInsertPoint(Intr);
1049cb41ee92SStanislav Mekhanoshin Intrinsic::ID ID = Intr->getIntrinsicID();
1050cb41ee92SStanislav Mekhanoshin assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
1051cb41ee92SStanislav Mekhanoshin
1052cb41ee92SStanislav Mekhanoshin MemTransferInst *MI = cast<MemTransferInst>(Intr);
1053cb41ee92SStanislav Mekhanoshin auto *B =
1054cb41ee92SStanislav Mekhanoshin Builder.CreateMemTransferInst(ID, MI->getRawDest(), MI->getDestAlign(),
1055cb41ee92SStanislav Mekhanoshin MI->getRawSource(), MI->getSourceAlign(),
1056cb41ee92SStanislav Mekhanoshin MI->getLength(), MI->isVolatile());
1057cb41ee92SStanislav Mekhanoshin
105844a3241fSArthur Eubanks for (unsigned I = 0; I != 2; ++I) {
105944a3241fSArthur Eubanks if (uint64_t Bytes = Intr->getParamDereferenceableBytes(I)) {
106044a3241fSArthur Eubanks B->addDereferenceableParamAttr(I, Bytes);
1061cb41ee92SStanislav Mekhanoshin }
1062cb41ee92SStanislav Mekhanoshin }
1063cb41ee92SStanislav Mekhanoshin
1064cb41ee92SStanislav Mekhanoshin Intr->eraseFromParent();
1065cb41ee92SStanislav Mekhanoshin }
1066cb41ee92SStanislav Mekhanoshin
10671dbace19SChangpeng Fang return true;
106845bb48eaSTom Stellard }
106945bb48eaSTom Stellard
handlePromoteAllocaToVector(AllocaInst & I,unsigned MaxVGPRs)10700e9abcfcSArthur Eubanks bool handlePromoteAllocaToVector(AllocaInst &I, unsigned MaxVGPRs) {
10710e9abcfcSArthur Eubanks // Array allocations are probably not worth handling, since an allocation of
10720e9abcfcSArthur Eubanks // the array type is the canonical form.
10730e9abcfcSArthur Eubanks if (!I.isStaticAlloca() || I.isArrayAllocation())
1074689e616eSStanislav Mekhanoshin return false;
1075689e616eSStanislav Mekhanoshin
10760e9abcfcSArthur Eubanks LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
10770e9abcfcSArthur Eubanks
10780e9abcfcSArthur Eubanks Module *Mod = I.getParent()->getParent()->getParent();
10790e9abcfcSArthur Eubanks return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs);
10800e9abcfcSArthur Eubanks }
10810e9abcfcSArthur Eubanks
promoteAllocasToVector(Function & F,TargetMachine & TM)10820e9abcfcSArthur Eubanks bool promoteAllocasToVector(Function &F, TargetMachine &TM) {
10830e9abcfcSArthur Eubanks if (DisablePromoteAllocaToVector)
108454e2dc75SStanislav Mekhanoshin return false;
108554e2dc75SStanislav Mekhanoshin
10860e9abcfcSArthur Eubanks const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
108754e2dc75SStanislav Mekhanoshin if (!ST.isPromoteAllocaEnabled())
108854e2dc75SStanislav Mekhanoshin return false;
108954e2dc75SStanislav Mekhanoshin
10900e9abcfcSArthur Eubanks unsigned MaxVGPRs;
10910e9abcfcSArthur Eubanks if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
10920e9abcfcSArthur Eubanks const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
109354e2dc75SStanislav Mekhanoshin MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
1094cf74ef13SStanislav Mekhanoshin // A non-entry function has only 32 caller preserved registers.
1095cf74ef13SStanislav Mekhanoshin // Do not promote alloca which will force spilling.
1096cf74ef13SStanislav Mekhanoshin if (!AMDGPU::isEntryFunctionCC(F.getCallingConv()))
1097cf74ef13SStanislav Mekhanoshin MaxVGPRs = std::min(MaxVGPRs, 32u);
109854e2dc75SStanislav Mekhanoshin } else {
109954e2dc75SStanislav Mekhanoshin MaxVGPRs = 128;
110054e2dc75SStanislav Mekhanoshin }
110154e2dc75SStanislav Mekhanoshin
1102689e616eSStanislav Mekhanoshin bool Changed = false;
1103689e616eSStanislav Mekhanoshin BasicBlock &EntryBB = *F.begin();
1104689e616eSStanislav Mekhanoshin
1105689e616eSStanislav Mekhanoshin SmallVector<AllocaInst *, 16> Allocas;
1106689e616eSStanislav Mekhanoshin for (Instruction &I : EntryBB) {
1107689e616eSStanislav Mekhanoshin if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
1108689e616eSStanislav Mekhanoshin Allocas.push_back(AI);
1109689e616eSStanislav Mekhanoshin }
1110689e616eSStanislav Mekhanoshin
1111689e616eSStanislav Mekhanoshin for (AllocaInst *AI : Allocas) {
11120e9abcfcSArthur Eubanks if (handlePromoteAllocaToVector(*AI, MaxVGPRs))
1113689e616eSStanislav Mekhanoshin Changed = true;
1114689e616eSStanislav Mekhanoshin }
1115689e616eSStanislav Mekhanoshin
1116689e616eSStanislav Mekhanoshin return Changed;
1117689e616eSStanislav Mekhanoshin }
1118689e616eSStanislav Mekhanoshin
runOnFunction(Function & F)11190e9abcfcSArthur Eubanks bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {
11200e9abcfcSArthur Eubanks if (skipFunction(F))
1121689e616eSStanislav Mekhanoshin return false;
11220e9abcfcSArthur Eubanks if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
11230e9abcfcSArthur Eubanks return promoteAllocasToVector(F, TPC->getTM<TargetMachine>());
11240e9abcfcSArthur Eubanks }
11250e9abcfcSArthur Eubanks return false;
11260e9abcfcSArthur Eubanks }
1127689e616eSStanislav Mekhanoshin
11280e9abcfcSArthur Eubanks PreservedAnalyses
run(Function & F,FunctionAnalysisManager & AM)11290e9abcfcSArthur Eubanks AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
11300e9abcfcSArthur Eubanks bool Changed = promoteAllocasToVector(F, TM);
11310e9abcfcSArthur Eubanks if (Changed) {
11320e9abcfcSArthur Eubanks PreservedAnalyses PA;
11330e9abcfcSArthur Eubanks PA.preserveSet<CFGAnalyses>();
11340e9abcfcSArthur Eubanks return PA;
11350e9abcfcSArthur Eubanks }
11360e9abcfcSArthur Eubanks return PreservedAnalyses::all();
1137689e616eSStanislav Mekhanoshin }
1138689e616eSStanislav Mekhanoshin
createAMDGPUPromoteAlloca()11398b61764cSFrancis Visoiu Mistrih FunctionPass *llvm::createAMDGPUPromoteAlloca() {
11408b61764cSFrancis Visoiu Mistrih return new AMDGPUPromoteAlloca();
114145bb48eaSTom Stellard }
1142689e616eSStanislav Mekhanoshin
createAMDGPUPromoteAllocaToVector()1143689e616eSStanislav Mekhanoshin FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() {
1144689e616eSStanislav Mekhanoshin return new AMDGPUPromoteAllocaToVector();
1145689e616eSStanislav Mekhanoshin }
1146