1 //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass eliminates allocas by either converting them into vectors or
10 // by migrating them to local address space.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPUSubtarget.h"
16 #include "Utils/AMDGPUBaseInfo.h"
17 #include "llvm/ADT/APInt.h"
18 #include "llvm/ADT/None.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/StringRef.h"
21 #include "llvm/ADT/Triple.h"
22 #include "llvm/ADT/Twine.h"
23 #include "llvm/Analysis/CaptureTracking.h"
24 #include "llvm/Analysis/ValueTracking.h"
25 #include "llvm/CodeGen/TargetPassConfig.h"
26 #include "llvm/IR/Attributes.h"
27 #include "llvm/IR/BasicBlock.h"
28 #include "llvm/IR/Constant.h"
29 #include "llvm/IR/Constants.h"
30 #include "llvm/IR/DataLayout.h"
31 #include "llvm/IR/DerivedTypes.h"
32 #include "llvm/IR/Function.h"
33 #include "llvm/IR/GlobalValue.h"
34 #include "llvm/IR/GlobalVariable.h"
35 #include "llvm/IR/IRBuilder.h"
36 #include "llvm/IR/Instruction.h"
37 #include "llvm/IR/Instructions.h"
38 #include "llvm/IR/IntrinsicInst.h"
39 #include "llvm/IR/Intrinsics.h"
40 #include "llvm/IR/IntrinsicsAMDGPU.h"
41 #include "llvm/IR/IntrinsicsR600.h"
42 #include "llvm/IR/LLVMContext.h"
43 #include "llvm/IR/Metadata.h"
44 #include "llvm/IR/Module.h"
45 #include "llvm/IR/Type.h"
46 #include "llvm/IR/User.h"
47 #include "llvm/IR/Value.h"
48 #include "llvm/Pass.h"
49 #include "llvm/Support/Casting.h"
50 #include "llvm/Support/Debug.h"
51 #include "llvm/Support/ErrorHandling.h"
52 #include "llvm/Support/MathExtras.h"
53 #include "llvm/Support/raw_ostream.h"
54 #include "llvm/Target/TargetMachine.h"
55 #include <algorithm>
56 #include <cassert>
57 #include <cstdint>
58 #include <map>
59 #include <tuple>
60 #include <utility>
61 #include <vector>
62 
63 #define DEBUG_TYPE "amdgpu-promote-alloca"
64 
65 using namespace llvm;
66 
67 namespace {
68 
69 static cl::opt<bool> DisablePromoteAllocaToVector(
70   "disable-promote-alloca-to-vector",
71   cl::desc("Disable promote alloca to vector"),
72   cl::init(false));
73 
74 static cl::opt<bool> DisablePromoteAllocaToLDS(
75   "disable-promote-alloca-to-lds",
76   cl::desc("Disable promote alloca to LDS"),
77   cl::init(false));
78 
79 // FIXME: This can create globals so should be a module pass.
80 class AMDGPUPromoteAlloca : public FunctionPass {
81 private:
82   const TargetMachine *TM;
83   Module *Mod = nullptr;
84   const DataLayout *DL = nullptr;
85 
86   // FIXME: This should be per-kernel.
87   uint32_t LocalMemLimit = 0;
88   uint32_t CurrentLocalMemUsage = 0;
89 
90   bool IsAMDGCN = false;
91   bool IsAMDHSA = false;
92 
93   std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
94   Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
95 
96   /// BaseAlloca is the alloca root the search started from.
97   /// Val may be that alloca or a recursive user of it.
98   bool collectUsesWithPtrTypes(Value *BaseAlloca,
99                                Value *Val,
100                                std::vector<Value*> &WorkList) const;
101 
102   /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
103   /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
104   /// Returns true if both operands are derived from the same alloca. Val should
105   /// be the same value as one of the input operands of UseInst.
106   bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
107                                        Instruction *UseInst,
108                                        int OpIdx0, int OpIdx1) const;
109 
110   /// Check whether we have enough local memory for promotion.
111   bool hasSufficientLocalMem(const Function &F);
112 
113 public:
114   static char ID;
115 
116   AMDGPUPromoteAlloca() : FunctionPass(ID) {}
117 
118   bool doInitialization(Module &M) override;
119   bool runOnFunction(Function &F) override;
120 
121   StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
122 
123   bool handleAlloca(AllocaInst &I, bool SufficientLDS);
124 
125   void getAnalysisUsage(AnalysisUsage &AU) const override {
126     AU.setPreservesCFG();
127     FunctionPass::getAnalysisUsage(AU);
128   }
129 };
130 
131 class AMDGPUPromoteAllocaToVector : public FunctionPass {
132 public:
133   static char ID;
134 
135   AMDGPUPromoteAllocaToVector() : FunctionPass(ID) {}
136 
137   bool runOnFunction(Function &F) override;
138 
139   StringRef getPassName() const override {
140     return "AMDGPU Promote Alloca to vector";
141   }
142 
143   bool handleAlloca(AllocaInst &I);
144 
145   void getAnalysisUsage(AnalysisUsage &AU) const override {
146     AU.setPreservesCFG();
147     FunctionPass::getAnalysisUsage(AU);
148   }
149 };
150 
151 } // end anonymous namespace
152 
153 char AMDGPUPromoteAlloca::ID = 0;
154 char AMDGPUPromoteAllocaToVector::ID = 0;
155 
156 INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
157                 "AMDGPU promote alloca to vector or LDS", false, false)
158 
159 INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
160                 "AMDGPU promote alloca to vector", false, false)
161 
162 char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
163 char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
164 
165 bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
166   Mod = &M;
167   DL = &Mod->getDataLayout();
168 
169   return false;
170 }
171 
172 bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
173   if (skipFunction(F))
174     return false;
175 
176   if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
177     TM = &TPC->getTM<TargetMachine>();
178   else
179     return false;
180 
181   const Triple &TT = TM->getTargetTriple();
182   IsAMDGCN = TT.getArch() == Triple::amdgcn;
183   IsAMDHSA = TT.getOS() == Triple::AMDHSA;
184 
185   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
186   if (!ST.isPromoteAllocaEnabled())
187     return false;
188 
189   bool SufficientLDS = hasSufficientLocalMem(F);
190   bool Changed = false;
191   BasicBlock &EntryBB = *F.begin();
192 
193   SmallVector<AllocaInst *, 16> Allocas;
194   for (Instruction &I : EntryBB) {
195     if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
196       Allocas.push_back(AI);
197   }
198 
199   for (AllocaInst *AI : Allocas) {
200     if (handleAlloca(*AI, SufficientLDS))
201       Changed = true;
202   }
203 
204   return Changed;
205 }
206 
207 std::pair<Value *, Value *>
208 AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
209   const Function &F = *Builder.GetInsertBlock()->getParent();
210   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
211 
212   if (!IsAMDHSA) {
213     Function *LocalSizeYFn
214       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
215     Function *LocalSizeZFn
216       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
217 
218     CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
219     CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
220 
221     ST.makeLIDRangeMetadata(LocalSizeY);
222     ST.makeLIDRangeMetadata(LocalSizeZ);
223 
224     return std::make_pair(LocalSizeY, LocalSizeZ);
225   }
226 
227   // We must read the size out of the dispatch pointer.
228   assert(IsAMDGCN);
229 
230   // We are indexing into this struct, and want to extract the workgroup_size_*
231   // fields.
232   //
233   //   typedef struct hsa_kernel_dispatch_packet_s {
234   //     uint16_t header;
235   //     uint16_t setup;
236   //     uint16_t workgroup_size_x ;
237   //     uint16_t workgroup_size_y;
238   //     uint16_t workgroup_size_z;
239   //     uint16_t reserved0;
240   //     uint32_t grid_size_x ;
241   //     uint32_t grid_size_y ;
242   //     uint32_t grid_size_z;
243   //
244   //     uint32_t private_segment_size;
245   //     uint32_t group_segment_size;
246   //     uint64_t kernel_object;
247   //
248   // #ifdef HSA_LARGE_MODEL
249   //     void *kernarg_address;
250   // #elif defined HSA_LITTLE_ENDIAN
251   //     void *kernarg_address;
252   //     uint32_t reserved1;
253   // #else
254   //     uint32_t reserved1;
255   //     void *kernarg_address;
256   // #endif
257   //     uint64_t reserved2;
258   //     hsa_signal_t completion_signal; // uint64_t wrapper
259   //   } hsa_kernel_dispatch_packet_t
260   //
261   Function *DispatchPtrFn
262     = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
263 
264   CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
265   DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
266   DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
267 
268   // Size of the dispatch packet struct.
269   DispatchPtr->addDereferenceableAttr(AttributeList::ReturnIndex, 64);
270 
271   Type *I32Ty = Type::getInt32Ty(Mod->getContext());
272   Value *CastDispatchPtr = Builder.CreateBitCast(
273     DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
274 
275   // We could do a single 64-bit load here, but it's likely that the basic
276   // 32-bit and extract sequence is already present, and it is probably easier
277   // to CSE this. The loads should be mergable later anyway.
278   Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 1);
279   LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
280 
281   Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 2);
282   LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
283 
284   MDNode *MD = MDNode::get(Mod->getContext(), None);
285   LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
286   LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
287   ST.makeLIDRangeMetadata(LoadZU);
288 
289   // Extract y component. Upper half of LoadZU should be zero already.
290   Value *Y = Builder.CreateLShr(LoadXY, 16);
291 
292   return std::make_pair(Y, LoadZU);
293 }
294 
295 Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
296   const AMDGPUSubtarget &ST =
297       AMDGPUSubtarget::get(*TM, *Builder.GetInsertBlock()->getParent());
298   Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
299 
300   switch (N) {
301   case 0:
302     IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
303                       : (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
304     break;
305   case 1:
306     IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
307                       : (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
308     break;
309 
310   case 2:
311     IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
312                       : (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
313     break;
314   default:
315     llvm_unreachable("invalid dimension");
316   }
317 
318   Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
319   CallInst *CI = Builder.CreateCall(WorkitemIdFn);
320   ST.makeLIDRangeMetadata(CI);
321 
322   return CI;
323 }
324 
325 static FixedVectorType *arrayTypeToVecType(ArrayType *ArrayTy) {
326   return FixedVectorType::get(ArrayTy->getElementType(),
327                               ArrayTy->getNumElements());
328 }
329 
330 static Value *stripBitcasts(Value *V) {
331   while (Instruction *I = dyn_cast<Instruction>(V)) {
332     if (I->getOpcode() != Instruction::BitCast)
333       break;
334     V = I->getOperand(0);
335   }
336   return V;
337 }
338 
339 static Value *
340 calculateVectorIndex(Value *Ptr,
341                      const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
342   GetElementPtrInst *GEP = cast<GetElementPtrInst>(stripBitcasts(Ptr));
343 
344   auto I = GEPIdx.find(GEP);
345   return I == GEPIdx.end() ? nullptr : I->second;
346 }
347 
348 static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
349   // FIXME we only support simple cases
350   if (GEP->getNumOperands() != 3)
351     return nullptr;
352 
353   ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
354   if (!I0 || !I0->isZero())
355     return nullptr;
356 
357   return GEP->getOperand(2);
358 }
359 
360 // Not an instruction handled below to turn into a vector.
361 //
362 // TODO: Check isTriviallyVectorizable for calls and handle other
363 // instructions.
364 static bool canVectorizeInst(Instruction *Inst, User *User,
365                              const DataLayout &DL) {
366   switch (Inst->getOpcode()) {
367   case Instruction::Load: {
368     // Currently only handle the case where the Pointer Operand is a GEP.
369     // Also we could not vectorize volatile or atomic loads.
370     LoadInst *LI = cast<LoadInst>(Inst);
371     if (isa<AllocaInst>(User) &&
372         LI->getPointerOperandType() == User->getType() &&
373         isa<VectorType>(LI->getType()))
374       return true;
375 
376     Instruction *PtrInst = dyn_cast<Instruction>(LI->getPointerOperand());
377     if (!PtrInst)
378       return false;
379 
380     return (PtrInst->getOpcode() == Instruction::GetElementPtr ||
381             PtrInst->getOpcode() == Instruction::BitCast) &&
382            LI->isSimple();
383   }
384   case Instruction::BitCast:
385     return true;
386   case Instruction::Store: {
387     // Must be the stored pointer operand, not a stored value, plus
388     // since it should be canonical form, the User should be a GEP.
389     // Also we could not vectorize volatile or atomic stores.
390     StoreInst *SI = cast<StoreInst>(Inst);
391     if (isa<AllocaInst>(User) &&
392         SI->getPointerOperandType() == User->getType() &&
393         isa<VectorType>(SI->getValueOperand()->getType()))
394       return true;
395 
396     Instruction *UserInst = dyn_cast<Instruction>(User);
397     if (!UserInst)
398       return false;
399 
400     return (SI->getPointerOperand() == User) &&
401            (UserInst->getOpcode() == Instruction::GetElementPtr ||
402             UserInst->getOpcode() == Instruction::BitCast) &&
403            SI->isSimple();
404   }
405   default:
406     return false;
407   }
408 }
409 
410 static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL) {
411 
412   if (DisablePromoteAllocaToVector) {
413     LLVM_DEBUG(dbgs() << "  Promotion alloca to vector is disabled\n");
414     return false;
415   }
416 
417   Type *AllocaTy = Alloca->getAllocatedType();
418   auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
419   if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
420     if (VectorType::isValidElementType(ArrayTy->getElementType()) &&
421         ArrayTy->getNumElements() > 0)
422       VectorTy = arrayTypeToVecType(ArrayTy);
423   }
424 
425   LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
426 
427   // FIXME: There is no reason why we can't support larger arrays, we
428   // are just being conservative for now.
429   // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these
430   // could also be promoted but we don't currently handle this case
431   if (!VectorTy || VectorTy->getNumElements() > 16 ||
432       VectorTy->getNumElements() < 2) {
433     LLVM_DEBUG(dbgs() << "  Cannot convert type to vector\n");
434     return false;
435   }
436 
437   std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
438   std::vector<Value *> WorkList;
439   SmallVector<User *, 8> Users(Alloca->users());
440   SmallVector<User *, 8> UseUsers(Users.size(), Alloca);
441   Type *VecEltTy = VectorTy->getElementType();
442   while (!Users.empty()) {
443     User *AllocaUser = Users.pop_back_val();
444     User *UseUser = UseUsers.pop_back_val();
445     Instruction *Inst = dyn_cast<Instruction>(AllocaUser);
446 
447     GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
448     if (!GEP) {
449       if (!canVectorizeInst(Inst, UseUser, DL))
450         return false;
451 
452       if (Inst->getOpcode() == Instruction::BitCast) {
453         Type *FromTy = Inst->getOperand(0)->getType()->getPointerElementType();
454         Type *ToTy = Inst->getType()->getPointerElementType();
455         if (FromTy->isAggregateType() || ToTy->isAggregateType() ||
456             DL.getTypeSizeInBits(FromTy) != DL.getTypeSizeInBits(ToTy))
457           continue;
458 
459         for (User *CastUser : Inst->users()) {
460           if (isAssumeLikeIntrinsic(cast<Instruction>(CastUser)))
461             continue;
462           Users.push_back(CastUser);
463           UseUsers.push_back(Inst);
464         }
465 
466         continue;
467       }
468 
469       WorkList.push_back(AllocaUser);
470       continue;
471     }
472 
473     Value *Index = GEPToVectorIndex(GEP);
474 
475     // If we can't compute a vector index from this GEP, then we can't
476     // promote this alloca to vector.
477     if (!Index) {
478       LLVM_DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP
479                         << '\n');
480       return false;
481     }
482 
483     GEPVectorIdx[GEP] = Index;
484     Users.append(GEP->user_begin(), GEP->user_end());
485     UseUsers.append(GEP->getNumUses(), GEP);
486   }
487 
488   LLVM_DEBUG(dbgs() << "  Converting alloca to vector " << *AllocaTy << " -> "
489                     << *VectorTy << '\n');
490 
491   for (Value *V : WorkList) {
492     Instruction *Inst = cast<Instruction>(V);
493     IRBuilder<> Builder(Inst);
494     switch (Inst->getOpcode()) {
495     case Instruction::Load: {
496       if (Inst->getType() == AllocaTy || Inst->getType()->isVectorTy())
497         break;
498 
499       Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
500       Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
501       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
502 
503       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
504       Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
505       Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
506       if (Inst->getType() != VecEltTy)
507         ExtractElement = Builder.CreateBitCast(ExtractElement, Inst->getType());
508       Inst->replaceAllUsesWith(ExtractElement);
509       Inst->eraseFromParent();
510       break;
511     }
512     case Instruction::Store: {
513       StoreInst *SI = cast<StoreInst>(Inst);
514       if (SI->getValueOperand()->getType() == AllocaTy ||
515           SI->getValueOperand()->getType()->isVectorTy())
516         break;
517 
518       Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
519       Value *Ptr = SI->getPointerOperand();
520       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
521       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
522       Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
523       Value *Elt = SI->getValueOperand();
524       if (Elt->getType() != VecEltTy)
525         Elt = Builder.CreateBitCast(Elt, VecEltTy);
526       Value *NewVecValue = Builder.CreateInsertElement(VecValue, Elt, Index);
527       Builder.CreateStore(NewVecValue, BitCast);
528       Inst->eraseFromParent();
529       break;
530     }
531 
532     default:
533       llvm_unreachable("Inconsistency in instructions promotable to vector");
534     }
535   }
536   return true;
537 }
538 
539 static bool isCallPromotable(CallInst *CI) {
540   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
541   if (!II)
542     return false;
543 
544   switch (II->getIntrinsicID()) {
545   case Intrinsic::memcpy:
546   case Intrinsic::memmove:
547   case Intrinsic::memset:
548   case Intrinsic::lifetime_start:
549   case Intrinsic::lifetime_end:
550   case Intrinsic::invariant_start:
551   case Intrinsic::invariant_end:
552   case Intrinsic::launder_invariant_group:
553   case Intrinsic::strip_invariant_group:
554   case Intrinsic::objectsize:
555     return true;
556   default:
557     return false;
558   }
559 }
560 
561 bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
562                                                           Value *Val,
563                                                           Instruction *Inst,
564                                                           int OpIdx0,
565                                                           int OpIdx1) const {
566   // Figure out which operand is the one we might not be promoting.
567   Value *OtherOp = Inst->getOperand(OpIdx0);
568   if (Val == OtherOp)
569     OtherOp = Inst->getOperand(OpIdx1);
570 
571   if (isa<ConstantPointerNull>(OtherOp))
572     return true;
573 
574   Value *OtherObj = GetUnderlyingObject(OtherOp, *DL);
575   if (!isa<AllocaInst>(OtherObj))
576     return false;
577 
578   // TODO: We should be able to replace undefs with the right pointer type.
579 
580   // TODO: If we know the other base object is another promotable
581   // alloca, not necessarily this alloca, we can do this. The
582   // important part is both must have the same address space at
583   // the end.
584   if (OtherObj != BaseAlloca) {
585     LLVM_DEBUG(
586         dbgs() << "Found a binary instruction with another alloca object\n");
587     return false;
588   }
589 
590   return true;
591 }
592 
593 bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
594   Value *BaseAlloca,
595   Value *Val,
596   std::vector<Value*> &WorkList) const {
597 
598   for (User *User : Val->users()) {
599     if (is_contained(WorkList, User))
600       continue;
601 
602     if (CallInst *CI = dyn_cast<CallInst>(User)) {
603       if (!isCallPromotable(CI))
604         return false;
605 
606       WorkList.push_back(User);
607       continue;
608     }
609 
610     Instruction *UseInst = cast<Instruction>(User);
611     if (UseInst->getOpcode() == Instruction::PtrToInt)
612       return false;
613 
614     if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
615       if (LI->isVolatile())
616         return false;
617 
618       continue;
619     }
620 
621     if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
622       if (SI->isVolatile())
623         return false;
624 
625       // Reject if the stored value is not the pointer operand.
626       if (SI->getPointerOperand() != Val)
627         return false;
628     } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
629       if (RMW->isVolatile())
630         return false;
631     } else if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
632       if (CAS->isVolatile())
633         return false;
634     }
635 
636     // Only promote a select if we know that the other select operand
637     // is from another pointer that will also be promoted.
638     if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
639       if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
640         return false;
641 
642       // May need to rewrite constant operands.
643       WorkList.push_back(ICmp);
644     }
645 
646     if (UseInst->getOpcode() == Instruction::AddrSpaceCast) {
647       // Give up if the pointer may be captured.
648       if (PointerMayBeCaptured(UseInst, true, true))
649         return false;
650       // Don't collect the users of this.
651       WorkList.push_back(User);
652       continue;
653     }
654 
655     if (!User->getType()->isPointerTy())
656       continue;
657 
658     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
659       // Be conservative if an address could be computed outside the bounds of
660       // the alloca.
661       if (!GEP->isInBounds())
662         return false;
663     }
664 
665     // Only promote a select if we know that the other select operand is from
666     // another pointer that will also be promoted.
667     if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
668       if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
669         return false;
670     }
671 
672     // Repeat for phis.
673     if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
674       // TODO: Handle more complex cases. We should be able to replace loops
675       // over arrays.
676       switch (Phi->getNumIncomingValues()) {
677       case 1:
678         break;
679       case 2:
680         if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
681           return false;
682         break;
683       default:
684         return false;
685       }
686     }
687 
688     WorkList.push_back(User);
689     if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
690       return false;
691   }
692 
693   return true;
694 }
695 
696 bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
697 
698   FunctionType *FTy = F.getFunctionType();
699   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
700 
701   // If the function has any arguments in the local address space, then it's
702   // possible these arguments require the entire local memory space, so
703   // we cannot use local memory in the pass.
704   for (Type *ParamTy : FTy->params()) {
705     PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
706     if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
707       LocalMemLimit = 0;
708       LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
709                            "local memory disabled.\n");
710       return false;
711     }
712   }
713 
714   LocalMemLimit = ST.getLocalMemorySize();
715   if (LocalMemLimit == 0)
716     return false;
717 
718   const DataLayout &DL = Mod->getDataLayout();
719 
720   // Check how much local memory is being used by global objects
721   CurrentLocalMemUsage = 0;
722   for (GlobalVariable &GV : Mod->globals()) {
723     if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
724       continue;
725 
726     for (const User *U : GV.users()) {
727       const Instruction *Use = dyn_cast<Instruction>(U);
728       if (!Use)
729         continue;
730 
731       if (Use->getParent()->getParent() == &F) {
732         unsigned Align = GV.getAlignment();
733         if (Align == 0)
734           Align = DL.getABITypeAlignment(GV.getValueType());
735 
736         // FIXME: Try to account for padding here. The padding is currently
737         // determined from the inverse order of uses in the function. I'm not
738         // sure if the use list order is in any way connected to this, so the
739         // total reported size is likely incorrect.
740         uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
741         CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
742         CurrentLocalMemUsage += AllocSize;
743         break;
744       }
745     }
746   }
747 
748   unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
749                                                           F);
750 
751   // Restrict local memory usage so that we don't drastically reduce occupancy,
752   // unless it is already significantly reduced.
753 
754   // TODO: Have some sort of hint or other heuristics to guess occupancy based
755   // on other factors..
756   unsigned OccupancyHint = ST.getWavesPerEU(F).second;
757   if (OccupancyHint == 0)
758     OccupancyHint = 7;
759 
760   // Clamp to max value.
761   OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
762 
763   // Check the hint but ignore it if it's obviously wrong from the existing LDS
764   // usage.
765   MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
766 
767 
768   // Round up to the next tier of usage.
769   unsigned MaxSizeWithWaveCount
770     = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
771 
772   // Program is possibly broken by using more local mem than available.
773   if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
774     return false;
775 
776   LocalMemLimit = MaxSizeWithWaveCount;
777 
778   LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
779                     << " bytes of LDS\n"
780                     << "  Rounding size to " << MaxSizeWithWaveCount
781                     << " with a maximum occupancy of " << MaxOccupancy << '\n'
782                     << " and " << (LocalMemLimit - CurrentLocalMemUsage)
783                     << " available for promotion\n");
784 
785   return true;
786 }
787 
788 // FIXME: Should try to pick the most likely to be profitable allocas first.
789 bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
790   // Array allocations are probably not worth handling, since an allocation of
791   // the array type is the canonical form.
792   if (!I.isStaticAlloca() || I.isArrayAllocation())
793     return false;
794 
795   const DataLayout &DL = Mod->getDataLayout();
796   IRBuilder<> Builder(&I);
797 
798   // First try to replace the alloca with a vector
799   Type *AllocaTy = I.getAllocatedType();
800 
801   LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
802 
803   if (tryPromoteAllocaToVector(&I, DL))
804     return true; // Promoted to vector.
805 
806   if (DisablePromoteAllocaToLDS)
807     return false;
808 
809   const Function &ContainingFunction = *I.getParent()->getParent();
810   CallingConv::ID CC = ContainingFunction.getCallingConv();
811 
812   // Don't promote the alloca to LDS for shader calling conventions as the work
813   // item ID intrinsics are not supported for these calling conventions.
814   // Furthermore not all LDS is available for some of the stages.
815   switch (CC) {
816   case CallingConv::AMDGPU_KERNEL:
817   case CallingConv::SPIR_KERNEL:
818     break;
819   default:
820     LLVM_DEBUG(
821         dbgs()
822         << " promote alloca to LDS not supported with calling convention.\n");
823     return false;
824   }
825 
826   // Not likely to have sufficient local memory for promotion.
827   if (!SufficientLDS)
828     return false;
829 
830   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, ContainingFunction);
831   unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
832 
833   unsigned Align = I.getAlignment();
834   if (Align == 0)
835     Align = DL.getABITypeAlignment(I.getAllocatedType());
836 
837   // FIXME: This computed padding is likely wrong since it depends on inverse
838   // usage order.
839   //
840   // FIXME: It is also possible that if we're allowed to use all of the memory
841   // could could end up using more than the maximum due to alignment padding.
842 
843   uint32_t NewSize = alignTo(CurrentLocalMemUsage, Align);
844   uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
845   NewSize += AllocSize;
846 
847   if (NewSize > LocalMemLimit) {
848     LLVM_DEBUG(dbgs() << "  " << AllocSize
849                       << " bytes of local memory not available to promote\n");
850     return false;
851   }
852 
853   CurrentLocalMemUsage = NewSize;
854 
855   std::vector<Value*> WorkList;
856 
857   if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
858     LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
859     return false;
860   }
861 
862   LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
863 
864   Function *F = I.getParent()->getParent();
865 
866   Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
867   GlobalVariable *GV = new GlobalVariable(
868       *Mod, GVTy, false, GlobalValue::InternalLinkage,
869       UndefValue::get(GVTy),
870       Twine(F->getName()) + Twine('.') + I.getName(),
871       nullptr,
872       GlobalVariable::NotThreadLocal,
873       AMDGPUAS::LOCAL_ADDRESS);
874   GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
875   GV->setAlignment(MaybeAlign(I.getAlignment()));
876 
877   Value *TCntY, *TCntZ;
878 
879   std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
880   Value *TIdX = getWorkitemID(Builder, 0);
881   Value *TIdY = getWorkitemID(Builder, 1);
882   Value *TIdZ = getWorkitemID(Builder, 2);
883 
884   Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
885   Tmp0 = Builder.CreateMul(Tmp0, TIdX);
886   Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
887   Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
888   TID = Builder.CreateAdd(TID, TIdZ);
889 
890   Value *Indices[] = {
891     Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
892     TID
893   };
894 
895   Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
896   I.mutateType(Offset->getType());
897   I.replaceAllUsesWith(Offset);
898   I.eraseFromParent();
899 
900   for (Value *V : WorkList) {
901     CallInst *Call = dyn_cast<CallInst>(V);
902     if (!Call) {
903       if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
904         Value *Src0 = CI->getOperand(0);
905         Type *EltTy = Src0->getType()->getPointerElementType();
906         PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
907 
908         if (isa<ConstantPointerNull>(CI->getOperand(0)))
909           CI->setOperand(0, ConstantPointerNull::get(NewTy));
910 
911         if (isa<ConstantPointerNull>(CI->getOperand(1)))
912           CI->setOperand(1, ConstantPointerNull::get(NewTy));
913 
914         continue;
915       }
916 
917       // The operand's value should be corrected on its own and we don't want to
918       // touch the users.
919       if (isa<AddrSpaceCastInst>(V))
920         continue;
921 
922       Type *EltTy = V->getType()->getPointerElementType();
923       PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
924 
925       // FIXME: It doesn't really make sense to try to do this for all
926       // instructions.
927       V->mutateType(NewTy);
928 
929       // Adjust the types of any constant operands.
930       if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
931         if (isa<ConstantPointerNull>(SI->getOperand(1)))
932           SI->setOperand(1, ConstantPointerNull::get(NewTy));
933 
934         if (isa<ConstantPointerNull>(SI->getOperand(2)))
935           SI->setOperand(2, ConstantPointerNull::get(NewTy));
936       } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
937         for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
938           if (isa<ConstantPointerNull>(Phi->getIncomingValue(I)))
939             Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy));
940         }
941       }
942 
943       continue;
944     }
945 
946     IntrinsicInst *Intr = cast<IntrinsicInst>(Call);
947     Builder.SetInsertPoint(Intr);
948     switch (Intr->getIntrinsicID()) {
949     case Intrinsic::lifetime_start:
950     case Intrinsic::lifetime_end:
951       // These intrinsics are for address space 0 only
952       Intr->eraseFromParent();
953       continue;
954     case Intrinsic::memcpy: {
955       MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
956       Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getDestAlign(),
957                            MemCpy->getRawSource(), MemCpy->getSourceAlign(),
958                            MemCpy->getLength(), MemCpy->isVolatile());
959       Intr->eraseFromParent();
960       continue;
961     }
962     case Intrinsic::memmove: {
963       MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
964       Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getDestAlign(),
965                             MemMove->getRawSource(), MemMove->getSourceAlign(),
966                             MemMove->getLength(), MemMove->isVolatile());
967       Intr->eraseFromParent();
968       continue;
969     }
970     case Intrinsic::memset: {
971       MemSetInst *MemSet = cast<MemSetInst>(Intr);
972       Builder.CreateMemSet(
973           MemSet->getRawDest(), MemSet->getValue(), MemSet->getLength(),
974           MaybeAlign(MemSet->getDestAlignment()), MemSet->isVolatile());
975       Intr->eraseFromParent();
976       continue;
977     }
978     case Intrinsic::invariant_start:
979     case Intrinsic::invariant_end:
980     case Intrinsic::launder_invariant_group:
981     case Intrinsic::strip_invariant_group:
982       Intr->eraseFromParent();
983       // FIXME: I think the invariant marker should still theoretically apply,
984       // but the intrinsics need to be changed to accept pointers with any
985       // address space.
986       continue;
987     case Intrinsic::objectsize: {
988       Value *Src = Intr->getOperand(0);
989       Type *SrcTy = Src->getType()->getPointerElementType();
990       Function *ObjectSize = Intrinsic::getDeclaration(Mod,
991         Intrinsic::objectsize,
992         { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) }
993       );
994 
995       CallInst *NewCall = Builder.CreateCall(
996           ObjectSize,
997           {Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)});
998       Intr->replaceAllUsesWith(NewCall);
999       Intr->eraseFromParent();
1000       continue;
1001     }
1002     default:
1003       Intr->print(errs());
1004       llvm_unreachable("Don't know how to promote alloca intrinsic use.");
1005     }
1006   }
1007   return true;
1008 }
1009 
1010 bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {
1011   if (skipFunction(F) || DisablePromoteAllocaToVector)
1012     return false;
1013 
1014   bool Changed = false;
1015   BasicBlock &EntryBB = *F.begin();
1016 
1017   SmallVector<AllocaInst *, 16> Allocas;
1018   for (Instruction &I : EntryBB) {
1019     if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
1020       Allocas.push_back(AI);
1021   }
1022 
1023   for (AllocaInst *AI : Allocas) {
1024     if (handleAlloca(*AI))
1025       Changed = true;
1026   }
1027 
1028   return Changed;
1029 }
1030 
1031 bool AMDGPUPromoteAllocaToVector::handleAlloca(AllocaInst &I) {
1032   // Array allocations are probably not worth handling, since an allocation of
1033   // the array type is the canonical form.
1034   if (!I.isStaticAlloca() || I.isArrayAllocation())
1035     return false;
1036 
1037   LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
1038 
1039   Module *Mod = I.getParent()->getParent()->getParent();
1040   return tryPromoteAllocaToVector(&I, Mod->getDataLayout());
1041 }
1042 
1043 FunctionPass *llvm::createAMDGPUPromoteAlloca() {
1044   return new AMDGPUPromoteAlloca();
1045 }
1046 
1047 FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() {
1048   return new AMDGPUPromoteAllocaToVector();
1049 }
1050