1 //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass eliminates allocas by either converting them into vectors or
11 // by migrating them to local address space.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
16 #include "AMDGPUSubtarget.h"
17 #include "Utils/AMDGPUBaseInfo.h"
18 #include "llvm/ADT/APInt.h"
19 #include "llvm/ADT/None.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/StringRef.h"
22 #include "llvm/ADT/Triple.h"
23 #include "llvm/ADT/Twine.h"
24 #include "llvm/Analysis/CaptureTracking.h"
25 #include "llvm/Analysis/ValueTracking.h"
26 #include "llvm/IR/Attributes.h"
27 #include "llvm/IR/BasicBlock.h"
28 #include "llvm/IR/Constant.h"
29 #include "llvm/IR/Constants.h"
30 #include "llvm/IR/DataLayout.h"
31 #include "llvm/IR/DerivedTypes.h"
32 #include "llvm/IR/Function.h"
33 #include "llvm/IR/GlobalValue.h"
34 #include "llvm/IR/GlobalVariable.h"
35 #include "llvm/IR/Instruction.h"
36 #include "llvm/IR/Instructions.h"
37 #include "llvm/IR/IntrinsicInst.h"
38 #include "llvm/IR/Intrinsics.h"
39 #include "llvm/IR/IRBuilder.h"
40 #include "llvm/IR/LLVMContext.h"
41 #include "llvm/IR/MDBuilder.h"
42 #include "llvm/IR/Metadata.h"
43 #include "llvm/IR/Module.h"
44 #include "llvm/IR/Type.h"
45 #include "llvm/IR/User.h"
46 #include "llvm/IR/Value.h"
47 #include "llvm/Pass.h"
48 #include "llvm/Support/Casting.h"
49 #include "llvm/Support/Debug.h"
50 #include "llvm/Support/ErrorHandling.h"
51 #include "llvm/Support/MathExtras.h"
52 #include "llvm/Support/raw_ostream.h"
53 #include "llvm/Target/TargetMachine.h"
54 #include <algorithm>
55 #include <cassert>
56 #include <cstdint>
57 #include <map>
58 #include <tuple>
59 #include <utility>
60 #include <vector>
61 
62 #define DEBUG_TYPE "amdgpu-promote-alloca"
63 
64 using namespace llvm;
65 
66 namespace {
67 
68 // FIXME: This can create globals so should be a module pass.
69 class AMDGPUPromoteAlloca : public FunctionPass {
70 private:
71   const TargetMachine *TM;
72   Module *Mod = nullptr;
73   const DataLayout *DL = nullptr;
74   MDNode *MaxWorkGroupSizeRange = nullptr;
75 
76   // FIXME: This should be per-kernel.
77   uint32_t LocalMemLimit = 0;
78   uint32_t CurrentLocalMemUsage = 0;
79 
80   bool IsAMDGCN = false;
81   bool IsAMDHSA = false;
82 
83   std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
84   Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
85 
86   /// BaseAlloca is the alloca root the search started from.
87   /// Val may be that alloca or a recursive user of it.
88   bool collectUsesWithPtrTypes(Value *BaseAlloca,
89                                Value *Val,
90                                std::vector<Value*> &WorkList) const;
91 
92   /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
93   /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
94   /// Returns true if both operands are derived from the same alloca. Val should
95   /// be the same value as one of the input operands of UseInst.
96   bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
97                                        Instruction *UseInst,
98                                        int OpIdx0, int OpIdx1) const;
99 
100 public:
101   static char ID;
102 
103   AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) :
104     FunctionPass(ID), TM(TM_) {}
105 
106   bool doInitialization(Module &M) override;
107   bool runOnFunction(Function &F) override;
108 
109   StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
110 
111   void handleAlloca(AllocaInst &I);
112 
113   void getAnalysisUsage(AnalysisUsage &AU) const override {
114     AU.setPreservesCFG();
115     FunctionPass::getAnalysisUsage(AU);
116   }
117 };
118 
119 } // end anonymous namespace
120 
121 char AMDGPUPromoteAlloca::ID = 0;
122 
123 INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
124                    "AMDGPU promote alloca to vector or LDS", false, false)
125 
126 char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
127 
128 bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
129   if (!TM)
130     return false;
131 
132   Mod = &M;
133   DL = &Mod->getDataLayout();
134 
135   // The maximum workitem id.
136   //
137   // FIXME: Should get as subtarget property. Usually runtime enforced max is
138   // 256.
139   MDBuilder MDB(Mod->getContext());
140   MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048));
141 
142   const Triple &TT = TM->getTargetTriple();
143 
144   IsAMDGCN = TT.getArch() == Triple::amdgcn;
145   IsAMDHSA = TT.getOS() == Triple::AMDHSA;
146 
147   return false;
148 }
149 
150 bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
151   if (!TM || skipFunction(F))
152     return false;
153 
154   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
155   if (!ST.isPromoteAllocaEnabled())
156     return false;
157 
158   FunctionType *FTy = F.getFunctionType();
159 
160   // If the function has any arguments in the local address space, then it's
161   // possible these arguments require the entire local memory space, so
162   // we cannot use local memory in the pass.
163   for (Type *ParamTy : FTy->params()) {
164     PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
165     if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
166       LocalMemLimit = 0;
167       DEBUG(dbgs() << "Function has local memory argument. Promoting to "
168                       "local memory disabled.\n");
169       return false;
170     }
171   }
172 
173   LocalMemLimit = ST.getLocalMemorySize();
174   if (LocalMemLimit == 0)
175     return false;
176 
177   const DataLayout &DL = Mod->getDataLayout();
178 
179   // Check how much local memory is being used by global objects
180   CurrentLocalMemUsage = 0;
181   for (GlobalVariable &GV : Mod->globals()) {
182     if (GV.getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
183       continue;
184 
185     for (const User *U : GV.users()) {
186       const Instruction *Use = dyn_cast<Instruction>(U);
187       if (!Use)
188         continue;
189 
190       if (Use->getParent()->getParent() == &F) {
191         unsigned Align = GV.getAlignment();
192         if (Align == 0)
193           Align = DL.getABITypeAlignment(GV.getValueType());
194 
195         // FIXME: Try to account for padding here. The padding is currently
196         // determined from the inverse order of uses in the function. I'm not
197         // sure if the use list order is in any way connected to this, so the
198         // total reported size is likely incorrect.
199         uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
200         CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
201         CurrentLocalMemUsage += AllocSize;
202         break;
203       }
204     }
205   }
206 
207   unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage);
208 
209   // Restrict local memory usage so that we don't drastically reduce occupancy,
210   // unless it is already significantly reduced.
211 
212   // TODO: Have some sort of hint or other heuristics to guess occupancy based
213   // on other factors..
214   unsigned OccupancyHint = ST.getWavesPerEU(F).second;
215   if (OccupancyHint == 0)
216     OccupancyHint = 7;
217 
218   // Clamp to max value.
219   OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
220 
221   // Check the hint but ignore it if it's obviously wrong from the existing LDS
222   // usage.
223   MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
224 
225 
226   // Round up to the next tier of usage.
227   unsigned MaxSizeWithWaveCount
228     = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy);
229 
230   // Program is possibly broken by using more local mem than available.
231   if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
232     return false;
233 
234   LocalMemLimit = MaxSizeWithWaveCount;
235 
236   DEBUG(
237     dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
238     << "  Rounding size to " << MaxSizeWithWaveCount
239     << " with a maximum occupancy of " << MaxOccupancy << '\n'
240     << " and " << (LocalMemLimit - CurrentLocalMemUsage)
241     << " available for promotion\n"
242   );
243 
244   BasicBlock &EntryBB = *F.begin();
245   for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {
246     AllocaInst *AI = dyn_cast<AllocaInst>(I);
247 
248     ++I;
249     if (AI)
250       handleAlloca(*AI);
251   }
252 
253   return true;
254 }
255 
256 std::pair<Value *, Value *>
257 AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
258   if (!IsAMDHSA) {
259     Function *LocalSizeYFn
260       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
261     Function *LocalSizeZFn
262       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
263 
264     CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
265     CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
266 
267     LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
268     LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
269 
270     return std::make_pair(LocalSizeY, LocalSizeZ);
271   }
272 
273   // We must read the size out of the dispatch pointer.
274   assert(IsAMDGCN);
275 
276   // We are indexing into this struct, and want to extract the workgroup_size_*
277   // fields.
278   //
279   //   typedef struct hsa_kernel_dispatch_packet_s {
280   //     uint16_t header;
281   //     uint16_t setup;
282   //     uint16_t workgroup_size_x ;
283   //     uint16_t workgroup_size_y;
284   //     uint16_t workgroup_size_z;
285   //     uint16_t reserved0;
286   //     uint32_t grid_size_x ;
287   //     uint32_t grid_size_y ;
288   //     uint32_t grid_size_z;
289   //
290   //     uint32_t private_segment_size;
291   //     uint32_t group_segment_size;
292   //     uint64_t kernel_object;
293   //
294   // #ifdef HSA_LARGE_MODEL
295   //     void *kernarg_address;
296   // #elif defined HSA_LITTLE_ENDIAN
297   //     void *kernarg_address;
298   //     uint32_t reserved1;
299   // #else
300   //     uint32_t reserved1;
301   //     void *kernarg_address;
302   // #endif
303   //     uint64_t reserved2;
304   //     hsa_signal_t completion_signal; // uint64_t wrapper
305   //   } hsa_kernel_dispatch_packet_t
306   //
307   Function *DispatchPtrFn
308     = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
309 
310   CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
311   DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias);
312   DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
313 
314   // Size of the dispatch packet struct.
315   DispatchPtr->addDereferenceableAttr(AttributeSet::ReturnIndex, 64);
316 
317   Type *I32Ty = Type::getInt32Ty(Mod->getContext());
318   Value *CastDispatchPtr = Builder.CreateBitCast(
319     DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
320 
321   // We could do a single 64-bit load here, but it's likely that the basic
322   // 32-bit and extract sequence is already present, and it is probably easier
323   // to CSE this. The loads should be mergable later anyway.
324   Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1);
325   LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4);
326 
327   Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2);
328   LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4);
329 
330   MDNode *MD = MDNode::get(Mod->getContext(), None);
331   LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
332   LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
333   LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
334 
335   // Extract y component. Upper half of LoadZU should be zero already.
336   Value *Y = Builder.CreateLShr(LoadXY, 16);
337 
338   return std::make_pair(Y, LoadZU);
339 }
340 
341 Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
342   Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
343 
344   switch (N) {
345   case 0:
346     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_x
347       : Intrinsic::r600_read_tidig_x;
348     break;
349   case 1:
350     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_y
351       : Intrinsic::r600_read_tidig_y;
352     break;
353 
354   case 2:
355     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_z
356       : Intrinsic::r600_read_tidig_z;
357     break;
358   default:
359     llvm_unreachable("invalid dimension");
360   }
361 
362   Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
363   CallInst *CI = Builder.CreateCall(WorkitemIdFn);
364   CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
365 
366   return CI;
367 }
368 
369 static VectorType *arrayTypeToVecType(Type *ArrayTy) {
370   return VectorType::get(ArrayTy->getArrayElementType(),
371                          ArrayTy->getArrayNumElements());
372 }
373 
374 static Value *
375 calculateVectorIndex(Value *Ptr,
376                      const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
377   GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
378 
379   auto I = GEPIdx.find(GEP);
380   return I == GEPIdx.end() ? nullptr : I->second;
381 }
382 
383 static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
384   // FIXME we only support simple cases
385   if (GEP->getNumOperands() != 3)
386     return nullptr;
387 
388   ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
389   if (!I0 || !I0->isZero())
390     return nullptr;
391 
392   return GEP->getOperand(2);
393 }
394 
395 // Not an instruction handled below to turn into a vector.
396 //
397 // TODO: Check isTriviallyVectorizable for calls and handle other
398 // instructions.
399 static bool canVectorizeInst(Instruction *Inst, User *User) {
400   switch (Inst->getOpcode()) {
401   case Instruction::Load:
402   case Instruction::BitCast:
403   case Instruction::AddrSpaceCast:
404     return true;
405   case Instruction::Store: {
406     // Must be the stored pointer operand, not a stored value.
407     StoreInst *SI = cast<StoreInst>(Inst);
408     return SI->getPointerOperand() == User;
409   }
410   default:
411     return false;
412   }
413 }
414 
415 static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
416   ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
417 
418   DEBUG(dbgs() << "Alloca candidate for vectorization\n");
419 
420   // FIXME: There is no reason why we can't support larger arrays, we
421   // are just being conservative for now.
422   if (!AllocaTy ||
423       AllocaTy->getElementType()->isVectorTy() ||
424       AllocaTy->getNumElements() > 4 ||
425       AllocaTy->getNumElements() < 2) {
426     DEBUG(dbgs() << "  Cannot convert type to vector\n");
427     return false;
428   }
429 
430   std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
431   std::vector<Value*> WorkList;
432   for (User *AllocaUser : Alloca->users()) {
433     GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
434     if (!GEP) {
435       if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
436         return false;
437 
438       WorkList.push_back(AllocaUser);
439       continue;
440     }
441 
442     Value *Index = GEPToVectorIndex(GEP);
443 
444     // If we can't compute a vector index from this GEP, then we can't
445     // promote this alloca to vector.
446     if (!Index) {
447       DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP << '\n');
448       return false;
449     }
450 
451     GEPVectorIdx[GEP] = Index;
452     for (User *GEPUser : AllocaUser->users()) {
453       if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
454         return false;
455 
456       WorkList.push_back(GEPUser);
457     }
458   }
459 
460   VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
461 
462   DEBUG(dbgs() << "  Converting alloca to vector "
463         << *AllocaTy << " -> " << *VectorTy << '\n');
464 
465   for (Value *V : WorkList) {
466     Instruction *Inst = cast<Instruction>(V);
467     IRBuilder<> Builder(Inst);
468     switch (Inst->getOpcode()) {
469     case Instruction::Load: {
470       Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
471       Value *Ptr = Inst->getOperand(0);
472       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
473 
474       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
475       Value *VecValue = Builder.CreateLoad(BitCast);
476       Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
477       Inst->replaceAllUsesWith(ExtractElement);
478       Inst->eraseFromParent();
479       break;
480     }
481     case Instruction::Store: {
482       Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
483 
484       Value *Ptr = Inst->getOperand(1);
485       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
486       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
487       Value *VecValue = Builder.CreateLoad(BitCast);
488       Value *NewVecValue = Builder.CreateInsertElement(VecValue,
489                                                        Inst->getOperand(0),
490                                                        Index);
491       Builder.CreateStore(NewVecValue, BitCast);
492       Inst->eraseFromParent();
493       break;
494     }
495     case Instruction::BitCast:
496     case Instruction::AddrSpaceCast:
497       break;
498 
499     default:
500       llvm_unreachable("Inconsistency in instructions promotable to vector");
501     }
502   }
503   return true;
504 }
505 
506 static bool isCallPromotable(CallInst *CI) {
507   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
508   if (!II)
509     return false;
510 
511   switch (II->getIntrinsicID()) {
512   case Intrinsic::memcpy:
513   case Intrinsic::memmove:
514   case Intrinsic::memset:
515   case Intrinsic::lifetime_start:
516   case Intrinsic::lifetime_end:
517   case Intrinsic::invariant_start:
518   case Intrinsic::invariant_end:
519   case Intrinsic::invariant_group_barrier:
520   case Intrinsic::objectsize:
521     return true;
522   default:
523     return false;
524   }
525 }
526 
527 bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
528                                                           Value *Val,
529                                                           Instruction *Inst,
530                                                           int OpIdx0,
531                                                           int OpIdx1) const {
532   // Figure out which operand is the one we might not be promoting.
533   Value *OtherOp = Inst->getOperand(OpIdx0);
534   if (Val == OtherOp)
535     OtherOp = Inst->getOperand(OpIdx1);
536 
537   if (isa<ConstantPointerNull>(OtherOp))
538     return true;
539 
540   Value *OtherObj = GetUnderlyingObject(OtherOp, *DL);
541   if (!isa<AllocaInst>(OtherObj))
542     return false;
543 
544   // TODO: We should be able to replace undefs with the right pointer type.
545 
546   // TODO: If we know the other base object is another promotable
547   // alloca, not necessarily this alloca, we can do this. The
548   // important part is both must have the same address space at
549   // the end.
550   if (OtherObj != BaseAlloca) {
551     DEBUG(dbgs() << "Found a binary instruction with another alloca object\n");
552     return false;
553   }
554 
555   return true;
556 }
557 
558 bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
559   Value *BaseAlloca,
560   Value *Val,
561   std::vector<Value*> &WorkList) const {
562 
563   for (User *User : Val->users()) {
564     if (is_contained(WorkList, User))
565       continue;
566 
567     if (CallInst *CI = dyn_cast<CallInst>(User)) {
568       if (!isCallPromotable(CI))
569         return false;
570 
571       WorkList.push_back(User);
572       continue;
573     }
574 
575     Instruction *UseInst = cast<Instruction>(User);
576     if (UseInst->getOpcode() == Instruction::PtrToInt)
577       return false;
578 
579     if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
580       if (LI->isVolatile())
581         return false;
582 
583       continue;
584     }
585 
586     if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
587       if (SI->isVolatile())
588         return false;
589 
590       // Reject if the stored value is not the pointer operand.
591       if (SI->getPointerOperand() != Val)
592         return false;
593     } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
594       if (RMW->isVolatile())
595         return false;
596     } else if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
597       if (CAS->isVolatile())
598         return false;
599     }
600 
601     // Only promote a select if we know that the other select operand
602     // is from another pointer that will also be promoted.
603     if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
604       if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
605         return false;
606 
607       // May need to rewrite constant operands.
608       WorkList.push_back(ICmp);
609     }
610 
611     if (UseInst->getOpcode() == Instruction::AddrSpaceCast) {
612       // Give up if the pointer may be captured.
613       if (PointerMayBeCaptured(UseInst, true, true))
614         return false;
615       // Don't collect the users of this.
616       WorkList.push_back(User);
617       continue;
618     }
619 
620     if (!User->getType()->isPointerTy())
621       continue;
622 
623     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
624       // Be conservative if an address could be computed outside the bounds of
625       // the alloca.
626       if (!GEP->isInBounds())
627         return false;
628     }
629 
630     // Only promote a select if we know that the other select operand is from
631     // another pointer that will also be promoted.
632     if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
633       if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
634         return false;
635     }
636 
637     // Repeat for phis.
638     if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
639       // TODO: Handle more complex cases. We should be able to replace loops
640       // over arrays.
641       switch (Phi->getNumIncomingValues()) {
642       case 1:
643         break;
644       case 2:
645         if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
646           return false;
647         break;
648       default:
649         return false;
650       }
651     }
652 
653     WorkList.push_back(User);
654     if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
655       return false;
656   }
657 
658   return true;
659 }
660 
661 // FIXME: Should try to pick the most likely to be profitable allocas first.
662 void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
663   // Array allocations are probably not worth handling, since an allocation of
664   // the array type is the canonical form.
665   if (!I.isStaticAlloca() || I.isArrayAllocation())
666     return;
667 
668   IRBuilder<> Builder(&I);
669 
670   // First try to replace the alloca with a vector
671   Type *AllocaTy = I.getAllocatedType();
672 
673   DEBUG(dbgs() << "Trying to promote " << I << '\n');
674 
675   if (tryPromoteAllocaToVector(&I)) {
676     DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
677     return;
678   }
679 
680   const Function &ContainingFunction = *I.getParent()->getParent();
681 
682   // Don't promote the alloca to LDS for shader calling conventions as the work
683   // item ID intrinsics are not supported for these calling conventions.
684   // Furthermore not all LDS is available for some of the stages.
685   if (AMDGPU::isShader(ContainingFunction.getCallingConv()))
686     return;
687 
688   const AMDGPUSubtarget &ST =
689     TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction);
690   // FIXME: We should also try to get this value from the reqd_work_group_size
691   // function attribute if it is available.
692   unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
693 
694   const DataLayout &DL = Mod->getDataLayout();
695 
696   unsigned Align = I.getAlignment();
697   if (Align == 0)
698     Align = DL.getABITypeAlignment(I.getAllocatedType());
699 
700   // FIXME: This computed padding is likely wrong since it depends on inverse
701   // usage order.
702   //
703   // FIXME: It is also possible that if we're allowed to use all of the memory
704   // could could end up using more than the maximum due to alignment padding.
705 
706   uint32_t NewSize = alignTo(CurrentLocalMemUsage, Align);
707   uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
708   NewSize += AllocSize;
709 
710   if (NewSize > LocalMemLimit) {
711     DEBUG(dbgs() << "  " << AllocSize
712           << " bytes of local memory not available to promote\n");
713     return;
714   }
715 
716   CurrentLocalMemUsage = NewSize;
717 
718   std::vector<Value*> WorkList;
719 
720   if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
721     DEBUG(dbgs() << " Do not know how to convert all uses\n");
722     return;
723   }
724 
725   DEBUG(dbgs() << "Promoting alloca to local memory\n");
726 
727   Function *F = I.getParent()->getParent();
728 
729   Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
730   GlobalVariable *GV = new GlobalVariable(
731       *Mod, GVTy, false, GlobalValue::InternalLinkage,
732       UndefValue::get(GVTy),
733       Twine(F->getName()) + Twine('.') + I.getName(),
734       nullptr,
735       GlobalVariable::NotThreadLocal,
736       AMDGPUAS::LOCAL_ADDRESS);
737   GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
738   GV->setAlignment(I.getAlignment());
739 
740   Value *TCntY, *TCntZ;
741 
742   std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
743   Value *TIdX = getWorkitemID(Builder, 0);
744   Value *TIdY = getWorkitemID(Builder, 1);
745   Value *TIdZ = getWorkitemID(Builder, 2);
746 
747   Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
748   Tmp0 = Builder.CreateMul(Tmp0, TIdX);
749   Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
750   Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
751   TID = Builder.CreateAdd(TID, TIdZ);
752 
753   Value *Indices[] = {
754     Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
755     TID
756   };
757 
758   Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
759   I.mutateType(Offset->getType());
760   I.replaceAllUsesWith(Offset);
761   I.eraseFromParent();
762 
763   for (Value *V : WorkList) {
764     CallInst *Call = dyn_cast<CallInst>(V);
765     if (!Call) {
766       if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
767         Value *Src0 = CI->getOperand(0);
768         Type *EltTy = Src0->getType()->getPointerElementType();
769         PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
770 
771         if (isa<ConstantPointerNull>(CI->getOperand(0)))
772           CI->setOperand(0, ConstantPointerNull::get(NewTy));
773 
774         if (isa<ConstantPointerNull>(CI->getOperand(1)))
775           CI->setOperand(1, ConstantPointerNull::get(NewTy));
776 
777         continue;
778       }
779 
780       // The operand's value should be corrected on its own and we don't want to
781       // touch the users.
782       if (isa<AddrSpaceCastInst>(V))
783         continue;
784 
785       Type *EltTy = V->getType()->getPointerElementType();
786       PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
787 
788       // FIXME: It doesn't really make sense to try to do this for all
789       // instructions.
790       V->mutateType(NewTy);
791 
792       // Adjust the types of any constant operands.
793       if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
794         if (isa<ConstantPointerNull>(SI->getOperand(1)))
795           SI->setOperand(1, ConstantPointerNull::get(NewTy));
796 
797         if (isa<ConstantPointerNull>(SI->getOperand(2)))
798           SI->setOperand(2, ConstantPointerNull::get(NewTy));
799       } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
800         for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
801           if (isa<ConstantPointerNull>(Phi->getIncomingValue(I)))
802             Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy));
803         }
804       }
805 
806       continue;
807     }
808 
809     IntrinsicInst *Intr = cast<IntrinsicInst>(Call);
810     Builder.SetInsertPoint(Intr);
811     switch (Intr->getIntrinsicID()) {
812     case Intrinsic::lifetime_start:
813     case Intrinsic::lifetime_end:
814       // These intrinsics are for address space 0 only
815       Intr->eraseFromParent();
816       continue;
817     case Intrinsic::memcpy: {
818       MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
819       Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
820                            MemCpy->getLength(), MemCpy->getAlignment(),
821                            MemCpy->isVolatile());
822       Intr->eraseFromParent();
823       continue;
824     }
825     case Intrinsic::memmove: {
826       MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
827       Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getRawSource(),
828                             MemMove->getLength(), MemMove->getAlignment(),
829                             MemMove->isVolatile());
830       Intr->eraseFromParent();
831       continue;
832     }
833     case Intrinsic::memset: {
834       MemSetInst *MemSet = cast<MemSetInst>(Intr);
835       Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
836                            MemSet->getLength(), MemSet->getAlignment(),
837                            MemSet->isVolatile());
838       Intr->eraseFromParent();
839       continue;
840     }
841     case Intrinsic::invariant_start:
842     case Intrinsic::invariant_end:
843     case Intrinsic::invariant_group_barrier:
844       Intr->eraseFromParent();
845       // FIXME: I think the invariant marker should still theoretically apply,
846       // but the intrinsics need to be changed to accept pointers with any
847       // address space.
848       continue;
849     case Intrinsic::objectsize: {
850       Value *Src = Intr->getOperand(0);
851       Type *SrcTy = Src->getType()->getPointerElementType();
852       Function *ObjectSize = Intrinsic::getDeclaration(Mod,
853         Intrinsic::objectsize,
854         { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) }
855       );
856 
857       CallInst *NewCall
858         = Builder.CreateCall(ObjectSize, { Src, Intr->getOperand(1) });
859       Intr->replaceAllUsesWith(NewCall);
860       Intr->eraseFromParent();
861       continue;
862     }
863     default:
864       Intr->print(errs());
865       llvm_unreachable("Don't know how to promote alloca intrinsic use.");
866     }
867   }
868 }
869 
870 FunctionPass *llvm::createAMDGPUPromoteAlloca(const TargetMachine *TM) {
871   return new AMDGPUPromoteAlloca(TM);
872 }
873