1 //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass eliminates allocas by either converting them into vectors or
11 // by migrating them to local address space.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
16 #include "AMDGPUSubtarget.h"
17 #include "Utils/AMDGPUBaseInfo.h"
18 #include "llvm/ADT/APInt.h"
19 #include "llvm/ADT/None.h"
20 #include "llvm/ADT/STLExtras.h"
21 #include "llvm/ADT/StringRef.h"
22 #include "llvm/ADT/Triple.h"
23 #include "llvm/ADT/Twine.h"
24 #include "llvm/Analysis/CaptureTracking.h"
25 #include "llvm/Analysis/ValueTracking.h"
26 #include "llvm/IR/Attributes.h"
27 #include "llvm/IR/BasicBlock.h"
28 #include "llvm/IR/Constant.h"
29 #include "llvm/IR/Constants.h"
30 #include "llvm/IR/DataLayout.h"
31 #include "llvm/IR/DerivedTypes.h"
32 #include "llvm/IR/Function.h"
33 #include "llvm/IR/GlobalValue.h"
34 #include "llvm/IR/GlobalVariable.h"
35 #include "llvm/IR/Instruction.h"
36 #include "llvm/IR/Instructions.h"
37 #include "llvm/IR/IntrinsicInst.h"
38 #include "llvm/IR/Intrinsics.h"
39 #include "llvm/IR/IRBuilder.h"
40 #include "llvm/IR/LLVMContext.h"
41 #include "llvm/IR/MDBuilder.h"
42 #include "llvm/IR/Metadata.h"
43 #include "llvm/IR/Module.h"
44 #include "llvm/IR/Type.h"
45 #include "llvm/IR/User.h"
46 #include "llvm/IR/Value.h"
47 #include "llvm/Pass.h"
48 #include "llvm/Support/Casting.h"
49 #include "llvm/Support/Debug.h"
50 #include "llvm/Support/ErrorHandling.h"
51 #include "llvm/Support/MathExtras.h"
52 #include "llvm/Support/raw_ostream.h"
53 #include "llvm/Target/TargetMachine.h"
54 #include <algorithm>
55 #include <cassert>
56 #include <cstdint>
57 #include <map>
58 #include <tuple>
59 #include <utility>
60 #include <vector>
61 
62 #define DEBUG_TYPE "amdgpu-promote-alloca"
63 
64 using namespace llvm;
65 
66 namespace {
67 
68 // FIXME: This can create globals so should be a module pass.
69 class AMDGPUPromoteAlloca : public FunctionPass {
70 private:
71   const TargetMachine *TM;
72   Module *Mod = nullptr;
73   const DataLayout *DL = nullptr;
74   MDNode *MaxWorkGroupSizeRange = nullptr;
75   AMDGPUAS AS;
76 
77   // FIXME: This should be per-kernel.
78   uint32_t LocalMemLimit = 0;
79   uint32_t CurrentLocalMemUsage = 0;
80 
81   bool IsAMDGCN = false;
82   bool IsAMDHSA = false;
83 
84   std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
85   Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
86 
87   /// BaseAlloca is the alloca root the search started from.
88   /// Val may be that alloca or a recursive user of it.
89   bool collectUsesWithPtrTypes(Value *BaseAlloca,
90                                Value *Val,
91                                std::vector<Value*> &WorkList) const;
92 
93   /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
94   /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
95   /// Returns true if both operands are derived from the same alloca. Val should
96   /// be the same value as one of the input operands of UseInst.
97   bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
98                                        Instruction *UseInst,
99                                        int OpIdx0, int OpIdx1) const;
100 
101 public:
102   static char ID;
103 
104   AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) :
105     FunctionPass(ID), TM(TM_) {}
106 
107   bool doInitialization(Module &M) override;
108   bool runOnFunction(Function &F) override;
109 
110   StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
111 
112   void handleAlloca(AllocaInst &I);
113 
114   void getAnalysisUsage(AnalysisUsage &AU) const override {
115     AU.setPreservesCFG();
116     FunctionPass::getAnalysisUsage(AU);
117   }
118 };
119 
120 } // end anonymous namespace
121 
122 char AMDGPUPromoteAlloca::ID = 0;
123 
124 INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
125                    "AMDGPU promote alloca to vector or LDS", false, false)
126 
127 char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
128 
129 bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
130   if (!TM)
131     return false;
132 
133   Mod = &M;
134   DL = &Mod->getDataLayout();
135 
136   // The maximum workitem id.
137   //
138   // FIXME: Should get as subtarget property. Usually runtime enforced max is
139   // 256.
140   MDBuilder MDB(Mod->getContext());
141   MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048));
142 
143   const Triple &TT = TM->getTargetTriple();
144 
145   IsAMDGCN = TT.getArch() == Triple::amdgcn;
146   IsAMDHSA = TT.getOS() == Triple::AMDHSA;
147 
148   return false;
149 }
150 
151 bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
152   if (!TM || skipFunction(F))
153     return false;
154 
155   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
156   if (!ST.isPromoteAllocaEnabled())
157     return false;
158   AS = AMDGPU::getAMDGPUAS(*F.getParent());
159 
160   FunctionType *FTy = F.getFunctionType();
161 
162   // If the function has any arguments in the local address space, then it's
163   // possible these arguments require the entire local memory space, so
164   // we cannot use local memory in the pass.
165   for (Type *ParamTy : FTy->params()) {
166     PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
167     if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) {
168       LocalMemLimit = 0;
169       DEBUG(dbgs() << "Function has local memory argument. Promoting to "
170                       "local memory disabled.\n");
171       return false;
172     }
173   }
174 
175   LocalMemLimit = ST.getLocalMemorySize();
176   if (LocalMemLimit == 0)
177     return false;
178 
179   const DataLayout &DL = Mod->getDataLayout();
180 
181   // Check how much local memory is being used by global objects
182   CurrentLocalMemUsage = 0;
183   for (GlobalVariable &GV : Mod->globals()) {
184     if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS)
185       continue;
186 
187     for (const User *U : GV.users()) {
188       const Instruction *Use = dyn_cast<Instruction>(U);
189       if (!Use)
190         continue;
191 
192       if (Use->getParent()->getParent() == &F) {
193         unsigned Align = GV.getAlignment();
194         if (Align == 0)
195           Align = DL.getABITypeAlignment(GV.getValueType());
196 
197         // FIXME: Try to account for padding here. The padding is currently
198         // determined from the inverse order of uses in the function. I'm not
199         // sure if the use list order is in any way connected to this, so the
200         // total reported size is likely incorrect.
201         uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
202         CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
203         CurrentLocalMemUsage += AllocSize;
204         break;
205       }
206     }
207   }
208 
209   unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
210                                                           F);
211 
212   // Restrict local memory usage so that we don't drastically reduce occupancy,
213   // unless it is already significantly reduced.
214 
215   // TODO: Have some sort of hint or other heuristics to guess occupancy based
216   // on other factors..
217   unsigned OccupancyHint = ST.getWavesPerEU(F).second;
218   if (OccupancyHint == 0)
219     OccupancyHint = 7;
220 
221   // Clamp to max value.
222   OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
223 
224   // Check the hint but ignore it if it's obviously wrong from the existing LDS
225   // usage.
226   MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
227 
228 
229   // Round up to the next tier of usage.
230   unsigned MaxSizeWithWaveCount
231     = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
232 
233   // Program is possibly broken by using more local mem than available.
234   if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
235     return false;
236 
237   LocalMemLimit = MaxSizeWithWaveCount;
238 
239   DEBUG(
240     dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
241     << "  Rounding size to " << MaxSizeWithWaveCount
242     << " with a maximum occupancy of " << MaxOccupancy << '\n'
243     << " and " << (LocalMemLimit - CurrentLocalMemUsage)
244     << " available for promotion\n"
245   );
246 
247   BasicBlock &EntryBB = *F.begin();
248   for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {
249     AllocaInst *AI = dyn_cast<AllocaInst>(I);
250 
251     ++I;
252     if (AI)
253       handleAlloca(*AI);
254   }
255 
256   return true;
257 }
258 
259 std::pair<Value *, Value *>
260 AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
261   if (!IsAMDHSA) {
262     Function *LocalSizeYFn
263       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
264     Function *LocalSizeZFn
265       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
266 
267     CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
268     CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
269 
270     LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
271     LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
272 
273     return std::make_pair(LocalSizeY, LocalSizeZ);
274   }
275 
276   // We must read the size out of the dispatch pointer.
277   assert(IsAMDGCN);
278 
279   // We are indexing into this struct, and want to extract the workgroup_size_*
280   // fields.
281   //
282   //   typedef struct hsa_kernel_dispatch_packet_s {
283   //     uint16_t header;
284   //     uint16_t setup;
285   //     uint16_t workgroup_size_x ;
286   //     uint16_t workgroup_size_y;
287   //     uint16_t workgroup_size_z;
288   //     uint16_t reserved0;
289   //     uint32_t grid_size_x ;
290   //     uint32_t grid_size_y ;
291   //     uint32_t grid_size_z;
292   //
293   //     uint32_t private_segment_size;
294   //     uint32_t group_segment_size;
295   //     uint64_t kernel_object;
296   //
297   // #ifdef HSA_LARGE_MODEL
298   //     void *kernarg_address;
299   // #elif defined HSA_LITTLE_ENDIAN
300   //     void *kernarg_address;
301   //     uint32_t reserved1;
302   // #else
303   //     uint32_t reserved1;
304   //     void *kernarg_address;
305   // #endif
306   //     uint64_t reserved2;
307   //     hsa_signal_t completion_signal; // uint64_t wrapper
308   //   } hsa_kernel_dispatch_packet_t
309   //
310   Function *DispatchPtrFn
311     = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
312 
313   CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
314   DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
315   DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
316 
317   // Size of the dispatch packet struct.
318   DispatchPtr->addDereferenceableAttr(AttributeList::ReturnIndex, 64);
319 
320   Type *I32Ty = Type::getInt32Ty(Mod->getContext());
321   Value *CastDispatchPtr = Builder.CreateBitCast(
322     DispatchPtr, PointerType::get(I32Ty, AS.CONSTANT_ADDRESS));
323 
324   // We could do a single 64-bit load here, but it's likely that the basic
325   // 32-bit and extract sequence is already present, and it is probably easier
326   // to CSE this. The loads should be mergable later anyway.
327   Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1);
328   LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4);
329 
330   Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2);
331   LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4);
332 
333   MDNode *MD = MDNode::get(Mod->getContext(), None);
334   LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
335   LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
336   LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
337 
338   // Extract y component. Upper half of LoadZU should be zero already.
339   Value *Y = Builder.CreateLShr(LoadXY, 16);
340 
341   return std::make_pair(Y, LoadZU);
342 }
343 
344 Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
345   Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
346 
347   switch (N) {
348   case 0:
349     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_x
350       : Intrinsic::r600_read_tidig_x;
351     break;
352   case 1:
353     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_y
354       : Intrinsic::r600_read_tidig_y;
355     break;
356 
357   case 2:
358     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_z
359       : Intrinsic::r600_read_tidig_z;
360     break;
361   default:
362     llvm_unreachable("invalid dimension");
363   }
364 
365   Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
366   CallInst *CI = Builder.CreateCall(WorkitemIdFn);
367   CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
368 
369   return CI;
370 }
371 
372 static VectorType *arrayTypeToVecType(Type *ArrayTy) {
373   return VectorType::get(ArrayTy->getArrayElementType(),
374                          ArrayTy->getArrayNumElements());
375 }
376 
377 static Value *
378 calculateVectorIndex(Value *Ptr,
379                      const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
380   GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
381 
382   auto I = GEPIdx.find(GEP);
383   return I == GEPIdx.end() ? nullptr : I->second;
384 }
385 
386 static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
387   // FIXME we only support simple cases
388   if (GEP->getNumOperands() != 3)
389     return nullptr;
390 
391   ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
392   if (!I0 || !I0->isZero())
393     return nullptr;
394 
395   return GEP->getOperand(2);
396 }
397 
398 // Not an instruction handled below to turn into a vector.
399 //
400 // TODO: Check isTriviallyVectorizable for calls and handle other
401 // instructions.
402 static bool canVectorizeInst(Instruction *Inst, User *User) {
403   switch (Inst->getOpcode()) {
404   case Instruction::Load:
405   case Instruction::BitCast:
406   case Instruction::AddrSpaceCast:
407     return true;
408   case Instruction::Store: {
409     // Must be the stored pointer operand, not a stored value.
410     StoreInst *SI = cast<StoreInst>(Inst);
411     return SI->getPointerOperand() == User;
412   }
413   default:
414     return false;
415   }
416 }
417 
418 static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) {
419   ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
420 
421   DEBUG(dbgs() << "Alloca candidate for vectorization\n");
422 
423   // FIXME: There is no reason why we can't support larger arrays, we
424   // are just being conservative for now.
425   if (!AllocaTy ||
426       AllocaTy->getElementType()->isVectorTy() ||
427       AllocaTy->getNumElements() > 4 ||
428       AllocaTy->getNumElements() < 2) {
429     DEBUG(dbgs() << "  Cannot convert type to vector\n");
430     return false;
431   }
432 
433   std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
434   std::vector<Value*> WorkList;
435   for (User *AllocaUser : Alloca->users()) {
436     GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
437     if (!GEP) {
438       if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
439         return false;
440 
441       WorkList.push_back(AllocaUser);
442       continue;
443     }
444 
445     Value *Index = GEPToVectorIndex(GEP);
446 
447     // If we can't compute a vector index from this GEP, then we can't
448     // promote this alloca to vector.
449     if (!Index) {
450       DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP << '\n');
451       return false;
452     }
453 
454     GEPVectorIdx[GEP] = Index;
455     for (User *GEPUser : AllocaUser->users()) {
456       if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
457         return false;
458 
459       WorkList.push_back(GEPUser);
460     }
461   }
462 
463   VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
464 
465   DEBUG(dbgs() << "  Converting alloca to vector "
466         << *AllocaTy << " -> " << *VectorTy << '\n');
467 
468   for (Value *V : WorkList) {
469     Instruction *Inst = cast<Instruction>(V);
470     IRBuilder<> Builder(Inst);
471     switch (Inst->getOpcode()) {
472     case Instruction::Load: {
473       Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS);
474       Value *Ptr = Inst->getOperand(0);
475       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
476 
477       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
478       Value *VecValue = Builder.CreateLoad(BitCast);
479       Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
480       Inst->replaceAllUsesWith(ExtractElement);
481       Inst->eraseFromParent();
482       break;
483     }
484     case Instruction::Store: {
485       Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS);
486 
487       Value *Ptr = Inst->getOperand(1);
488       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
489       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
490       Value *VecValue = Builder.CreateLoad(BitCast);
491       Value *NewVecValue = Builder.CreateInsertElement(VecValue,
492                                                        Inst->getOperand(0),
493                                                        Index);
494       Builder.CreateStore(NewVecValue, BitCast);
495       Inst->eraseFromParent();
496       break;
497     }
498     case Instruction::BitCast:
499     case Instruction::AddrSpaceCast:
500       break;
501 
502     default:
503       llvm_unreachable("Inconsistency in instructions promotable to vector");
504     }
505   }
506   return true;
507 }
508 
509 static bool isCallPromotable(CallInst *CI) {
510   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
511   if (!II)
512     return false;
513 
514   switch (II->getIntrinsicID()) {
515   case Intrinsic::memcpy:
516   case Intrinsic::memmove:
517   case Intrinsic::memset:
518   case Intrinsic::lifetime_start:
519   case Intrinsic::lifetime_end:
520   case Intrinsic::invariant_start:
521   case Intrinsic::invariant_end:
522   case Intrinsic::invariant_group_barrier:
523   case Intrinsic::objectsize:
524     return true;
525   default:
526     return false;
527   }
528 }
529 
530 bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
531                                                           Value *Val,
532                                                           Instruction *Inst,
533                                                           int OpIdx0,
534                                                           int OpIdx1) const {
535   // Figure out which operand is the one we might not be promoting.
536   Value *OtherOp = Inst->getOperand(OpIdx0);
537   if (Val == OtherOp)
538     OtherOp = Inst->getOperand(OpIdx1);
539 
540   if (isa<ConstantPointerNull>(OtherOp))
541     return true;
542 
543   Value *OtherObj = GetUnderlyingObject(OtherOp, *DL);
544   if (!isa<AllocaInst>(OtherObj))
545     return false;
546 
547   // TODO: We should be able to replace undefs with the right pointer type.
548 
549   // TODO: If we know the other base object is another promotable
550   // alloca, not necessarily this alloca, we can do this. The
551   // important part is both must have the same address space at
552   // the end.
553   if (OtherObj != BaseAlloca) {
554     DEBUG(dbgs() << "Found a binary instruction with another alloca object\n");
555     return false;
556   }
557 
558   return true;
559 }
560 
561 bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
562   Value *BaseAlloca,
563   Value *Val,
564   std::vector<Value*> &WorkList) const {
565 
566   for (User *User : Val->users()) {
567     if (is_contained(WorkList, User))
568       continue;
569 
570     if (CallInst *CI = dyn_cast<CallInst>(User)) {
571       if (!isCallPromotable(CI))
572         return false;
573 
574       WorkList.push_back(User);
575       continue;
576     }
577 
578     Instruction *UseInst = cast<Instruction>(User);
579     if (UseInst->getOpcode() == Instruction::PtrToInt)
580       return false;
581 
582     if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
583       if (LI->isVolatile())
584         return false;
585 
586       continue;
587     }
588 
589     if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
590       if (SI->isVolatile())
591         return false;
592 
593       // Reject if the stored value is not the pointer operand.
594       if (SI->getPointerOperand() != Val)
595         return false;
596     } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
597       if (RMW->isVolatile())
598         return false;
599     } else if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
600       if (CAS->isVolatile())
601         return false;
602     }
603 
604     // Only promote a select if we know that the other select operand
605     // is from another pointer that will also be promoted.
606     if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
607       if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
608         return false;
609 
610       // May need to rewrite constant operands.
611       WorkList.push_back(ICmp);
612     }
613 
614     if (UseInst->getOpcode() == Instruction::AddrSpaceCast) {
615       // Give up if the pointer may be captured.
616       if (PointerMayBeCaptured(UseInst, true, true))
617         return false;
618       // Don't collect the users of this.
619       WorkList.push_back(User);
620       continue;
621     }
622 
623     if (!User->getType()->isPointerTy())
624       continue;
625 
626     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
627       // Be conservative if an address could be computed outside the bounds of
628       // the alloca.
629       if (!GEP->isInBounds())
630         return false;
631     }
632 
633     // Only promote a select if we know that the other select operand is from
634     // another pointer that will also be promoted.
635     if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
636       if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
637         return false;
638     }
639 
640     // Repeat for phis.
641     if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
642       // TODO: Handle more complex cases. We should be able to replace loops
643       // over arrays.
644       switch (Phi->getNumIncomingValues()) {
645       case 1:
646         break;
647       case 2:
648         if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
649           return false;
650         break;
651       default:
652         return false;
653       }
654     }
655 
656     WorkList.push_back(User);
657     if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
658       return false;
659   }
660 
661   return true;
662 }
663 
664 // FIXME: Should try to pick the most likely to be profitable allocas first.
665 void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
666   // Array allocations are probably not worth handling, since an allocation of
667   // the array type is the canonical form.
668   if (!I.isStaticAlloca() || I.isArrayAllocation())
669     return;
670 
671   IRBuilder<> Builder(&I);
672 
673   // First try to replace the alloca with a vector
674   Type *AllocaTy = I.getAllocatedType();
675 
676   DEBUG(dbgs() << "Trying to promote " << I << '\n');
677 
678   if (tryPromoteAllocaToVector(&I, AS)) {
679     DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
680     return;
681   }
682 
683   const Function &ContainingFunction = *I.getParent()->getParent();
684 
685   // Don't promote the alloca to LDS for shader calling conventions as the work
686   // item ID intrinsics are not supported for these calling conventions.
687   // Furthermore not all LDS is available for some of the stages.
688   if (AMDGPU::isShader(ContainingFunction.getCallingConv()))
689     return;
690 
691   const AMDGPUSubtarget &ST =
692     TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction);
693   // FIXME: We should also try to get this value from the reqd_work_group_size
694   // function attribute if it is available.
695   unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
696 
697   const DataLayout &DL = Mod->getDataLayout();
698 
699   unsigned Align = I.getAlignment();
700   if (Align == 0)
701     Align = DL.getABITypeAlignment(I.getAllocatedType());
702 
703   // FIXME: This computed padding is likely wrong since it depends on inverse
704   // usage order.
705   //
706   // FIXME: It is also possible that if we're allowed to use all of the memory
707   // could could end up using more than the maximum due to alignment padding.
708 
709   uint32_t NewSize = alignTo(CurrentLocalMemUsage, Align);
710   uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
711   NewSize += AllocSize;
712 
713   if (NewSize > LocalMemLimit) {
714     DEBUG(dbgs() << "  " << AllocSize
715           << " bytes of local memory not available to promote\n");
716     return;
717   }
718 
719   CurrentLocalMemUsage = NewSize;
720 
721   std::vector<Value*> WorkList;
722 
723   if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
724     DEBUG(dbgs() << " Do not know how to convert all uses\n");
725     return;
726   }
727 
728   DEBUG(dbgs() << "Promoting alloca to local memory\n");
729 
730   Function *F = I.getParent()->getParent();
731 
732   Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
733   GlobalVariable *GV = new GlobalVariable(
734       *Mod, GVTy, false, GlobalValue::InternalLinkage,
735       UndefValue::get(GVTy),
736       Twine(F->getName()) + Twine('.') + I.getName(),
737       nullptr,
738       GlobalVariable::NotThreadLocal,
739       AS.LOCAL_ADDRESS);
740   GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
741   GV->setAlignment(I.getAlignment());
742 
743   Value *TCntY, *TCntZ;
744 
745   std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
746   Value *TIdX = getWorkitemID(Builder, 0);
747   Value *TIdY = getWorkitemID(Builder, 1);
748   Value *TIdZ = getWorkitemID(Builder, 2);
749 
750   Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
751   Tmp0 = Builder.CreateMul(Tmp0, TIdX);
752   Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
753   Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
754   TID = Builder.CreateAdd(TID, TIdZ);
755 
756   Value *Indices[] = {
757     Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
758     TID
759   };
760 
761   Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
762   I.mutateType(Offset->getType());
763   I.replaceAllUsesWith(Offset);
764   I.eraseFromParent();
765 
766   for (Value *V : WorkList) {
767     CallInst *Call = dyn_cast<CallInst>(V);
768     if (!Call) {
769       if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
770         Value *Src0 = CI->getOperand(0);
771         Type *EltTy = Src0->getType()->getPointerElementType();
772         PointerType *NewTy = PointerType::get(EltTy, AS.LOCAL_ADDRESS);
773 
774         if (isa<ConstantPointerNull>(CI->getOperand(0)))
775           CI->setOperand(0, ConstantPointerNull::get(NewTy));
776 
777         if (isa<ConstantPointerNull>(CI->getOperand(1)))
778           CI->setOperand(1, ConstantPointerNull::get(NewTy));
779 
780         continue;
781       }
782 
783       // The operand's value should be corrected on its own and we don't want to
784       // touch the users.
785       if (isa<AddrSpaceCastInst>(V))
786         continue;
787 
788       Type *EltTy = V->getType()->getPointerElementType();
789       PointerType *NewTy = PointerType::get(EltTy, AS.LOCAL_ADDRESS);
790 
791       // FIXME: It doesn't really make sense to try to do this for all
792       // instructions.
793       V->mutateType(NewTy);
794 
795       // Adjust the types of any constant operands.
796       if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
797         if (isa<ConstantPointerNull>(SI->getOperand(1)))
798           SI->setOperand(1, ConstantPointerNull::get(NewTy));
799 
800         if (isa<ConstantPointerNull>(SI->getOperand(2)))
801           SI->setOperand(2, ConstantPointerNull::get(NewTy));
802       } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
803         for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
804           if (isa<ConstantPointerNull>(Phi->getIncomingValue(I)))
805             Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy));
806         }
807       }
808 
809       continue;
810     }
811 
812     IntrinsicInst *Intr = cast<IntrinsicInst>(Call);
813     Builder.SetInsertPoint(Intr);
814     switch (Intr->getIntrinsicID()) {
815     case Intrinsic::lifetime_start:
816     case Intrinsic::lifetime_end:
817       // These intrinsics are for address space 0 only
818       Intr->eraseFromParent();
819       continue;
820     case Intrinsic::memcpy: {
821       MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
822       Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
823                            MemCpy->getLength(), MemCpy->getAlignment(),
824                            MemCpy->isVolatile());
825       Intr->eraseFromParent();
826       continue;
827     }
828     case Intrinsic::memmove: {
829       MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
830       Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getRawSource(),
831                             MemMove->getLength(), MemMove->getAlignment(),
832                             MemMove->isVolatile());
833       Intr->eraseFromParent();
834       continue;
835     }
836     case Intrinsic::memset: {
837       MemSetInst *MemSet = cast<MemSetInst>(Intr);
838       Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
839                            MemSet->getLength(), MemSet->getAlignment(),
840                            MemSet->isVolatile());
841       Intr->eraseFromParent();
842       continue;
843     }
844     case Intrinsic::invariant_start:
845     case Intrinsic::invariant_end:
846     case Intrinsic::invariant_group_barrier:
847       Intr->eraseFromParent();
848       // FIXME: I think the invariant marker should still theoretically apply,
849       // but the intrinsics need to be changed to accept pointers with any
850       // address space.
851       continue;
852     case Intrinsic::objectsize: {
853       Value *Src = Intr->getOperand(0);
854       Type *SrcTy = Src->getType()->getPointerElementType();
855       Function *ObjectSize = Intrinsic::getDeclaration(Mod,
856         Intrinsic::objectsize,
857         { Intr->getType(), PointerType::get(SrcTy, AS.LOCAL_ADDRESS) }
858       );
859 
860       CallInst *NewCall = Builder.CreateCall(
861           ObjectSize, {Src, Intr->getOperand(1), Intr->getOperand(2)});
862       Intr->replaceAllUsesWith(NewCall);
863       Intr->eraseFromParent();
864       continue;
865     }
866     default:
867       Intr->print(errs());
868       llvm_unreachable("Don't know how to promote alloca intrinsic use.");
869     }
870   }
871 }
872 
873 FunctionPass *llvm::createAMDGPUPromoteAlloca(const TargetMachine *TM) {
874   return new AMDGPUPromoteAlloca(TM);
875 }
876