1 //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass eliminates allocas by either converting them into vectors or
11 // by migrating them to local address space.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
16 #include "AMDGPUSubtarget.h"
17 #include "llvm/Analysis/ValueTracking.h"
18 #include "llvm/IR/IRBuilder.h"
19 #include "llvm/IR/IntrinsicInst.h"
20 #include "llvm/IR/MDBuilder.h"
21 #include "llvm/Support/Debug.h"
22 #include "llvm/Support/raw_ostream.h"
23 
24 #define DEBUG_TYPE "amdgpu-promote-alloca"
25 
26 using namespace llvm;
27 
28 namespace {
29 
30 // FIXME: This can create globals so should be a module pass.
31 class AMDGPUPromoteAlloca : public FunctionPass {
32 private:
33   const TargetMachine *TM;
34   Module *Mod;
35   const DataLayout *DL;
36   MDNode *MaxWorkGroupSizeRange;
37 
38   // FIXME: This should be per-kernel.
39   int LocalMemAvailable;
40 
41   bool IsAMDGCN;
42   bool IsAMDHSA;
43 
44   std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
45   Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
46 
47   /// BaseAlloca is the alloca root the search started from.
48   /// Val may be that alloca or a recursive user of it.
49   bool collectUsesWithPtrTypes(Value *BaseAlloca,
50                                Value *Val,
51                                std::vector<Value*> &WorkList) const;
52 
53   /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
54   /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
55   /// Returns true if both operands are derived from the same alloca. Val should
56   /// be the same value as one of the input operands of UseInst.
57   bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
58                                        Instruction *UseInst,
59                                        int OpIdx0, int OpIdx1) const;
60 
61 public:
62   static char ID;
63 
64   AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) :
65     FunctionPass(ID),
66     TM(TM_),
67     Mod(nullptr),
68     DL(nullptr),
69     MaxWorkGroupSizeRange(nullptr),
70     LocalMemAvailable(0),
71     IsAMDGCN(false),
72     IsAMDHSA(false) { }
73 
74   bool doInitialization(Module &M) override;
75   bool runOnFunction(Function &F) override;
76 
77   const char *getPassName() const override {
78     return "AMDGPU Promote Alloca";
79   }
80 
81   void handleAlloca(AllocaInst &I);
82 
83   void getAnalysisUsage(AnalysisUsage &AU) const override {
84     AU.setPreservesCFG();
85     FunctionPass::getAnalysisUsage(AU);
86   }
87 };
88 
89 } // End anonymous namespace
90 
91 char AMDGPUPromoteAlloca::ID = 0;
92 
93 INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
94                    "AMDGPU promote alloca to vector or LDS", false, false)
95 
96 char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
97 
98 
99 bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
100   if (!TM)
101     return false;
102 
103   Mod = &M;
104   DL = &Mod->getDataLayout();
105 
106   // The maximum workitem id.
107   //
108   // FIXME: Should get as subtarget property. Usually runtime enforced max is
109   // 256.
110   MDBuilder MDB(Mod->getContext());
111   MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048));
112 
113   const Triple &TT = TM->getTargetTriple();
114 
115   IsAMDGCN = TT.getArch() == Triple::amdgcn;
116   IsAMDHSA = TT.getOS() == Triple::AMDHSA;
117 
118   return false;
119 }
120 
121 bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
122   if (!TM || skipFunction(F))
123     return false;
124 
125   FunctionType *FTy = F.getFunctionType();
126 
127   // If the function has any arguments in the local address space, then it's
128   // possible these arguments require the entire local memory space, so
129   // we cannot use local memory in the pass.
130   for (Type *ParamTy : FTy->params()) {
131     PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
132     if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
133       LocalMemAvailable = 0;
134       DEBUG(dbgs() << "Function has local memory argument.  Promoting to "
135                       "local memory disabled.\n");
136       return false;
137     }
138   }
139 
140   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
141   LocalMemAvailable = ST.getLocalMemorySize();
142   if (LocalMemAvailable == 0)
143     return false;
144 
145   // Check how much local memory is being used by global objects
146   for (GlobalVariable &GV : Mod->globals()) {
147     if (GV.getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
148       continue;
149 
150     for (User *U : GV.users()) {
151       Instruction *Use = dyn_cast<Instruction>(U);
152       if (!Use)
153         continue;
154 
155       if (Use->getParent()->getParent() == &F) {
156         LocalMemAvailable -= DL->getTypeAllocSize(GV.getValueType());
157         break;
158       }
159     }
160   }
161 
162   LocalMemAvailable = std::max(0, LocalMemAvailable);
163   DEBUG(dbgs() << LocalMemAvailable << " bytes free in local memory.\n");
164 
165   BasicBlock &EntryBB = *F.begin();
166   for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {
167     AllocaInst *AI = dyn_cast<AllocaInst>(I);
168 
169     ++I;
170     if (AI)
171       handleAlloca(*AI);
172   }
173 
174   return true;
175 }
176 
177 std::pair<Value *, Value *>
178 AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
179   if (!IsAMDHSA) {
180     Function *LocalSizeYFn
181       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
182     Function *LocalSizeZFn
183       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
184 
185     CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
186     CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
187 
188     LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
189     LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
190 
191     return std::make_pair(LocalSizeY, LocalSizeZ);
192   }
193 
194   // We must read the size out of the dispatch pointer.
195   assert(IsAMDGCN);
196 
197   // We are indexing into this struct, and want to extract the workgroup_size_*
198   // fields.
199   //
200   //   typedef struct hsa_kernel_dispatch_packet_s {
201   //     uint16_t header;
202   //     uint16_t setup;
203   //     uint16_t workgroup_size_x ;
204   //     uint16_t workgroup_size_y;
205   //     uint16_t workgroup_size_z;
206   //     uint16_t reserved0;
207   //     uint32_t grid_size_x ;
208   //     uint32_t grid_size_y ;
209   //     uint32_t grid_size_z;
210   //
211   //     uint32_t private_segment_size;
212   //     uint32_t group_segment_size;
213   //     uint64_t kernel_object;
214   //
215   // #ifdef HSA_LARGE_MODEL
216   //     void *kernarg_address;
217   // #elif defined HSA_LITTLE_ENDIAN
218   //     void *kernarg_address;
219   //     uint32_t reserved1;
220   // #else
221   //     uint32_t reserved1;
222   //     void *kernarg_address;
223   // #endif
224   //     uint64_t reserved2;
225   //     hsa_signal_t completion_signal; // uint64_t wrapper
226   //   } hsa_kernel_dispatch_packet_t
227   //
228   Function *DispatchPtrFn
229     = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
230 
231   CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
232   DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias);
233   DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
234 
235   // Size of the dispatch packet struct.
236   DispatchPtr->addDereferenceableAttr(AttributeSet::ReturnIndex, 64);
237 
238   Type *I32Ty = Type::getInt32Ty(Mod->getContext());
239   Value *CastDispatchPtr = Builder.CreateBitCast(
240     DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
241 
242   // We could do a single 64-bit load here, but it's likely that the basic
243   // 32-bit and extract sequence is already present, and it is probably easier
244   // to CSE this. The loads should be mergable later anyway.
245   Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1);
246   LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4);
247 
248   Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2);
249   LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4);
250 
251   MDNode *MD = llvm::MDNode::get(Mod->getContext(), None);
252   LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
253   LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
254   LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
255 
256   // Extract y component. Upper half of LoadZU should be zero already.
257   Value *Y = Builder.CreateLShr(LoadXY, 16);
258 
259   return std::make_pair(Y, LoadZU);
260 }
261 
262 Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
263   Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
264 
265   switch (N) {
266   case 0:
267     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_x
268       : Intrinsic::r600_read_tidig_x;
269     break;
270   case 1:
271     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_y
272       : Intrinsic::r600_read_tidig_y;
273     break;
274 
275   case 2:
276     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_z
277       : Intrinsic::r600_read_tidig_z;
278     break;
279   default:
280     llvm_unreachable("invalid dimension");
281   }
282 
283   Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
284   CallInst *CI = Builder.CreateCall(WorkitemIdFn);
285   CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
286 
287   return CI;
288 }
289 
290 static VectorType *arrayTypeToVecType(Type *ArrayTy) {
291   return VectorType::get(ArrayTy->getArrayElementType(),
292                          ArrayTy->getArrayNumElements());
293 }
294 
295 static Value *
296 calculateVectorIndex(Value *Ptr,
297                      const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
298   if (isa<AllocaInst>(Ptr))
299     return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
300 
301   GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
302 
303   auto I = GEPIdx.find(GEP);
304   return I == GEPIdx.end() ? nullptr : I->second;
305 }
306 
307 static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
308   // FIXME we only support simple cases
309   if (GEP->getNumOperands() != 3)
310     return NULL;
311 
312   ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
313   if (!I0 || !I0->isZero())
314     return NULL;
315 
316   return GEP->getOperand(2);
317 }
318 
319 // Not an instruction handled below to turn into a vector.
320 //
321 // TODO: Check isTriviallyVectorizable for calls and handle other
322 // instructions.
323 static bool canVectorizeInst(Instruction *Inst, User *User) {
324   switch (Inst->getOpcode()) {
325   case Instruction::Load:
326   case Instruction::BitCast:
327   case Instruction::AddrSpaceCast:
328     return true;
329   case Instruction::Store: {
330     // Must be the stored pointer operand, not a stored value.
331     StoreInst *SI = cast<StoreInst>(Inst);
332     return SI->getPointerOperand() == User;
333   }
334   default:
335     return false;
336   }
337 }
338 
339 static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
340   ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
341 
342   DEBUG(dbgs() << "Alloca candidate for vectorization\n");
343 
344   // FIXME: There is no reason why we can't support larger arrays, we
345   // are just being conservative for now.
346   if (!AllocaTy ||
347       AllocaTy->getElementType()->isVectorTy() ||
348       AllocaTy->getNumElements() > 4) {
349     DEBUG(dbgs() << "  Cannot convert type to vector\n");
350     return false;
351   }
352 
353   std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
354   std::vector<Value*> WorkList;
355   for (User *AllocaUser : Alloca->users()) {
356     GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
357     if (!GEP) {
358       if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
359         return false;
360 
361       WorkList.push_back(AllocaUser);
362       continue;
363     }
364 
365     Value *Index = GEPToVectorIndex(GEP);
366 
367     // If we can't compute a vector index from this GEP, then we can't
368     // promote this alloca to vector.
369     if (!Index) {
370       DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP << '\n');
371       return false;
372     }
373 
374     GEPVectorIdx[GEP] = Index;
375     for (User *GEPUser : AllocaUser->users()) {
376       if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
377         return false;
378 
379       WorkList.push_back(GEPUser);
380     }
381   }
382 
383   VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
384 
385   DEBUG(dbgs() << "  Converting alloca to vector "
386         << *AllocaTy << " -> " << *VectorTy << '\n');
387 
388   for (Value *V : WorkList) {
389     Instruction *Inst = cast<Instruction>(V);
390     IRBuilder<> Builder(Inst);
391     switch (Inst->getOpcode()) {
392     case Instruction::Load: {
393       Value *Ptr = Inst->getOperand(0);
394       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
395       Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
396       Value *VecValue = Builder.CreateLoad(BitCast);
397       Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
398       Inst->replaceAllUsesWith(ExtractElement);
399       Inst->eraseFromParent();
400       break;
401     }
402     case Instruction::Store: {
403       Value *Ptr = Inst->getOperand(1);
404       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
405       Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
406       Value *VecValue = Builder.CreateLoad(BitCast);
407       Value *NewVecValue = Builder.CreateInsertElement(VecValue,
408                                                        Inst->getOperand(0),
409                                                        Index);
410       Builder.CreateStore(NewVecValue, BitCast);
411       Inst->eraseFromParent();
412       break;
413     }
414     case Instruction::BitCast:
415     case Instruction::AddrSpaceCast:
416       break;
417 
418     default:
419       Inst->dump();
420       llvm_unreachable("Inconsistency in instructions promotable to vector");
421     }
422   }
423   return true;
424 }
425 
426 static bool isCallPromotable(CallInst *CI) {
427   // TODO: We might be able to handle some cases where the callee is a
428   // constantexpr bitcast of a function.
429   if (!CI->getCalledFunction())
430     return false;
431 
432   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
433   if (!II)
434     return false;
435 
436   switch (II->getIntrinsicID()) {
437   case Intrinsic::memcpy:
438   case Intrinsic::memmove:
439   case Intrinsic::memset:
440   case Intrinsic::lifetime_start:
441   case Intrinsic::lifetime_end:
442   case Intrinsic::invariant_start:
443   case Intrinsic::invariant_end:
444   case Intrinsic::invariant_group_barrier:
445   case Intrinsic::objectsize:
446     return true;
447   default:
448     return false;
449   }
450 }
451 
452 bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca,
453                                                           Value *Val,
454                                                           Instruction *Inst,
455                                                           int OpIdx0,
456                                                           int OpIdx1) const {
457   // Figure out which operand is the one we might not be promoting.
458   Value *OtherOp = Inst->getOperand(OpIdx0);
459   if (Val == OtherOp)
460     OtherOp = Inst->getOperand(OpIdx1);
461 
462   Value *OtherObj = GetUnderlyingObject(OtherOp, *DL);
463   if (!isa<AllocaInst>(OtherObj))
464     return false;
465 
466   // TODO: We should be able to replace undefs with the right pointer type.
467 
468   // TODO: If we know the other base object is another promotable
469   // alloca, not necessarily this alloca, we can do this. The
470   // important part is both must have the same address space at
471   // the end.
472   if (OtherObj != BaseAlloca) {
473     DEBUG(dbgs() << "Found a binary instruction with another alloca object\n");
474     return false;
475   }
476 
477   return true;
478 }
479 
480 bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
481   Value *BaseAlloca,
482   Value *Val,
483   std::vector<Value*> &WorkList) const {
484 
485   for (User *User : Val->users()) {
486     if (std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
487       continue;
488 
489     if (CallInst *CI = dyn_cast<CallInst>(User)) {
490       if (!isCallPromotable(CI))
491         return false;
492 
493       WorkList.push_back(User);
494       continue;
495     }
496 
497     Instruction *UseInst = cast<Instruction>(User);
498     if (UseInst->getOpcode() == Instruction::PtrToInt)
499       return false;
500 
501     if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
502       if (SI->isVolatile())
503         return false;
504 
505       // Reject if the stored value is not the pointer operand.
506       if (SI->getPointerOperand() != Val)
507         return false;
508     } else if (LoadInst *LI = dyn_cast_or_null<LoadInst>(UseInst)) {
509       if (LI->isVolatile())
510         return false;
511     } else if (AtomicRMWInst *RMW = dyn_cast_or_null<AtomicRMWInst>(UseInst)) {
512       if (RMW->isVolatile())
513         return false;
514     } else if (AtomicCmpXchgInst *CAS
515                = dyn_cast_or_null<AtomicCmpXchgInst>(UseInst)) {
516       if (CAS->isVolatile())
517         return false;
518     }
519 
520     // Only promote a select if we know that the other select operand
521     // is from another pointer that will also be promoted.
522     if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
523       if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
524         return false;
525     }
526 
527     if (!User->getType()->isPointerTy())
528       continue;
529 
530     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
531       // Be conservative if an address could be computed outside the bounds of
532       // the alloca.
533       if (!GEP->isInBounds())
534         return false;
535     }
536 
537     // Only promote a select if we know that the other select operand is from
538     // another pointer that will also be promoted.
539     if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
540       if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
541         return false;
542     }
543 
544     // Repeat for phis.
545     if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
546       // TODO: Handle more complex cases. We should be able to replace loops
547       // over arrays.
548       switch (Phi->getNumIncomingValues()) {
549       case 1:
550         break;
551       case 2:
552         if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
553           return false;
554         break;
555       default:
556         return false;
557       }
558     }
559 
560     WorkList.push_back(User);
561     if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
562       return false;
563   }
564 
565   return true;
566 }
567 
568 void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
569   // Array allocations are probably not worth handling, since an allocation of
570   // the array type is the canonical form.
571   if (!I.isStaticAlloca() || I.isArrayAllocation())
572     return;
573 
574   IRBuilder<> Builder(&I);
575 
576   // First try to replace the alloca with a vector
577   Type *AllocaTy = I.getAllocatedType();
578 
579   DEBUG(dbgs() << "Trying to promote " << I << '\n');
580 
581   if (tryPromoteAllocaToVector(&I))
582     return;
583 
584   DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
585 
586   const Function &ContainingFunction = *I.getParent()->getParent();
587 
588   // FIXME: We should also try to get this value from the reqd_work_group_size
589   // function attribute if it is available.
590   unsigned WorkGroupSize = AMDGPU::getMaximumWorkGroupSize(ContainingFunction);
591 
592   int AllocaSize =
593       WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy);
594 
595   if (AllocaSize > LocalMemAvailable) {
596     DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
597     return;
598   }
599 
600   std::vector<Value*> WorkList;
601 
602   if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
603     DEBUG(dbgs() << " Do not know how to convert all uses\n");
604     return;
605   }
606 
607   DEBUG(dbgs() << "Promoting alloca to local memory\n");
608   LocalMemAvailable -= AllocaSize;
609 
610   Function *F = I.getParent()->getParent();
611 
612   Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
613   GlobalVariable *GV = new GlobalVariable(
614       *Mod, GVTy, false, GlobalValue::InternalLinkage,
615       UndefValue::get(GVTy),
616       Twine(F->getName()) + Twine('.') + I.getName(),
617       nullptr,
618       GlobalVariable::NotThreadLocal,
619       AMDGPUAS::LOCAL_ADDRESS);
620   GV->setUnnamedAddr(true);
621   GV->setAlignment(I.getAlignment());
622 
623   Value *TCntY, *TCntZ;
624 
625   std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
626   Value *TIdX = getWorkitemID(Builder, 0);
627   Value *TIdY = getWorkitemID(Builder, 1);
628   Value *TIdZ = getWorkitemID(Builder, 2);
629 
630   Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
631   Tmp0 = Builder.CreateMul(Tmp0, TIdX);
632   Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
633   Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
634   TID = Builder.CreateAdd(TID, TIdZ);
635 
636   Value *Indices[] = {
637     Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
638     TID
639   };
640 
641   Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
642   I.mutateType(Offset->getType());
643   I.replaceAllUsesWith(Offset);
644   I.eraseFromParent();
645 
646   for (Value *V : WorkList) {
647     CallInst *Call = dyn_cast<CallInst>(V);
648     if (!Call) {
649       Type *EltTy = V->getType()->getPointerElementType();
650       PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
651 
652       // The operand's value should be corrected on its own.
653       if (isa<AddrSpaceCastInst>(V))
654         continue;
655 
656       // FIXME: It doesn't really make sense to try to do this for all
657       // instructions.
658       V->mutateType(NewTy);
659       continue;
660     }
661 
662     IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
663     if (!Intr) {
664       // FIXME: What is this for? It doesn't make sense to promote arbitrary
665       // function calls. If the call is to a defined function that can also be
666       // promoted, we should be able to do this once that function is also
667       // rewritten.
668 
669       std::vector<Type*> ArgTypes;
670       for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
671                                 ArgIdx != ArgEnd; ++ArgIdx) {
672         ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
673       }
674       Function *F = Call->getCalledFunction();
675       FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
676                                                 F->isVarArg());
677       Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(),
678                                              NewType, F->getAttributes());
679       Function *NewF = cast<Function>(C);
680       Call->setCalledFunction(NewF);
681       continue;
682     }
683 
684     Builder.SetInsertPoint(Intr);
685     switch (Intr->getIntrinsicID()) {
686     case Intrinsic::lifetime_start:
687     case Intrinsic::lifetime_end:
688       // These intrinsics are for address space 0 only
689       Intr->eraseFromParent();
690       continue;
691     case Intrinsic::memcpy: {
692       MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
693       Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
694                            MemCpy->getLength(), MemCpy->getAlignment(),
695                            MemCpy->isVolatile());
696       Intr->eraseFromParent();
697       continue;
698     }
699     case Intrinsic::memmove: {
700       MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
701       Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getRawSource(),
702                             MemMove->getLength(), MemMove->getAlignment(),
703                             MemMove->isVolatile());
704       Intr->eraseFromParent();
705       continue;
706     }
707     case Intrinsic::memset: {
708       MemSetInst *MemSet = cast<MemSetInst>(Intr);
709       Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
710                            MemSet->getLength(), MemSet->getAlignment(),
711                            MemSet->isVolatile());
712       Intr->eraseFromParent();
713       continue;
714     }
715     case Intrinsic::invariant_start:
716     case Intrinsic::invariant_end:
717     case Intrinsic::invariant_group_barrier:
718       Intr->eraseFromParent();
719       // FIXME: I think the invariant marker should still theoretically apply,
720       // but the intrinsics need to be changed to accept pointers with any
721       // address space.
722       continue;
723     case Intrinsic::objectsize: {
724       Value *Src = Intr->getOperand(0);
725       Type *SrcTy = Src->getType()->getPointerElementType();
726       Function *ObjectSize = Intrinsic::getDeclaration(Mod,
727         Intrinsic::objectsize,
728         { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) }
729       );
730 
731       CallInst *NewCall
732         = Builder.CreateCall(ObjectSize, { Src, Intr->getOperand(1) });
733       Intr->replaceAllUsesWith(NewCall);
734       Intr->eraseFromParent();
735       continue;
736     }
737     default:
738       Intr->dump();
739       llvm_unreachable("Don't know how to promote alloca intrinsic use.");
740     }
741   }
742 }
743 
744 FunctionPass *llvm::createAMDGPUPromoteAlloca(const TargetMachine *TM) {
745   return new AMDGPUPromoteAlloca(TM);
746 }
747