1 //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass eliminates allocas by either converting them into vectors or
11 // by migrating them to local address space.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
16 #include "AMDGPUSubtarget.h"
17 #include "llvm/Analysis/ValueTracking.h"
18 #include "llvm/IR/IRBuilder.h"
19 #include "llvm/IR/InstVisitor.h"
20 #include "llvm/IR/MDBuilder.h"
21 #include "llvm/Support/Debug.h"
22 #include "llvm/Support/raw_ostream.h"
23 
24 #define DEBUG_TYPE "amdgpu-promote-alloca"
25 
26 using namespace llvm;
27 
28 namespace {
29 
30 // FIXME: This can create globals so should be a module pass.
31 class AMDGPUPromoteAlloca : public FunctionPass,
32                             public InstVisitor<AMDGPUPromoteAlloca> {
33 private:
34   const TargetMachine *TM;
35   Module *Mod;
36   MDNode *MaxWorkGroupSizeRange;
37 
38   // FIXME: This should be per-kernel.
39   int LocalMemAvailable;
40 
41   bool IsAMDGCN;
42   bool IsAMDHSA;
43 
44   std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
45   Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
46 
47 public:
48   static char ID;
49 
50   AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) :
51     FunctionPass(ID),
52     TM(TM_),
53     Mod(nullptr),
54     MaxWorkGroupSizeRange(nullptr),
55     LocalMemAvailable(0),
56     IsAMDGCN(false),
57     IsAMDHSA(false) { }
58 
59   bool doInitialization(Module &M) override;
60   bool runOnFunction(Function &F) override;
61 
62   const char *getPassName() const override {
63     return "AMDGPU Promote Alloca";
64   }
65 
66   void visitAlloca(AllocaInst &I);
67 };
68 
69 } // End anonymous namespace
70 
71 char AMDGPUPromoteAlloca::ID = 0;
72 
73 INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
74                    "AMDGPU promote alloca to vector or LDS", false, false)
75 
76 char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
77 
78 
79 bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
80   if (!TM)
81     return false;
82 
83   Mod = &M;
84 
85   // The maximum workitem id.
86   //
87   // FIXME: Should get as subtarget property. Usually runtime enforced max is
88   // 256.
89   MDBuilder MDB(Mod->getContext());
90   MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048));
91 
92   const Triple &TT = TM->getTargetTriple();
93 
94   IsAMDGCN = TT.getArch() == Triple::amdgcn;
95   IsAMDHSA = TT.getOS() == Triple::AMDHSA;
96 
97   return false;
98 }
99 
100 bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
101   if (!TM || F.hasFnAttribute(Attribute::OptimizeNone))
102     return false;
103 
104   FunctionType *FTy = F.getFunctionType();
105 
106   // If the function has any arguments in the local address space, then it's
107   // possible these arguments require the entire local memory space, so
108   // we cannot use local memory in the pass.
109   for (Type *ParamTy : FTy->params()) {
110     PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
111     if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
112       LocalMemAvailable = 0;
113       DEBUG(dbgs() << "Function has local memory argument.  Promoting to "
114                       "local memory disabled.\n");
115       return false;
116     }
117   }
118 
119   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
120   LocalMemAvailable = ST.getLocalMemorySize();
121   if (LocalMemAvailable == 0)
122     return false;
123 
124   // Check how much local memory is being used by global objects
125   for (GlobalVariable &GV : Mod->globals()) {
126     if (GV.getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
127       continue;
128 
129     for (Use &U : GV.uses()) {
130       Instruction *Use = dyn_cast<Instruction>(U);
131       if (!Use)
132         continue;
133 
134       if (Use->getParent()->getParent() == &F)
135         LocalMemAvailable -=
136           Mod->getDataLayout().getTypeAllocSize(GV.getValueType());
137     }
138   }
139 
140   LocalMemAvailable = std::max(0, LocalMemAvailable);
141   DEBUG(dbgs() << LocalMemAvailable << " bytes free in local memory.\n");
142 
143   visit(F);
144 
145   return true;
146 }
147 
148 std::pair<Value *, Value *>
149 AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
150   if (!IsAMDHSA) {
151     Function *LocalSizeYFn
152       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
153     Function *LocalSizeZFn
154       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
155 
156     CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
157     CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
158 
159     LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
160     LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
161 
162     return std::make_pair(LocalSizeY, LocalSizeZ);
163   }
164 
165   // We must read the size out of the dispatch pointer.
166   assert(IsAMDGCN);
167 
168   // We are indexing into this struct, and want to extract the workgroup_size_*
169   // fields.
170   //
171   //   typedef struct hsa_kernel_dispatch_packet_s {
172   //     uint16_t header;
173   //     uint16_t setup;
174   //     uint16_t workgroup_size_x ;
175   //     uint16_t workgroup_size_y;
176   //     uint16_t workgroup_size_z;
177   //     uint16_t reserved0;
178   //     uint32_t grid_size_x ;
179   //     uint32_t grid_size_y ;
180   //     uint32_t grid_size_z;
181   //
182   //     uint32_t private_segment_size;
183   //     uint32_t group_segment_size;
184   //     uint64_t kernel_object;
185   //
186   // #ifdef HSA_LARGE_MODEL
187   //     void *kernarg_address;
188   // #elif defined HSA_LITTLE_ENDIAN
189   //     void *kernarg_address;
190   //     uint32_t reserved1;
191   // #else
192   //     uint32_t reserved1;
193   //     void *kernarg_address;
194   // #endif
195   //     uint64_t reserved2;
196   //     hsa_signal_t completion_signal; // uint64_t wrapper
197   //   } hsa_kernel_dispatch_packet_t
198   //
199   Function *DispatchPtrFn
200     = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
201 
202   CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
203   DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias);
204   DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
205 
206   // Size of the dispatch packet struct.
207   DispatchPtr->addDereferenceableAttr(AttributeSet::ReturnIndex, 64);
208 
209   Type *I32Ty = Type::getInt32Ty(Mod->getContext());
210   Value *CastDispatchPtr = Builder.CreateBitCast(
211     DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
212 
213   // We could do a single 64-bit load here, but it's likely that the basic
214   // 32-bit and extract sequence is already present, and it is probably easier
215   // to CSE this. The loads should be mergable later anyway.
216   Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1);
217   LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4);
218 
219   Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2);
220   LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4);
221 
222   MDNode *MD = llvm::MDNode::get(Mod->getContext(), None);
223   LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
224   LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
225   LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
226 
227   // Extract y component. Upper half of LoadZU should be zero already.
228   Value *Y = Builder.CreateLShr(LoadXY, 16);
229 
230   return std::make_pair(Y, LoadZU);
231 }
232 
233 Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
234   Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
235 
236   switch (N) {
237   case 0:
238     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_x
239       : Intrinsic::r600_read_tidig_x;
240     break;
241   case 1:
242     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_y
243       : Intrinsic::r600_read_tidig_y;
244     break;
245 
246   case 2:
247     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_z
248       : Intrinsic::r600_read_tidig_z;
249     break;
250   default:
251     llvm_unreachable("invalid dimension");
252   }
253 
254   Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
255   CallInst *CI = Builder.CreateCall(WorkitemIdFn);
256   CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
257 
258   return CI;
259 }
260 
261 static VectorType *arrayTypeToVecType(Type *ArrayTy) {
262   return VectorType::get(ArrayTy->getArrayElementType(),
263                          ArrayTy->getArrayNumElements());
264 }
265 
266 static Value *
267 calculateVectorIndex(Value *Ptr,
268                      const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
269   if (isa<AllocaInst>(Ptr))
270     return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
271 
272   GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
273 
274   auto I = GEPIdx.find(GEP);
275   return I == GEPIdx.end() ? nullptr : I->second;
276 }
277 
278 static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
279   // FIXME we only support simple cases
280   if (GEP->getNumOperands() != 3)
281     return NULL;
282 
283   ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
284   if (!I0 || !I0->isZero())
285     return NULL;
286 
287   return GEP->getOperand(2);
288 }
289 
290 // Not an instruction handled below to turn into a vector.
291 //
292 // TODO: Check isTriviallyVectorizable for calls and handle other
293 // instructions.
294 static bool canVectorizeInst(Instruction *Inst, User *User) {
295   switch (Inst->getOpcode()) {
296   case Instruction::Load:
297   case Instruction::BitCast:
298   case Instruction::AddrSpaceCast:
299     return true;
300   case Instruction::Store: {
301     // Must be the stored pointer operand, not a stored value.
302     StoreInst *SI = cast<StoreInst>(Inst);
303     return SI->getPointerOperand() == User;
304   }
305   default:
306     return false;
307   }
308 }
309 
310 static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
311   ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
312 
313   DEBUG(dbgs() << "Alloca candidate for vectorization\n");
314 
315   // FIXME: There is no reason why we can't support larger arrays, we
316   // are just being conservative for now.
317   if (!AllocaTy ||
318       AllocaTy->getElementType()->isVectorTy() ||
319       AllocaTy->getNumElements() > 4) {
320     DEBUG(dbgs() << "  Cannot convert type to vector\n");
321     return false;
322   }
323 
324   std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
325   std::vector<Value*> WorkList;
326   for (User *AllocaUser : Alloca->users()) {
327     GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
328     if (!GEP) {
329       if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
330         return false;
331 
332       WorkList.push_back(AllocaUser);
333       continue;
334     }
335 
336     Value *Index = GEPToVectorIndex(GEP);
337 
338     // If we can't compute a vector index from this GEP, then we can't
339     // promote this alloca to vector.
340     if (!Index) {
341       DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP << '\n');
342       return false;
343     }
344 
345     GEPVectorIdx[GEP] = Index;
346     for (User *GEPUser : AllocaUser->users()) {
347       if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
348         return false;
349 
350       WorkList.push_back(GEPUser);
351     }
352   }
353 
354   VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
355 
356   DEBUG(dbgs() << "  Converting alloca to vector "
357         << *AllocaTy << " -> " << *VectorTy << '\n');
358 
359   for (Value *V : WorkList) {
360     Instruction *Inst = cast<Instruction>(V);
361     IRBuilder<> Builder(Inst);
362     switch (Inst->getOpcode()) {
363     case Instruction::Load: {
364       Value *Ptr = Inst->getOperand(0);
365       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
366       Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
367       Value *VecValue = Builder.CreateLoad(BitCast);
368       Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
369       Inst->replaceAllUsesWith(ExtractElement);
370       Inst->eraseFromParent();
371       break;
372     }
373     case Instruction::Store: {
374       Value *Ptr = Inst->getOperand(1);
375       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
376       Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
377       Value *VecValue = Builder.CreateLoad(BitCast);
378       Value *NewVecValue = Builder.CreateInsertElement(VecValue,
379                                                        Inst->getOperand(0),
380                                                        Index);
381       Builder.CreateStore(NewVecValue, BitCast);
382       Inst->eraseFromParent();
383       break;
384     }
385     case Instruction::BitCast:
386     case Instruction::AddrSpaceCast:
387       break;
388 
389     default:
390       Inst->dump();
391       llvm_unreachable("Inconsistency in instructions promotable to vector");
392     }
393   }
394   return true;
395 }
396 
397 static bool isCallPromotable(CallInst *CI) {
398   // TODO: We might be able to handle some cases where the callee is a
399   // constantexpr bitcast of a function.
400   if (!CI->getCalledFunction())
401     return false;
402 
403   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
404   if (!II)
405     return false;
406 
407   switch (II->getIntrinsicID()) {
408   case Intrinsic::memcpy:
409   case Intrinsic::memmove:
410   case Intrinsic::memset:
411   case Intrinsic::lifetime_start:
412   case Intrinsic::lifetime_end:
413   case Intrinsic::invariant_start:
414   case Intrinsic::invariant_end:
415   case Intrinsic::invariant_group_barrier:
416   case Intrinsic::objectsize:
417     return true;
418   default:
419     return false;
420   }
421 }
422 
423 static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
424   for (User *User : Val->users()) {
425     if (std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
426       continue;
427 
428     if (CallInst *CI = dyn_cast<CallInst>(User)) {
429       if (!isCallPromotable(CI))
430         return false;
431 
432       WorkList.push_back(User);
433       continue;
434     }
435 
436     // FIXME: Correctly handle ptrtoint instructions.
437     Instruction *UseInst = dyn_cast<Instruction>(User);
438     if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt)
439       return false;
440 
441     if (StoreInst *SI = dyn_cast_or_null<StoreInst>(UseInst)) {
442       // Reject if the stored value is not the pointer operand.
443       if (SI->getPointerOperand() != Val)
444         return false;
445     }
446 
447     if (!User->getType()->isPointerTy())
448       continue;
449 
450     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
451       // Be conservative if an address could be computed outside the bounds of
452       // the alloca.
453       if (!GEP->isInBounds())
454         return false;
455     }
456 
457     WorkList.push_back(User);
458     if (!collectUsesWithPtrTypes(User, WorkList))
459       return false;
460   }
461 
462   return true;
463 }
464 
465 void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
466   if (!I.isStaticAlloca())
467     return;
468 
469   IRBuilder<> Builder(&I);
470 
471   // First try to replace the alloca with a vector
472   Type *AllocaTy = I.getAllocatedType();
473 
474   DEBUG(dbgs() << "Trying to promote " << I << '\n');
475 
476   if (tryPromoteAllocaToVector(&I))
477     return;
478 
479   DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
480 
481   // FIXME: This is the maximum work group size.  We should try to get
482   // value from the reqd_work_group_size function attribute if it is
483   // available.
484   unsigned WorkGroupSize = 256;
485   int AllocaSize =
486       WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy);
487 
488   if (AllocaSize > LocalMemAvailable) {
489     DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
490     return;
491   }
492 
493   std::vector<Value*> WorkList;
494 
495   if (!collectUsesWithPtrTypes(&I, WorkList)) {
496     DEBUG(dbgs() << " Do not know how to convert all uses\n");
497     return;
498   }
499 
500   DEBUG(dbgs() << "Promoting alloca to local memory\n");
501   LocalMemAvailable -= AllocaSize;
502 
503   Function *F = I.getParent()->getParent();
504 
505   Type *GVTy = ArrayType::get(I.getAllocatedType(), 256);
506   GlobalVariable *GV = new GlobalVariable(
507       *Mod, GVTy, false, GlobalValue::InternalLinkage,
508       UndefValue::get(GVTy),
509       Twine(F->getName()) + Twine('.') + I.getName(),
510       nullptr,
511       GlobalVariable::NotThreadLocal,
512       AMDGPUAS::LOCAL_ADDRESS);
513   GV->setUnnamedAddr(true);
514   GV->setAlignment(I.getAlignment());
515 
516   Value *TCntY, *TCntZ;
517 
518   std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
519   Value *TIdX = getWorkitemID(Builder, 0);
520   Value *TIdY = getWorkitemID(Builder, 1);
521   Value *TIdZ = getWorkitemID(Builder, 2);
522 
523   Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
524   Tmp0 = Builder.CreateMul(Tmp0, TIdX);
525   Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
526   Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
527   TID = Builder.CreateAdd(TID, TIdZ);
528 
529   Value *Indices[] = {
530     Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
531     TID
532   };
533 
534   Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
535   I.mutateType(Offset->getType());
536   I.replaceAllUsesWith(Offset);
537   I.eraseFromParent();
538 
539   for (Value *V : WorkList) {
540     CallInst *Call = dyn_cast<CallInst>(V);
541     if (!Call) {
542       Type *EltTy = V->getType()->getPointerElementType();
543       PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
544 
545       // The operand's value should be corrected on its own.
546       if (isa<AddrSpaceCastInst>(V))
547         continue;
548 
549       // FIXME: It doesn't really make sense to try to do this for all
550       // instructions.
551       V->mutateType(NewTy);
552       continue;
553     }
554 
555     IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
556     if (!Intr) {
557       // FIXME: What is this for? It doesn't make sense to promote arbitrary
558       // function calls. If the call is to a defined function that can also be
559       // promoted, we should be able to do this once that function is also
560       // rewritten.
561 
562       std::vector<Type*> ArgTypes;
563       for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
564                                 ArgIdx != ArgEnd; ++ArgIdx) {
565         ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
566       }
567       Function *F = Call->getCalledFunction();
568       FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
569                                                 F->isVarArg());
570       Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(),
571                                              NewType, F->getAttributes());
572       Function *NewF = cast<Function>(C);
573       Call->setCalledFunction(NewF);
574       continue;
575     }
576 
577     Builder.SetInsertPoint(Intr);
578     switch (Intr->getIntrinsicID()) {
579     case Intrinsic::lifetime_start:
580     case Intrinsic::lifetime_end:
581       // These intrinsics are for address space 0 only
582       Intr->eraseFromParent();
583       continue;
584     case Intrinsic::memcpy: {
585       MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
586       Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
587                            MemCpy->getLength(), MemCpy->getAlignment(),
588                            MemCpy->isVolatile());
589       Intr->eraseFromParent();
590       continue;
591     }
592     case Intrinsic::memmove: {
593       MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
594       Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getRawSource(),
595                             MemMove->getLength(), MemMove->getAlignment(),
596                             MemMove->isVolatile());
597       Intr->eraseFromParent();
598       continue;
599     }
600     case Intrinsic::memset: {
601       MemSetInst *MemSet = cast<MemSetInst>(Intr);
602       Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
603                            MemSet->getLength(), MemSet->getAlignment(),
604                            MemSet->isVolatile());
605       Intr->eraseFromParent();
606       continue;
607     }
608     case Intrinsic::invariant_start:
609     case Intrinsic::invariant_end:
610     case Intrinsic::invariant_group_barrier:
611       Intr->eraseFromParent();
612       // FIXME: I think the invariant marker should still theoretically apply,
613       // but the intrinsics need to be changed to accept pointers with any
614       // address space.
615       continue;
616     case Intrinsic::objectsize: {
617       Value *Src = Intr->getOperand(0);
618       Type *SrcTy = Src->getType()->getPointerElementType();
619       Function *ObjectSize = Intrinsic::getDeclaration(Mod,
620         Intrinsic::objectsize,
621         { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) }
622       );
623 
624       CallInst *NewCall
625         = Builder.CreateCall(ObjectSize, { Src, Intr->getOperand(1) });
626       Intr->replaceAllUsesWith(NewCall);
627       Intr->eraseFromParent();
628       continue;
629     }
630     default:
631       Intr->dump();
632       llvm_unreachable("Don't know how to promote alloca intrinsic use.");
633     }
634   }
635 }
636 
637 FunctionPass *llvm::createAMDGPUPromoteAlloca(const TargetMachine *TM) {
638   return new AMDGPUPromoteAlloca(TM);
639 }
640