1 //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass eliminates allocas by either converting them into vectors or
11 // by migrating them to local address space.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
16 #include "AMDGPUSubtarget.h"
17 #include "llvm/Analysis/ValueTracking.h"
18 #include "llvm/IR/IRBuilder.h"
19 #include "llvm/IR/IntrinsicInst.h"
20 #include "llvm/IR/MDBuilder.h"
21 #include "llvm/Support/Debug.h"
22 #include "llvm/Support/raw_ostream.h"
23 
24 #define DEBUG_TYPE "amdgpu-promote-alloca"
25 
26 using namespace llvm;
27 
28 namespace {
29 
30 // FIXME: This can create globals so should be a module pass.
31 class AMDGPUPromoteAlloca : public FunctionPass {
32 private:
33   const TargetMachine *TM;
34   Module *Mod;
35   MDNode *MaxWorkGroupSizeRange;
36 
37   // FIXME: This should be per-kernel.
38   int LocalMemAvailable;
39 
40   bool IsAMDGCN;
41   bool IsAMDHSA;
42 
43   std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
44   Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
45 
46 public:
47   static char ID;
48 
49   AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) :
50     FunctionPass(ID),
51     TM(TM_),
52     Mod(nullptr),
53     MaxWorkGroupSizeRange(nullptr),
54     LocalMemAvailable(0),
55     IsAMDGCN(false),
56     IsAMDHSA(false) { }
57 
58   bool doInitialization(Module &M) override;
59   bool runOnFunction(Function &F) override;
60 
61   const char *getPassName() const override {
62     return "AMDGPU Promote Alloca";
63   }
64 
65   void handleAlloca(AllocaInst &I);
66 };
67 
68 } // End anonymous namespace
69 
70 char AMDGPUPromoteAlloca::ID = 0;
71 
72 INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
73                    "AMDGPU promote alloca to vector or LDS", false, false)
74 
75 char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
76 
77 
78 bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
79   if (!TM)
80     return false;
81 
82   Mod = &M;
83 
84   // The maximum workitem id.
85   //
86   // FIXME: Should get as subtarget property. Usually runtime enforced max is
87   // 256.
88   MDBuilder MDB(Mod->getContext());
89   MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048));
90 
91   const Triple &TT = TM->getTargetTriple();
92 
93   IsAMDGCN = TT.getArch() == Triple::amdgcn;
94   IsAMDHSA = TT.getOS() == Triple::AMDHSA;
95 
96   return false;
97 }
98 
99 bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
100   if (!TM || F.hasFnAttribute(Attribute::OptimizeNone))
101     return false;
102 
103   FunctionType *FTy = F.getFunctionType();
104 
105   // If the function has any arguments in the local address space, then it's
106   // possible these arguments require the entire local memory space, so
107   // we cannot use local memory in the pass.
108   for (Type *ParamTy : FTy->params()) {
109     PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
110     if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
111       LocalMemAvailable = 0;
112       DEBUG(dbgs() << "Function has local memory argument.  Promoting to "
113                       "local memory disabled.\n");
114       return false;
115     }
116   }
117 
118   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
119   LocalMemAvailable = ST.getLocalMemorySize();
120   if (LocalMemAvailable == 0)
121     return false;
122 
123   // Check how much local memory is being used by global objects
124   for (GlobalVariable &GV : Mod->globals()) {
125     if (GV.getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
126       continue;
127 
128     for (Use &U : GV.uses()) {
129       Instruction *Use = dyn_cast<Instruction>(U);
130       if (!Use)
131         continue;
132 
133       if (Use->getParent()->getParent() == &F)
134         LocalMemAvailable -=
135           Mod->getDataLayout().getTypeAllocSize(GV.getValueType());
136     }
137   }
138 
139   LocalMemAvailable = std::max(0, LocalMemAvailable);
140   DEBUG(dbgs() << LocalMemAvailable << " bytes free in local memory.\n");
141 
142   BasicBlock &EntryBB = *F.begin();
143   for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {
144     AllocaInst *AI = dyn_cast<AllocaInst>(I);
145 
146     ++I;
147     if (AI)
148       handleAlloca(*AI);
149   }
150 
151   return true;
152 }
153 
154 std::pair<Value *, Value *>
155 AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
156   if (!IsAMDHSA) {
157     Function *LocalSizeYFn
158       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
159     Function *LocalSizeZFn
160       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
161 
162     CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
163     CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
164 
165     LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
166     LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
167 
168     return std::make_pair(LocalSizeY, LocalSizeZ);
169   }
170 
171   // We must read the size out of the dispatch pointer.
172   assert(IsAMDGCN);
173 
174   // We are indexing into this struct, and want to extract the workgroup_size_*
175   // fields.
176   //
177   //   typedef struct hsa_kernel_dispatch_packet_s {
178   //     uint16_t header;
179   //     uint16_t setup;
180   //     uint16_t workgroup_size_x ;
181   //     uint16_t workgroup_size_y;
182   //     uint16_t workgroup_size_z;
183   //     uint16_t reserved0;
184   //     uint32_t grid_size_x ;
185   //     uint32_t grid_size_y ;
186   //     uint32_t grid_size_z;
187   //
188   //     uint32_t private_segment_size;
189   //     uint32_t group_segment_size;
190   //     uint64_t kernel_object;
191   //
192   // #ifdef HSA_LARGE_MODEL
193   //     void *kernarg_address;
194   // #elif defined HSA_LITTLE_ENDIAN
195   //     void *kernarg_address;
196   //     uint32_t reserved1;
197   // #else
198   //     uint32_t reserved1;
199   //     void *kernarg_address;
200   // #endif
201   //     uint64_t reserved2;
202   //     hsa_signal_t completion_signal; // uint64_t wrapper
203   //   } hsa_kernel_dispatch_packet_t
204   //
205   Function *DispatchPtrFn
206     = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
207 
208   CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
209   DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias);
210   DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
211 
212   // Size of the dispatch packet struct.
213   DispatchPtr->addDereferenceableAttr(AttributeSet::ReturnIndex, 64);
214 
215   Type *I32Ty = Type::getInt32Ty(Mod->getContext());
216   Value *CastDispatchPtr = Builder.CreateBitCast(
217     DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
218 
219   // We could do a single 64-bit load here, but it's likely that the basic
220   // 32-bit and extract sequence is already present, and it is probably easier
221   // to CSE this. The loads should be mergable later anyway.
222   Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1);
223   LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4);
224 
225   Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2);
226   LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4);
227 
228   MDNode *MD = llvm::MDNode::get(Mod->getContext(), None);
229   LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
230   LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
231   LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
232 
233   // Extract y component. Upper half of LoadZU should be zero already.
234   Value *Y = Builder.CreateLShr(LoadXY, 16);
235 
236   return std::make_pair(Y, LoadZU);
237 }
238 
239 Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
240   Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
241 
242   switch (N) {
243   case 0:
244     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_x
245       : Intrinsic::r600_read_tidig_x;
246     break;
247   case 1:
248     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_y
249       : Intrinsic::r600_read_tidig_y;
250     break;
251 
252   case 2:
253     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_z
254       : Intrinsic::r600_read_tidig_z;
255     break;
256   default:
257     llvm_unreachable("invalid dimension");
258   }
259 
260   Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
261   CallInst *CI = Builder.CreateCall(WorkitemIdFn);
262   CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
263 
264   return CI;
265 }
266 
267 static VectorType *arrayTypeToVecType(Type *ArrayTy) {
268   return VectorType::get(ArrayTy->getArrayElementType(),
269                          ArrayTy->getArrayNumElements());
270 }
271 
272 static Value *
273 calculateVectorIndex(Value *Ptr,
274                      const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
275   if (isa<AllocaInst>(Ptr))
276     return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
277 
278   GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
279 
280   auto I = GEPIdx.find(GEP);
281   return I == GEPIdx.end() ? nullptr : I->second;
282 }
283 
284 static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
285   // FIXME we only support simple cases
286   if (GEP->getNumOperands() != 3)
287     return NULL;
288 
289   ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
290   if (!I0 || !I0->isZero())
291     return NULL;
292 
293   return GEP->getOperand(2);
294 }
295 
296 // Not an instruction handled below to turn into a vector.
297 //
298 // TODO: Check isTriviallyVectorizable for calls and handle other
299 // instructions.
300 static bool canVectorizeInst(Instruction *Inst, User *User) {
301   switch (Inst->getOpcode()) {
302   case Instruction::Load:
303   case Instruction::BitCast:
304   case Instruction::AddrSpaceCast:
305     return true;
306   case Instruction::Store: {
307     // Must be the stored pointer operand, not a stored value.
308     StoreInst *SI = cast<StoreInst>(Inst);
309     return SI->getPointerOperand() == User;
310   }
311   default:
312     return false;
313   }
314 }
315 
316 static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
317   ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
318 
319   DEBUG(dbgs() << "Alloca candidate for vectorization\n");
320 
321   // FIXME: There is no reason why we can't support larger arrays, we
322   // are just being conservative for now.
323   if (!AllocaTy ||
324       AllocaTy->getElementType()->isVectorTy() ||
325       AllocaTy->getNumElements() > 4) {
326     DEBUG(dbgs() << "  Cannot convert type to vector\n");
327     return false;
328   }
329 
330   std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
331   std::vector<Value*> WorkList;
332   for (User *AllocaUser : Alloca->users()) {
333     GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
334     if (!GEP) {
335       if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
336         return false;
337 
338       WorkList.push_back(AllocaUser);
339       continue;
340     }
341 
342     Value *Index = GEPToVectorIndex(GEP);
343 
344     // If we can't compute a vector index from this GEP, then we can't
345     // promote this alloca to vector.
346     if (!Index) {
347       DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP << '\n');
348       return false;
349     }
350 
351     GEPVectorIdx[GEP] = Index;
352     for (User *GEPUser : AllocaUser->users()) {
353       if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
354         return false;
355 
356       WorkList.push_back(GEPUser);
357     }
358   }
359 
360   VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
361 
362   DEBUG(dbgs() << "  Converting alloca to vector "
363         << *AllocaTy << " -> " << *VectorTy << '\n');
364 
365   for (Value *V : WorkList) {
366     Instruction *Inst = cast<Instruction>(V);
367     IRBuilder<> Builder(Inst);
368     switch (Inst->getOpcode()) {
369     case Instruction::Load: {
370       Value *Ptr = Inst->getOperand(0);
371       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
372       Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
373       Value *VecValue = Builder.CreateLoad(BitCast);
374       Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
375       Inst->replaceAllUsesWith(ExtractElement);
376       Inst->eraseFromParent();
377       break;
378     }
379     case Instruction::Store: {
380       Value *Ptr = Inst->getOperand(1);
381       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
382       Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
383       Value *VecValue = Builder.CreateLoad(BitCast);
384       Value *NewVecValue = Builder.CreateInsertElement(VecValue,
385                                                        Inst->getOperand(0),
386                                                        Index);
387       Builder.CreateStore(NewVecValue, BitCast);
388       Inst->eraseFromParent();
389       break;
390     }
391     case Instruction::BitCast:
392     case Instruction::AddrSpaceCast:
393       break;
394 
395     default:
396       Inst->dump();
397       llvm_unreachable("Inconsistency in instructions promotable to vector");
398     }
399   }
400   return true;
401 }
402 
403 static bool isCallPromotable(CallInst *CI) {
404   // TODO: We might be able to handle some cases where the callee is a
405   // constantexpr bitcast of a function.
406   if (!CI->getCalledFunction())
407     return false;
408 
409   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
410   if (!II)
411     return false;
412 
413   switch (II->getIntrinsicID()) {
414   case Intrinsic::memcpy:
415   case Intrinsic::memmove:
416   case Intrinsic::memset:
417   case Intrinsic::lifetime_start:
418   case Intrinsic::lifetime_end:
419   case Intrinsic::invariant_start:
420   case Intrinsic::invariant_end:
421   case Intrinsic::invariant_group_barrier:
422   case Intrinsic::objectsize:
423     return true;
424   default:
425     return false;
426   }
427 }
428 
429 static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
430   for (User *User : Val->users()) {
431     if (std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
432       continue;
433 
434     if (CallInst *CI = dyn_cast<CallInst>(User)) {
435       if (!isCallPromotable(CI))
436         return false;
437 
438       WorkList.push_back(User);
439       continue;
440     }
441 
442     Instruction *UseInst = dyn_cast<Instruction>(User);
443     if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt)
444       return false;
445 
446     if (StoreInst *SI = dyn_cast_or_null<StoreInst>(UseInst)) {
447       if (SI->isVolatile())
448         return false;
449 
450       // Reject if the stored value is not the pointer operand.
451       if (SI->getPointerOperand() != Val)
452         return false;
453     } else if (LoadInst *LI = dyn_cast_or_null<LoadInst>(UseInst)) {
454       if (LI->isVolatile())
455         return false;
456     } else if (AtomicRMWInst *RMW = dyn_cast_or_null<AtomicRMWInst>(UseInst)) {
457       if (RMW->isVolatile())
458         return false;
459     } else if (AtomicCmpXchgInst *CAS
460                = dyn_cast_or_null<AtomicCmpXchgInst>(UseInst)) {
461       if (CAS->isVolatile())
462         return false;
463     }
464 
465     if (!User->getType()->isPointerTy())
466       continue;
467 
468     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
469       // Be conservative if an address could be computed outside the bounds of
470       // the alloca.
471       if (!GEP->isInBounds())
472         return false;
473     }
474 
475     WorkList.push_back(User);
476     if (!collectUsesWithPtrTypes(User, WorkList))
477       return false;
478   }
479 
480   return true;
481 }
482 
483 void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
484   if (!I.isStaticAlloca())
485     return;
486 
487   IRBuilder<> Builder(&I);
488 
489   // First try to replace the alloca with a vector
490   Type *AllocaTy = I.getAllocatedType();
491 
492   DEBUG(dbgs() << "Trying to promote " << I << '\n');
493 
494   if (tryPromoteAllocaToVector(&I))
495     return;
496 
497   DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
498 
499   const Function &ContainingFunction = *I.getParent()->getParent();
500 
501   // FIXME: We should also try to get this value from the reqd_work_group_size
502   // function attribute if it is available.
503   unsigned WorkGroupSize = AMDGPU::getMaximumWorkGroupSize(ContainingFunction);
504 
505   int AllocaSize =
506       WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy);
507 
508   if (AllocaSize > LocalMemAvailable) {
509     DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
510     return;
511   }
512 
513   std::vector<Value*> WorkList;
514 
515   if (!collectUsesWithPtrTypes(&I, WorkList)) {
516     DEBUG(dbgs() << " Do not know how to convert all uses\n");
517     return;
518   }
519 
520   DEBUG(dbgs() << "Promoting alloca to local memory\n");
521   LocalMemAvailable -= AllocaSize;
522 
523   Function *F = I.getParent()->getParent();
524 
525   Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
526   GlobalVariable *GV = new GlobalVariable(
527       *Mod, GVTy, false, GlobalValue::InternalLinkage,
528       UndefValue::get(GVTy),
529       Twine(F->getName()) + Twine('.') + I.getName(),
530       nullptr,
531       GlobalVariable::NotThreadLocal,
532       AMDGPUAS::LOCAL_ADDRESS);
533   GV->setUnnamedAddr(true);
534   GV->setAlignment(I.getAlignment());
535 
536   Value *TCntY, *TCntZ;
537 
538   std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
539   Value *TIdX = getWorkitemID(Builder, 0);
540   Value *TIdY = getWorkitemID(Builder, 1);
541   Value *TIdZ = getWorkitemID(Builder, 2);
542 
543   Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
544   Tmp0 = Builder.CreateMul(Tmp0, TIdX);
545   Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
546   Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
547   TID = Builder.CreateAdd(TID, TIdZ);
548 
549   Value *Indices[] = {
550     Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
551     TID
552   };
553 
554   Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
555   I.mutateType(Offset->getType());
556   I.replaceAllUsesWith(Offset);
557   I.eraseFromParent();
558 
559   for (Value *V : WorkList) {
560     CallInst *Call = dyn_cast<CallInst>(V);
561     if (!Call) {
562       Type *EltTy = V->getType()->getPointerElementType();
563       PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
564 
565       // The operand's value should be corrected on its own.
566       if (isa<AddrSpaceCastInst>(V))
567         continue;
568 
569       // FIXME: It doesn't really make sense to try to do this for all
570       // instructions.
571       V->mutateType(NewTy);
572       continue;
573     }
574 
575     IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
576     if (!Intr) {
577       // FIXME: What is this for? It doesn't make sense to promote arbitrary
578       // function calls. If the call is to a defined function that can also be
579       // promoted, we should be able to do this once that function is also
580       // rewritten.
581 
582       std::vector<Type*> ArgTypes;
583       for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
584                                 ArgIdx != ArgEnd; ++ArgIdx) {
585         ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
586       }
587       Function *F = Call->getCalledFunction();
588       FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
589                                                 F->isVarArg());
590       Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(),
591                                              NewType, F->getAttributes());
592       Function *NewF = cast<Function>(C);
593       Call->setCalledFunction(NewF);
594       continue;
595     }
596 
597     Builder.SetInsertPoint(Intr);
598     switch (Intr->getIntrinsicID()) {
599     case Intrinsic::lifetime_start:
600     case Intrinsic::lifetime_end:
601       // These intrinsics are for address space 0 only
602       Intr->eraseFromParent();
603       continue;
604     case Intrinsic::memcpy: {
605       MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
606       Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
607                            MemCpy->getLength(), MemCpy->getAlignment(),
608                            MemCpy->isVolatile());
609       Intr->eraseFromParent();
610       continue;
611     }
612     case Intrinsic::memmove: {
613       MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
614       Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getRawSource(),
615                             MemMove->getLength(), MemMove->getAlignment(),
616                             MemMove->isVolatile());
617       Intr->eraseFromParent();
618       continue;
619     }
620     case Intrinsic::memset: {
621       MemSetInst *MemSet = cast<MemSetInst>(Intr);
622       Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
623                            MemSet->getLength(), MemSet->getAlignment(),
624                            MemSet->isVolatile());
625       Intr->eraseFromParent();
626       continue;
627     }
628     case Intrinsic::invariant_start:
629     case Intrinsic::invariant_end:
630     case Intrinsic::invariant_group_barrier:
631       Intr->eraseFromParent();
632       // FIXME: I think the invariant marker should still theoretically apply,
633       // but the intrinsics need to be changed to accept pointers with any
634       // address space.
635       continue;
636     case Intrinsic::objectsize: {
637       Value *Src = Intr->getOperand(0);
638       Type *SrcTy = Src->getType()->getPointerElementType();
639       Function *ObjectSize = Intrinsic::getDeclaration(Mod,
640         Intrinsic::objectsize,
641         { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) }
642       );
643 
644       CallInst *NewCall
645         = Builder.CreateCall(ObjectSize, { Src, Intr->getOperand(1) });
646       Intr->replaceAllUsesWith(NewCall);
647       Intr->eraseFromParent();
648       continue;
649     }
650     default:
651       Intr->dump();
652       llvm_unreachable("Don't know how to promote alloca intrinsic use.");
653     }
654   }
655 }
656 
657 FunctionPass *llvm::createAMDGPUPromoteAlloca(const TargetMachine *TM) {
658   return new AMDGPUPromoteAlloca(TM);
659 }
660