1 //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass eliminates allocas by either converting them into vectors or
11 // by migrating them to local address space.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
16 #include "AMDGPUSubtarget.h"
17 #include "llvm/Analysis/ValueTracking.h"
18 #include "llvm/IR/IRBuilder.h"
19 #include "llvm/IR/IntrinsicInst.h"
20 #include "llvm/IR/MDBuilder.h"
21 #include "llvm/Support/Debug.h"
22 #include "llvm/Support/raw_ostream.h"
23 
24 #define DEBUG_TYPE "amdgpu-promote-alloca"
25 
26 using namespace llvm;
27 
28 namespace {
29 
30 // FIXME: This can create globals so should be a module pass.
31 class AMDGPUPromoteAlloca : public FunctionPass {
32 private:
33   const TargetMachine *TM;
34   Module *Mod;
35   MDNode *MaxWorkGroupSizeRange;
36 
37   // FIXME: This should be per-kernel.
38   int LocalMemAvailable;
39 
40   bool IsAMDGCN;
41   bool IsAMDHSA;
42 
43   std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
44   Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
45 
46 public:
47   static char ID;
48 
49   AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) :
50     FunctionPass(ID),
51     TM(TM_),
52     Mod(nullptr),
53     MaxWorkGroupSizeRange(nullptr),
54     LocalMemAvailable(0),
55     IsAMDGCN(false),
56     IsAMDHSA(false) { }
57 
58   bool doInitialization(Module &M) override;
59   bool runOnFunction(Function &F) override;
60 
61   const char *getPassName() const override {
62     return "AMDGPU Promote Alloca";
63   }
64 
65   void handleAlloca(AllocaInst &I);
66 };
67 
68 } // End anonymous namespace
69 
70 char AMDGPUPromoteAlloca::ID = 0;
71 
72 INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
73                    "AMDGPU promote alloca to vector or LDS", false, false)
74 
75 char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
76 
77 
78 bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
79   if (!TM)
80     return false;
81 
82   Mod = &M;
83 
84   // The maximum workitem id.
85   //
86   // FIXME: Should get as subtarget property. Usually runtime enforced max is
87   // 256.
88   MDBuilder MDB(Mod->getContext());
89   MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048));
90 
91   const Triple &TT = TM->getTargetTriple();
92 
93   IsAMDGCN = TT.getArch() == Triple::amdgcn;
94   IsAMDHSA = TT.getOS() == Triple::AMDHSA;
95 
96   return false;
97 }
98 
99 bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
100   if (!TM || skipFunction(F))
101     return false;
102 
103   FunctionType *FTy = F.getFunctionType();
104 
105   // If the function has any arguments in the local address space, then it's
106   // possible these arguments require the entire local memory space, so
107   // we cannot use local memory in the pass.
108   for (Type *ParamTy : FTy->params()) {
109     PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
110     if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
111       LocalMemAvailable = 0;
112       DEBUG(dbgs() << "Function has local memory argument.  Promoting to "
113                       "local memory disabled.\n");
114       return false;
115     }
116   }
117 
118   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
119   LocalMemAvailable = ST.getLocalMemorySize();
120   if (LocalMemAvailable == 0)
121     return false;
122 
123   // Check how much local memory is being used by global objects
124   for (GlobalVariable &GV : Mod->globals()) {
125     if (GV.getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
126       continue;
127 
128     for (User *U : GV.users()) {
129       Instruction *Use = dyn_cast<Instruction>(U);
130       if (!Use)
131         continue;
132 
133       if (Use->getParent()->getParent() == &F) {
134         LocalMemAvailable -=
135           Mod->getDataLayout().getTypeAllocSize(GV.getValueType());
136         break;
137       }
138     }
139   }
140 
141   LocalMemAvailable = std::max(0, LocalMemAvailable);
142   DEBUG(dbgs() << LocalMemAvailable << " bytes free in local memory.\n");
143 
144   BasicBlock &EntryBB = *F.begin();
145   for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {
146     AllocaInst *AI = dyn_cast<AllocaInst>(I);
147 
148     ++I;
149     if (AI)
150       handleAlloca(*AI);
151   }
152 
153   return true;
154 }
155 
156 std::pair<Value *, Value *>
157 AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
158   if (!IsAMDHSA) {
159     Function *LocalSizeYFn
160       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
161     Function *LocalSizeZFn
162       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
163 
164     CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
165     CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
166 
167     LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
168     LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
169 
170     return std::make_pair(LocalSizeY, LocalSizeZ);
171   }
172 
173   // We must read the size out of the dispatch pointer.
174   assert(IsAMDGCN);
175 
176   // We are indexing into this struct, and want to extract the workgroup_size_*
177   // fields.
178   //
179   //   typedef struct hsa_kernel_dispatch_packet_s {
180   //     uint16_t header;
181   //     uint16_t setup;
182   //     uint16_t workgroup_size_x ;
183   //     uint16_t workgroup_size_y;
184   //     uint16_t workgroup_size_z;
185   //     uint16_t reserved0;
186   //     uint32_t grid_size_x ;
187   //     uint32_t grid_size_y ;
188   //     uint32_t grid_size_z;
189   //
190   //     uint32_t private_segment_size;
191   //     uint32_t group_segment_size;
192   //     uint64_t kernel_object;
193   //
194   // #ifdef HSA_LARGE_MODEL
195   //     void *kernarg_address;
196   // #elif defined HSA_LITTLE_ENDIAN
197   //     void *kernarg_address;
198   //     uint32_t reserved1;
199   // #else
200   //     uint32_t reserved1;
201   //     void *kernarg_address;
202   // #endif
203   //     uint64_t reserved2;
204   //     hsa_signal_t completion_signal; // uint64_t wrapper
205   //   } hsa_kernel_dispatch_packet_t
206   //
207   Function *DispatchPtrFn
208     = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
209 
210   CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
211   DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias);
212   DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
213 
214   // Size of the dispatch packet struct.
215   DispatchPtr->addDereferenceableAttr(AttributeSet::ReturnIndex, 64);
216 
217   Type *I32Ty = Type::getInt32Ty(Mod->getContext());
218   Value *CastDispatchPtr = Builder.CreateBitCast(
219     DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
220 
221   // We could do a single 64-bit load here, but it's likely that the basic
222   // 32-bit and extract sequence is already present, and it is probably easier
223   // to CSE this. The loads should be mergable later anyway.
224   Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1);
225   LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4);
226 
227   Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2);
228   LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4);
229 
230   MDNode *MD = llvm::MDNode::get(Mod->getContext(), None);
231   LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
232   LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
233   LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
234 
235   // Extract y component. Upper half of LoadZU should be zero already.
236   Value *Y = Builder.CreateLShr(LoadXY, 16);
237 
238   return std::make_pair(Y, LoadZU);
239 }
240 
241 Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
242   Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
243 
244   switch (N) {
245   case 0:
246     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_x
247       : Intrinsic::r600_read_tidig_x;
248     break;
249   case 1:
250     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_y
251       : Intrinsic::r600_read_tidig_y;
252     break;
253 
254   case 2:
255     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_z
256       : Intrinsic::r600_read_tidig_z;
257     break;
258   default:
259     llvm_unreachable("invalid dimension");
260   }
261 
262   Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
263   CallInst *CI = Builder.CreateCall(WorkitemIdFn);
264   CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
265 
266   return CI;
267 }
268 
269 static VectorType *arrayTypeToVecType(Type *ArrayTy) {
270   return VectorType::get(ArrayTy->getArrayElementType(),
271                          ArrayTy->getArrayNumElements());
272 }
273 
274 static Value *
275 calculateVectorIndex(Value *Ptr,
276                      const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
277   if (isa<AllocaInst>(Ptr))
278     return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
279 
280   GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
281 
282   auto I = GEPIdx.find(GEP);
283   return I == GEPIdx.end() ? nullptr : I->second;
284 }
285 
286 static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
287   // FIXME we only support simple cases
288   if (GEP->getNumOperands() != 3)
289     return NULL;
290 
291   ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
292   if (!I0 || !I0->isZero())
293     return NULL;
294 
295   return GEP->getOperand(2);
296 }
297 
298 // Not an instruction handled below to turn into a vector.
299 //
300 // TODO: Check isTriviallyVectorizable for calls and handle other
301 // instructions.
302 static bool canVectorizeInst(Instruction *Inst, User *User) {
303   switch (Inst->getOpcode()) {
304   case Instruction::Load:
305   case Instruction::BitCast:
306   case Instruction::AddrSpaceCast:
307     return true;
308   case Instruction::Store: {
309     // Must be the stored pointer operand, not a stored value.
310     StoreInst *SI = cast<StoreInst>(Inst);
311     return SI->getPointerOperand() == User;
312   }
313   default:
314     return false;
315   }
316 }
317 
318 static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
319   ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
320 
321   DEBUG(dbgs() << "Alloca candidate for vectorization\n");
322 
323   // FIXME: There is no reason why we can't support larger arrays, we
324   // are just being conservative for now.
325   if (!AllocaTy ||
326       AllocaTy->getElementType()->isVectorTy() ||
327       AllocaTy->getNumElements() > 4) {
328     DEBUG(dbgs() << "  Cannot convert type to vector\n");
329     return false;
330   }
331 
332   std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
333   std::vector<Value*> WorkList;
334   for (User *AllocaUser : Alloca->users()) {
335     GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
336     if (!GEP) {
337       if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
338         return false;
339 
340       WorkList.push_back(AllocaUser);
341       continue;
342     }
343 
344     Value *Index = GEPToVectorIndex(GEP);
345 
346     // If we can't compute a vector index from this GEP, then we can't
347     // promote this alloca to vector.
348     if (!Index) {
349       DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP << '\n');
350       return false;
351     }
352 
353     GEPVectorIdx[GEP] = Index;
354     for (User *GEPUser : AllocaUser->users()) {
355       if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
356         return false;
357 
358       WorkList.push_back(GEPUser);
359     }
360   }
361 
362   VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
363 
364   DEBUG(dbgs() << "  Converting alloca to vector "
365         << *AllocaTy << " -> " << *VectorTy << '\n');
366 
367   for (Value *V : WorkList) {
368     Instruction *Inst = cast<Instruction>(V);
369     IRBuilder<> Builder(Inst);
370     switch (Inst->getOpcode()) {
371     case Instruction::Load: {
372       Value *Ptr = Inst->getOperand(0);
373       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
374       Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
375       Value *VecValue = Builder.CreateLoad(BitCast);
376       Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
377       Inst->replaceAllUsesWith(ExtractElement);
378       Inst->eraseFromParent();
379       break;
380     }
381     case Instruction::Store: {
382       Value *Ptr = Inst->getOperand(1);
383       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
384       Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
385       Value *VecValue = Builder.CreateLoad(BitCast);
386       Value *NewVecValue = Builder.CreateInsertElement(VecValue,
387                                                        Inst->getOperand(0),
388                                                        Index);
389       Builder.CreateStore(NewVecValue, BitCast);
390       Inst->eraseFromParent();
391       break;
392     }
393     case Instruction::BitCast:
394     case Instruction::AddrSpaceCast:
395       break;
396 
397     default:
398       Inst->dump();
399       llvm_unreachable("Inconsistency in instructions promotable to vector");
400     }
401   }
402   return true;
403 }
404 
405 static bool isCallPromotable(CallInst *CI) {
406   // TODO: We might be able to handle some cases where the callee is a
407   // constantexpr bitcast of a function.
408   if (!CI->getCalledFunction())
409     return false;
410 
411   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
412   if (!II)
413     return false;
414 
415   switch (II->getIntrinsicID()) {
416   case Intrinsic::memcpy:
417   case Intrinsic::memmove:
418   case Intrinsic::memset:
419   case Intrinsic::lifetime_start:
420   case Intrinsic::lifetime_end:
421   case Intrinsic::invariant_start:
422   case Intrinsic::invariant_end:
423   case Intrinsic::invariant_group_barrier:
424   case Intrinsic::objectsize:
425     return true;
426   default:
427     return false;
428   }
429 }
430 
431 static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
432   for (User *User : Val->users()) {
433     if (std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
434       continue;
435 
436     if (CallInst *CI = dyn_cast<CallInst>(User)) {
437       if (!isCallPromotable(CI))
438         return false;
439 
440       WorkList.push_back(User);
441       continue;
442     }
443 
444     Instruction *UseInst = dyn_cast<Instruction>(User);
445     if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt)
446       return false;
447 
448     if (StoreInst *SI = dyn_cast_or_null<StoreInst>(UseInst)) {
449       if (SI->isVolatile())
450         return false;
451 
452       // Reject if the stored value is not the pointer operand.
453       if (SI->getPointerOperand() != Val)
454         return false;
455     } else if (LoadInst *LI = dyn_cast_or_null<LoadInst>(UseInst)) {
456       if (LI->isVolatile())
457         return false;
458     } else if (AtomicRMWInst *RMW = dyn_cast_or_null<AtomicRMWInst>(UseInst)) {
459       if (RMW->isVolatile())
460         return false;
461     } else if (AtomicCmpXchgInst *CAS
462                = dyn_cast_or_null<AtomicCmpXchgInst>(UseInst)) {
463       if (CAS->isVolatile())
464         return false;
465     }
466 
467     if (!User->getType()->isPointerTy())
468       continue;
469 
470     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
471       // Be conservative if an address could be computed outside the bounds of
472       // the alloca.
473       if (!GEP->isInBounds())
474         return false;
475     }
476 
477     WorkList.push_back(User);
478     if (!collectUsesWithPtrTypes(User, WorkList))
479       return false;
480   }
481 
482   return true;
483 }
484 
485 void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
486   // Array allocations are probably not worth handling, since an allocation of
487   // the array type is the canonical form.
488   if (!I.isStaticAlloca() || I.isArrayAllocation())
489     return;
490 
491   IRBuilder<> Builder(&I);
492 
493   // First try to replace the alloca with a vector
494   Type *AllocaTy = I.getAllocatedType();
495 
496   DEBUG(dbgs() << "Trying to promote " << I << '\n');
497 
498   if (tryPromoteAllocaToVector(&I))
499     return;
500 
501   DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
502 
503   const Function &ContainingFunction = *I.getParent()->getParent();
504 
505   // FIXME: We should also try to get this value from the reqd_work_group_size
506   // function attribute if it is available.
507   unsigned WorkGroupSize = AMDGPU::getMaximumWorkGroupSize(ContainingFunction);
508 
509   int AllocaSize =
510       WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy);
511 
512   if (AllocaSize > LocalMemAvailable) {
513     DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
514     return;
515   }
516 
517   std::vector<Value*> WorkList;
518 
519   if (!collectUsesWithPtrTypes(&I, WorkList)) {
520     DEBUG(dbgs() << " Do not know how to convert all uses\n");
521     return;
522   }
523 
524   DEBUG(dbgs() << "Promoting alloca to local memory\n");
525   LocalMemAvailable -= AllocaSize;
526 
527   Function *F = I.getParent()->getParent();
528 
529   Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
530   GlobalVariable *GV = new GlobalVariable(
531       *Mod, GVTy, false, GlobalValue::InternalLinkage,
532       UndefValue::get(GVTy),
533       Twine(F->getName()) + Twine('.') + I.getName(),
534       nullptr,
535       GlobalVariable::NotThreadLocal,
536       AMDGPUAS::LOCAL_ADDRESS);
537   GV->setUnnamedAddr(true);
538   GV->setAlignment(I.getAlignment());
539 
540   Value *TCntY, *TCntZ;
541 
542   std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
543   Value *TIdX = getWorkitemID(Builder, 0);
544   Value *TIdY = getWorkitemID(Builder, 1);
545   Value *TIdZ = getWorkitemID(Builder, 2);
546 
547   Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
548   Tmp0 = Builder.CreateMul(Tmp0, TIdX);
549   Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
550   Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
551   TID = Builder.CreateAdd(TID, TIdZ);
552 
553   Value *Indices[] = {
554     Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
555     TID
556   };
557 
558   Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
559   I.mutateType(Offset->getType());
560   I.replaceAllUsesWith(Offset);
561   I.eraseFromParent();
562 
563   for (Value *V : WorkList) {
564     CallInst *Call = dyn_cast<CallInst>(V);
565     if (!Call) {
566       Type *EltTy = V->getType()->getPointerElementType();
567       PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
568 
569       // The operand's value should be corrected on its own.
570       if (isa<AddrSpaceCastInst>(V))
571         continue;
572 
573       // FIXME: It doesn't really make sense to try to do this for all
574       // instructions.
575       V->mutateType(NewTy);
576       continue;
577     }
578 
579     IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
580     if (!Intr) {
581       // FIXME: What is this for? It doesn't make sense to promote arbitrary
582       // function calls. If the call is to a defined function that can also be
583       // promoted, we should be able to do this once that function is also
584       // rewritten.
585 
586       std::vector<Type*> ArgTypes;
587       for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
588                                 ArgIdx != ArgEnd; ++ArgIdx) {
589         ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
590       }
591       Function *F = Call->getCalledFunction();
592       FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
593                                                 F->isVarArg());
594       Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(),
595                                              NewType, F->getAttributes());
596       Function *NewF = cast<Function>(C);
597       Call->setCalledFunction(NewF);
598       continue;
599     }
600 
601     Builder.SetInsertPoint(Intr);
602     switch (Intr->getIntrinsicID()) {
603     case Intrinsic::lifetime_start:
604     case Intrinsic::lifetime_end:
605       // These intrinsics are for address space 0 only
606       Intr->eraseFromParent();
607       continue;
608     case Intrinsic::memcpy: {
609       MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
610       Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
611                            MemCpy->getLength(), MemCpy->getAlignment(),
612                            MemCpy->isVolatile());
613       Intr->eraseFromParent();
614       continue;
615     }
616     case Intrinsic::memmove: {
617       MemMoveInst *MemMove = cast<MemMoveInst>(Intr);
618       Builder.CreateMemMove(MemMove->getRawDest(), MemMove->getRawSource(),
619                             MemMove->getLength(), MemMove->getAlignment(),
620                             MemMove->isVolatile());
621       Intr->eraseFromParent();
622       continue;
623     }
624     case Intrinsic::memset: {
625       MemSetInst *MemSet = cast<MemSetInst>(Intr);
626       Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
627                            MemSet->getLength(), MemSet->getAlignment(),
628                            MemSet->isVolatile());
629       Intr->eraseFromParent();
630       continue;
631     }
632     case Intrinsic::invariant_start:
633     case Intrinsic::invariant_end:
634     case Intrinsic::invariant_group_barrier:
635       Intr->eraseFromParent();
636       // FIXME: I think the invariant marker should still theoretically apply,
637       // but the intrinsics need to be changed to accept pointers with any
638       // address space.
639       continue;
640     case Intrinsic::objectsize: {
641       Value *Src = Intr->getOperand(0);
642       Type *SrcTy = Src->getType()->getPointerElementType();
643       Function *ObjectSize = Intrinsic::getDeclaration(Mod,
644         Intrinsic::objectsize,
645         { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) }
646       );
647 
648       CallInst *NewCall
649         = Builder.CreateCall(ObjectSize, { Src, Intr->getOperand(1) });
650       Intr->replaceAllUsesWith(NewCall);
651       Intr->eraseFromParent();
652       continue;
653     }
654     default:
655       Intr->dump();
656       llvm_unreachable("Don't know how to promote alloca intrinsic use.");
657     }
658   }
659 }
660 
661 FunctionPass *llvm::createAMDGPUPromoteAlloca(const TargetMachine *TM) {
662   return new AMDGPUPromoteAlloca(TM);
663 }
664