1 //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass eliminates allocas by either converting them into vectors or
11 // by migrating them to local address space.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
16 #include "AMDGPUSubtarget.h"
17 #include "llvm/Analysis/ValueTracking.h"
18 #include "llvm/IR/IRBuilder.h"
19 #include "llvm/IR/InstVisitor.h"
20 #include "llvm/IR/MDBuilder.h"
21 #include "llvm/Support/Debug.h"
22 #include "llvm/Support/raw_ostream.h"
23 
24 #define DEBUG_TYPE "amdgpu-promote-alloca"
25 
26 using namespace llvm;
27 
28 namespace {
29 
30 // FIXME: This can create globals so should be a module pass.
31 class AMDGPUPromoteAlloca : public FunctionPass,
32                             public InstVisitor<AMDGPUPromoteAlloca> {
33 private:
34   const TargetMachine *TM;
35   Module *Mod;
36   MDNode *MaxWorkGroupSizeRange;
37 
38   // FIXME: This should be per-kernel.
39   int LocalMemAvailable;
40 
41   bool IsAMDGCN;
42   bool IsAMDHSA;
43 
44   std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
45   Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
46 
47 public:
48   static char ID;
49 
50   AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) :
51     FunctionPass(ID),
52     TM(TM_),
53     Mod(nullptr),
54     MaxWorkGroupSizeRange(nullptr),
55     LocalMemAvailable(0),
56     IsAMDGCN(false),
57     IsAMDHSA(false) { }
58 
59   bool doInitialization(Module &M) override;
60   bool runOnFunction(Function &F) override;
61 
62   const char *getPassName() const override {
63     return "AMDGPU Promote Alloca";
64   }
65 
66   void visitAlloca(AllocaInst &I);
67 };
68 
69 } // End anonymous namespace
70 
71 char AMDGPUPromoteAlloca::ID = 0;
72 
73 INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
74                    "AMDGPU promote alloca to vector or LDS", false, false)
75 
76 char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
77 
78 
79 bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
80   if (!TM)
81     return false;
82 
83   Mod = &M;
84 
85   // The maximum workitem id.
86   //
87   // FIXME: Should get as subtarget property. Usually runtime enforced max is
88   // 256.
89   MDBuilder MDB(Mod->getContext());
90   MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048));
91 
92   const Triple &TT = TM->getTargetTriple();
93 
94   IsAMDGCN = TT.getArch() == Triple::amdgcn;
95   IsAMDHSA = TT.getOS() == Triple::AMDHSA;
96 
97   return false;
98 }
99 
100 bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
101   if (!TM)
102     return false;
103 
104   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
105 
106   FunctionType *FTy = F.getFunctionType();
107   LocalMemAvailable = ST.getLocalMemorySize();
108 
109 
110   // If the function has any arguments in the local address space, then it's
111   // possible these arguments require the entire local memory space, so
112   // we cannot use local memory in the pass.
113   for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
114     Type *ParamTy = FTy->getParamType(i);
115     if (ParamTy->isPointerTy() &&
116         ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
117       LocalMemAvailable = 0;
118       DEBUG(dbgs() << "Function has local memory argument.  Promoting to "
119                       "local memory disabled.\n");
120       break;
121     }
122   }
123 
124   if (LocalMemAvailable > 0) {
125     // Check how much local memory is being used by global objects
126     for (Module::global_iterator I = Mod->global_begin(),
127                                  E = Mod->global_end(); I != E; ++I) {
128       GlobalVariable *GV = &*I;
129       if (GV->getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
130         continue;
131       for (Value::use_iterator U = GV->use_begin(),
132                                UE = GV->use_end(); U != UE; ++U) {
133         Instruction *Use = dyn_cast<Instruction>(*U);
134         if (!Use)
135           continue;
136         if (Use->getParent()->getParent() == &F)
137           LocalMemAvailable -=
138               Mod->getDataLayout().getTypeAllocSize(GV->getValueType());
139       }
140     }
141   }
142 
143   LocalMemAvailable = std::max(0, LocalMemAvailable);
144   DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n");
145 
146   visit(F);
147 
148   return false;
149 }
150 
151 std::pair<Value *, Value *>
152 AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
153   if (!IsAMDHSA) {
154     Function *LocalSizeYFn
155       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
156     Function *LocalSizeZFn
157       = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
158 
159     CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
160     CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
161 
162     LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
163     LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
164 
165     return std::make_pair(LocalSizeY, LocalSizeZ);
166   }
167 
168   // We must read the size out of the dispatch pointer.
169   assert(IsAMDGCN);
170 
171   // We are indexing into this struct, and want to extract the workgroup_size_*
172   // fields.
173   //
174   //   typedef struct hsa_kernel_dispatch_packet_s {
175   //     uint16_t header;
176   //     uint16_t setup;
177   //     uint16_t workgroup_size_x ;
178   //     uint16_t workgroup_size_y;
179   //     uint16_t workgroup_size_z;
180   //     uint16_t reserved0;
181   //     uint32_t grid_size_x ;
182   //     uint32_t grid_size_y ;
183   //     uint32_t grid_size_z;
184   //
185   //     uint32_t private_segment_size;
186   //     uint32_t group_segment_size;
187   //     uint64_t kernel_object;
188   //
189   // #ifdef HSA_LARGE_MODEL
190   //     void *kernarg_address;
191   // #elif defined HSA_LITTLE_ENDIAN
192   //     void *kernarg_address;
193   //     uint32_t reserved1;
194   // #else
195   //     uint32_t reserved1;
196   //     void *kernarg_address;
197   // #endif
198   //     uint64_t reserved2;
199   //     hsa_signal_t completion_signal; // uint64_t wrapper
200   //   } hsa_kernel_dispatch_packet_t
201   //
202   Function *DispatchPtrFn
203     = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
204 
205   CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
206   DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias);
207   DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
208 
209   // Size of the dispatch packet struct.
210   DispatchPtr->addDereferenceableAttr(AttributeSet::ReturnIndex, 64);
211 
212   Type *I32Ty = Type::getInt32Ty(Mod->getContext());
213   Value *CastDispatchPtr = Builder.CreateBitCast(
214     DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
215 
216   // We could do a single 64-bit load here, but it's likely that the basic
217   // 32-bit and extract sequence is already present, and it is probably easier
218   // to CSE this. The loads should be mergable later anyway.
219   Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1);
220   LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4);
221 
222   Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2);
223   LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4);
224 
225   MDNode *MD = llvm::MDNode::get(Mod->getContext(), None);
226   LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
227   LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
228   LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
229 
230   // Extract y component. Upper half of LoadZU should be zero already.
231   Value *Y = Builder.CreateLShr(LoadXY, 16);
232 
233   return std::make_pair(Y, LoadZU);
234 }
235 
236 Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
237   Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
238 
239   switch (N) {
240   case 0:
241     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_x
242       : Intrinsic::r600_read_tidig_x;
243     break;
244   case 1:
245     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_y
246       : Intrinsic::r600_read_tidig_y;
247     break;
248 
249   case 2:
250     IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_z
251       : Intrinsic::r600_read_tidig_z;
252     break;
253   default:
254     llvm_unreachable("invalid dimension");
255   }
256 
257   Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
258   CallInst *CI = Builder.CreateCall(WorkitemIdFn);
259   CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
260 
261   return CI;
262 }
263 
264 static VectorType *arrayTypeToVecType(Type *ArrayTy) {
265   return VectorType::get(ArrayTy->getArrayElementType(),
266                          ArrayTy->getArrayNumElements());
267 }
268 
269 static Value *
270 calculateVectorIndex(Value *Ptr,
271                      const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
272   if (isa<AllocaInst>(Ptr))
273     return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
274 
275   GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
276 
277   auto I = GEPIdx.find(GEP);
278   return I == GEPIdx.end() ? nullptr : I->second;
279 }
280 
281 static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
282   // FIXME we only support simple cases
283   if (GEP->getNumOperands() != 3)
284     return NULL;
285 
286   ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
287   if (!I0 || !I0->isZero())
288     return NULL;
289 
290   return GEP->getOperand(2);
291 }
292 
293 // Not an instruction handled below to turn into a vector.
294 //
295 // TODO: Check isTriviallyVectorizable for calls and handle other
296 // instructions.
297 static bool canVectorizeInst(Instruction *Inst, User *User) {
298   switch (Inst->getOpcode()) {
299   case Instruction::Load:
300   case Instruction::BitCast:
301   case Instruction::AddrSpaceCast:
302     return true;
303   case Instruction::Store: {
304     // Must be the stored pointer operand, not a stored value.
305     StoreInst *SI = cast<StoreInst>(Inst);
306     return SI->getPointerOperand() == User;
307   }
308   default:
309     return false;
310   }
311 }
312 
313 static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
314   Type *AllocaTy = Alloca->getAllocatedType();
315 
316   DEBUG(dbgs() << "Alloca Candidate for vectorization \n");
317 
318   // FIXME: There is no reason why we can't support larger arrays, we
319   // are just being conservative for now.
320   if (!AllocaTy->isArrayTy() ||
321       AllocaTy->getArrayElementType()->isVectorTy() ||
322       AllocaTy->getArrayNumElements() > 4) {
323 
324     DEBUG(dbgs() << "  Cannot convert type to vector");
325     return false;
326   }
327 
328   std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
329   std::vector<Value*> WorkList;
330   for (User *AllocaUser : Alloca->users()) {
331     GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
332     if (!GEP) {
333       if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
334         return false;
335 
336       WorkList.push_back(AllocaUser);
337       continue;
338     }
339 
340     Value *Index = GEPToVectorIndex(GEP);
341 
342     // If we can't compute a vector index from this GEP, then we can't
343     // promote this alloca to vector.
344     if (!Index) {
345       DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP << '\n');
346       return false;
347     }
348 
349     GEPVectorIdx[GEP] = Index;
350     for (User *GEPUser : AllocaUser->users()) {
351       if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
352         return false;
353 
354       WorkList.push_back(GEPUser);
355     }
356   }
357 
358   VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
359 
360   DEBUG(dbgs() << "  Converting alloca to vector "
361         << *AllocaTy << " -> " << *VectorTy << '\n');
362 
363   for (std::vector<Value*>::iterator I = WorkList.begin(),
364                                      E = WorkList.end(); I != E; ++I) {
365     Instruction *Inst = cast<Instruction>(*I);
366     IRBuilder<> Builder(Inst);
367     switch (Inst->getOpcode()) {
368     case Instruction::Load: {
369       Value *Ptr = Inst->getOperand(0);
370       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
371       Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
372       Value *VecValue = Builder.CreateLoad(BitCast);
373       Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
374       Inst->replaceAllUsesWith(ExtractElement);
375       Inst->eraseFromParent();
376       break;
377     }
378     case Instruction::Store: {
379       Value *Ptr = Inst->getOperand(1);
380       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
381       Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
382       Value *VecValue = Builder.CreateLoad(BitCast);
383       Value *NewVecValue = Builder.CreateInsertElement(VecValue,
384                                                        Inst->getOperand(0),
385                                                        Index);
386       Builder.CreateStore(NewVecValue, BitCast);
387       Inst->eraseFromParent();
388       break;
389     }
390     case Instruction::BitCast:
391     case Instruction::AddrSpaceCast:
392       break;
393 
394     default:
395       Inst->dump();
396       llvm_unreachable("Inconsistency in instructions promotable to vector");
397     }
398   }
399   return true;
400 }
401 
402 static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
403   bool Success = true;
404   for (User *User : Val->users()) {
405     if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
406       continue;
407     if (CallInst *CI = dyn_cast<CallInst>(User)) {
408       // TODO: We might be able to handle some cases where the callee is a
409       // constantexpr bitcast of a function.
410       if (!CI->getCalledFunction())
411         return false;
412 
413       WorkList.push_back(User);
414       continue;
415     }
416 
417     // FIXME: Correctly handle ptrtoint instructions.
418     Instruction *UseInst = dyn_cast<Instruction>(User);
419     if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt)
420       return false;
421 
422     if (StoreInst *SI = dyn_cast_or_null<StoreInst>(UseInst)) {
423       // Reject if the stored value is not the pointer operand.
424       if (SI->getPointerOperand() != Val)
425         return false;
426     }
427 
428     if (!User->getType()->isPointerTy())
429       continue;
430 
431     WorkList.push_back(User);
432 
433     Success &= collectUsesWithPtrTypes(User, WorkList);
434   }
435   return Success;
436 }
437 
438 void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
439   if (!I.isStaticAlloca())
440     return;
441 
442   IRBuilder<> Builder(&I);
443 
444   // First try to replace the alloca with a vector
445   Type *AllocaTy = I.getAllocatedType();
446 
447   DEBUG(dbgs() << "Trying to promote " << I << '\n');
448 
449   if (tryPromoteAllocaToVector(&I))
450     return;
451 
452   DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
453 
454   // FIXME: This is the maximum work group size.  We should try to get
455   // value from the reqd_work_group_size function attribute if it is
456   // available.
457   unsigned WorkGroupSize = 256;
458   int AllocaSize =
459       WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy);
460 
461   if (AllocaSize > LocalMemAvailable) {
462     DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
463     return;
464   }
465 
466   std::vector<Value*> WorkList;
467 
468   if (!collectUsesWithPtrTypes(&I, WorkList)) {
469     DEBUG(dbgs() << " Do not know how to convert all uses\n");
470     return;
471   }
472 
473   DEBUG(dbgs() << "Promoting alloca to local memory\n");
474   LocalMemAvailable -= AllocaSize;
475 
476   Type *GVTy = ArrayType::get(I.getAllocatedType(), 256);
477   GlobalVariable *GV = new GlobalVariable(
478       *Mod, GVTy, false, GlobalValue::ExternalLinkage, 0, I.getName(), 0,
479       GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
480 
481   Value *TCntY, *TCntZ;
482 
483   std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
484   Value *TIdX = getWorkitemID(Builder, 0);
485   Value *TIdY = getWorkitemID(Builder, 1);
486   Value *TIdZ = getWorkitemID(Builder, 2);
487 
488   Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ);
489   Tmp0 = Builder.CreateMul(Tmp0, TIdX);
490   Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ);
491   Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
492   TID = Builder.CreateAdd(TID, TIdZ);
493 
494   std::vector<Value*> Indices;
495   Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext())));
496   Indices.push_back(TID);
497 
498   Value *Offset = Builder.CreateGEP(GVTy, GV, Indices);
499   I.mutateType(Offset->getType());
500   I.replaceAllUsesWith(Offset);
501   I.eraseFromParent();
502 
503   for (std::vector<Value*>::iterator i = WorkList.begin(),
504                                      e = WorkList.end(); i != e; ++i) {
505     Value *V = *i;
506     CallInst *Call = dyn_cast<CallInst>(V);
507     if (!Call) {
508       Type *EltTy = V->getType()->getPointerElementType();
509       PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
510 
511       // The operand's value should be corrected on its own.
512       if (isa<AddrSpaceCastInst>(V))
513         continue;
514 
515       // FIXME: It doesn't really make sense to try to do this for all
516       // instructions.
517       V->mutateType(NewTy);
518       continue;
519     }
520 
521     IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
522     if (!Intr) {
523       std::vector<Type*> ArgTypes;
524       for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
525                                 ArgIdx != ArgEnd; ++ArgIdx) {
526         ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
527       }
528       Function *F = Call->getCalledFunction();
529       FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
530                                                 F->isVarArg());
531       Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(),
532                                              NewType, F->getAttributes());
533       Function *NewF = cast<Function>(C);
534       Call->setCalledFunction(NewF);
535       continue;
536     }
537 
538     Builder.SetInsertPoint(Intr);
539     switch (Intr->getIntrinsicID()) {
540     case Intrinsic::lifetime_start:
541     case Intrinsic::lifetime_end:
542       // These intrinsics are for address space 0 only
543       Intr->eraseFromParent();
544       continue;
545     case Intrinsic::memcpy: {
546       MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
547       Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
548                            MemCpy->getLength(), MemCpy->getAlignment(),
549                            MemCpy->isVolatile());
550       Intr->eraseFromParent();
551       continue;
552     }
553     case Intrinsic::memset: {
554       MemSetInst *MemSet = cast<MemSetInst>(Intr);
555       Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
556                            MemSet->getLength(), MemSet->getAlignment(),
557                            MemSet->isVolatile());
558       Intr->eraseFromParent();
559       continue;
560     }
561     case Intrinsic::invariant_start:
562     case Intrinsic::invariant_end:
563     case Intrinsic::invariant_group_barrier:
564       Intr->eraseFromParent();
565       // FIXME: I think the invariant marker should still theoretically apply,
566       // but the intrinsics need to be changed to accept pointers with any
567       // address space.
568       continue;
569     default:
570       Intr->dump();
571       llvm_unreachable("Don't know how to promote alloca intrinsic use.");
572     }
573   }
574 }
575 
576 FunctionPass *llvm::createAMDGPUPromoteAlloca(const TargetMachine *TM) {
577   return new AMDGPUPromoteAlloca(TM);
578 }
579