1 //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This pass eliminates allocas by either converting them into vectors or 11 // by migrating them to local address space. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPU.h" 16 #include "AMDGPUSubtarget.h" 17 #include "llvm/Analysis/ValueTracking.h" 18 #include "llvm/IR/IRBuilder.h" 19 #include "llvm/IR/InstVisitor.h" 20 #include "llvm/IR/MDBuilder.h" 21 #include "llvm/Support/Debug.h" 22 #include "llvm/Support/raw_ostream.h" 23 24 #define DEBUG_TYPE "amdgpu-promote-alloca" 25 26 using namespace llvm; 27 28 namespace { 29 30 // FIXME: This can create globals so should be a module pass. 31 class AMDGPUPromoteAlloca : public FunctionPass, 32 public InstVisitor<AMDGPUPromoteAlloca> { 33 private: 34 const TargetMachine *TM; 35 Module *Mod; 36 MDNode *MaxWorkGroupSizeRange; 37 38 // FIXME: This should be per-kernel. 39 int LocalMemAvailable; 40 41 bool IsAMDGCN; 42 bool IsAMDHSA; 43 44 std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder); 45 Value *getWorkitemID(IRBuilder<> &Builder, unsigned N); 46 47 public: 48 static char ID; 49 50 AMDGPUPromoteAlloca(const TargetMachine *TM_ = nullptr) : 51 FunctionPass(ID), 52 TM(TM_), 53 Mod(nullptr), 54 MaxWorkGroupSizeRange(nullptr), 55 LocalMemAvailable(0), 56 IsAMDGCN(false), 57 IsAMDHSA(false) { } 58 59 bool doInitialization(Module &M) override; 60 bool runOnFunction(Function &F) override; 61 62 const char *getPassName() const override { 63 return "AMDGPU Promote Alloca"; 64 } 65 66 void visitAlloca(AllocaInst &I); 67 }; 68 69 } // End anonymous namespace 70 71 char AMDGPUPromoteAlloca::ID = 0; 72 73 INITIALIZE_TM_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE, 74 "AMDGPU promote alloca to vector or LDS", false, false) 75 76 char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID; 77 78 79 bool AMDGPUPromoteAlloca::doInitialization(Module &M) { 80 if (!TM) 81 return false; 82 83 Mod = &M; 84 85 // The maximum workitem id. 86 // 87 // FIXME: Should get as subtarget property. Usually runtime enforced max is 88 // 256. 89 MDBuilder MDB(Mod->getContext()); 90 MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048)); 91 92 const Triple &TT = TM->getTargetTriple(); 93 94 IsAMDGCN = TT.getArch() == Triple::amdgcn; 95 IsAMDHSA = TT.getOS() == Triple::AMDHSA; 96 97 return false; 98 } 99 100 bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { 101 if (!TM) 102 return false; 103 104 const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F); 105 106 FunctionType *FTy = F.getFunctionType(); 107 LocalMemAvailable = ST.getLocalMemorySize(); 108 109 110 // If the function has any arguments in the local address space, then it's 111 // possible these arguments require the entire local memory space, so 112 // we cannot use local memory in the pass. 113 for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) { 114 Type *ParamTy = FTy->getParamType(i); 115 if (ParamTy->isPointerTy() && 116 ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { 117 LocalMemAvailable = 0; 118 DEBUG(dbgs() << "Function has local memory argument. Promoting to " 119 "local memory disabled.\n"); 120 break; 121 } 122 } 123 124 if (LocalMemAvailable > 0) { 125 // Check how much local memory is being used by global objects 126 for (Module::global_iterator I = Mod->global_begin(), 127 E = Mod->global_end(); I != E; ++I) { 128 GlobalVariable *GV = &*I; 129 if (GV->getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) 130 continue; 131 for (Value::use_iterator U = GV->use_begin(), 132 UE = GV->use_end(); U != UE; ++U) { 133 Instruction *Use = dyn_cast<Instruction>(*U); 134 if (!Use) 135 continue; 136 if (Use->getParent()->getParent() == &F) 137 LocalMemAvailable -= 138 Mod->getDataLayout().getTypeAllocSize(GV->getValueType()); 139 } 140 } 141 } 142 143 LocalMemAvailable = std::max(0, LocalMemAvailable); 144 DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n"); 145 146 visit(F); 147 148 return false; 149 } 150 151 std::pair<Value *, Value *> 152 AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) { 153 if (!IsAMDHSA) { 154 Function *LocalSizeYFn 155 = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y); 156 Function *LocalSizeZFn 157 = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z); 158 159 CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {}); 160 CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {}); 161 162 LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 163 LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 164 165 return std::make_pair(LocalSizeY, LocalSizeZ); 166 } 167 168 // We must read the size out of the dispatch pointer. 169 assert(IsAMDGCN); 170 171 // We are indexing into this struct, and want to extract the workgroup_size_* 172 // fields. 173 // 174 // typedef struct hsa_kernel_dispatch_packet_s { 175 // uint16_t header; 176 // uint16_t setup; 177 // uint16_t workgroup_size_x ; 178 // uint16_t workgroup_size_y; 179 // uint16_t workgroup_size_z; 180 // uint16_t reserved0; 181 // uint32_t grid_size_x ; 182 // uint32_t grid_size_y ; 183 // uint32_t grid_size_z; 184 // 185 // uint32_t private_segment_size; 186 // uint32_t group_segment_size; 187 // uint64_t kernel_object; 188 // 189 // #ifdef HSA_LARGE_MODEL 190 // void *kernarg_address; 191 // #elif defined HSA_LITTLE_ENDIAN 192 // void *kernarg_address; 193 // uint32_t reserved1; 194 // #else 195 // uint32_t reserved1; 196 // void *kernarg_address; 197 // #endif 198 // uint64_t reserved2; 199 // hsa_signal_t completion_signal; // uint64_t wrapper 200 // } hsa_kernel_dispatch_packet_t 201 // 202 Function *DispatchPtrFn 203 = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr); 204 205 CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {}); 206 DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias); 207 DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull); 208 209 // Size of the dispatch packet struct. 210 DispatchPtr->addDereferenceableAttr(AttributeSet::ReturnIndex, 64); 211 212 Type *I32Ty = Type::getInt32Ty(Mod->getContext()); 213 Value *CastDispatchPtr = Builder.CreateBitCast( 214 DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS)); 215 216 // We could do a single 64-bit load here, but it's likely that the basic 217 // 32-bit and extract sequence is already present, and it is probably easier 218 // to CSE this. The loads should be mergable later anyway. 219 Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 1); 220 LoadInst *LoadXY = Builder.CreateAlignedLoad(GEPXY, 4); 221 222 Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(CastDispatchPtr, 2); 223 LoadInst *LoadZU = Builder.CreateAlignedLoad(GEPZU, 4); 224 225 MDNode *MD = llvm::MDNode::get(Mod->getContext(), None); 226 LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD); 227 LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD); 228 LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 229 230 // Extract y component. Upper half of LoadZU should be zero already. 231 Value *Y = Builder.CreateLShr(LoadXY, 16); 232 233 return std::make_pair(Y, LoadZU); 234 } 235 236 Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) { 237 Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic; 238 239 switch (N) { 240 case 0: 241 IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_x 242 : Intrinsic::r600_read_tidig_x; 243 break; 244 case 1: 245 IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_y 246 : Intrinsic::r600_read_tidig_y; 247 break; 248 249 case 2: 250 IntrID = IsAMDGCN ? Intrinsic::amdgcn_workitem_id_z 251 : Intrinsic::r600_read_tidig_z; 252 break; 253 default: 254 llvm_unreachable("invalid dimension"); 255 } 256 257 Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID); 258 CallInst *CI = Builder.CreateCall(WorkitemIdFn); 259 CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange); 260 261 return CI; 262 } 263 264 static VectorType *arrayTypeToVecType(Type *ArrayTy) { 265 return VectorType::get(ArrayTy->getArrayElementType(), 266 ArrayTy->getArrayNumElements()); 267 } 268 269 static Value * 270 calculateVectorIndex(Value *Ptr, 271 const std::map<GetElementPtrInst *, Value *> &GEPIdx) { 272 if (isa<AllocaInst>(Ptr)) 273 return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext())); 274 275 GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr); 276 277 auto I = GEPIdx.find(GEP); 278 return I == GEPIdx.end() ? nullptr : I->second; 279 } 280 281 static Value* GEPToVectorIndex(GetElementPtrInst *GEP) { 282 // FIXME we only support simple cases 283 if (GEP->getNumOperands() != 3) 284 return NULL; 285 286 ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1)); 287 if (!I0 || !I0->isZero()) 288 return NULL; 289 290 return GEP->getOperand(2); 291 } 292 293 // Not an instruction handled below to turn into a vector. 294 // 295 // TODO: Check isTriviallyVectorizable for calls and handle other 296 // instructions. 297 static bool canVectorizeInst(Instruction *Inst, User *User) { 298 switch (Inst->getOpcode()) { 299 case Instruction::Load: 300 case Instruction::BitCast: 301 case Instruction::AddrSpaceCast: 302 return true; 303 case Instruction::Store: { 304 // Must be the stored pointer operand, not a stored value. 305 StoreInst *SI = cast<StoreInst>(Inst); 306 return SI->getPointerOperand() == User; 307 } 308 default: 309 return false; 310 } 311 } 312 313 static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { 314 Type *AllocaTy = Alloca->getAllocatedType(); 315 316 DEBUG(dbgs() << "Alloca Candidate for vectorization \n"); 317 318 // FIXME: There is no reason why we can't support larger arrays, we 319 // are just being conservative for now. 320 if (!AllocaTy->isArrayTy() || 321 AllocaTy->getArrayElementType()->isVectorTy() || 322 AllocaTy->getArrayNumElements() > 4) { 323 324 DEBUG(dbgs() << " Cannot convert type to vector"); 325 return false; 326 } 327 328 std::map<GetElementPtrInst*, Value*> GEPVectorIdx; 329 std::vector<Value*> WorkList; 330 for (User *AllocaUser : Alloca->users()) { 331 GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser); 332 if (!GEP) { 333 if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca)) 334 return false; 335 336 WorkList.push_back(AllocaUser); 337 continue; 338 } 339 340 Value *Index = GEPToVectorIndex(GEP); 341 342 // If we can't compute a vector index from this GEP, then we can't 343 // promote this alloca to vector. 344 if (!Index) { 345 DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << '\n'); 346 return false; 347 } 348 349 GEPVectorIdx[GEP] = Index; 350 for (User *GEPUser : AllocaUser->users()) { 351 if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser)) 352 return false; 353 354 WorkList.push_back(GEPUser); 355 } 356 } 357 358 VectorType *VectorTy = arrayTypeToVecType(AllocaTy); 359 360 DEBUG(dbgs() << " Converting alloca to vector " 361 << *AllocaTy << " -> " << *VectorTy << '\n'); 362 363 for (std::vector<Value*>::iterator I = WorkList.begin(), 364 E = WorkList.end(); I != E; ++I) { 365 Instruction *Inst = cast<Instruction>(*I); 366 IRBuilder<> Builder(Inst); 367 switch (Inst->getOpcode()) { 368 case Instruction::Load: { 369 Value *Ptr = Inst->getOperand(0); 370 Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); 371 Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0)); 372 Value *VecValue = Builder.CreateLoad(BitCast); 373 Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index); 374 Inst->replaceAllUsesWith(ExtractElement); 375 Inst->eraseFromParent(); 376 break; 377 } 378 case Instruction::Store: { 379 Value *Ptr = Inst->getOperand(1); 380 Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); 381 Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0)); 382 Value *VecValue = Builder.CreateLoad(BitCast); 383 Value *NewVecValue = Builder.CreateInsertElement(VecValue, 384 Inst->getOperand(0), 385 Index); 386 Builder.CreateStore(NewVecValue, BitCast); 387 Inst->eraseFromParent(); 388 break; 389 } 390 case Instruction::BitCast: 391 case Instruction::AddrSpaceCast: 392 break; 393 394 default: 395 Inst->dump(); 396 llvm_unreachable("Inconsistency in instructions promotable to vector"); 397 } 398 } 399 return true; 400 } 401 402 static bool collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) { 403 bool Success = true; 404 for (User *User : Val->users()) { 405 if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end()) 406 continue; 407 if (CallInst *CI = dyn_cast<CallInst>(User)) { 408 // TODO: We might be able to handle some cases where the callee is a 409 // constantexpr bitcast of a function. 410 if (!CI->getCalledFunction()) 411 return false; 412 413 WorkList.push_back(User); 414 continue; 415 } 416 417 // FIXME: Correctly handle ptrtoint instructions. 418 Instruction *UseInst = dyn_cast<Instruction>(User); 419 if (UseInst && UseInst->getOpcode() == Instruction::PtrToInt) 420 return false; 421 422 if (StoreInst *SI = dyn_cast_or_null<StoreInst>(UseInst)) { 423 // Reject if the stored value is not the pointer operand. 424 if (SI->getPointerOperand() != Val) 425 return false; 426 } 427 428 if (!User->getType()->isPointerTy()) 429 continue; 430 431 WorkList.push_back(User); 432 433 Success &= collectUsesWithPtrTypes(User, WorkList); 434 } 435 return Success; 436 } 437 438 void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { 439 if (!I.isStaticAlloca()) 440 return; 441 442 IRBuilder<> Builder(&I); 443 444 // First try to replace the alloca with a vector 445 Type *AllocaTy = I.getAllocatedType(); 446 447 DEBUG(dbgs() << "Trying to promote " << I << '\n'); 448 449 if (tryPromoteAllocaToVector(&I)) 450 return; 451 452 DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n"); 453 454 // FIXME: This is the maximum work group size. We should try to get 455 // value from the reqd_work_group_size function attribute if it is 456 // available. 457 unsigned WorkGroupSize = 256; 458 int AllocaSize = 459 WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy); 460 461 if (AllocaSize > LocalMemAvailable) { 462 DEBUG(dbgs() << " Not enough local memory to promote alloca.\n"); 463 return; 464 } 465 466 std::vector<Value*> WorkList; 467 468 if (!collectUsesWithPtrTypes(&I, WorkList)) { 469 DEBUG(dbgs() << " Do not know how to convert all uses\n"); 470 return; 471 } 472 473 DEBUG(dbgs() << "Promoting alloca to local memory\n"); 474 LocalMemAvailable -= AllocaSize; 475 476 Type *GVTy = ArrayType::get(I.getAllocatedType(), 256); 477 GlobalVariable *GV = new GlobalVariable( 478 *Mod, GVTy, false, GlobalValue::ExternalLinkage, 0, I.getName(), 0, 479 GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); 480 481 Value *TCntY, *TCntZ; 482 483 std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder); 484 Value *TIdX = getWorkitemID(Builder, 0); 485 Value *TIdY = getWorkitemID(Builder, 1); 486 Value *TIdZ = getWorkitemID(Builder, 2); 487 488 Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ); 489 Tmp0 = Builder.CreateMul(Tmp0, TIdX); 490 Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ); 491 Value *TID = Builder.CreateAdd(Tmp0, Tmp1); 492 TID = Builder.CreateAdd(TID, TIdZ); 493 494 std::vector<Value*> Indices; 495 Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext()))); 496 Indices.push_back(TID); 497 498 Value *Offset = Builder.CreateGEP(GVTy, GV, Indices); 499 I.mutateType(Offset->getType()); 500 I.replaceAllUsesWith(Offset); 501 I.eraseFromParent(); 502 503 for (std::vector<Value*>::iterator i = WorkList.begin(), 504 e = WorkList.end(); i != e; ++i) { 505 Value *V = *i; 506 CallInst *Call = dyn_cast<CallInst>(V); 507 if (!Call) { 508 Type *EltTy = V->getType()->getPointerElementType(); 509 PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); 510 511 // The operand's value should be corrected on its own. 512 if (isa<AddrSpaceCastInst>(V)) 513 continue; 514 515 // FIXME: It doesn't really make sense to try to do this for all 516 // instructions. 517 V->mutateType(NewTy); 518 continue; 519 } 520 521 IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call); 522 if (!Intr) { 523 std::vector<Type*> ArgTypes; 524 for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands(); 525 ArgIdx != ArgEnd; ++ArgIdx) { 526 ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType()); 527 } 528 Function *F = Call->getCalledFunction(); 529 FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes, 530 F->isVarArg()); 531 Constant *C = Mod->getOrInsertFunction((F->getName() + ".local").str(), 532 NewType, F->getAttributes()); 533 Function *NewF = cast<Function>(C); 534 Call->setCalledFunction(NewF); 535 continue; 536 } 537 538 Builder.SetInsertPoint(Intr); 539 switch (Intr->getIntrinsicID()) { 540 case Intrinsic::lifetime_start: 541 case Intrinsic::lifetime_end: 542 // These intrinsics are for address space 0 only 543 Intr->eraseFromParent(); 544 continue; 545 case Intrinsic::memcpy: { 546 MemCpyInst *MemCpy = cast<MemCpyInst>(Intr); 547 Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(), 548 MemCpy->getLength(), MemCpy->getAlignment(), 549 MemCpy->isVolatile()); 550 Intr->eraseFromParent(); 551 continue; 552 } 553 case Intrinsic::memset: { 554 MemSetInst *MemSet = cast<MemSetInst>(Intr); 555 Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(), 556 MemSet->getLength(), MemSet->getAlignment(), 557 MemSet->isVolatile()); 558 Intr->eraseFromParent(); 559 continue; 560 } 561 case Intrinsic::invariant_start: 562 case Intrinsic::invariant_end: 563 case Intrinsic::invariant_group_barrier: 564 Intr->eraseFromParent(); 565 // FIXME: I think the invariant marker should still theoretically apply, 566 // but the intrinsics need to be changed to accept pointers with any 567 // address space. 568 continue; 569 default: 570 Intr->dump(); 571 llvm_unreachable("Don't know how to promote alloca intrinsic use."); 572 } 573 } 574 } 575 576 FunctionPass *llvm::createAMDGPUPromoteAlloca(const TargetMachine *TM) { 577 return new AMDGPUPromoteAlloca(TM); 578 } 579