1 //===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// \brief Analyzes if a function potentially memory bound and if a kernel 12 /// kernel may benefit from limiting number of waves to reduce cache thrashing. 13 /// 14 //===----------------------------------------------------------------------===// 15 16 #include "AMDGPU.h" 17 #include "AMDGPUPerfHintAnalysis.h" 18 #include "Utils/AMDGPUBaseInfo.h" 19 #include "llvm/ADT/SmallSet.h" 20 #include "llvm/ADT/Statistic.h" 21 #include "llvm/Analysis/ValueTracking.h" 22 #include "llvm/CodeGen/TargetLowering.h" 23 #include "llvm/CodeGen/TargetPassConfig.h" 24 #include "llvm/CodeGen/TargetSubtargetInfo.h" 25 #include "llvm/IR/Constants.h" 26 #include "llvm/IR/Instructions.h" 27 #include "llvm/IR/IntrinsicInst.h" 28 #include "llvm/IR/Module.h" 29 #include "llvm/IR/ValueMap.h" 30 #include "llvm/Support/CommandLine.h" 31 32 using namespace llvm; 33 34 #define DEBUG_TYPE "amdgpu-perf-hint" 35 36 static cl::opt<unsigned> 37 MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden, 38 cl::desc("Function mem bound threshold in %")); 39 40 static cl::opt<unsigned> 41 LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden, 42 cl::desc("Kernel limit wave threshold in %")); 43 44 static cl::opt<unsigned> 45 IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden, 46 cl::desc("Indirect access memory instruction weight")); 47 48 static cl::opt<unsigned> 49 LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden, 50 cl::desc("Large stride memory access weight")); 51 52 static cl::opt<unsigned> 53 LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden, 54 cl::desc("Large stride memory access threshold")); 55 56 STATISTIC(NumMemBound, "Number of functions marked as memory bound"); 57 STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave"); 58 59 char llvm::AMDGPUPerfHintAnalysis::ID = 0; 60 char &llvm::AMDGPUPerfHintAnalysisID = AMDGPUPerfHintAnalysis::ID; 61 62 INITIALIZE_PASS(AMDGPUPerfHintAnalysis, DEBUG_TYPE, 63 "Analysis if a function is memory bound", true, true) 64 65 namespace { 66 67 struct AMDGPUPerfHint { 68 friend AMDGPUPerfHintAnalysis; 69 70 public: 71 AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_, 72 const TargetLowering *TLI_) 73 : FIM(FIM_), DL(nullptr), TLI(TLI_) {} 74 75 void runOnFunction(Function &F); 76 77 private: 78 struct MemAccessInfo { 79 const Value *V; 80 const Value *Base; 81 int64_t Offset; 82 MemAccessInfo() : V(nullptr), Base(nullptr), Offset(0) {} 83 bool isLargeStride(MemAccessInfo &Reference) const; 84 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 85 Printable print() const { 86 return Printable([this](raw_ostream &OS) { 87 OS << "Value: " << *V << '\n' 88 << "Base: " << *Base << " Offset: " << Offset << '\n'; 89 }); 90 } 91 #endif 92 }; 93 94 MemAccessInfo makeMemAccessInfo(Instruction *) const; 95 96 MemAccessInfo LastAccess; // Last memory access info 97 98 AMDGPUPerfHintAnalysis::FuncInfoMap &FIM; 99 100 const DataLayout *DL; 101 102 AMDGPUAS AS; 103 104 const TargetLowering *TLI; 105 106 void visit(const Function &F); 107 static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F); 108 static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F); 109 110 bool isIndirectAccess(const Instruction *Inst) const; 111 112 /// Check if the instruction is large stride. 113 /// The purpose is to identify memory access pattern like: 114 /// x = a[i]; 115 /// y = a[i+1000]; 116 /// z = a[i+2000]; 117 /// In the above example, the second and third memory access will be marked 118 /// large stride memory access. 119 bool isLargeStride(const Instruction *Inst); 120 121 bool isGlobalAddr(const Value *V) const; 122 bool isLocalAddr(const Value *V) const; 123 bool isConstantAddr(const Value *V) const; 124 }; 125 126 static const Value *getMemoryInstrPtr(const Instruction *Inst) { 127 if (auto LI = dyn_cast<LoadInst>(Inst)) { 128 return LI->getPointerOperand(); 129 } 130 if (auto SI = dyn_cast<StoreInst>(Inst)) { 131 return SI->getPointerOperand(); 132 } 133 if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst)) { 134 return AI->getPointerOperand(); 135 } 136 if (auto AI = dyn_cast<AtomicRMWInst>(Inst)) { 137 return AI->getPointerOperand(); 138 } 139 if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst)) { 140 return MI->getRawDest(); 141 } 142 143 return nullptr; 144 } 145 146 bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const { 147 LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n'); 148 SmallSet<const Value *, 32> WorkSet; 149 SmallSet<const Value *, 32> Visited; 150 if (const Value *MO = getMemoryInstrPtr(Inst)) { 151 if (isGlobalAddr(MO)) 152 WorkSet.insert(MO); 153 } 154 155 while (!WorkSet.empty()) { 156 const Value *V = *WorkSet.begin(); 157 WorkSet.erase(*WorkSet.begin()); 158 if (!Visited.insert(V).second) 159 continue; 160 LLVM_DEBUG(dbgs() << " check: " << *V << '\n'); 161 162 if (auto LD = dyn_cast<LoadInst>(V)) { 163 auto M = LD->getPointerOperand(); 164 if (isGlobalAddr(M) || isLocalAddr(M) || isConstantAddr(M)) { 165 LLVM_DEBUG(dbgs() << " is IA\n"); 166 return true; 167 } 168 continue; 169 } 170 171 if (auto GEP = dyn_cast<GetElementPtrInst>(V)) { 172 auto P = GEP->getPointerOperand(); 173 WorkSet.insert(P); 174 for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I) 175 WorkSet.insert(GEP->getOperand(I)); 176 continue; 177 } 178 179 if (auto U = dyn_cast<UnaryInstruction>(V)) { 180 WorkSet.insert(U->getOperand(0)); 181 continue; 182 } 183 184 if (auto BO = dyn_cast<BinaryOperator>(V)) { 185 WorkSet.insert(BO->getOperand(0)); 186 WorkSet.insert(BO->getOperand(1)); 187 continue; 188 } 189 190 if (auto S = dyn_cast<SelectInst>(V)) { 191 WorkSet.insert(S->getFalseValue()); 192 WorkSet.insert(S->getTrueValue()); 193 continue; 194 } 195 196 if (auto E = dyn_cast<ExtractElementInst>(V)) { 197 WorkSet.insert(E->getVectorOperand()); 198 continue; 199 } 200 201 if (auto Phi = dyn_cast<PHINode>(V)) { 202 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) 203 WorkSet.insert(Phi->getIncomingValue(I)); 204 continue; 205 } 206 207 LLVM_DEBUG(dbgs() << " dropped\n"); 208 } 209 210 LLVM_DEBUG(dbgs() << " is not IA\n"); 211 return false; 212 } 213 214 void AMDGPUPerfHint::visit(const Function &F) { 215 auto FIP = FIM.insert(std::make_pair(&F, AMDGPUPerfHintAnalysis::FuncInfo())); 216 if (!FIP.second) 217 return; 218 219 AMDGPUPerfHintAnalysis::FuncInfo &FI = FIP.first->second; 220 221 LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n'); 222 223 for (auto &B : F) { 224 LastAccess = MemAccessInfo(); 225 for (auto &I : B) { 226 if (getMemoryInstrPtr(&I)) { 227 if (isIndirectAccess(&I)) 228 ++FI.IAMInstCount; 229 if (isLargeStride(&I)) 230 ++FI.LSMInstCount; 231 ++FI.MemInstCount; 232 ++FI.InstCount; 233 continue; 234 } 235 CallSite CS(const_cast<Instruction *>(&I)); 236 if (CS) { 237 Function *Callee = CS.getCalledFunction(); 238 if (!Callee || Callee->isDeclaration()) { 239 ++FI.InstCount; 240 continue; 241 } 242 if (&F == Callee) // Handle immediate recursion 243 continue; 244 245 visit(*Callee); 246 auto Loc = FIM.find(Callee); 247 248 assert(Loc != FIM.end() && "No func info"); 249 FI.MemInstCount += Loc->second.MemInstCount; 250 FI.InstCount += Loc->second.InstCount; 251 FI.IAMInstCount += Loc->second.IAMInstCount; 252 FI.LSMInstCount += Loc->second.LSMInstCount; 253 } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { 254 TargetLoweringBase::AddrMode AM; 255 auto *Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, *DL); 256 AM.BaseGV = dyn_cast_or_null<GlobalValue>(const_cast<Value *>(Ptr)); 257 AM.HasBaseReg = !AM.BaseGV; 258 if (TLI->isLegalAddressingMode(*DL, AM, GEP->getResultElementType(), 259 GEP->getPointerAddressSpace())) 260 // Offset will likely be folded into load or store 261 continue; 262 ++FI.InstCount; 263 } else { 264 ++FI.InstCount; 265 } 266 } 267 } 268 } 269 270 void AMDGPUPerfHint::runOnFunction(Function &F) { 271 if (FIM.find(&F) != FIM.end()) 272 return; 273 274 const Module &M = *F.getParent(); 275 DL = &M.getDataLayout(); 276 AS = AMDGPU::getAMDGPUAS(M); 277 278 visit(F); 279 auto Loc = FIM.find(&F); 280 281 assert(Loc != FIM.end() && "No func info"); 282 LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Loc->second.MemInstCount 283 << '\n' 284 << " IAMInst: " << Loc->second.IAMInstCount << '\n' 285 << " LSMInst: " << Loc->second.LSMInstCount << '\n' 286 << " TotalInst: " << Loc->second.InstCount << '\n'); 287 288 auto &FI = Loc->second; 289 290 if (isMemBound(FI)) { 291 LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n"); 292 NumMemBound++; 293 } 294 295 if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(FI)) { 296 LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n"); 297 NumLimitWave++; 298 } 299 } 300 301 bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { 302 return FI.MemInstCount * 100 / FI.InstCount > MemBoundThresh; 303 } 304 305 bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { 306 return ((FI.MemInstCount + FI.IAMInstCount * IAWeight + 307 FI.LSMInstCount * LSWeight) * 308 100 / FI.InstCount) > LimitWaveThresh; 309 } 310 311 bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const { 312 if (auto PT = dyn_cast<PointerType>(V->getType())) { 313 unsigned As = PT->getAddressSpace(); 314 // Flat likely points to global too. 315 return As == AS.GLOBAL_ADDRESS || As == AS.FLAT_ADDRESS; 316 } 317 return false; 318 } 319 320 bool AMDGPUPerfHint::isLocalAddr(const Value *V) const { 321 if (auto PT = dyn_cast<PointerType>(V->getType())) 322 return PT->getAddressSpace() == AS.LOCAL_ADDRESS; 323 return false; 324 } 325 326 bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) { 327 LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n'); 328 329 MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst)); 330 bool IsLargeStride = MAI.isLargeStride(LastAccess); 331 if (MAI.Base) 332 LastAccess = std::move(MAI); 333 334 return IsLargeStride; 335 } 336 337 AMDGPUPerfHint::MemAccessInfo 338 AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const { 339 MemAccessInfo MAI; 340 const Value *MO = getMemoryInstrPtr(Inst); 341 342 LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n'); 343 // Do not treat local-addr memory access as large stride. 344 if (isLocalAddr(MO)) 345 return MAI; 346 347 MAI.V = MO; 348 MAI.Base = GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL); 349 return MAI; 350 } 351 352 bool AMDGPUPerfHint::isConstantAddr(const Value *V) const { 353 if (auto PT = dyn_cast<PointerType>(V->getType())) { 354 unsigned As = PT->getAddressSpace(); 355 return As == AS.CONSTANT_ADDRESS || As == AS.CONSTANT_ADDRESS_32BIT; 356 } 357 return false; 358 } 359 360 bool AMDGPUPerfHint::MemAccessInfo::isLargeStride( 361 MemAccessInfo &Reference) const { 362 363 if (!Base || !Reference.Base || Base != Reference.Base) 364 return false; 365 366 uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset 367 : Reference.Offset - Offset; 368 bool Result = Diff > LargeStrideThresh; 369 LLVM_DEBUG(dbgs() << "[isLargeStride compare]\n" 370 << print() << "<=>\n" 371 << Reference.print() << "Result:" << Result << '\n'); 372 return Result; 373 } 374 } // namespace 375 376 bool AMDGPUPerfHintAnalysis::runOnFunction(Function &F) { 377 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 378 if (!TPC) 379 return false; 380 381 const TargetMachine &TM = TPC->getTM<TargetMachine>(); 382 const TargetSubtargetInfo *ST = TM.getSubtargetImpl(F); 383 384 AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering()); 385 Analyzer.runOnFunction(F); 386 return false; 387 } 388 389 bool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const { 390 auto FI = FIM.find(F); 391 if (FI == FIM.end()) 392 return false; 393 394 return AMDGPUPerfHint::isMemBound(FI->second); 395 } 396 397 bool AMDGPUPerfHintAnalysis::needsWaveLimiter(const Function *F) const { 398 auto FI = FIM.find(F); 399 if (FI == FIM.end()) 400 return false; 401 402 return AMDGPUPerfHint::needLimitWave(FI->second); 403 } 404