1 //===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// \brief Analyzes if a function potentially memory bound and if a kernel 11 /// kernel may benefit from limiting number of waves to reduce cache thrashing. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPU.h" 16 #include "AMDGPUPerfHintAnalysis.h" 17 #include "Utils/AMDGPUBaseInfo.h" 18 #include "llvm/ADT/SmallSet.h" 19 #include "llvm/ADT/Statistic.h" 20 #include "llvm/Analysis/ValueTracking.h" 21 #include "llvm/CodeGen/TargetLowering.h" 22 #include "llvm/CodeGen/TargetPassConfig.h" 23 #include "llvm/CodeGen/TargetSubtargetInfo.h" 24 #include "llvm/IR/Constants.h" 25 #include "llvm/IR/Instructions.h" 26 #include "llvm/IR/IntrinsicInst.h" 27 #include "llvm/IR/Module.h" 28 #include "llvm/IR/ValueMap.h" 29 #include "llvm/Support/CommandLine.h" 30 31 using namespace llvm; 32 33 #define DEBUG_TYPE "amdgpu-perf-hint" 34 35 static cl::opt<unsigned> 36 MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden, 37 cl::desc("Function mem bound threshold in %")); 38 39 static cl::opt<unsigned> 40 LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden, 41 cl::desc("Kernel limit wave threshold in %")); 42 43 static cl::opt<unsigned> 44 IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden, 45 cl::desc("Indirect access memory instruction weight")); 46 47 static cl::opt<unsigned> 48 LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden, 49 cl::desc("Large stride memory access weight")); 50 51 static cl::opt<unsigned> 52 LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden, 53 cl::desc("Large stride memory access threshold")); 54 55 STATISTIC(NumMemBound, "Number of functions marked as memory bound"); 56 STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave"); 57 58 char llvm::AMDGPUPerfHintAnalysis::ID = 0; 59 char &llvm::AMDGPUPerfHintAnalysisID = AMDGPUPerfHintAnalysis::ID; 60 61 INITIALIZE_PASS(AMDGPUPerfHintAnalysis, DEBUG_TYPE, 62 "Analysis if a function is memory bound", true, true) 63 64 namespace { 65 66 struct AMDGPUPerfHint { 67 friend AMDGPUPerfHintAnalysis; 68 69 public: 70 AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_, 71 const TargetLowering *TLI_) 72 : FIM(FIM_), DL(nullptr), TLI(TLI_) {} 73 74 void runOnFunction(Function &F); 75 76 private: 77 struct MemAccessInfo { 78 const Value *V; 79 const Value *Base; 80 int64_t Offset; 81 MemAccessInfo() : V(nullptr), Base(nullptr), Offset(0) {} 82 bool isLargeStride(MemAccessInfo &Reference) const; 83 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 84 Printable print() const { 85 return Printable([this](raw_ostream &OS) { 86 OS << "Value: " << *V << '\n' 87 << "Base: " << *Base << " Offset: " << Offset << '\n'; 88 }); 89 } 90 #endif 91 }; 92 93 MemAccessInfo makeMemAccessInfo(Instruction *) const; 94 95 MemAccessInfo LastAccess; // Last memory access info 96 97 AMDGPUPerfHintAnalysis::FuncInfoMap &FIM; 98 99 const DataLayout *DL; 100 101 const TargetLowering *TLI; 102 103 void visit(const Function &F); 104 static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F); 105 static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F); 106 107 bool isIndirectAccess(const Instruction *Inst) const; 108 109 /// Check if the instruction is large stride. 110 /// The purpose is to identify memory access pattern like: 111 /// x = a[i]; 112 /// y = a[i+1000]; 113 /// z = a[i+2000]; 114 /// In the above example, the second and third memory access will be marked 115 /// large stride memory access. 116 bool isLargeStride(const Instruction *Inst); 117 118 bool isGlobalAddr(const Value *V) const; 119 bool isLocalAddr(const Value *V) const; 120 bool isConstantAddr(const Value *V) const; 121 }; 122 123 static const Value *getMemoryInstrPtr(const Instruction *Inst) { 124 if (auto LI = dyn_cast<LoadInst>(Inst)) { 125 return LI->getPointerOperand(); 126 } 127 if (auto SI = dyn_cast<StoreInst>(Inst)) { 128 return SI->getPointerOperand(); 129 } 130 if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst)) { 131 return AI->getPointerOperand(); 132 } 133 if (auto AI = dyn_cast<AtomicRMWInst>(Inst)) { 134 return AI->getPointerOperand(); 135 } 136 if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst)) { 137 return MI->getRawDest(); 138 } 139 140 return nullptr; 141 } 142 143 bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const { 144 LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n'); 145 SmallSet<const Value *, 32> WorkSet; 146 SmallSet<const Value *, 32> Visited; 147 if (const Value *MO = getMemoryInstrPtr(Inst)) { 148 if (isGlobalAddr(MO)) 149 WorkSet.insert(MO); 150 } 151 152 while (!WorkSet.empty()) { 153 const Value *V = *WorkSet.begin(); 154 WorkSet.erase(*WorkSet.begin()); 155 if (!Visited.insert(V).second) 156 continue; 157 LLVM_DEBUG(dbgs() << " check: " << *V << '\n'); 158 159 if (auto LD = dyn_cast<LoadInst>(V)) { 160 auto M = LD->getPointerOperand(); 161 if (isGlobalAddr(M) || isLocalAddr(M) || isConstantAddr(M)) { 162 LLVM_DEBUG(dbgs() << " is IA\n"); 163 return true; 164 } 165 continue; 166 } 167 168 if (auto GEP = dyn_cast<GetElementPtrInst>(V)) { 169 auto P = GEP->getPointerOperand(); 170 WorkSet.insert(P); 171 for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I) 172 WorkSet.insert(GEP->getOperand(I)); 173 continue; 174 } 175 176 if (auto U = dyn_cast<UnaryInstruction>(V)) { 177 WorkSet.insert(U->getOperand(0)); 178 continue; 179 } 180 181 if (auto BO = dyn_cast<BinaryOperator>(V)) { 182 WorkSet.insert(BO->getOperand(0)); 183 WorkSet.insert(BO->getOperand(1)); 184 continue; 185 } 186 187 if (auto S = dyn_cast<SelectInst>(V)) { 188 WorkSet.insert(S->getFalseValue()); 189 WorkSet.insert(S->getTrueValue()); 190 continue; 191 } 192 193 if (auto E = dyn_cast<ExtractElementInst>(V)) { 194 WorkSet.insert(E->getVectorOperand()); 195 continue; 196 } 197 198 LLVM_DEBUG(dbgs() << " dropped\n"); 199 } 200 201 LLVM_DEBUG(dbgs() << " is not IA\n"); 202 return false; 203 } 204 205 void AMDGPUPerfHint::visit(const Function &F) { 206 auto FIP = FIM.insert(std::make_pair(&F, AMDGPUPerfHintAnalysis::FuncInfo())); 207 if (!FIP.second) 208 return; 209 210 AMDGPUPerfHintAnalysis::FuncInfo &FI = FIP.first->second; 211 212 LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n'); 213 214 for (auto &B : F) { 215 LastAccess = MemAccessInfo(); 216 for (auto &I : B) { 217 if (getMemoryInstrPtr(&I)) { 218 if (isIndirectAccess(&I)) 219 ++FI.IAMInstCount; 220 if (isLargeStride(&I)) 221 ++FI.LSMInstCount; 222 ++FI.MemInstCount; 223 ++FI.InstCount; 224 continue; 225 } 226 CallSite CS(const_cast<Instruction *>(&I)); 227 if (CS) { 228 Function *Callee = CS.getCalledFunction(); 229 if (!Callee || Callee->isDeclaration()) { 230 ++FI.InstCount; 231 continue; 232 } 233 if (&F == Callee) // Handle immediate recursion 234 continue; 235 236 visit(*Callee); 237 auto Loc = FIM.find(Callee); 238 239 assert(Loc != FIM.end() && "No func info"); 240 FI.MemInstCount += Loc->second.MemInstCount; 241 FI.InstCount += Loc->second.InstCount; 242 FI.IAMInstCount += Loc->second.IAMInstCount; 243 FI.LSMInstCount += Loc->second.LSMInstCount; 244 } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { 245 TargetLoweringBase::AddrMode AM; 246 auto *Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, *DL); 247 AM.BaseGV = dyn_cast_or_null<GlobalValue>(const_cast<Value *>(Ptr)); 248 AM.HasBaseReg = !AM.BaseGV; 249 if (TLI->isLegalAddressingMode(*DL, AM, GEP->getResultElementType(), 250 GEP->getPointerAddressSpace())) 251 // Offset will likely be folded into load or store 252 continue; 253 ++FI.InstCount; 254 } else { 255 ++FI.InstCount; 256 } 257 } 258 } 259 } 260 261 void AMDGPUPerfHint::runOnFunction(Function &F) { 262 if (FIM.find(&F) != FIM.end()) 263 return; 264 265 const Module &M = *F.getParent(); 266 DL = &M.getDataLayout(); 267 268 visit(F); 269 auto Loc = FIM.find(&F); 270 271 assert(Loc != FIM.end() && "No func info"); 272 LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Loc->second.MemInstCount 273 << '\n' 274 << " IAMInst: " << Loc->second.IAMInstCount << '\n' 275 << " LSMInst: " << Loc->second.LSMInstCount << '\n' 276 << " TotalInst: " << Loc->second.InstCount << '\n'); 277 278 auto &FI = Loc->second; 279 280 if (isMemBound(FI)) { 281 LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n"); 282 NumMemBound++; 283 } 284 285 if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(FI)) { 286 LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n"); 287 NumLimitWave++; 288 } 289 } 290 291 bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { 292 return FI.MemInstCount * 100 / FI.InstCount > MemBoundThresh; 293 } 294 295 bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { 296 return ((FI.MemInstCount + FI.IAMInstCount * IAWeight + 297 FI.LSMInstCount * LSWeight) * 298 100 / FI.InstCount) > LimitWaveThresh; 299 } 300 301 bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const { 302 if (auto PT = dyn_cast<PointerType>(V->getType())) { 303 unsigned As = PT->getAddressSpace(); 304 // Flat likely points to global too. 305 return As == AMDGPUAS::GLOBAL_ADDRESS || As == AMDGPUAS::FLAT_ADDRESS; 306 } 307 return false; 308 } 309 310 bool AMDGPUPerfHint::isLocalAddr(const Value *V) const { 311 if (auto PT = dyn_cast<PointerType>(V->getType())) 312 return PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; 313 return false; 314 } 315 316 bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) { 317 LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n'); 318 319 MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst)); 320 bool IsLargeStride = MAI.isLargeStride(LastAccess); 321 if (MAI.Base) 322 LastAccess = std::move(MAI); 323 324 return IsLargeStride; 325 } 326 327 AMDGPUPerfHint::MemAccessInfo 328 AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const { 329 MemAccessInfo MAI; 330 const Value *MO = getMemoryInstrPtr(Inst); 331 332 LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n'); 333 // Do not treat local-addr memory access as large stride. 334 if (isLocalAddr(MO)) 335 return MAI; 336 337 MAI.V = MO; 338 MAI.Base = GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL); 339 return MAI; 340 } 341 342 bool AMDGPUPerfHint::isConstantAddr(const Value *V) const { 343 if (auto PT = dyn_cast<PointerType>(V->getType())) { 344 unsigned As = PT->getAddressSpace(); 345 return As == AMDGPUAS::CONSTANT_ADDRESS || 346 As == AMDGPUAS::CONSTANT_ADDRESS_32BIT; 347 } 348 return false; 349 } 350 351 bool AMDGPUPerfHint::MemAccessInfo::isLargeStride( 352 MemAccessInfo &Reference) const { 353 354 if (!Base || !Reference.Base || Base != Reference.Base) 355 return false; 356 357 uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset 358 : Reference.Offset - Offset; 359 bool Result = Diff > LargeStrideThresh; 360 LLVM_DEBUG(dbgs() << "[isLargeStride compare]\n" 361 << print() << "<=>\n" 362 << Reference.print() << "Result:" << Result << '\n'); 363 return Result; 364 } 365 } // namespace 366 367 bool AMDGPUPerfHintAnalysis::runOnFunction(Function &F) { 368 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 369 if (!TPC) 370 return false; 371 372 const TargetMachine &TM = TPC->getTM<TargetMachine>(); 373 const TargetSubtargetInfo *ST = TM.getSubtargetImpl(F); 374 375 AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering()); 376 Analyzer.runOnFunction(F); 377 return false; 378 } 379 380 bool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const { 381 auto FI = FIM.find(F); 382 if (FI == FIM.end()) 383 return false; 384 385 return AMDGPUPerfHint::isMemBound(FI->second); 386 } 387 388 bool AMDGPUPerfHintAnalysis::needsWaveLimiter(const Function *F) const { 389 auto FI = FIM.find(F); 390 if (FI == FIM.end()) 391 return false; 392 393 return AMDGPUPerfHint::needLimitWave(FI->second); 394 } 395