1 //===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// \brief Analyzes if a function potentially memory bound and if a kernel 11 /// kernel may benefit from limiting number of waves to reduce cache thrashing. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPU.h" 16 #include "AMDGPUPerfHintAnalysis.h" 17 #include "Utils/AMDGPUBaseInfo.h" 18 #include "llvm/ADT/SmallSet.h" 19 #include "llvm/ADT/Statistic.h" 20 #include "llvm/Analysis/CallGraph.h" 21 #include "llvm/Analysis/ValueTracking.h" 22 #include "llvm/CodeGen/TargetLowering.h" 23 #include "llvm/CodeGen/TargetPassConfig.h" 24 #include "llvm/CodeGen/TargetSubtargetInfo.h" 25 #include "llvm/IR/Instructions.h" 26 #include "llvm/IR/IntrinsicInst.h" 27 #include "llvm/Support/CommandLine.h" 28 #include "llvm/Target/TargetMachine.h" 29 30 using namespace llvm; 31 32 #define DEBUG_TYPE "amdgpu-perf-hint" 33 34 static cl::opt<unsigned> 35 MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden, 36 cl::desc("Function mem bound threshold in %")); 37 38 static cl::opt<unsigned> 39 LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden, 40 cl::desc("Kernel limit wave threshold in %")); 41 42 static cl::opt<unsigned> 43 IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden, 44 cl::desc("Indirect access memory instruction weight")); 45 46 static cl::opt<unsigned> 47 LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden, 48 cl::desc("Large stride memory access weight")); 49 50 static cl::opt<unsigned> 51 LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden, 52 cl::desc("Large stride memory access threshold")); 53 54 STATISTIC(NumMemBound, "Number of functions marked as memory bound"); 55 STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave"); 56 57 char llvm::AMDGPUPerfHintAnalysis::ID = 0; 58 char &llvm::AMDGPUPerfHintAnalysisID = AMDGPUPerfHintAnalysis::ID; 59 60 INITIALIZE_PASS(AMDGPUPerfHintAnalysis, DEBUG_TYPE, 61 "Analysis if a function is memory bound", true, true) 62 63 namespace { 64 65 struct AMDGPUPerfHint { 66 friend AMDGPUPerfHintAnalysis; 67 68 public: 69 AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_, 70 const TargetLowering *TLI_) 71 : FIM(FIM_), DL(nullptr), TLI(TLI_) {} 72 73 bool runOnFunction(Function &F); 74 75 private: 76 struct MemAccessInfo { 77 const Value *V; 78 const Value *Base; 79 int64_t Offset; 80 MemAccessInfo() : V(nullptr), Base(nullptr), Offset(0) {} 81 bool isLargeStride(MemAccessInfo &Reference) const; 82 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 83 Printable print() const { 84 return Printable([this](raw_ostream &OS) { 85 OS << "Value: " << *V << '\n' 86 << "Base: " << *Base << " Offset: " << Offset << '\n'; 87 }); 88 } 89 #endif 90 }; 91 92 MemAccessInfo makeMemAccessInfo(Instruction *) const; 93 94 MemAccessInfo LastAccess; // Last memory access info 95 96 AMDGPUPerfHintAnalysis::FuncInfoMap &FIM; 97 98 const DataLayout *DL; 99 100 const TargetLowering *TLI; 101 102 AMDGPUPerfHintAnalysis::FuncInfo *visit(const Function &F); 103 static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F); 104 static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F); 105 106 bool isIndirectAccess(const Instruction *Inst) const; 107 108 /// Check if the instruction is large stride. 109 /// The purpose is to identify memory access pattern like: 110 /// x = a[i]; 111 /// y = a[i+1000]; 112 /// z = a[i+2000]; 113 /// In the above example, the second and third memory access will be marked 114 /// large stride memory access. 115 bool isLargeStride(const Instruction *Inst); 116 117 bool isGlobalAddr(const Value *V) const; 118 bool isLocalAddr(const Value *V) const; 119 bool isConstantAddr(const Value *V) const; 120 }; 121 122 static const Value *getMemoryInstrPtr(const Instruction *Inst) { 123 if (auto LI = dyn_cast<LoadInst>(Inst)) { 124 return LI->getPointerOperand(); 125 } 126 if (auto SI = dyn_cast<StoreInst>(Inst)) { 127 return SI->getPointerOperand(); 128 } 129 if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst)) { 130 return AI->getPointerOperand(); 131 } 132 if (auto AI = dyn_cast<AtomicRMWInst>(Inst)) { 133 return AI->getPointerOperand(); 134 } 135 if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst)) { 136 return MI->getRawDest(); 137 } 138 139 return nullptr; 140 } 141 142 bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const { 143 LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n'); 144 SmallSet<const Value *, 32> WorkSet; 145 SmallSet<const Value *, 32> Visited; 146 if (const Value *MO = getMemoryInstrPtr(Inst)) { 147 if (isGlobalAddr(MO)) 148 WorkSet.insert(MO); 149 } 150 151 while (!WorkSet.empty()) { 152 const Value *V = *WorkSet.begin(); 153 WorkSet.erase(*WorkSet.begin()); 154 if (!Visited.insert(V).second) 155 continue; 156 LLVM_DEBUG(dbgs() << " check: " << *V << '\n'); 157 158 if (auto LD = dyn_cast<LoadInst>(V)) { 159 auto M = LD->getPointerOperand(); 160 if (isGlobalAddr(M) || isLocalAddr(M) || isConstantAddr(M)) { 161 LLVM_DEBUG(dbgs() << " is IA\n"); 162 return true; 163 } 164 continue; 165 } 166 167 if (auto GEP = dyn_cast<GetElementPtrInst>(V)) { 168 auto P = GEP->getPointerOperand(); 169 WorkSet.insert(P); 170 for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I) 171 WorkSet.insert(GEP->getOperand(I)); 172 continue; 173 } 174 175 if (auto U = dyn_cast<UnaryInstruction>(V)) { 176 WorkSet.insert(U->getOperand(0)); 177 continue; 178 } 179 180 if (auto BO = dyn_cast<BinaryOperator>(V)) { 181 WorkSet.insert(BO->getOperand(0)); 182 WorkSet.insert(BO->getOperand(1)); 183 continue; 184 } 185 186 if (auto S = dyn_cast<SelectInst>(V)) { 187 WorkSet.insert(S->getFalseValue()); 188 WorkSet.insert(S->getTrueValue()); 189 continue; 190 } 191 192 if (auto E = dyn_cast<ExtractElementInst>(V)) { 193 WorkSet.insert(E->getVectorOperand()); 194 continue; 195 } 196 197 LLVM_DEBUG(dbgs() << " dropped\n"); 198 } 199 200 LLVM_DEBUG(dbgs() << " is not IA\n"); 201 return false; 202 } 203 204 AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) { 205 AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F]; 206 207 LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n'); 208 209 for (auto &B : F) { 210 LastAccess = MemAccessInfo(); 211 for (auto &I : B) { 212 if (getMemoryInstrPtr(&I)) { 213 if (isIndirectAccess(&I)) 214 ++FI.IAMInstCount; 215 if (isLargeStride(&I)) 216 ++FI.LSMInstCount; 217 ++FI.MemInstCount; 218 ++FI.InstCount; 219 continue; 220 } 221 if (auto *CB = dyn_cast<CallBase>(&I)) { 222 Function *Callee = CB->getCalledFunction(); 223 if (!Callee || Callee->isDeclaration()) { 224 ++FI.InstCount; 225 continue; 226 } 227 if (&F == Callee) // Handle immediate recursion 228 continue; 229 230 auto Loc = FIM.find(Callee); 231 if (Loc == FIM.end()) 232 continue; 233 234 FI.MemInstCount += Loc->second.MemInstCount; 235 FI.InstCount += Loc->second.InstCount; 236 FI.IAMInstCount += Loc->second.IAMInstCount; 237 FI.LSMInstCount += Loc->second.LSMInstCount; 238 } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { 239 TargetLoweringBase::AddrMode AM; 240 auto *Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, *DL); 241 AM.BaseGV = dyn_cast_or_null<GlobalValue>(const_cast<Value *>(Ptr)); 242 AM.HasBaseReg = !AM.BaseGV; 243 if (TLI->isLegalAddressingMode(*DL, AM, GEP->getResultElementType(), 244 GEP->getPointerAddressSpace())) 245 // Offset will likely be folded into load or store 246 continue; 247 ++FI.InstCount; 248 } else { 249 ++FI.InstCount; 250 } 251 } 252 } 253 254 return &FI; 255 } 256 257 bool AMDGPUPerfHint::runOnFunction(Function &F) { 258 const Module &M = *F.getParent(); 259 DL = &M.getDataLayout(); 260 261 if (F.hasFnAttribute("amdgpu-wave-limiter") && 262 F.hasFnAttribute("amdgpu-memory-bound")) 263 return false; 264 265 const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F); 266 267 LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Info->MemInstCount 268 << '\n' 269 << " IAMInst: " << Info->IAMInstCount << '\n' 270 << " LSMInst: " << Info->LSMInstCount << '\n' 271 << " TotalInst: " << Info->InstCount << '\n'); 272 273 if (isMemBound(*Info)) { 274 LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n"); 275 NumMemBound++; 276 F.addFnAttr("amdgpu-memory-bound", "true"); 277 } 278 279 if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(*Info)) { 280 LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n"); 281 NumLimitWave++; 282 F.addFnAttr("amdgpu-wave-limiter", "true"); 283 } 284 285 return true; 286 } 287 288 bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { 289 return FI.MemInstCount * 100 / FI.InstCount > MemBoundThresh; 290 } 291 292 bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { 293 return ((FI.MemInstCount + FI.IAMInstCount * IAWeight + 294 FI.LSMInstCount * LSWeight) * 295 100 / FI.InstCount) > LimitWaveThresh; 296 } 297 298 bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const { 299 if (auto PT = dyn_cast<PointerType>(V->getType())) { 300 unsigned As = PT->getAddressSpace(); 301 // Flat likely points to global too. 302 return As == AMDGPUAS::GLOBAL_ADDRESS || As == AMDGPUAS::FLAT_ADDRESS; 303 } 304 return false; 305 } 306 307 bool AMDGPUPerfHint::isLocalAddr(const Value *V) const { 308 if (auto PT = dyn_cast<PointerType>(V->getType())) 309 return PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; 310 return false; 311 } 312 313 bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) { 314 LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n'); 315 316 MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst)); 317 bool IsLargeStride = MAI.isLargeStride(LastAccess); 318 if (MAI.Base) 319 LastAccess = std::move(MAI); 320 321 return IsLargeStride; 322 } 323 324 AMDGPUPerfHint::MemAccessInfo 325 AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const { 326 MemAccessInfo MAI; 327 const Value *MO = getMemoryInstrPtr(Inst); 328 329 LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n'); 330 // Do not treat local-addr memory access as large stride. 331 if (isLocalAddr(MO)) 332 return MAI; 333 334 MAI.V = MO; 335 MAI.Base = GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL); 336 return MAI; 337 } 338 339 bool AMDGPUPerfHint::isConstantAddr(const Value *V) const { 340 if (auto PT = dyn_cast<PointerType>(V->getType())) { 341 unsigned As = PT->getAddressSpace(); 342 return As == AMDGPUAS::CONSTANT_ADDRESS || 343 As == AMDGPUAS::CONSTANT_ADDRESS_32BIT; 344 } 345 return false; 346 } 347 348 bool AMDGPUPerfHint::MemAccessInfo::isLargeStride( 349 MemAccessInfo &Reference) const { 350 351 if (!Base || !Reference.Base || Base != Reference.Base) 352 return false; 353 354 uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset 355 : Reference.Offset - Offset; 356 bool Result = Diff > LargeStrideThresh; 357 LLVM_DEBUG(dbgs() << "[isLargeStride compare]\n" 358 << print() << "<=>\n" 359 << Reference.print() << "Result:" << Result << '\n'); 360 return Result; 361 } 362 } // namespace 363 364 bool AMDGPUPerfHintAnalysis::runOnSCC(CallGraphSCC &SCC) { 365 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 366 if (!TPC) 367 return false; 368 369 const TargetMachine &TM = TPC->getTM<TargetMachine>(); 370 371 bool Changed = false; 372 for (CallGraphNode *I : SCC) { 373 Function *F = I->getFunction(); 374 if (!F || F->isDeclaration()) 375 continue; 376 377 const TargetSubtargetInfo *ST = TM.getSubtargetImpl(*F); 378 AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering()); 379 380 if (Analyzer.runOnFunction(*F)) 381 Changed = true; 382 } 383 384 return Changed; 385 } 386 387 bool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const { 388 auto FI = FIM.find(F); 389 if (FI == FIM.end()) 390 return false; 391 392 return AMDGPUPerfHint::isMemBound(FI->second); 393 } 394 395 bool AMDGPUPerfHintAnalysis::needsWaveLimiter(const Function *F) const { 396 auto FI = FIM.find(F); 397 if (FI == FIM.end()) 398 return false; 399 400 return AMDGPUPerfHint::needLimitWave(FI->second); 401 } 402