1 //===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// \brief Analyzes if a function potentially memory bound and if a kernel 11 /// kernel may benefit from limiting number of waves to reduce cache thrashing. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPU.h" 16 #include "AMDGPUPerfHintAnalysis.h" 17 #include "Utils/AMDGPUBaseInfo.h" 18 #include "llvm/ADT/SmallSet.h" 19 #include "llvm/ADT/Statistic.h" 20 #include "llvm/Analysis/CallGraph.h" 21 #include "llvm/Analysis/ValueTracking.h" 22 #include "llvm/CodeGen/TargetLowering.h" 23 #include "llvm/CodeGen/TargetPassConfig.h" 24 #include "llvm/CodeGen/TargetSubtargetInfo.h" 25 #include "llvm/IR/Constants.h" 26 #include "llvm/IR/Instructions.h" 27 #include "llvm/IR/IntrinsicInst.h" 28 #include "llvm/IR/Module.h" 29 #include "llvm/IR/ValueMap.h" 30 #include "llvm/Support/CommandLine.h" 31 32 using namespace llvm; 33 34 #define DEBUG_TYPE "amdgpu-perf-hint" 35 36 static cl::opt<unsigned> 37 MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden, 38 cl::desc("Function mem bound threshold in %")); 39 40 static cl::opt<unsigned> 41 LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden, 42 cl::desc("Kernel limit wave threshold in %")); 43 44 static cl::opt<unsigned> 45 IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden, 46 cl::desc("Indirect access memory instruction weight")); 47 48 static cl::opt<unsigned> 49 LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden, 50 cl::desc("Large stride memory access weight")); 51 52 static cl::opt<unsigned> 53 LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden, 54 cl::desc("Large stride memory access threshold")); 55 56 STATISTIC(NumMemBound, "Number of functions marked as memory bound"); 57 STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave"); 58 59 char llvm::AMDGPUPerfHintAnalysis::ID = 0; 60 char &llvm::AMDGPUPerfHintAnalysisID = AMDGPUPerfHintAnalysis::ID; 61 62 INITIALIZE_PASS(AMDGPUPerfHintAnalysis, DEBUG_TYPE, 63 "Analysis if a function is memory bound", true, true) 64 65 namespace { 66 67 struct AMDGPUPerfHint { 68 friend AMDGPUPerfHintAnalysis; 69 70 public: 71 AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_, 72 const TargetLowering *TLI_) 73 : FIM(FIM_), DL(nullptr), TLI(TLI_) {} 74 75 bool runOnFunction(Function &F); 76 77 private: 78 struct MemAccessInfo { 79 const Value *V; 80 const Value *Base; 81 int64_t Offset; 82 MemAccessInfo() : V(nullptr), Base(nullptr), Offset(0) {} 83 bool isLargeStride(MemAccessInfo &Reference) const; 84 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 85 Printable print() const { 86 return Printable([this](raw_ostream &OS) { 87 OS << "Value: " << *V << '\n' 88 << "Base: " << *Base << " Offset: " << Offset << '\n'; 89 }); 90 } 91 #endif 92 }; 93 94 MemAccessInfo makeMemAccessInfo(Instruction *) const; 95 96 MemAccessInfo LastAccess; // Last memory access info 97 98 AMDGPUPerfHintAnalysis::FuncInfoMap &FIM; 99 100 const DataLayout *DL; 101 102 const TargetLowering *TLI; 103 104 AMDGPUPerfHintAnalysis::FuncInfo *visit(const Function &F); 105 static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F); 106 static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F); 107 108 bool isIndirectAccess(const Instruction *Inst) const; 109 110 /// Check if the instruction is large stride. 111 /// The purpose is to identify memory access pattern like: 112 /// x = a[i]; 113 /// y = a[i+1000]; 114 /// z = a[i+2000]; 115 /// In the above example, the second and third memory access will be marked 116 /// large stride memory access. 117 bool isLargeStride(const Instruction *Inst); 118 119 bool isGlobalAddr(const Value *V) const; 120 bool isLocalAddr(const Value *V) const; 121 bool isConstantAddr(const Value *V) const; 122 }; 123 124 static const Value *getMemoryInstrPtr(const Instruction *Inst) { 125 if (auto LI = dyn_cast<LoadInst>(Inst)) { 126 return LI->getPointerOperand(); 127 } 128 if (auto SI = dyn_cast<StoreInst>(Inst)) { 129 return SI->getPointerOperand(); 130 } 131 if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst)) { 132 return AI->getPointerOperand(); 133 } 134 if (auto AI = dyn_cast<AtomicRMWInst>(Inst)) { 135 return AI->getPointerOperand(); 136 } 137 if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst)) { 138 return MI->getRawDest(); 139 } 140 141 return nullptr; 142 } 143 144 bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const { 145 LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n'); 146 SmallSet<const Value *, 32> WorkSet; 147 SmallSet<const Value *, 32> Visited; 148 if (const Value *MO = getMemoryInstrPtr(Inst)) { 149 if (isGlobalAddr(MO)) 150 WorkSet.insert(MO); 151 } 152 153 while (!WorkSet.empty()) { 154 const Value *V = *WorkSet.begin(); 155 WorkSet.erase(*WorkSet.begin()); 156 if (!Visited.insert(V).second) 157 continue; 158 LLVM_DEBUG(dbgs() << " check: " << *V << '\n'); 159 160 if (auto LD = dyn_cast<LoadInst>(V)) { 161 auto M = LD->getPointerOperand(); 162 if (isGlobalAddr(M) || isLocalAddr(M) || isConstantAddr(M)) { 163 LLVM_DEBUG(dbgs() << " is IA\n"); 164 return true; 165 } 166 continue; 167 } 168 169 if (auto GEP = dyn_cast<GetElementPtrInst>(V)) { 170 auto P = GEP->getPointerOperand(); 171 WorkSet.insert(P); 172 for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I) 173 WorkSet.insert(GEP->getOperand(I)); 174 continue; 175 } 176 177 if (auto U = dyn_cast<UnaryInstruction>(V)) { 178 WorkSet.insert(U->getOperand(0)); 179 continue; 180 } 181 182 if (auto BO = dyn_cast<BinaryOperator>(V)) { 183 WorkSet.insert(BO->getOperand(0)); 184 WorkSet.insert(BO->getOperand(1)); 185 continue; 186 } 187 188 if (auto S = dyn_cast<SelectInst>(V)) { 189 WorkSet.insert(S->getFalseValue()); 190 WorkSet.insert(S->getTrueValue()); 191 continue; 192 } 193 194 if (auto E = dyn_cast<ExtractElementInst>(V)) { 195 WorkSet.insert(E->getVectorOperand()); 196 continue; 197 } 198 199 LLVM_DEBUG(dbgs() << " dropped\n"); 200 } 201 202 LLVM_DEBUG(dbgs() << " is not IA\n"); 203 return false; 204 } 205 206 AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) { 207 AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F]; 208 209 LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n'); 210 211 for (auto &B : F) { 212 LastAccess = MemAccessInfo(); 213 for (auto &I : B) { 214 if (getMemoryInstrPtr(&I)) { 215 if (isIndirectAccess(&I)) 216 ++FI.IAMInstCount; 217 if (isLargeStride(&I)) 218 ++FI.LSMInstCount; 219 ++FI.MemInstCount; 220 ++FI.InstCount; 221 continue; 222 } 223 if (auto *CB = dyn_cast<CallBase>(&I)) { 224 Function *Callee = CB->getCalledFunction(); 225 if (!Callee || Callee->isDeclaration()) { 226 ++FI.InstCount; 227 continue; 228 } 229 if (&F == Callee) // Handle immediate recursion 230 continue; 231 232 auto Loc = FIM.find(Callee); 233 if (Loc == FIM.end()) 234 continue; 235 236 FI.MemInstCount += Loc->second.MemInstCount; 237 FI.InstCount += Loc->second.InstCount; 238 FI.IAMInstCount += Loc->second.IAMInstCount; 239 FI.LSMInstCount += Loc->second.LSMInstCount; 240 } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { 241 TargetLoweringBase::AddrMode AM; 242 auto *Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, *DL); 243 AM.BaseGV = dyn_cast_or_null<GlobalValue>(const_cast<Value *>(Ptr)); 244 AM.HasBaseReg = !AM.BaseGV; 245 if (TLI->isLegalAddressingMode(*DL, AM, GEP->getResultElementType(), 246 GEP->getPointerAddressSpace())) 247 // Offset will likely be folded into load or store 248 continue; 249 ++FI.InstCount; 250 } else { 251 ++FI.InstCount; 252 } 253 } 254 } 255 256 return &FI; 257 } 258 259 bool AMDGPUPerfHint::runOnFunction(Function &F) { 260 const Module &M = *F.getParent(); 261 DL = &M.getDataLayout(); 262 263 if (F.hasFnAttribute("amdgpu-wave-limiter") && 264 F.hasFnAttribute("amdgpu-memory-bound")) 265 return false; 266 267 const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F); 268 269 LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Info->MemInstCount 270 << '\n' 271 << " IAMInst: " << Info->IAMInstCount << '\n' 272 << " LSMInst: " << Info->LSMInstCount << '\n' 273 << " TotalInst: " << Info->InstCount << '\n'); 274 275 if (isMemBound(*Info)) { 276 LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n"); 277 NumMemBound++; 278 F.addFnAttr("amdgpu-memory-bound", "true"); 279 } 280 281 if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(*Info)) { 282 LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n"); 283 NumLimitWave++; 284 F.addFnAttr("amdgpu-wave-limiter", "true"); 285 } 286 287 return true; 288 } 289 290 bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { 291 return FI.MemInstCount * 100 / FI.InstCount > MemBoundThresh; 292 } 293 294 bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { 295 return ((FI.MemInstCount + FI.IAMInstCount * IAWeight + 296 FI.LSMInstCount * LSWeight) * 297 100 / FI.InstCount) > LimitWaveThresh; 298 } 299 300 bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const { 301 if (auto PT = dyn_cast<PointerType>(V->getType())) { 302 unsigned As = PT->getAddressSpace(); 303 // Flat likely points to global too. 304 return As == AMDGPUAS::GLOBAL_ADDRESS || As == AMDGPUAS::FLAT_ADDRESS; 305 } 306 return false; 307 } 308 309 bool AMDGPUPerfHint::isLocalAddr(const Value *V) const { 310 if (auto PT = dyn_cast<PointerType>(V->getType())) 311 return PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS; 312 return false; 313 } 314 315 bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) { 316 LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n'); 317 318 MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst)); 319 bool IsLargeStride = MAI.isLargeStride(LastAccess); 320 if (MAI.Base) 321 LastAccess = std::move(MAI); 322 323 return IsLargeStride; 324 } 325 326 AMDGPUPerfHint::MemAccessInfo 327 AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const { 328 MemAccessInfo MAI; 329 const Value *MO = getMemoryInstrPtr(Inst); 330 331 LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n'); 332 // Do not treat local-addr memory access as large stride. 333 if (isLocalAddr(MO)) 334 return MAI; 335 336 MAI.V = MO; 337 MAI.Base = GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL); 338 return MAI; 339 } 340 341 bool AMDGPUPerfHint::isConstantAddr(const Value *V) const { 342 if (auto PT = dyn_cast<PointerType>(V->getType())) { 343 unsigned As = PT->getAddressSpace(); 344 return As == AMDGPUAS::CONSTANT_ADDRESS || 345 As == AMDGPUAS::CONSTANT_ADDRESS_32BIT; 346 } 347 return false; 348 } 349 350 bool AMDGPUPerfHint::MemAccessInfo::isLargeStride( 351 MemAccessInfo &Reference) const { 352 353 if (!Base || !Reference.Base || Base != Reference.Base) 354 return false; 355 356 uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset 357 : Reference.Offset - Offset; 358 bool Result = Diff > LargeStrideThresh; 359 LLVM_DEBUG(dbgs() << "[isLargeStride compare]\n" 360 << print() << "<=>\n" 361 << Reference.print() << "Result:" << Result << '\n'); 362 return Result; 363 } 364 } // namespace 365 366 bool AMDGPUPerfHintAnalysis::runOnSCC(CallGraphSCC &SCC) { 367 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 368 if (!TPC) 369 return false; 370 371 const TargetMachine &TM = TPC->getTM<TargetMachine>(); 372 373 bool Changed = false; 374 for (CallGraphNode *I : SCC) { 375 Function *F = I->getFunction(); 376 if (!F || F->isDeclaration()) 377 continue; 378 379 const TargetSubtargetInfo *ST = TM.getSubtargetImpl(*F); 380 AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering()); 381 382 if (Analyzer.runOnFunction(*F)) 383 Changed = true; 384 } 385 386 return Changed; 387 } 388 389 bool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const { 390 auto FI = FIM.find(F); 391 if (FI == FIM.end()) 392 return false; 393 394 return AMDGPUPerfHint::isMemBound(FI->second); 395 } 396 397 bool AMDGPUPerfHintAnalysis::needsWaveLimiter(const Function *F) const { 398 auto FI = FIM.find(F); 399 if (FI == FIM.end()) 400 return false; 401 402 return AMDGPUPerfHint::needLimitWave(FI->second); 403 } 404