1*0b57cec5SDimitry Andric //===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===//
2*0b57cec5SDimitry Andric //
3*0b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*0b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5*0b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*0b57cec5SDimitry Andric //
7*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
8*0b57cec5SDimitry Andric //
9*0b57cec5SDimitry Andric /// \file
10*0b57cec5SDimitry Andric /// \brief Analyzes if a function potentially memory bound and if a kernel
11*0b57cec5SDimitry Andric /// kernel may benefit from limiting number of waves to reduce cache thrashing.
12*0b57cec5SDimitry Andric ///
13*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
14*0b57cec5SDimitry Andric
15*0b57cec5SDimitry Andric #include "AMDGPU.h"
16*0b57cec5SDimitry Andric #include "AMDGPUPerfHintAnalysis.h"
17*0b57cec5SDimitry Andric #include "Utils/AMDGPUBaseInfo.h"
18*0b57cec5SDimitry Andric #include "llvm/ADT/SmallSet.h"
19*0b57cec5SDimitry Andric #include "llvm/ADT/Statistic.h"
20*0b57cec5SDimitry Andric #include "llvm/Analysis/CallGraph.h"
21*0b57cec5SDimitry Andric #include "llvm/Analysis/ValueTracking.h"
22*0b57cec5SDimitry Andric #include "llvm/CodeGen/TargetLowering.h"
23*0b57cec5SDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h"
24*0b57cec5SDimitry Andric #include "llvm/CodeGen/TargetSubtargetInfo.h"
25*0b57cec5SDimitry Andric #include "llvm/IR/Instructions.h"
26*0b57cec5SDimitry Andric #include "llvm/IR/IntrinsicInst.h"
27*0b57cec5SDimitry Andric #include "llvm/Support/CommandLine.h"
28*0b57cec5SDimitry Andric #include "llvm/Target/TargetMachine.h"
29*0b57cec5SDimitry Andric
30*0b57cec5SDimitry Andric using namespace llvm;
31*0b57cec5SDimitry Andric
32*0b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-perf-hint"
33*0b57cec5SDimitry Andric
34*0b57cec5SDimitry Andric static cl::opt<unsigned>
35*0b57cec5SDimitry Andric MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden,
36*0b57cec5SDimitry Andric cl::desc("Function mem bound threshold in %"));
37*0b57cec5SDimitry Andric
38*0b57cec5SDimitry Andric static cl::opt<unsigned>
39*0b57cec5SDimitry Andric LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden,
40*0b57cec5SDimitry Andric cl::desc("Kernel limit wave threshold in %"));
41*0b57cec5SDimitry Andric
42*0b57cec5SDimitry Andric static cl::opt<unsigned>
43*0b57cec5SDimitry Andric IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden,
44*0b57cec5SDimitry Andric cl::desc("Indirect access memory instruction weight"));
45*0b57cec5SDimitry Andric
46*0b57cec5SDimitry Andric static cl::opt<unsigned>
47*0b57cec5SDimitry Andric LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden,
48*0b57cec5SDimitry Andric cl::desc("Large stride memory access weight"));
49*0b57cec5SDimitry Andric
50*0b57cec5SDimitry Andric static cl::opt<unsigned>
51*0b57cec5SDimitry Andric LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden,
52*0b57cec5SDimitry Andric cl::desc("Large stride memory access threshold"));
53*0b57cec5SDimitry Andric
54*0b57cec5SDimitry Andric STATISTIC(NumMemBound, "Number of functions marked as memory bound");
55*0b57cec5SDimitry Andric STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave");
56*0b57cec5SDimitry Andric
57*0b57cec5SDimitry Andric char llvm::AMDGPUPerfHintAnalysis::ID = 0;
58*0b57cec5SDimitry Andric char &llvm::AMDGPUPerfHintAnalysisID = AMDGPUPerfHintAnalysis::ID;
59*0b57cec5SDimitry Andric
60*0b57cec5SDimitry Andric INITIALIZE_PASS(AMDGPUPerfHintAnalysis, DEBUG_TYPE,
61*0b57cec5SDimitry Andric "Analysis if a function is memory bound", true, true)
62*0b57cec5SDimitry Andric
63*0b57cec5SDimitry Andric namespace {
64*0b57cec5SDimitry Andric
65*0b57cec5SDimitry Andric struct AMDGPUPerfHint {
66*0b57cec5SDimitry Andric friend AMDGPUPerfHintAnalysis;
67*0b57cec5SDimitry Andric
68*0b57cec5SDimitry Andric public:
AMDGPUPerfHint__anon83545b8b0111::AMDGPUPerfHint69*0b57cec5SDimitry Andric AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_,
70*0b57cec5SDimitry Andric const TargetLowering *TLI_)
71*0b57cec5SDimitry Andric : FIM(FIM_), DL(nullptr), TLI(TLI_) {}
72*0b57cec5SDimitry Andric
73*0b57cec5SDimitry Andric bool runOnFunction(Function &F);
74*0b57cec5SDimitry Andric
75*0b57cec5SDimitry Andric private:
76*0b57cec5SDimitry Andric struct MemAccessInfo {
77*0b57cec5SDimitry Andric const Value *V = nullptr;
78*0b57cec5SDimitry Andric const Value *Base = nullptr;
79*0b57cec5SDimitry Andric int64_t Offset = 0;
80*0b57cec5SDimitry Andric MemAccessInfo() = default;
81*0b57cec5SDimitry Andric bool isLargeStride(MemAccessInfo &Reference) const;
82*0b57cec5SDimitry Andric #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print__anon83545b8b0111::AMDGPUPerfHint::MemAccessInfo83*0b57cec5SDimitry Andric Printable print() const {
84*0b57cec5SDimitry Andric return Printable([this](raw_ostream &OS) {
85*0b57cec5SDimitry Andric OS << "Value: " << *V << '\n'
86*0b57cec5SDimitry Andric << "Base: " << *Base << " Offset: " << Offset << '\n';
87*0b57cec5SDimitry Andric });
88*0b57cec5SDimitry Andric }
89*0b57cec5SDimitry Andric #endif
90*0b57cec5SDimitry Andric };
91*0b57cec5SDimitry Andric
92*0b57cec5SDimitry Andric MemAccessInfo makeMemAccessInfo(Instruction *) const;
93*0b57cec5SDimitry Andric
94*0b57cec5SDimitry Andric MemAccessInfo LastAccess; // Last memory access info
95*0b57cec5SDimitry Andric
96*0b57cec5SDimitry Andric AMDGPUPerfHintAnalysis::FuncInfoMap &FIM;
97*0b57cec5SDimitry Andric
98*0b57cec5SDimitry Andric const DataLayout *DL;
99*0b57cec5SDimitry Andric
100*0b57cec5SDimitry Andric const TargetLowering *TLI;
101*0b57cec5SDimitry Andric
102*0b57cec5SDimitry Andric AMDGPUPerfHintAnalysis::FuncInfo *visit(const Function &F);
103*0b57cec5SDimitry Andric static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F);
104*0b57cec5SDimitry Andric static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F);
105*0b57cec5SDimitry Andric
106*0b57cec5SDimitry Andric bool isIndirectAccess(const Instruction *Inst) const;
107*0b57cec5SDimitry Andric
108*0b57cec5SDimitry Andric /// Check if the instruction is large stride.
109*0b57cec5SDimitry Andric /// The purpose is to identify memory access pattern like:
110*0b57cec5SDimitry Andric /// x = a[i];
111*0b57cec5SDimitry Andric /// y = a[i+1000];
112*0b57cec5SDimitry Andric /// z = a[i+2000];
113*0b57cec5SDimitry Andric /// In the above example, the second and third memory access will be marked
114*0b57cec5SDimitry Andric /// large stride memory access.
115*0b57cec5SDimitry Andric bool isLargeStride(const Instruction *Inst);
116*0b57cec5SDimitry Andric
117*0b57cec5SDimitry Andric bool isGlobalAddr(const Value *V) const;
118*0b57cec5SDimitry Andric bool isLocalAddr(const Value *V) const;
119*0b57cec5SDimitry Andric bool isGlobalLoadUsedInBB(const Instruction &) const;
120*0b57cec5SDimitry Andric };
121*0b57cec5SDimitry Andric
getMemoryInstrPtrAndType(const Instruction * Inst)122*0b57cec5SDimitry Andric static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType(
123*0b57cec5SDimitry Andric const Instruction *Inst) {
124*0b57cec5SDimitry Andric if (auto LI = dyn_cast<LoadInst>(Inst))
125*0b57cec5SDimitry Andric return {LI->getPointerOperand(), LI->getType()};
126*0b57cec5SDimitry Andric if (auto SI = dyn_cast<StoreInst>(Inst))
127*0b57cec5SDimitry Andric return {SI->getPointerOperand(), SI->getValueOperand()->getType()};
128*0b57cec5SDimitry Andric if (auto AI = dyn_cast<AtomicCmpXchgInst>(Inst))
129*0b57cec5SDimitry Andric return {AI->getPointerOperand(), AI->getCompareOperand()->getType()};
130*0b57cec5SDimitry Andric if (auto AI = dyn_cast<AtomicRMWInst>(Inst))
131*0b57cec5SDimitry Andric return {AI->getPointerOperand(), AI->getValOperand()->getType()};
132*0b57cec5SDimitry Andric if (auto MI = dyn_cast<AnyMemIntrinsic>(Inst))
133*0b57cec5SDimitry Andric return {MI->getRawDest(), Type::getInt8Ty(MI->getContext())};
134*0b57cec5SDimitry Andric
135*0b57cec5SDimitry Andric return {nullptr, nullptr};
136*0b57cec5SDimitry Andric }
137*0b57cec5SDimitry Andric
isIndirectAccess(const Instruction * Inst) const138*0b57cec5SDimitry Andric bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
139*0b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n');
140*0b57cec5SDimitry Andric SmallSet<const Value *, 32> WorkSet;
141*0b57cec5SDimitry Andric SmallSet<const Value *, 32> Visited;
142*0b57cec5SDimitry Andric if (const Value *MO = getMemoryInstrPtrAndType(Inst).first) {
143*0b57cec5SDimitry Andric if (isGlobalAddr(MO))
144*0b57cec5SDimitry Andric WorkSet.insert(MO);
145*0b57cec5SDimitry Andric }
146*0b57cec5SDimitry Andric
147*0b57cec5SDimitry Andric while (!WorkSet.empty()) {
148*0b57cec5SDimitry Andric const Value *V = *WorkSet.begin();
149*0b57cec5SDimitry Andric WorkSet.erase(*WorkSet.begin());
150*0b57cec5SDimitry Andric if (!Visited.insert(V).second)
151*0b57cec5SDimitry Andric continue;
152*0b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " check: " << *V << '\n');
153*0b57cec5SDimitry Andric
154*0b57cec5SDimitry Andric if (auto LD = dyn_cast<LoadInst>(V)) {
155*0b57cec5SDimitry Andric auto M = LD->getPointerOperand();
156*0b57cec5SDimitry Andric if (isGlobalAddr(M)) {
157*0b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " is IA\n");
158*0b57cec5SDimitry Andric return true;
159*0b57cec5SDimitry Andric }
160*0b57cec5SDimitry Andric continue;
161*0b57cec5SDimitry Andric }
162*0b57cec5SDimitry Andric
163*0b57cec5SDimitry Andric if (auto GEP = dyn_cast<GetElementPtrInst>(V)) {
164*0b57cec5SDimitry Andric auto P = GEP->getPointerOperand();
165*0b57cec5SDimitry Andric WorkSet.insert(P);
166*0b57cec5SDimitry Andric for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I)
167*0b57cec5SDimitry Andric WorkSet.insert(GEP->getOperand(I));
168*0b57cec5SDimitry Andric continue;
169*0b57cec5SDimitry Andric }
170*0b57cec5SDimitry Andric
171*0b57cec5SDimitry Andric if (auto U = dyn_cast<UnaryInstruction>(V)) {
172*0b57cec5SDimitry Andric WorkSet.insert(U->getOperand(0));
173*0b57cec5SDimitry Andric continue;
174*0b57cec5SDimitry Andric }
175*0b57cec5SDimitry Andric
176*0b57cec5SDimitry Andric if (auto BO = dyn_cast<BinaryOperator>(V)) {
177*0b57cec5SDimitry Andric WorkSet.insert(BO->getOperand(0));
178*0b57cec5SDimitry Andric WorkSet.insert(BO->getOperand(1));
179*0b57cec5SDimitry Andric continue;
180*0b57cec5SDimitry Andric }
181*0b57cec5SDimitry Andric
182*0b57cec5SDimitry Andric if (auto S = dyn_cast<SelectInst>(V)) {
183*0b57cec5SDimitry Andric WorkSet.insert(S->getFalseValue());
184*0b57cec5SDimitry Andric WorkSet.insert(S->getTrueValue());
185*0b57cec5SDimitry Andric continue;
186*0b57cec5SDimitry Andric }
187*0b57cec5SDimitry Andric
188*0b57cec5SDimitry Andric if (auto E = dyn_cast<ExtractElementInst>(V)) {
189*0b57cec5SDimitry Andric WorkSet.insert(E->getVectorOperand());
190*0b57cec5SDimitry Andric continue;
191*0b57cec5SDimitry Andric }
192*0b57cec5SDimitry Andric
193*0b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " dropped\n");
194*0b57cec5SDimitry Andric }
195*0b57cec5SDimitry Andric
196*0b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << " is not IA\n");
197*0b57cec5SDimitry Andric return false;
198*0b57cec5SDimitry Andric }
199*0b57cec5SDimitry Andric
200*0b57cec5SDimitry Andric // Returns true if the global load `I` is used in its own basic block.
isGlobalLoadUsedInBB(const Instruction & I) const201*0b57cec5SDimitry Andric bool AMDGPUPerfHint::isGlobalLoadUsedInBB(const Instruction &I) const {
202*0b57cec5SDimitry Andric const auto *Ld = dyn_cast<LoadInst>(&I);
203*0b57cec5SDimitry Andric if (!Ld)
204*0b57cec5SDimitry Andric return false;
205*0b57cec5SDimitry Andric if (!isGlobalAddr(Ld->getPointerOperand()))
206*0b57cec5SDimitry Andric return false;
207*0b57cec5SDimitry Andric
208*0b57cec5SDimitry Andric for (const User *Usr : Ld->users()) {
209*0b57cec5SDimitry Andric if (const Instruction *UsrInst = dyn_cast<Instruction>(Usr)) {
210*0b57cec5SDimitry Andric if (UsrInst->getParent() == I.getParent())
211*0b57cec5SDimitry Andric return true;
212*0b57cec5SDimitry Andric }
213*0b57cec5SDimitry Andric }
214*0b57cec5SDimitry Andric
215*0b57cec5SDimitry Andric return false;
216*0b57cec5SDimitry Andric }
217*0b57cec5SDimitry Andric
visit(const Function & F)218*0b57cec5SDimitry Andric AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
219*0b57cec5SDimitry Andric AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F];
220*0b57cec5SDimitry Andric
221*0b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n');
222*0b57cec5SDimitry Andric
223*0b57cec5SDimitry Andric for (auto &B : F) {
224*0b57cec5SDimitry Andric LastAccess = MemAccessInfo();
225*0b57cec5SDimitry Andric unsigned UsedGlobalLoadsInBB = 0;
226*0b57cec5SDimitry Andric for (auto &I : B) {
227*0b57cec5SDimitry Andric if (const Type *Ty = getMemoryInstrPtrAndType(&I).second) {
228*0b57cec5SDimitry Andric unsigned Size = divideCeil(Ty->getPrimitiveSizeInBits(), 32);
229*0b57cec5SDimitry Andric // TODO: Check if the global load and its user are close to each other
230*0b57cec5SDimitry Andric // instead (Or do this analysis in GCNSchedStrategy?).
231*0b57cec5SDimitry Andric if (isGlobalLoadUsedInBB(I))
232*0b57cec5SDimitry Andric UsedGlobalLoadsInBB += Size;
233*0b57cec5SDimitry Andric if (isIndirectAccess(&I))
234*0b57cec5SDimitry Andric FI.IAMInstCost += Size;
235*0b57cec5SDimitry Andric if (isLargeStride(&I))
236*0b57cec5SDimitry Andric FI.LSMInstCost += Size;
237*0b57cec5SDimitry Andric FI.MemInstCost += Size;
238*0b57cec5SDimitry Andric FI.InstCost += Size;
239*0b57cec5SDimitry Andric continue;
240*0b57cec5SDimitry Andric }
241*0b57cec5SDimitry Andric if (auto *CB = dyn_cast<CallBase>(&I)) {
242*0b57cec5SDimitry Andric Function *Callee = CB->getCalledFunction();
243*0b57cec5SDimitry Andric if (!Callee || Callee->isDeclaration()) {
244*0b57cec5SDimitry Andric ++FI.InstCost;
245*0b57cec5SDimitry Andric continue;
246*0b57cec5SDimitry Andric }
247*0b57cec5SDimitry Andric if (&F == Callee) // Handle immediate recursion
248*0b57cec5SDimitry Andric continue;
249*0b57cec5SDimitry Andric
250*0b57cec5SDimitry Andric auto Loc = FIM.find(Callee);
251*0b57cec5SDimitry Andric if (Loc == FIM.end())
252*0b57cec5SDimitry Andric continue;
253*0b57cec5SDimitry Andric
254*0b57cec5SDimitry Andric FI.MemInstCost += Loc->second.MemInstCost;
255*0b57cec5SDimitry Andric FI.InstCost += Loc->second.InstCost;
256*0b57cec5SDimitry Andric FI.IAMInstCost += Loc->second.IAMInstCost;
257*0b57cec5SDimitry Andric FI.LSMInstCost += Loc->second.LSMInstCost;
258*0b57cec5SDimitry Andric } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
259*0b57cec5SDimitry Andric TargetLoweringBase::AddrMode AM;
260*0b57cec5SDimitry Andric auto *Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, *DL);
261*0b57cec5SDimitry Andric AM.BaseGV = dyn_cast_or_null<GlobalValue>(const_cast<Value *>(Ptr));
262*0b57cec5SDimitry Andric AM.HasBaseReg = !AM.BaseGV;
263*0b57cec5SDimitry Andric if (TLI->isLegalAddressingMode(*DL, AM, GEP->getResultElementType(),
264*0b57cec5SDimitry Andric GEP->getPointerAddressSpace()))
265*0b57cec5SDimitry Andric // Offset will likely be folded into load or store
266*0b57cec5SDimitry Andric continue;
267*0b57cec5SDimitry Andric ++FI.InstCost;
268*0b57cec5SDimitry Andric } else {
269*0b57cec5SDimitry Andric ++FI.InstCost;
270*0b57cec5SDimitry Andric }
271*0b57cec5SDimitry Andric }
272*0b57cec5SDimitry Andric
273*0b57cec5SDimitry Andric if (!FI.HasDenseGlobalMemAcc) {
274*0b57cec5SDimitry Andric unsigned GlobalMemAccPercentage = UsedGlobalLoadsInBB * 100 / B.size();
275*0b57cec5SDimitry Andric if (GlobalMemAccPercentage > 50) {
276*0b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "[HasDenseGlobalMemAcc] Set to true since "
277*0b57cec5SDimitry Andric << B.getName() << " has " << GlobalMemAccPercentage
278*0b57cec5SDimitry Andric << "% global memory access\n");
279*0b57cec5SDimitry Andric FI.HasDenseGlobalMemAcc = true;
280*0b57cec5SDimitry Andric }
281*0b57cec5SDimitry Andric }
282*0b57cec5SDimitry Andric }
283*0b57cec5SDimitry Andric
284*0b57cec5SDimitry Andric return &FI;
285*0b57cec5SDimitry Andric }
286*0b57cec5SDimitry Andric
runOnFunction(Function & F)287*0b57cec5SDimitry Andric bool AMDGPUPerfHint::runOnFunction(Function &F) {
288*0b57cec5SDimitry Andric const Module &M = *F.getParent();
289*0b57cec5SDimitry Andric DL = &M.getDataLayout();
290*0b57cec5SDimitry Andric
291*0b57cec5SDimitry Andric if (F.hasFnAttribute("amdgpu-wave-limiter") &&
292*0b57cec5SDimitry Andric F.hasFnAttribute("amdgpu-memory-bound"))
293*0b57cec5SDimitry Andric return false;
294*0b57cec5SDimitry Andric
295*0b57cec5SDimitry Andric const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F);
296*0b57cec5SDimitry Andric
297*0b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << F.getName() << " MemInst cost: " << Info->MemInstCost
298*0b57cec5SDimitry Andric << '\n'
299*0b57cec5SDimitry Andric << " IAMInst cost: " << Info->IAMInstCost << '\n'
300*0b57cec5SDimitry Andric << " LSMInst cost: " << Info->LSMInstCost << '\n'
301*0b57cec5SDimitry Andric << " TotalInst cost: " << Info->InstCost << '\n');
302*0b57cec5SDimitry Andric
303*0b57cec5SDimitry Andric bool Changed = false;
304*0b57cec5SDimitry Andric
305*0b57cec5SDimitry Andric if (isMemBound(*Info)) {
306*0b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n");
307*0b57cec5SDimitry Andric NumMemBound++;
308*0b57cec5SDimitry Andric F.addFnAttr("amdgpu-memory-bound", "true");
309*0b57cec5SDimitry Andric Changed = true;
310*0b57cec5SDimitry Andric }
311*0b57cec5SDimitry Andric
312*0b57cec5SDimitry Andric if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(*Info)) {
313*0b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n");
314*0b57cec5SDimitry Andric NumLimitWave++;
315*0b57cec5SDimitry Andric F.addFnAttr("amdgpu-wave-limiter", "true");
316*0b57cec5SDimitry Andric Changed = true;
317*0b57cec5SDimitry Andric }
318*0b57cec5SDimitry Andric
319*0b57cec5SDimitry Andric return Changed;
320*0b57cec5SDimitry Andric }
321*0b57cec5SDimitry Andric
isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo & FI)322*0b57cec5SDimitry Andric bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
323*0b57cec5SDimitry Andric // Reverting optimal scheduling in favour of occupancy with basic block(s)
324*0b57cec5SDimitry Andric // having dense global memory access can potentially hurt performance.
325*0b57cec5SDimitry Andric if (FI.HasDenseGlobalMemAcc)
326*0b57cec5SDimitry Andric return true;
327*0b57cec5SDimitry Andric
328*0b57cec5SDimitry Andric return FI.MemInstCost * 100 / FI.InstCost > MemBoundThresh;
329*0b57cec5SDimitry Andric }
330*0b57cec5SDimitry Andric
needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo & FI)331*0b57cec5SDimitry Andric bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
332*0b57cec5SDimitry Andric return ((FI.MemInstCost + FI.IAMInstCost * IAWeight +
333*0b57cec5SDimitry Andric FI.LSMInstCost * LSWeight) * 100 / FI.InstCost) > LimitWaveThresh;
334*0b57cec5SDimitry Andric }
335*0b57cec5SDimitry Andric
isGlobalAddr(const Value * V) const336*0b57cec5SDimitry Andric bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const {
337*0b57cec5SDimitry Andric if (auto PT = dyn_cast<PointerType>(V->getType())) {
338*0b57cec5SDimitry Andric unsigned As = PT->getAddressSpace();
339*0b57cec5SDimitry Andric // Flat likely points to global too.
340*0b57cec5SDimitry Andric return As == AMDGPUAS::GLOBAL_ADDRESS || As == AMDGPUAS::FLAT_ADDRESS;
341*0b57cec5SDimitry Andric }
342*0b57cec5SDimitry Andric return false;
343*0b57cec5SDimitry Andric }
344*0b57cec5SDimitry Andric
isLocalAddr(const Value * V) const345*0b57cec5SDimitry Andric bool AMDGPUPerfHint::isLocalAddr(const Value *V) const {
346*0b57cec5SDimitry Andric if (auto PT = dyn_cast<PointerType>(V->getType()))
347*0b57cec5SDimitry Andric return PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
348*0b57cec5SDimitry Andric return false;
349*0b57cec5SDimitry Andric }
350*0b57cec5SDimitry Andric
isLargeStride(const Instruction * Inst)351*0b57cec5SDimitry Andric bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) {
352*0b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n');
353*0b57cec5SDimitry Andric
354*0b57cec5SDimitry Andric MemAccessInfo MAI = makeMemAccessInfo(const_cast<Instruction *>(Inst));
355*0b57cec5SDimitry Andric bool IsLargeStride = MAI.isLargeStride(LastAccess);
356*0b57cec5SDimitry Andric if (MAI.Base)
357*0b57cec5SDimitry Andric LastAccess = std::move(MAI);
358*0b57cec5SDimitry Andric
359*0b57cec5SDimitry Andric return IsLargeStride;
360*0b57cec5SDimitry Andric }
361*0b57cec5SDimitry Andric
362*0b57cec5SDimitry Andric AMDGPUPerfHint::MemAccessInfo
makeMemAccessInfo(Instruction * Inst) const363*0b57cec5SDimitry Andric AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const {
364*0b57cec5SDimitry Andric MemAccessInfo MAI;
365*0b57cec5SDimitry Andric const Value *MO = getMemoryInstrPtrAndType(Inst).first;
366*0b57cec5SDimitry Andric
367*0b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n');
368*0b57cec5SDimitry Andric // Do not treat local-addr memory access as large stride.
369*0b57cec5SDimitry Andric if (isLocalAddr(MO))
370*0b57cec5SDimitry Andric return MAI;
371*0b57cec5SDimitry Andric
372*0b57cec5SDimitry Andric MAI.V = MO;
373*0b57cec5SDimitry Andric MAI.Base = GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL);
374*0b57cec5SDimitry Andric return MAI;
375*0b57cec5SDimitry Andric }
376*0b57cec5SDimitry Andric
isLargeStride(MemAccessInfo & Reference) const377*0b57cec5SDimitry Andric bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(
378*0b57cec5SDimitry Andric MemAccessInfo &Reference) const {
379*0b57cec5SDimitry Andric
380*0b57cec5SDimitry Andric if (!Base || !Reference.Base || Base != Reference.Base)
381*0b57cec5SDimitry Andric return false;
382*0b57cec5SDimitry Andric
383*0b57cec5SDimitry Andric uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset
384*0b57cec5SDimitry Andric : Reference.Offset - Offset;
385*0b57cec5SDimitry Andric bool Result = Diff > LargeStrideThresh;
386*0b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "[isLargeStride compare]\n"
387*0b57cec5SDimitry Andric << print() << "<=>\n"
388*0b57cec5SDimitry Andric << Reference.print() << "Result:" << Result << '\n');
389*0b57cec5SDimitry Andric return Result;
390*0b57cec5SDimitry Andric }
391*0b57cec5SDimitry Andric } // namespace
392*0b57cec5SDimitry Andric
runOnSCC(CallGraphSCC & SCC)393*0b57cec5SDimitry Andric bool AMDGPUPerfHintAnalysis::runOnSCC(CallGraphSCC &SCC) {
394*0b57cec5SDimitry Andric auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
395*0b57cec5SDimitry Andric if (!TPC)
396*0b57cec5SDimitry Andric return false;
397*0b57cec5SDimitry Andric
398*0b57cec5SDimitry Andric const TargetMachine &TM = TPC->getTM<TargetMachine>();
399*0b57cec5SDimitry Andric
400*0b57cec5SDimitry Andric bool Changed = false;
401*0b57cec5SDimitry Andric for (CallGraphNode *I : SCC) {
402*0b57cec5SDimitry Andric Function *F = I->getFunction();
403*0b57cec5SDimitry Andric if (!F || F->isDeclaration())
404*0b57cec5SDimitry Andric continue;
405
406 const TargetSubtargetInfo *ST = TM.getSubtargetImpl(*F);
407 AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering());
408
409 if (Analyzer.runOnFunction(*F))
410 Changed = true;
411 }
412
413 return Changed;
414 }
415
isMemoryBound(const Function * F) const416 bool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const {
417 auto FI = FIM.find(F);
418 if (FI == FIM.end())
419 return false;
420
421 return AMDGPUPerfHint::isMemBound(FI->second);
422 }
423
needsWaveLimiter(const Function * F) const424 bool AMDGPUPerfHintAnalysis::needsWaveLimiter(const Function *F) const {
425 auto FI = FIM.find(F);
426 if (FI == FIM.end())
427 return false;
428
429 return AMDGPUPerfHint::needLimitWave(FI->second);
430 }
431