1 //===- AMDGPUInline.cpp - Code to perform simple function inlining --------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// This is AMDGPU specific replacement of the standard inliner. 12 /// The main purpose is to account for the fact that calls not only expensive 13 /// on the AMDGPU, but much more expensive if a private memory pointer is 14 /// passed to a function as an argument. In this situation, we are unable to 15 /// eliminate private memory in the caller unless inlined and end up with slow 16 /// and expensive scratch access. Thus, we boost the inline threshold for such 17 /// functions here. 18 /// 19 //===----------------------------------------------------------------------===// 20 21 22 #include "AMDGPU.h" 23 #include "llvm/Transforms/IPO.h" 24 #include "llvm/Analysis/AssumptionCache.h" 25 #include "llvm/Analysis/CallGraph.h" 26 #include "llvm/Analysis/InlineCost.h" 27 #include "llvm/Analysis/ValueTracking.h" 28 #include "llvm/Analysis/TargetTransformInfo.h" 29 #include "llvm/IR/CallSite.h" 30 #include "llvm/IR/DataLayout.h" 31 #include "llvm/IR/Instructions.h" 32 #include "llvm/IR/Module.h" 33 #include "llvm/IR/Type.h" 34 #include "llvm/Support/CommandLine.h" 35 #include "llvm/Support/Debug.h" 36 #include "llvm/Transforms/IPO/Inliner.h" 37 38 using namespace llvm; 39 40 #define DEBUG_TYPE "inline" 41 42 static cl::opt<int> 43 ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(2200), 44 cl::desc("Cost of alloca argument")); 45 46 // If the amount of scratch memory to eliminate exceeds our ability to allocate 47 // it into registers we gain nothing by aggressively inlining functions for that 48 // heuristic. 49 static cl::opt<unsigned> 50 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), 51 cl::desc("Maximum alloca size to use for inline cost")); 52 53 namespace { 54 55 class AMDGPUInliner : public LegacyInlinerBase { 56 57 public: 58 AMDGPUInliner() : LegacyInlinerBase(ID) { 59 initializeAMDGPUInlinerPass(*PassRegistry::getPassRegistry()); 60 Params = getInlineParams(); 61 } 62 63 static char ID; // Pass identification, replacement for typeid 64 65 unsigned getInlineThreshold(CallSite CS) const; 66 67 InlineCost getInlineCost(CallSite CS) override; 68 69 bool runOnSCC(CallGraphSCC &SCC) override; 70 71 void getAnalysisUsage(AnalysisUsage &AU) const override; 72 73 private: 74 TargetTransformInfoWrapperPass *TTIWP; 75 76 InlineParams Params; 77 }; 78 79 } // end anonymous namespace 80 81 char AMDGPUInliner::ID = 0; 82 INITIALIZE_PASS_BEGIN(AMDGPUInliner, "amdgpu-inline", 83 "AMDGPU Function Integration/Inlining", false, false) 84 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 85 INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) 86 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) 87 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 88 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 89 INITIALIZE_PASS_END(AMDGPUInliner, "amdgpu-inline", 90 "AMDGPU Function Integration/Inlining", false, false) 91 92 Pass *llvm::createAMDGPUFunctionInliningPass() { return new AMDGPUInliner(); } 93 94 bool AMDGPUInliner::runOnSCC(CallGraphSCC &SCC) { 95 TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>(); 96 return LegacyInlinerBase::runOnSCC(SCC); 97 } 98 99 void AMDGPUInliner::getAnalysisUsage(AnalysisUsage &AU) const { 100 AU.addRequired<TargetTransformInfoWrapperPass>(); 101 LegacyInlinerBase::getAnalysisUsage(AU); 102 } 103 104 unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const { 105 int Thres = Params.DefaultThreshold; 106 107 Function *Caller = CS.getCaller(); 108 // Listen to the inlinehint attribute when it would increase the threshold 109 // and the caller does not need to minimize its size. 110 Function *Callee = CS.getCalledFunction(); 111 bool InlineHint = Callee && !Callee->isDeclaration() && 112 Callee->hasFnAttribute(Attribute::InlineHint); 113 if (InlineHint && Params.HintThreshold && Params.HintThreshold > Thres 114 && !Caller->hasFnAttribute(Attribute::MinSize)) 115 Thres = Params.HintThreshold.getValue(); 116 117 const DataLayout &DL = Caller->getParent()->getDataLayout(); 118 if (!Callee) 119 return (unsigned)Thres; 120 121 // If we have a pointer to private array passed into a function 122 // it will not be optimized out, leaving scratch usage. 123 // Increase the inline threshold to allow inliniting in this case. 124 uint64_t AllocaSize = 0; 125 SmallPtrSet<const AllocaInst *, 8> AIVisited; 126 for (Value *PtrArg : CS.args()) { 127 Type *Ty = PtrArg->getType(); 128 if (!Ty->isPointerTy() || 129 Ty->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) 130 continue; 131 PtrArg = GetUnderlyingObject(PtrArg, DL); 132 if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) { 133 if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second) 134 continue; 135 AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType()); 136 // If the amount of stack memory is excessive we will not be able 137 // to get rid of the scratch anyway, bail out. 138 if (AllocaSize > ArgAllocaCutoff) { 139 AllocaSize = 0; 140 break; 141 } 142 } 143 } 144 if (AllocaSize) 145 Thres += ArgAllocaCost; 146 147 return (unsigned)Thres; 148 } 149 150 // Check if call is just a wrapper around another call. 151 // In this case we only have call and ret instructions. 152 static bool isWrapperOnlyCall(CallSite CS) { 153 Function *Callee = CS.getCalledFunction(); 154 if (!Callee || Callee->size() != 1) 155 return false; 156 const BasicBlock &BB = Callee->getEntryBlock(); 157 if (const Instruction *I = BB.getFirstNonPHI()) { 158 if (!isa<CallInst>(I)) { 159 return false; 160 } 161 if (isa<ReturnInst>(*std::next(I->getIterator()))) { 162 LLVM_DEBUG(dbgs() << " Wrapper only call detected: " 163 << Callee->getName() << '\n'); 164 return true; 165 } 166 } 167 return false; 168 } 169 170 InlineCost AMDGPUInliner::getInlineCost(CallSite CS) { 171 Function *Callee = CS.getCalledFunction(); 172 Function *Caller = CS.getCaller(); 173 TargetTransformInfo &TTI = TTIWP->getTTI(*Callee); 174 175 if (!Callee || Callee->isDeclaration()) 176 return llvm::InlineCost::getNever("undefined callee"); 177 178 if (CS.isNoInline()) 179 return llvm::InlineCost::getNever("noinline"); 180 181 if (!TTI.areInlineCompatible(Caller, Callee)) 182 return llvm::InlineCost::getNever("incompatible"); 183 184 if (CS.hasFnAttr(Attribute::AlwaysInline)) { 185 if (isInlineViable(*Callee)) 186 return llvm::InlineCost::getAlways("alwaysinline viable"); 187 return llvm::InlineCost::getNever("alwaysinline unviable"); 188 } 189 190 if (isWrapperOnlyCall(CS)) 191 return llvm::InlineCost::getAlways("wrapper-only call"); 192 193 InlineParams LocalParams = Params; 194 LocalParams.DefaultThreshold = (int)getInlineThreshold(CS); 195 bool RemarksEnabled = false; 196 const auto &BBs = Caller->getBasicBlockList(); 197 if (!BBs.empty()) { 198 auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBs.front()); 199 if (DI.isEnabled()) 200 RemarksEnabled = true; 201 } 202 203 OptimizationRemarkEmitter ORE(Caller); 204 std::function<AssumptionCache &(Function &)> GetAssumptionCache = 205 [this](Function &F) -> AssumptionCache & { 206 return ACT->getAssumptionCache(F); 207 }; 208 209 return llvm::getInlineCost(CS, Callee, LocalParams, TTI, GetAssumptionCache, 210 None, PSI, RemarksEnabled ? &ORE : nullptr); 211 } 212