//===--- AMDGPUPropagateAttributes.cpp --------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// \file /// \brief This pass propagates attributes from kernels to the non-entry /// functions. Most of the library functions were not compiled for specific ABI, /// yet will be correctly compiled if proper attrbutes are propagated from the /// caller. /// /// The pass analyzes call graph and propagates ABI target features through the /// call graph. /// /// It can run in two modes: as a function or module pass. A function pass /// simply propagates attributes. A module pass clones functions if there are /// callers with different ABI. If a function is clonned all call sites will /// be updated to use a correct clone. /// /// A function pass is limited in functionality but can run early in the /// pipeline. A module pass is more powerful but has to run late, so misses /// library folding opportunities. // //===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/SmallSet.h" #include "llvm/Analysis/CallGraph.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/InstrTypes.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Utils/Cloning.h" #include #define DEBUG_TYPE "amdgpu-propagate-attributes" using namespace llvm; namespace llvm { extern const SubtargetFeatureKV AMDGPUFeatureKV[AMDGPU::NumSubtargetFeatures-1]; } namespace { // Target features to propagate. static constexpr const FeatureBitset TargetFeatures = { AMDGPU::FeatureWavefrontSize16, AMDGPU::FeatureWavefrontSize32, AMDGPU::FeatureWavefrontSize64 }; // Attributes to propagate. // TODO: Support conservative min/max merging instead of cloning. static constexpr const char* AttributeNames[] = { "amdgpu-waves-per-eu", "amdgpu-flat-work-group-size" }; static constexpr unsigned NumAttr = sizeof(AttributeNames) / sizeof(AttributeNames[0]); class AMDGPUPropagateAttributes { class FnProperties { private: explicit FnProperties(const FeatureBitset &&FB) : Features(FB) {} public: explicit FnProperties(const TargetMachine &TM, const Function &F) { Features = TM.getSubtargetImpl(F)->getFeatureBits(); for (unsigned I = 0; I < NumAttr; ++I) if (F.hasFnAttribute(AttributeNames[I])) Attributes[I] = F.getFnAttribute(AttributeNames[I]); } bool operator == (const FnProperties &Other) const { if ((Features & TargetFeatures) != (Other.Features & TargetFeatures)) return false; for (unsigned I = 0; I < NumAttr; ++I) if (Attributes[I] != Other.Attributes[I]) return false; return true; } FnProperties adjustToCaller(const FnProperties &CallerProps) const { FnProperties New((Features & ~TargetFeatures) | CallerProps.Features); for (unsigned I = 0; I < NumAttr; ++I) New.Attributes[I] = CallerProps.Attributes[I]; return New; } FeatureBitset Features; Optional Attributes[NumAttr]; }; class Clone { public: Clone(const FnProperties &Props, Function *OrigF, Function *NewF) : Properties(Props), OrigF(OrigF), NewF(NewF) {} FnProperties Properties; Function *OrigF; Function *NewF; }; const TargetMachine *TM; // Clone functions as needed or just set attributes. bool AllowClone; CallGraph *ModuleCG = nullptr; // Option propagation roots. SmallSet Roots; // Clones of functions with their attributes. SmallVector Clones; // To memoize address taken functions. SmallSet AddressTakenFunctions; // Find a clone with required features. Function *findFunction(const FnProperties &PropsNeeded, Function *OrigF); // Clone function \p F and set \p NewProps on the clone. // Cole takes the name of original function. Function *cloneWithProperties(Function &F, const FnProperties &NewProps); // Set new function's features in place. void setFeatures(Function &F, const FeatureBitset &NewFeatures); // Set new function's attributes in place. void setAttributes(Function &F, const ArrayRef> NewAttrs); std::string getFeatureString(const FeatureBitset &Features) const; // Propagate attributes from Roots. bool process(); public: AMDGPUPropagateAttributes(const TargetMachine *TM, bool AllowClone) : TM(TM), AllowClone(AllowClone) {} // Use F as a root and propagate its attributes. bool process(Function &F); // Propagate attributes starting from kernel functions. bool process(Module &M, CallGraph *CG); // Remove attributes from F. // This is used in presence of address taken functions. bool removeAttributes(Function *F); // Handle call graph rooted at address taken functions. // This function will erase all attributes present // on all functions called from address taken functions transitively. bool handleAddressTakenFunctions(CallGraph *CG); }; // Allows to propagate attributes early, but no clonning is allowed as it must // be a function pass to run before any optimizations. // TODO: We shall only need a one instance of module pass, but that needs to be // in the linker pipeline which is currently not possible. class AMDGPUPropagateAttributesEarly : public FunctionPass { const TargetMachine *TM; public: static char ID; // Pass identification AMDGPUPropagateAttributesEarly(const TargetMachine *TM = nullptr) : FunctionPass(ID), TM(TM) { initializeAMDGPUPropagateAttributesEarlyPass( *PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; }; // Allows to propagate attributes with clonning but does that late in the // pipeline. class AMDGPUPropagateAttributesLate : public ModulePass { const TargetMachine *TM; public: static char ID; // Pass identification AMDGPUPropagateAttributesLate(const TargetMachine *TM = nullptr) : ModulePass(ID), TM(TM) { initializeAMDGPUPropagateAttributesLatePass( *PassRegistry::getPassRegistry()); } void getAnalysisUsage(AnalysisUsage &AU) const override; bool runOnModule(Module &M) override; }; } // end anonymous namespace. char AMDGPUPropagateAttributesEarly::ID = 0; char AMDGPUPropagateAttributesLate::ID = 0; INITIALIZE_PASS(AMDGPUPropagateAttributesEarly, "amdgpu-propagate-attributes-early", "Early propagate attributes from kernels to functions", false, false) INITIALIZE_PASS(AMDGPUPropagateAttributesLate, "amdgpu-propagate-attributes-late", "Late propagate attributes from kernels to functions", false, false) bool AMDGPUPropagateAttributes::removeAttributes(Function *F) { bool Changed = false; if (!F) return Changed; LLVM_DEBUG(dbgs() << "Removing attributes from " << F->getName() << '\n'); for (unsigned I = 0; I < NumAttr; ++I) { if (F->hasFnAttribute(AttributeNames[I])) { F->removeFnAttr(AttributeNames[I]); Changed = true; } } return Changed; } bool AMDGPUPropagateAttributes::handleAddressTakenFunctions(CallGraph *CG) { assert(ModuleCG && "Call graph not present"); bool Changed = false; SmallSet Visited; for (Function *F : AddressTakenFunctions) { CallGraphNode *CGN = (*CG)[F]; if (!Visited.count(CGN)) { Changed |= removeAttributes(F); Visited.insert(CGN); } std::queue SubGraph; SubGraph.push(CGN); while (!SubGraph.empty()) { CallGraphNode *CGN = SubGraph.front(); SubGraph.pop(); if (!Visited.count(CGN)) { Changed |= removeAttributes(CGN->getFunction()); Visited.insert(CGN); } for (auto N : *CGN) SubGraph.push(N.second); } } return Changed; } Function * AMDGPUPropagateAttributes::findFunction(const FnProperties &PropsNeeded, Function *OrigF) { // TODO: search for clone's clones. for (Clone &C : Clones) if (C.OrigF == OrigF && PropsNeeded == C.Properties) return C.NewF; return nullptr; } bool AMDGPUPropagateAttributes::process(Module &M, CallGraph *CG) { for (auto &F : M.functions()) if (AMDGPU::isEntryFunctionCC(F.getCallingConv())) Roots.insert(&F); ModuleCG = CG; return process(); } bool AMDGPUPropagateAttributes::process(Function &F) { Roots.insert(&F); return process(); } bool AMDGPUPropagateAttributes::process() { bool Changed = false; SmallSet NewRoots; SmallSet Replaced; if (Roots.empty()) return false; Module &M = *(*Roots.begin())->getParent(); do { Roots.insert(NewRoots.begin(), NewRoots.end()); NewRoots.clear(); for (auto &F : M.functions()) { if (F.isDeclaration()) continue; if (F.hasAddressTaken(nullptr, true, true, true)) AddressTakenFunctions.insert(&F); const FnProperties CalleeProps(*TM, F); SmallVector, 32> ToReplace; SmallSet Visited; for (User *U : F.users()) { Instruction *I = dyn_cast(U); if (!I) continue; CallBase *CI = dyn_cast(I); // Only propagate attributes if F is the called function. Specifically, // do not propagate attributes if F is passed as an argument. // FIXME: handle bitcasted callee, e.g. // %retval = call i8* bitcast (i32* ()* @f to i8* ()*)() if (!CI || CI->getCalledOperand() != &F) continue; Function *Caller = CI->getCaller(); if (!Caller || !Visited.insert(CI).second) continue; if (!Roots.count(Caller) && !NewRoots.count(Caller)) continue; const FnProperties CallerProps(*TM, *Caller); if (CalleeProps == CallerProps) { if (!Roots.count(&F)) NewRoots.insert(&F); continue; } Function *NewF = findFunction(CallerProps, &F); if (!NewF) { const FnProperties NewProps = CalleeProps.adjustToCaller(CallerProps); if (!AllowClone) { // This may set different features on different iteartions if // there is a contradiction in callers' attributes. In this case // we rely on a second pass running on Module, which is allowed // to clone. setFeatures(F, NewProps.Features); setAttributes(F, NewProps.Attributes); NewRoots.insert(&F); Changed = true; break; } NewF = cloneWithProperties(F, NewProps); Clones.push_back(Clone(CallerProps, &F, NewF)); NewRoots.insert(NewF); } ToReplace.push_back(std::make_pair(CI, NewF)); Replaced.insert(&F); Changed = true; } while (!ToReplace.empty()) { auto R = ToReplace.pop_back_val(); R.first->setCalledFunction(R.second); } } } while (!NewRoots.empty()); for (Function *F : Replaced) { if (F->use_empty()) F->eraseFromParent(); } Roots.clear(); Clones.clear(); // Keep the post processing related to indirect // calls separate to handle them gracefully. // The core traversal need not be affected by this. if (AllowClone) Changed |= handleAddressTakenFunctions(ModuleCG); return Changed; } Function * AMDGPUPropagateAttributes::cloneWithProperties(Function &F, const FnProperties &NewProps) { LLVM_DEBUG(dbgs() << "Cloning " << F.getName() << '\n'); ValueToValueMapTy dummy; Function *NewF = CloneFunction(&F, dummy); setFeatures(*NewF, NewProps.Features); setAttributes(*NewF, NewProps.Attributes); NewF->setVisibility(GlobalValue::DefaultVisibility); NewF->setLinkage(GlobalValue::InternalLinkage); // Swap names. If that is the only clone it will retain the name of now // dead value. Preserve original name for externally visible functions. if (F.hasName() && F.hasLocalLinkage()) { std::string NewName = std::string(NewF->getName()); NewF->takeName(&F); F.setName(NewName); } return NewF; } void AMDGPUPropagateAttributes::setFeatures(Function &F, const FeatureBitset &NewFeatures) { std::string NewFeatureStr = getFeatureString(NewFeatures); LLVM_DEBUG(dbgs() << "Set features " << getFeatureString(NewFeatures & TargetFeatures) << " on " << F.getName() << '\n'); F.removeFnAttr("target-features"); F.addFnAttr("target-features", NewFeatureStr); } void AMDGPUPropagateAttributes::setAttributes(Function &F, const ArrayRef> NewAttrs) { LLVM_DEBUG(dbgs() << "Set attributes on " << F.getName() << ":\n"); for (unsigned I = 0; I < NumAttr; ++I) { F.removeFnAttr(AttributeNames[I]); if (NewAttrs[I]) { LLVM_DEBUG(dbgs() << '\t' << NewAttrs[I]->getAsString() << '\n'); F.addFnAttr(*NewAttrs[I]); } } } std::string AMDGPUPropagateAttributes::getFeatureString(const FeatureBitset &Features) const { std::string Ret; for (const SubtargetFeatureKV &KV : AMDGPUFeatureKV) { if (Features[KV.Value]) Ret += (StringRef("+") + KV.Key + ",").str(); else if (TargetFeatures[KV.Value]) Ret += (StringRef("-") + KV.Key + ",").str(); } Ret.pop_back(); // Remove last comma. return Ret; } bool AMDGPUPropagateAttributesEarly::runOnFunction(Function &F) { if (!TM) { auto *TPC = getAnalysisIfAvailable(); if (!TPC) return false; TM = &TPC->getTM(); } if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) return false; return AMDGPUPropagateAttributes(TM, false).process(F); } void AMDGPUPropagateAttributesLate::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); } bool AMDGPUPropagateAttributesLate::runOnModule(Module &M) { if (!TM) { auto *TPC = getAnalysisIfAvailable(); if (!TPC) return false; TM = &TPC->getTM(); } CallGraph &CG = getAnalysis().getCallGraph(); return AMDGPUPropagateAttributes(TM, true).process(M, &CG); } FunctionPass *llvm::createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *TM) { return new AMDGPUPropagateAttributesEarly(TM); } ModulePass *llvm::createAMDGPUPropagateAttributesLatePass(const TargetMachine *TM) { return new AMDGPUPropagateAttributesLate(TM); } PreservedAnalyses AMDGPUPropagateAttributesEarlyPass::run(Function &F, FunctionAnalysisManager &AM) { if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) return PreservedAnalyses::all(); return AMDGPUPropagateAttributes(&TM, false).process(F) ? PreservedAnalyses::none() : PreservedAnalyses::all(); } PreservedAnalyses AMDGPUPropagateAttributesLatePass::run(Module &M, ModuleAnalysisManager &MAM) { AMDGPUPropagateAttributes APA(&TM, true); CallGraph &CG = MAM.getResult(M); const bool Changed = APA.process(M, &CG); return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); }