1 //===--- AMDGPUPropagateAttributes.cpp --------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// \brief This pass propagates attributes from kernels to the non-entry 11 /// functions. Most of the library functions were not compiled for specific ABI, 12 /// yet will be correctly compiled if proper attrbutes are propagated from the 13 /// caller. 14 /// 15 /// The pass analyzes call graph and propagates ABI target features through the 16 /// call graph. 17 /// 18 /// It can run in two modes: as a function or module pass. A function pass 19 /// simply propagates attributes. A module pass clones functions if there are 20 /// callers with different ABI. If a function is clonned all call sites will 21 /// be updated to use a correct clone. 22 /// 23 /// A function pass is limited in functionality but can run early in the 24 /// pipeline. A module pass is more powerful but has to run late, so misses 25 /// library folding opportunities. 26 // 27 //===----------------------------------------------------------------------===// 28 29 #include "AMDGPU.h" 30 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 31 #include "Utils/AMDGPUBaseInfo.h" 32 #include "llvm/ADT/SmallSet.h" 33 #include "llvm/Analysis/CallGraph.h" 34 #include "llvm/CodeGen/TargetPassConfig.h" 35 #include "llvm/CodeGen/TargetSubtargetInfo.h" 36 #include "llvm/IR/InstrTypes.h" 37 #include "llvm/Target/TargetMachine.h" 38 #include "llvm/Transforms/Utils/Cloning.h" 39 #include <queue> 40 41 #define DEBUG_TYPE "amdgpu-propagate-attributes" 42 43 using namespace llvm; 44 45 namespace llvm { 46 extern const SubtargetFeatureKV AMDGPUFeatureKV[AMDGPU::NumSubtargetFeatures-1]; 47 } 48 49 namespace { 50 51 // Target features to propagate. 52 static constexpr const FeatureBitset TargetFeatures = { 53 AMDGPU::FeatureWavefrontSize16, 54 AMDGPU::FeatureWavefrontSize32, 55 AMDGPU::FeatureWavefrontSize64 56 }; 57 58 // Attributes to propagate. 59 // TODO: Support conservative min/max merging instead of cloning. 60 static constexpr const char* AttributeNames[] = { 61 "amdgpu-waves-per-eu", 62 "amdgpu-flat-work-group-size" 63 }; 64 65 static constexpr unsigned NumAttr = 66 sizeof(AttributeNames) / sizeof(AttributeNames[0]); 67 68 class AMDGPUPropagateAttributes { 69 70 class FnProperties { 71 private: 72 explicit FnProperties(const FeatureBitset &&FB) : Features(FB) {} 73 74 public: 75 explicit FnProperties(const TargetMachine &TM, const Function &F) { 76 Features = TM.getSubtargetImpl(F)->getFeatureBits(); 77 78 for (unsigned I = 0; I < NumAttr; ++I) 79 if (F.hasFnAttribute(AttributeNames[I])) 80 Attributes[I] = F.getFnAttribute(AttributeNames[I]); 81 } 82 83 bool operator == (const FnProperties &Other) const { 84 if ((Features & TargetFeatures) != (Other.Features & TargetFeatures)) 85 return false; 86 for (unsigned I = 0; I < NumAttr; ++I) 87 if (Attributes[I] != Other.Attributes[I]) 88 return false; 89 return true; 90 } 91 92 FnProperties adjustToCaller(const FnProperties &CallerProps) const { 93 FnProperties New((Features & ~TargetFeatures) | CallerProps.Features); 94 for (unsigned I = 0; I < NumAttr; ++I) 95 New.Attributes[I] = CallerProps.Attributes[I]; 96 return New; 97 } 98 99 FeatureBitset Features; 100 Optional<Attribute> Attributes[NumAttr]; 101 }; 102 103 class Clone { 104 public: 105 Clone(const FnProperties &Props, Function *OrigF, Function *NewF) : 106 Properties(Props), OrigF(OrigF), NewF(NewF) {} 107 108 FnProperties Properties; 109 Function *OrigF; 110 Function *NewF; 111 }; 112 113 const TargetMachine *TM; 114 115 // Clone functions as needed or just set attributes. 116 bool AllowClone; 117 118 CallGraph *ModuleCG = nullptr; 119 120 // Option propagation roots. 121 SmallSet<Function *, 32> Roots; 122 123 // Clones of functions with their attributes. 124 SmallVector<Clone, 32> Clones; 125 126 // To memoize address taken functions. 127 SmallSet<Function *, 32> AddressTakenFunctions; 128 129 // Find a clone with required features. 130 Function *findFunction(const FnProperties &PropsNeeded, 131 Function *OrigF); 132 133 // Clone function \p F and set \p NewProps on the clone. 134 // Cole takes the name of original function. 135 Function *cloneWithProperties(Function &F, const FnProperties &NewProps); 136 137 // Set new function's features in place. 138 void setFeatures(Function &F, const FeatureBitset &NewFeatures); 139 140 // Set new function's attributes in place. 141 void setAttributes(Function &F, const ArrayRef<Optional<Attribute>> NewAttrs); 142 143 std::string getFeatureString(const FeatureBitset &Features) const; 144 145 // Propagate attributes from Roots. 146 bool process(); 147 148 public: 149 AMDGPUPropagateAttributes(const TargetMachine *TM, bool AllowClone) 150 : TM(TM), AllowClone(AllowClone) {} 151 152 // Use F as a root and propagate its attributes. 153 bool process(Function &F); 154 155 // Propagate attributes starting from kernel functions. 156 bool process(Module &M, CallGraph *CG); 157 158 // Remove attributes from F. 159 // This is used in presence of address taken functions. 160 bool removeAttributes(Function *F); 161 162 // Handle call graph rooted at address taken functions. 163 // This function will erase all attributes present 164 // on all functions called from address taken functions transitively. 165 bool handleAddressTakenFunctions(CallGraph *CG); 166 }; 167 168 // Allows to propagate attributes early, but no clonning is allowed as it must 169 // be a function pass to run before any optimizations. 170 // TODO: We shall only need a one instance of module pass, but that needs to be 171 // in the linker pipeline which is currently not possible. 172 class AMDGPUPropagateAttributesEarly : public FunctionPass { 173 const TargetMachine *TM; 174 175 public: 176 static char ID; // Pass identification 177 178 AMDGPUPropagateAttributesEarly(const TargetMachine *TM = nullptr) : 179 FunctionPass(ID), TM(TM) { 180 initializeAMDGPUPropagateAttributesEarlyPass( 181 *PassRegistry::getPassRegistry()); 182 } 183 184 bool runOnFunction(Function &F) override; 185 }; 186 187 // Allows to propagate attributes with clonning but does that late in the 188 // pipeline. 189 class AMDGPUPropagateAttributesLate : public ModulePass { 190 const TargetMachine *TM; 191 192 public: 193 static char ID; // Pass identification 194 195 AMDGPUPropagateAttributesLate(const TargetMachine *TM = nullptr) : 196 ModulePass(ID), TM(TM) { 197 initializeAMDGPUPropagateAttributesLatePass( 198 *PassRegistry::getPassRegistry()); 199 } 200 201 void getAnalysisUsage(AnalysisUsage &AU) const override; 202 bool runOnModule(Module &M) override; 203 }; 204 205 } // end anonymous namespace. 206 207 char AMDGPUPropagateAttributesEarly::ID = 0; 208 char AMDGPUPropagateAttributesLate::ID = 0; 209 210 INITIALIZE_PASS(AMDGPUPropagateAttributesEarly, 211 "amdgpu-propagate-attributes-early", 212 "Early propagate attributes from kernels to functions", 213 false, false) 214 INITIALIZE_PASS(AMDGPUPropagateAttributesLate, 215 "amdgpu-propagate-attributes-late", 216 "Late propagate attributes from kernels to functions", 217 false, false) 218 219 bool AMDGPUPropagateAttributes::removeAttributes(Function *F) { 220 bool Changed = false; 221 if (!F) 222 return Changed; 223 LLVM_DEBUG(dbgs() << "Removing attributes from " << F->getName() << '\n'); 224 for (unsigned I = 0; I < NumAttr; ++I) { 225 if (F->hasFnAttribute(AttributeNames[I])) { 226 F->removeFnAttr(AttributeNames[I]); 227 Changed = true; 228 } 229 } 230 return Changed; 231 } 232 233 bool AMDGPUPropagateAttributes::handleAddressTakenFunctions(CallGraph *CG) { 234 assert(ModuleCG && "Call graph not present"); 235 236 bool Changed = false; 237 SmallSet<CallGraphNode *, 32> Visited; 238 239 for (Function *F : AddressTakenFunctions) { 240 CallGraphNode *CGN = (*CG)[F]; 241 if (!Visited.count(CGN)) { 242 Changed |= removeAttributes(F); 243 Visited.insert(CGN); 244 } 245 246 std::queue<CallGraphNode *> SubGraph; 247 SubGraph.push(CGN); 248 while (!SubGraph.empty()) { 249 CallGraphNode *CGN = SubGraph.front(); 250 SubGraph.pop(); 251 if (!Visited.count(CGN)) { 252 Changed |= removeAttributes(CGN->getFunction()); 253 Visited.insert(CGN); 254 } 255 for (auto N : *CGN) 256 SubGraph.push(N.second); 257 } 258 } 259 return Changed; 260 } 261 262 Function * 263 AMDGPUPropagateAttributes::findFunction(const FnProperties &PropsNeeded, 264 Function *OrigF) { 265 // TODO: search for clone's clones. 266 for (Clone &C : Clones) 267 if (C.OrigF == OrigF && PropsNeeded == C.Properties) 268 return C.NewF; 269 270 return nullptr; 271 } 272 273 bool AMDGPUPropagateAttributes::process(Module &M, CallGraph *CG) { 274 for (auto &F : M.functions()) 275 if (AMDGPU::isEntryFunctionCC(F.getCallingConv())) 276 Roots.insert(&F); 277 278 ModuleCG = CG; 279 return process(); 280 } 281 282 bool AMDGPUPropagateAttributes::process(Function &F) { 283 Roots.insert(&F); 284 return process(); 285 } 286 287 bool AMDGPUPropagateAttributes::process() { 288 bool Changed = false; 289 SmallSet<Function *, 32> NewRoots; 290 SmallSet<Function *, 32> Replaced; 291 292 if (Roots.empty()) 293 return false; 294 Module &M = *(*Roots.begin())->getParent(); 295 296 do { 297 Roots.insert(NewRoots.begin(), NewRoots.end()); 298 NewRoots.clear(); 299 300 for (auto &F : M.functions()) { 301 if (F.isDeclaration()) 302 continue; 303 304 if (F.hasAddressTaken(nullptr, true, true, true)) 305 AddressTakenFunctions.insert(&F); 306 307 const FnProperties CalleeProps(*TM, F); 308 SmallVector<std::pair<CallBase *, Function *>, 32> ToReplace; 309 SmallSet<CallBase *, 32> Visited; 310 311 for (User *U : F.users()) { 312 Instruction *I = dyn_cast<Instruction>(U); 313 if (!I) 314 continue; 315 CallBase *CI = dyn_cast<CallBase>(I); 316 // Only propagate attributes if F is the called function. Specifically, 317 // do not propagate attributes if F is passed as an argument. 318 // FIXME: handle bitcasted callee, e.g. 319 // %retval = call i8* bitcast (i32* ()* @f to i8* ()*)() 320 if (!CI || CI->getCalledOperand() != &F) 321 continue; 322 Function *Caller = CI->getCaller(); 323 if (!Caller || !Visited.insert(CI).second) 324 continue; 325 if (!Roots.count(Caller) && !NewRoots.count(Caller)) 326 continue; 327 328 const FnProperties CallerProps(*TM, *Caller); 329 330 if (CalleeProps == CallerProps) { 331 if (!Roots.count(&F)) 332 NewRoots.insert(&F); 333 continue; 334 } 335 336 Function *NewF = findFunction(CallerProps, &F); 337 if (!NewF) { 338 const FnProperties NewProps = CalleeProps.adjustToCaller(CallerProps); 339 if (!AllowClone) { 340 // This may set different features on different iteartions if 341 // there is a contradiction in callers' attributes. In this case 342 // we rely on a second pass running on Module, which is allowed 343 // to clone. 344 setFeatures(F, NewProps.Features); 345 setAttributes(F, NewProps.Attributes); 346 NewRoots.insert(&F); 347 Changed = true; 348 break; 349 } 350 351 NewF = cloneWithProperties(F, NewProps); 352 Clones.push_back(Clone(CallerProps, &F, NewF)); 353 NewRoots.insert(NewF); 354 } 355 356 ToReplace.push_back(std::make_pair(CI, NewF)); 357 Replaced.insert(&F); 358 359 Changed = true; 360 } 361 362 while (!ToReplace.empty()) { 363 auto R = ToReplace.pop_back_val(); 364 R.first->setCalledFunction(R.second); 365 } 366 } 367 } while (!NewRoots.empty()); 368 369 for (Function *F : Replaced) { 370 if (F->use_empty()) 371 F->eraseFromParent(); 372 } 373 374 Roots.clear(); 375 Clones.clear(); 376 377 // Keep the post processing related to indirect 378 // calls separate to handle them gracefully. 379 // The core traversal need not be affected by this. 380 if (AllowClone) 381 Changed |= handleAddressTakenFunctions(ModuleCG); 382 383 return Changed; 384 } 385 386 Function * 387 AMDGPUPropagateAttributes::cloneWithProperties(Function &F, 388 const FnProperties &NewProps) { 389 LLVM_DEBUG(dbgs() << "Cloning " << F.getName() << '\n'); 390 391 ValueToValueMapTy dummy; 392 Function *NewF = CloneFunction(&F, dummy); 393 setFeatures(*NewF, NewProps.Features); 394 setAttributes(*NewF, NewProps.Attributes); 395 NewF->setVisibility(GlobalValue::DefaultVisibility); 396 NewF->setLinkage(GlobalValue::InternalLinkage); 397 398 // Swap names. If that is the only clone it will retain the name of now 399 // dead value. Preserve original name for externally visible functions. 400 if (F.hasName() && F.hasLocalLinkage()) { 401 std::string NewName = std::string(NewF->getName()); 402 NewF->takeName(&F); 403 F.setName(NewName); 404 } 405 406 return NewF; 407 } 408 409 void AMDGPUPropagateAttributes::setFeatures(Function &F, 410 const FeatureBitset &NewFeatures) { 411 std::string NewFeatureStr = getFeatureString(NewFeatures); 412 413 LLVM_DEBUG(dbgs() << "Set features " 414 << getFeatureString(NewFeatures & TargetFeatures) 415 << " on " << F.getName() << '\n'); 416 417 F.removeFnAttr("target-features"); 418 F.addFnAttr("target-features", NewFeatureStr); 419 } 420 421 void AMDGPUPropagateAttributes::setAttributes(Function &F, 422 const ArrayRef<Optional<Attribute>> NewAttrs) { 423 LLVM_DEBUG(dbgs() << "Set attributes on " << F.getName() << ":\n"); 424 for (unsigned I = 0; I < NumAttr; ++I) { 425 F.removeFnAttr(AttributeNames[I]); 426 if (NewAttrs[I]) { 427 LLVM_DEBUG(dbgs() << '\t' << NewAttrs[I]->getAsString() << '\n'); 428 F.addFnAttr(*NewAttrs[I]); 429 } 430 } 431 } 432 433 std::string 434 AMDGPUPropagateAttributes::getFeatureString(const FeatureBitset &Features) const 435 { 436 std::string Ret; 437 for (const SubtargetFeatureKV &KV : AMDGPUFeatureKV) { 438 if (Features[KV.Value]) 439 Ret += (StringRef("+") + KV.Key + ",").str(); 440 else if (TargetFeatures[KV.Value]) 441 Ret += (StringRef("-") + KV.Key + ",").str(); 442 } 443 Ret.pop_back(); // Remove last comma. 444 return Ret; 445 } 446 447 bool AMDGPUPropagateAttributesEarly::runOnFunction(Function &F) { 448 if (!TM) { 449 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 450 if (!TPC) 451 return false; 452 453 TM = &TPC->getTM<TargetMachine>(); 454 } 455 456 if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) 457 return false; 458 459 return AMDGPUPropagateAttributes(TM, false).process(F); 460 } 461 462 void AMDGPUPropagateAttributesLate::getAnalysisUsage(AnalysisUsage &AU) const { 463 AU.addRequired<CallGraphWrapperPass>(); 464 } 465 466 bool AMDGPUPropagateAttributesLate::runOnModule(Module &M) { 467 if (!TM) { 468 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 469 if (!TPC) 470 return false; 471 472 TM = &TPC->getTM<TargetMachine>(); 473 } 474 CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph(); 475 return AMDGPUPropagateAttributes(TM, true).process(M, &CG); 476 } 477 478 FunctionPass 479 *llvm::createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *TM) { 480 return new AMDGPUPropagateAttributesEarly(TM); 481 } 482 483 ModulePass 484 *llvm::createAMDGPUPropagateAttributesLatePass(const TargetMachine *TM) { 485 return new AMDGPUPropagateAttributesLate(TM); 486 } 487 488 PreservedAnalyses 489 AMDGPUPropagateAttributesEarlyPass::run(Function &F, 490 FunctionAnalysisManager &AM) { 491 if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) 492 return PreservedAnalyses::all(); 493 494 return AMDGPUPropagateAttributes(&TM, false).process(F) 495 ? PreservedAnalyses::none() 496 : PreservedAnalyses::all(); 497 } 498 499 PreservedAnalyses 500 AMDGPUPropagateAttributesLatePass::run(Module &M, ModuleAnalysisManager &MAM) { 501 AMDGPUPropagateAttributes APA(&TM, true); 502 CallGraph &CG = MAM.getResult<CallGraphAnalysis>(M); 503 const bool Changed = APA.process(M, &CG); 504 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); 505 } 506