1 //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This pass adds target attributes to functions which use intrinsics 10 /// which will impact calling convention lowering. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPUSubtarget.h" 16 #include "Utils/AMDGPUBaseInfo.h" 17 #include "llvm/ADT/SmallPtrSet.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/StringRef.h" 20 #include "llvm/ADT/Triple.h" 21 #include "llvm/Analysis/CallGraph.h" 22 #include "llvm/Analysis/CallGraphSCCPass.h" 23 #include "llvm/CodeGen/TargetPassConfig.h" 24 #include "llvm/IR/CallSite.h" 25 #include "llvm/IR/Constant.h" 26 #include "llvm/IR/Constants.h" 27 #include "llvm/IR/Function.h" 28 #include "llvm/IR/Instruction.h" 29 #include "llvm/IR/Instructions.h" 30 #include "llvm/IR/Intrinsics.h" 31 #include "llvm/IR/Module.h" 32 #include "llvm/IR/Type.h" 33 #include "llvm/IR/Use.h" 34 #include "llvm/Pass.h" 35 #include "llvm/Support/Casting.h" 36 #include "llvm/Support/ErrorHandling.h" 37 #include "llvm/Target/TargetMachine.h" 38 39 #define DEBUG_TYPE "amdgpu-annotate-kernel-features" 40 41 using namespace llvm; 42 43 namespace { 44 45 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass { 46 private: 47 const TargetMachine *TM = nullptr; 48 SmallVector<CallGraphNode*, 8> NodeList; 49 50 bool addFeatureAttributes(Function &F); 51 bool processUniformWorkGroupAttribute(); 52 bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee); 53 54 public: 55 static char ID; 56 57 AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {} 58 59 bool doInitialization(CallGraph &CG) override; 60 bool runOnSCC(CallGraphSCC &SCC) override; 61 62 StringRef getPassName() const override { 63 return "AMDGPU Annotate Kernel Features"; 64 } 65 66 void getAnalysisUsage(AnalysisUsage &AU) const override { 67 AU.setPreservesAll(); 68 CallGraphSCCPass::getAnalysisUsage(AU); 69 } 70 71 static bool visitConstantExpr(const ConstantExpr *CE); 72 static bool visitConstantExprsRecursively( 73 const Constant *EntryC, 74 SmallPtrSet<const Constant *, 8> &ConstantExprVisited); 75 }; 76 77 } // end anonymous namespace 78 79 char AMDGPUAnnotateKernelFeatures::ID = 0; 80 81 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID; 82 83 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, 84 "Add AMDGPU function attributes", false, false) 85 86 87 // The queue ptr is only needed when casting to flat, not from it. 88 static bool castRequiresQueuePtr(unsigned SrcAS) { 89 return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS; 90 } 91 92 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) { 93 return castRequiresQueuePtr(ASC->getSrcAddressSpace()); 94 } 95 96 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) { 97 if (CE->getOpcode() == Instruction::AddrSpaceCast) { 98 unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); 99 return castRequiresQueuePtr(SrcAS); 100 } 101 102 return false; 103 } 104 105 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( 106 const Constant *EntryC, 107 SmallPtrSet<const Constant *, 8> &ConstantExprVisited) { 108 109 if (!ConstantExprVisited.insert(EntryC).second) 110 return false; 111 112 SmallVector<const Constant *, 16> Stack; 113 Stack.push_back(EntryC); 114 115 while (!Stack.empty()) { 116 const Constant *C = Stack.pop_back_val(); 117 118 // Check this constant expression. 119 if (const auto *CE = dyn_cast<ConstantExpr>(C)) { 120 if (visitConstantExpr(CE)) 121 return true; 122 } 123 124 // Visit all sub-expressions. 125 for (const Use &U : C->operands()) { 126 const auto *OpC = dyn_cast<Constant>(U); 127 if (!OpC) 128 continue; 129 130 if (!ConstantExprVisited.insert(OpC).second) 131 continue; 132 133 Stack.push_back(OpC); 134 } 135 } 136 137 return false; 138 } 139 140 // We do not need to note the x workitem or workgroup id because they are always 141 // initialized. 142 // 143 // TODO: We should not add the attributes if the known compile time workgroup 144 // size is 1 for y/z. 145 static StringRef intrinsicToAttrName(Intrinsic::ID ID, 146 bool &NonKernelOnly, 147 bool &IsQueuePtr) { 148 switch (ID) { 149 case Intrinsic::amdgcn_workitem_id_x: 150 NonKernelOnly = true; 151 return "amdgpu-work-item-id-x"; 152 case Intrinsic::amdgcn_workgroup_id_x: 153 NonKernelOnly = true; 154 return "amdgpu-work-group-id-x"; 155 case Intrinsic::amdgcn_workitem_id_y: 156 case Intrinsic::r600_read_tidig_y: 157 return "amdgpu-work-item-id-y"; 158 case Intrinsic::amdgcn_workitem_id_z: 159 case Intrinsic::r600_read_tidig_z: 160 return "amdgpu-work-item-id-z"; 161 case Intrinsic::amdgcn_workgroup_id_y: 162 case Intrinsic::r600_read_tgid_y: 163 return "amdgpu-work-group-id-y"; 164 case Intrinsic::amdgcn_workgroup_id_z: 165 case Intrinsic::r600_read_tgid_z: 166 return "amdgpu-work-group-id-z"; 167 case Intrinsic::amdgcn_dispatch_ptr: 168 return "amdgpu-dispatch-ptr"; 169 case Intrinsic::amdgcn_dispatch_id: 170 return "amdgpu-dispatch-id"; 171 case Intrinsic::amdgcn_kernarg_segment_ptr: 172 return "amdgpu-kernarg-segment-ptr"; 173 case Intrinsic::amdgcn_implicitarg_ptr: 174 return "amdgpu-implicitarg-ptr"; 175 case Intrinsic::amdgcn_queue_ptr: 176 case Intrinsic::amdgcn_is_shared: 177 case Intrinsic::amdgcn_is_private: 178 // TODO: Does not require queue ptr on gfx9+ 179 case Intrinsic::trap: 180 case Intrinsic::debugtrap: 181 IsQueuePtr = true; 182 return "amdgpu-queue-ptr"; 183 default: 184 return ""; 185 } 186 } 187 188 static bool handleAttr(Function &Parent, const Function &Callee, 189 StringRef Name) { 190 if (Callee.hasFnAttribute(Name)) { 191 Parent.addFnAttr(Name); 192 return true; 193 } 194 return false; 195 } 196 197 static void copyFeaturesToFunction(Function &Parent, const Function &Callee, 198 bool &NeedQueuePtr) { 199 // X ids unnecessarily propagated to kernels. 200 static const StringRef AttrNames[] = { 201 { "amdgpu-work-item-id-x" }, 202 { "amdgpu-work-item-id-y" }, 203 { "amdgpu-work-item-id-z" }, 204 { "amdgpu-work-group-id-x" }, 205 { "amdgpu-work-group-id-y" }, 206 { "amdgpu-work-group-id-z" }, 207 { "amdgpu-dispatch-ptr" }, 208 { "amdgpu-dispatch-id" }, 209 { "amdgpu-kernarg-segment-ptr" }, 210 { "amdgpu-implicitarg-ptr" } 211 }; 212 213 if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) 214 NeedQueuePtr = true; 215 216 for (StringRef AttrName : AttrNames) 217 handleAttr(Parent, Callee, AttrName); 218 } 219 220 bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() { 221 bool Changed = false; 222 223 for (auto *Node : reverse(NodeList)) { 224 Function *Caller = Node->getFunction(); 225 226 for (auto I : *Node) { 227 Function *Callee = std::get<1>(I)->getFunction(); 228 if (Callee) 229 Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee); 230 } 231 } 232 233 return Changed; 234 } 235 236 bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute( 237 Function &Caller, Function &Callee) { 238 239 // Check for externally defined function 240 if (!Callee.hasExactDefinition()) { 241 Callee.addFnAttr("uniform-work-group-size", "false"); 242 if (!Caller.hasFnAttribute("uniform-work-group-size")) 243 Caller.addFnAttr("uniform-work-group-size", "false"); 244 245 return true; 246 } 247 // Check if the Caller has the attribute 248 if (Caller.hasFnAttribute("uniform-work-group-size")) { 249 // Check if the value of the attribute is true 250 if (Caller.getFnAttribute("uniform-work-group-size") 251 .getValueAsString().equals("true")) { 252 // Propagate the attribute to the Callee, if it does not have it 253 if (!Callee.hasFnAttribute("uniform-work-group-size")) { 254 Callee.addFnAttr("uniform-work-group-size", "true"); 255 return true; 256 } 257 } else { 258 Callee.addFnAttr("uniform-work-group-size", "false"); 259 return true; 260 } 261 } else { 262 // If the attribute is absent, set it as false 263 Caller.addFnAttr("uniform-work-group-size", "false"); 264 Callee.addFnAttr("uniform-work-group-size", "false"); 265 return true; 266 } 267 return false; 268 } 269 270 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { 271 const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F); 272 bool HasFlat = ST.hasFlatAddressSpace(); 273 bool HasApertureRegs = ST.hasApertureRegs(); 274 SmallPtrSet<const Constant *, 8> ConstantExprVisited; 275 276 bool Changed = false; 277 bool NeedQueuePtr = false; 278 bool HaveCall = false; 279 bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv()); 280 281 for (BasicBlock &BB : F) { 282 for (Instruction &I : BB) { 283 CallSite CS(&I); 284 if (CS) { 285 Function *Callee = CS.getCalledFunction(); 286 287 // TODO: Do something with indirect calls. 288 if (!Callee) { 289 if (!CS.isInlineAsm()) 290 HaveCall = true; 291 continue; 292 } 293 294 Intrinsic::ID IID = Callee->getIntrinsicID(); 295 if (IID == Intrinsic::not_intrinsic) { 296 HaveCall = true; 297 copyFeaturesToFunction(F, *Callee, NeedQueuePtr); 298 Changed = true; 299 } else { 300 bool NonKernelOnly = false; 301 StringRef AttrName = intrinsicToAttrName(IID, 302 NonKernelOnly, NeedQueuePtr); 303 if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) { 304 F.addFnAttr(AttrName); 305 Changed = true; 306 } 307 } 308 } 309 310 if (NeedQueuePtr || HasApertureRegs) 311 continue; 312 313 if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) { 314 if (castRequiresQueuePtr(ASC)) { 315 NeedQueuePtr = true; 316 continue; 317 } 318 } 319 320 for (const Use &U : I.operands()) { 321 const auto *OpC = dyn_cast<Constant>(U); 322 if (!OpC) 323 continue; 324 325 if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) { 326 NeedQueuePtr = true; 327 break; 328 } 329 } 330 } 331 } 332 333 if (NeedQueuePtr) { 334 F.addFnAttr("amdgpu-queue-ptr"); 335 Changed = true; 336 } 337 338 // TODO: We could refine this to captured pointers that could possibly be 339 // accessed by flat instructions. For now this is mostly a poor way of 340 // estimating whether there are calls before argument lowering. 341 if (HasFlat && !IsFunc && HaveCall) { 342 F.addFnAttr("amdgpu-flat-scratch"); 343 Changed = true; 344 } 345 346 return Changed; 347 } 348 349 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) { 350 bool Changed = false; 351 352 for (CallGraphNode *I : SCC) { 353 // Build a list of CallGraphNodes from most number of uses to least 354 if (I->getNumReferences()) 355 NodeList.push_back(I); 356 else { 357 processUniformWorkGroupAttribute(); 358 NodeList.clear(); 359 } 360 361 Function *F = I->getFunction(); 362 // Add feature attributes 363 if (!F || F->isDeclaration()) 364 continue; 365 Changed |= addFeatureAttributes(*F); 366 } 367 368 return Changed; 369 } 370 371 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) { 372 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 373 if (!TPC) 374 report_fatal_error("TargetMachine is required"); 375 376 TM = &TPC->getTM<TargetMachine>(); 377 return false; 378 } 379 380 Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { 381 return new AMDGPUAnnotateKernelFeatures(); 382 } 383