1 //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This pass adds target attributes to functions which use intrinsics 10 /// which will impact calling convention lowering. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPUSubtarget.h" 16 #include "Utils/AMDGPUBaseInfo.h" 17 #include "llvm/ADT/SmallPtrSet.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/StringRef.h" 20 #include "llvm/ADT/Triple.h" 21 #include "llvm/Analysis/CallGraph.h" 22 #include "llvm/Analysis/CallGraphSCCPass.h" 23 #include "llvm/CodeGen/TargetPassConfig.h" 24 #include "llvm/IR/CallSite.h" 25 #include "llvm/IR/Constant.h" 26 #include "llvm/IR/Constants.h" 27 #include "llvm/IR/Function.h" 28 #include "llvm/IR/Instruction.h" 29 #include "llvm/IR/Instructions.h" 30 #include "llvm/IR/Intrinsics.h" 31 #include "llvm/IR/Module.h" 32 #include "llvm/IR/Type.h" 33 #include "llvm/IR/Use.h" 34 #include "llvm/Pass.h" 35 #include "llvm/Support/Casting.h" 36 #include "llvm/Support/ErrorHandling.h" 37 #include "llvm/Target/TargetMachine.h" 38 39 #define DEBUG_TYPE "amdgpu-annotate-kernel-features" 40 41 using namespace llvm; 42 43 namespace { 44 45 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass { 46 private: 47 const TargetMachine *TM = nullptr; 48 49 bool addFeatureAttributes(Function &F); 50 51 public: 52 static char ID; 53 54 AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {} 55 56 bool doInitialization(CallGraph &CG) override; 57 bool runOnSCC(CallGraphSCC &SCC) override; 58 59 StringRef getPassName() const override { 60 return "AMDGPU Annotate Kernel Features"; 61 } 62 63 void getAnalysisUsage(AnalysisUsage &AU) const override { 64 AU.setPreservesAll(); 65 CallGraphSCCPass::getAnalysisUsage(AU); 66 } 67 68 static bool visitConstantExpr(const ConstantExpr *CE); 69 static bool visitConstantExprsRecursively( 70 const Constant *EntryC, 71 SmallPtrSet<const Constant *, 8> &ConstantExprVisited); 72 }; 73 74 } // end anonymous namespace 75 76 char AMDGPUAnnotateKernelFeatures::ID = 0; 77 78 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID; 79 80 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, 81 "Add AMDGPU function attributes", false, false) 82 83 84 // The queue ptr is only needed when casting to flat, not from it. 85 static bool castRequiresQueuePtr(unsigned SrcAS) { 86 return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS; 87 } 88 89 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) { 90 return castRequiresQueuePtr(ASC->getSrcAddressSpace()); 91 } 92 93 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) { 94 if (CE->getOpcode() == Instruction::AddrSpaceCast) { 95 unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); 96 return castRequiresQueuePtr(SrcAS); 97 } 98 99 return false; 100 } 101 102 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( 103 const Constant *EntryC, 104 SmallPtrSet<const Constant *, 8> &ConstantExprVisited) { 105 106 if (!ConstantExprVisited.insert(EntryC).second) 107 return false; 108 109 SmallVector<const Constant *, 16> Stack; 110 Stack.push_back(EntryC); 111 112 while (!Stack.empty()) { 113 const Constant *C = Stack.pop_back_val(); 114 115 // Check this constant expression. 116 if (const auto *CE = dyn_cast<ConstantExpr>(C)) { 117 if (visitConstantExpr(CE)) 118 return true; 119 } 120 121 // Visit all sub-expressions. 122 for (const Use &U : C->operands()) { 123 const auto *OpC = dyn_cast<Constant>(U); 124 if (!OpC) 125 continue; 126 127 if (!ConstantExprVisited.insert(OpC).second) 128 continue; 129 130 Stack.push_back(OpC); 131 } 132 } 133 134 return false; 135 } 136 137 // We do not need to note the x workitem or workgroup id because they are always 138 // initialized. 139 // 140 // TODO: We should not add the attributes if the known compile time workgroup 141 // size is 1 for y/z. 142 static StringRef intrinsicToAttrName(Intrinsic::ID ID, 143 bool &NonKernelOnly, 144 bool &IsQueuePtr) { 145 switch (ID) { 146 case Intrinsic::amdgcn_workitem_id_x: 147 NonKernelOnly = true; 148 return "amdgpu-work-item-id-x"; 149 case Intrinsic::amdgcn_workgroup_id_x: 150 NonKernelOnly = true; 151 return "amdgpu-work-group-id-x"; 152 case Intrinsic::amdgcn_workitem_id_y: 153 case Intrinsic::r600_read_tidig_y: 154 return "amdgpu-work-item-id-y"; 155 case Intrinsic::amdgcn_workitem_id_z: 156 case Intrinsic::r600_read_tidig_z: 157 return "amdgpu-work-item-id-z"; 158 case Intrinsic::amdgcn_workgroup_id_y: 159 case Intrinsic::r600_read_tgid_y: 160 return "amdgpu-work-group-id-y"; 161 case Intrinsic::amdgcn_workgroup_id_z: 162 case Intrinsic::r600_read_tgid_z: 163 return "amdgpu-work-group-id-z"; 164 case Intrinsic::amdgcn_dispatch_ptr: 165 return "amdgpu-dispatch-ptr"; 166 case Intrinsic::amdgcn_dispatch_id: 167 return "amdgpu-dispatch-id"; 168 case Intrinsic::amdgcn_kernarg_segment_ptr: 169 return "amdgpu-kernarg-segment-ptr"; 170 case Intrinsic::amdgcn_implicitarg_ptr: 171 return "amdgpu-implicitarg-ptr"; 172 case Intrinsic::amdgcn_queue_ptr: 173 case Intrinsic::trap: 174 case Intrinsic::debugtrap: 175 IsQueuePtr = true; 176 return "amdgpu-queue-ptr"; 177 default: 178 return ""; 179 } 180 } 181 182 static bool handleAttr(Function &Parent, const Function &Callee, 183 StringRef Name) { 184 if (Callee.hasFnAttribute(Name)) { 185 Parent.addFnAttr(Name); 186 return true; 187 } 188 189 return false; 190 } 191 192 static void copyFeaturesToFunction(Function &Parent, const Function &Callee, 193 bool &NeedQueuePtr) { 194 // X ids unnecessarily propagated to kernels. 195 static const StringRef AttrNames[] = { 196 { "amdgpu-work-item-id-x" }, 197 { "amdgpu-work-item-id-y" }, 198 { "amdgpu-work-item-id-z" }, 199 { "amdgpu-work-group-id-x" }, 200 { "amdgpu-work-group-id-y" }, 201 { "amdgpu-work-group-id-z" }, 202 { "amdgpu-dispatch-ptr" }, 203 { "amdgpu-dispatch-id" }, 204 { "amdgpu-kernarg-segment-ptr" }, 205 { "amdgpu-implicitarg-ptr" } 206 }; 207 208 if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) 209 NeedQueuePtr = true; 210 211 for (StringRef AttrName : AttrNames) 212 handleAttr(Parent, Callee, AttrName); 213 } 214 215 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { 216 const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F); 217 bool HasFlat = ST.hasFlatAddressSpace(); 218 bool HasApertureRegs = ST.hasApertureRegs(); 219 SmallPtrSet<const Constant *, 8> ConstantExprVisited; 220 221 bool Changed = false; 222 bool NeedQueuePtr = false; 223 bool HaveCall = false; 224 bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv()); 225 226 for (BasicBlock &BB : F) { 227 for (Instruction &I : BB) { 228 CallSite CS(&I); 229 if (CS) { 230 Function *Callee = CS.getCalledFunction(); 231 232 // TODO: Do something with indirect calls. 233 if (!Callee) { 234 if (!CS.isInlineAsm()) 235 HaveCall = true; 236 continue; 237 } 238 239 Intrinsic::ID IID = Callee->getIntrinsicID(); 240 if (IID == Intrinsic::not_intrinsic) { 241 HaveCall = true; 242 copyFeaturesToFunction(F, *Callee, NeedQueuePtr); 243 Changed = true; 244 } else { 245 bool NonKernelOnly = false; 246 StringRef AttrName = intrinsicToAttrName(IID, 247 NonKernelOnly, NeedQueuePtr); 248 if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) { 249 F.addFnAttr(AttrName); 250 Changed = true; 251 } 252 } 253 } 254 255 if (NeedQueuePtr || HasApertureRegs) 256 continue; 257 258 if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) { 259 if (castRequiresQueuePtr(ASC)) { 260 NeedQueuePtr = true; 261 continue; 262 } 263 } 264 265 for (const Use &U : I.operands()) { 266 const auto *OpC = dyn_cast<Constant>(U); 267 if (!OpC) 268 continue; 269 270 if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) { 271 NeedQueuePtr = true; 272 break; 273 } 274 } 275 } 276 } 277 278 if (NeedQueuePtr) { 279 F.addFnAttr("amdgpu-queue-ptr"); 280 Changed = true; 281 } 282 283 // TODO: We could refine this to captured pointers that could possibly be 284 // accessed by flat instructions. For now this is mostly a poor way of 285 // estimating whether there are calls before argument lowering. 286 if (HasFlat && !IsFunc && HaveCall) { 287 F.addFnAttr("amdgpu-flat-scratch"); 288 Changed = true; 289 } 290 291 return Changed; 292 } 293 294 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) { 295 Module &M = SCC.getCallGraph().getModule(); 296 Triple TT(M.getTargetTriple()); 297 298 bool Changed = false; 299 for (CallGraphNode *I : SCC) { 300 Function *F = I->getFunction(); 301 if (!F || F->isDeclaration()) 302 continue; 303 304 Changed |= addFeatureAttributes(*F); 305 } 306 307 return Changed; 308 } 309 310 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) { 311 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 312 if (!TPC) 313 report_fatal_error("TargetMachine is required"); 314 315 TM = &TPC->getTM<TargetMachine>(); 316 return false; 317 } 318 319 Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { 320 return new AMDGPUAnnotateKernelFeatures(); 321 } 322