1 //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file This pass adds target attributes to functions which use intrinsics 11 /// which will impact calling convention lowering. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPU.h" 16 #include "AMDGPUSubtarget.h" 17 #include "Utils/AMDGPUBaseInfo.h" 18 #include "llvm/ADT/SmallPtrSet.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/ADT/StringRef.h" 21 #include "llvm/ADT/Triple.h" 22 #include "llvm/Analysis/CallGraph.h" 23 #include "llvm/Analysis/CallGraphSCCPass.h" 24 #include "llvm/CodeGen/TargetPassConfig.h" 25 #include "llvm/IR/CallSite.h" 26 #include "llvm/IR/Constant.h" 27 #include "llvm/IR/Constants.h" 28 #include "llvm/IR/Function.h" 29 #include "llvm/IR/Instruction.h" 30 #include "llvm/IR/Instructions.h" 31 #include "llvm/IR/Intrinsics.h" 32 #include "llvm/IR/Module.h" 33 #include "llvm/IR/Type.h" 34 #include "llvm/IR/Use.h" 35 #include "llvm/Pass.h" 36 #include "llvm/Support/Casting.h" 37 #include "llvm/Support/ErrorHandling.h" 38 #include "llvm/Target/TargetMachine.h" 39 40 #define DEBUG_TYPE "amdgpu-annotate-kernel-features" 41 42 using namespace llvm; 43 44 namespace { 45 46 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass { 47 private: 48 const TargetMachine *TM = nullptr; 49 50 bool addFeatureAttributes(Function &F); 51 52 public: 53 static char ID; 54 55 AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {} 56 57 bool doInitialization(CallGraph &CG) override; 58 bool runOnSCC(CallGraphSCC &SCC) override; 59 60 StringRef getPassName() const override { 61 return "AMDGPU Annotate Kernel Features"; 62 } 63 64 void getAnalysisUsage(AnalysisUsage &AU) const override { 65 AU.setPreservesAll(); 66 CallGraphSCCPass::getAnalysisUsage(AU); 67 } 68 69 static bool visitConstantExpr(const ConstantExpr *CE); 70 static bool visitConstantExprsRecursively( 71 const Constant *EntryC, 72 SmallPtrSet<const Constant *, 8> &ConstantExprVisited); 73 }; 74 75 } // end anonymous namespace 76 77 char AMDGPUAnnotateKernelFeatures::ID = 0; 78 79 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID; 80 81 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, 82 "Add AMDGPU function attributes", false, false) 83 84 85 // The queue ptr is only needed when casting to flat, not from it. 86 static bool castRequiresQueuePtr(unsigned SrcAS) { 87 return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS; 88 } 89 90 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) { 91 return castRequiresQueuePtr(ASC->getSrcAddressSpace()); 92 } 93 94 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) { 95 if (CE->getOpcode() == Instruction::AddrSpaceCast) { 96 unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); 97 return castRequiresQueuePtr(SrcAS); 98 } 99 100 return false; 101 } 102 103 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( 104 const Constant *EntryC, 105 SmallPtrSet<const Constant *, 8> &ConstantExprVisited) { 106 107 if (!ConstantExprVisited.insert(EntryC).second) 108 return false; 109 110 SmallVector<const Constant *, 16> Stack; 111 Stack.push_back(EntryC); 112 113 while (!Stack.empty()) { 114 const Constant *C = Stack.pop_back_val(); 115 116 // Check this constant expression. 117 if (const auto *CE = dyn_cast<ConstantExpr>(C)) { 118 if (visitConstantExpr(CE)) 119 return true; 120 } 121 122 // Visit all sub-expressions. 123 for (const Use &U : C->operands()) { 124 const auto *OpC = dyn_cast<Constant>(U); 125 if (!OpC) 126 continue; 127 128 if (!ConstantExprVisited.insert(OpC).second) 129 continue; 130 131 Stack.push_back(OpC); 132 } 133 } 134 135 return false; 136 } 137 138 // We do not need to note the x workitem or workgroup id because they are always 139 // initialized. 140 // 141 // TODO: We should not add the attributes if the known compile time workgroup 142 // size is 1 for y/z. 143 static StringRef intrinsicToAttrName(Intrinsic::ID ID, 144 bool &NonKernelOnly, 145 bool &IsQueuePtr) { 146 switch (ID) { 147 case Intrinsic::amdgcn_workitem_id_x: 148 NonKernelOnly = true; 149 return "amdgpu-work-item-id-x"; 150 case Intrinsic::amdgcn_workgroup_id_x: 151 NonKernelOnly = true; 152 return "amdgpu-work-group-id-x"; 153 case Intrinsic::amdgcn_workitem_id_y: 154 case Intrinsic::r600_read_tidig_y: 155 return "amdgpu-work-item-id-y"; 156 case Intrinsic::amdgcn_workitem_id_z: 157 case Intrinsic::r600_read_tidig_z: 158 return "amdgpu-work-item-id-z"; 159 case Intrinsic::amdgcn_workgroup_id_y: 160 case Intrinsic::r600_read_tgid_y: 161 return "amdgpu-work-group-id-y"; 162 case Intrinsic::amdgcn_workgroup_id_z: 163 case Intrinsic::r600_read_tgid_z: 164 return "amdgpu-work-group-id-z"; 165 case Intrinsic::amdgcn_dispatch_ptr: 166 return "amdgpu-dispatch-ptr"; 167 case Intrinsic::amdgcn_dispatch_id: 168 return "amdgpu-dispatch-id"; 169 case Intrinsic::amdgcn_kernarg_segment_ptr: 170 return "amdgpu-kernarg-segment-ptr"; 171 case Intrinsic::amdgcn_implicitarg_ptr: 172 return "amdgpu-implicitarg-ptr"; 173 case Intrinsic::amdgcn_queue_ptr: 174 case Intrinsic::trap: 175 case Intrinsic::debugtrap: 176 IsQueuePtr = true; 177 return "amdgpu-queue-ptr"; 178 default: 179 return ""; 180 } 181 } 182 183 static bool handleAttr(Function &Parent, const Function &Callee, 184 StringRef Name) { 185 if (Callee.hasFnAttribute(Name)) { 186 Parent.addFnAttr(Name); 187 return true; 188 } 189 190 return false; 191 } 192 193 static void copyFeaturesToFunction(Function &Parent, const Function &Callee, 194 bool &NeedQueuePtr) { 195 // X ids unnecessarily propagated to kernels. 196 static const StringRef AttrNames[] = { 197 { "amdgpu-work-item-id-x" }, 198 { "amdgpu-work-item-id-y" }, 199 { "amdgpu-work-item-id-z" }, 200 { "amdgpu-work-group-id-x" }, 201 { "amdgpu-work-group-id-y" }, 202 { "amdgpu-work-group-id-z" }, 203 { "amdgpu-dispatch-ptr" }, 204 { "amdgpu-dispatch-id" }, 205 { "amdgpu-kernarg-segment-ptr" }, 206 { "amdgpu-implicitarg-ptr" } 207 }; 208 209 if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) 210 NeedQueuePtr = true; 211 212 for (StringRef AttrName : AttrNames) 213 handleAttr(Parent, Callee, AttrName); 214 } 215 216 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { 217 const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F); 218 bool HasFlat = ST.hasFlatAddressSpace(); 219 bool HasApertureRegs = ST.hasApertureRegs(); 220 SmallPtrSet<const Constant *, 8> ConstantExprVisited; 221 222 bool Changed = false; 223 bool NeedQueuePtr = false; 224 bool HaveCall = false; 225 bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv()); 226 227 for (BasicBlock &BB : F) { 228 for (Instruction &I : BB) { 229 CallSite CS(&I); 230 if (CS) { 231 Function *Callee = CS.getCalledFunction(); 232 233 // TODO: Do something with indirect calls. 234 if (!Callee) { 235 if (!CS.isInlineAsm()) 236 HaveCall = true; 237 continue; 238 } 239 240 Intrinsic::ID IID = Callee->getIntrinsicID(); 241 if (IID == Intrinsic::not_intrinsic) { 242 HaveCall = true; 243 copyFeaturesToFunction(F, *Callee, NeedQueuePtr); 244 Changed = true; 245 } else { 246 bool NonKernelOnly = false; 247 StringRef AttrName = intrinsicToAttrName(IID, 248 NonKernelOnly, NeedQueuePtr); 249 if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) { 250 F.addFnAttr(AttrName); 251 Changed = true; 252 } 253 } 254 } 255 256 if (NeedQueuePtr || HasApertureRegs) 257 continue; 258 259 if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) { 260 if (castRequiresQueuePtr(ASC)) { 261 NeedQueuePtr = true; 262 continue; 263 } 264 } 265 266 for (const Use &U : I.operands()) { 267 const auto *OpC = dyn_cast<Constant>(U); 268 if (!OpC) 269 continue; 270 271 if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) { 272 NeedQueuePtr = true; 273 break; 274 } 275 } 276 } 277 } 278 279 if (NeedQueuePtr) { 280 F.addFnAttr("amdgpu-queue-ptr"); 281 Changed = true; 282 } 283 284 // TODO: We could refine this to captured pointers that could possibly be 285 // accessed by flat instructions. For now this is mostly a poor way of 286 // estimating whether there are calls before argument lowering. 287 if (HasFlat && !IsFunc && HaveCall) { 288 F.addFnAttr("amdgpu-flat-scratch"); 289 Changed = true; 290 } 291 292 return Changed; 293 } 294 295 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) { 296 Module &M = SCC.getCallGraph().getModule(); 297 Triple TT(M.getTargetTriple()); 298 299 bool Changed = false; 300 for (CallGraphNode *I : SCC) { 301 Function *F = I->getFunction(); 302 if (!F || F->isDeclaration()) 303 continue; 304 305 Changed |= addFeatureAttributes(*F); 306 } 307 308 return Changed; 309 } 310 311 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) { 312 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 313 if (!TPC) 314 report_fatal_error("TargetMachine is required"); 315 316 TM = &TPC->getTM<TargetMachine>(); 317 return false; 318 } 319 320 Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { 321 return new AMDGPUAnnotateKernelFeatures(); 322 } 323