1 //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file This pass adds target attributes to functions which use intrinsics 11 /// which will impact calling convention lowering. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPU.h" 16 #include "AMDGPUSubtarget.h" 17 #include "Utils/AMDGPUBaseInfo.h" 18 #include "llvm/ADT/SmallPtrSet.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/ADT/StringRef.h" 21 #include "llvm/ADT/Triple.h" 22 #include "llvm/Analysis/CallGraph.h" 23 #include "llvm/Analysis/CallGraphSCCPass.h" 24 #include "llvm/CodeGen/TargetPassConfig.h" 25 #include "llvm/IR/CallSite.h" 26 #include "llvm/IR/Constant.h" 27 #include "llvm/IR/Constants.h" 28 #include "llvm/IR/Function.h" 29 #include "llvm/IR/Instruction.h" 30 #include "llvm/IR/Instructions.h" 31 #include "llvm/IR/Intrinsics.h" 32 #include "llvm/IR/Module.h" 33 #include "llvm/IR/Type.h" 34 #include "llvm/IR/Use.h" 35 #include "llvm/Pass.h" 36 #include "llvm/Support/Casting.h" 37 #include "llvm/Support/ErrorHandling.h" 38 #include "llvm/Target/TargetMachine.h" 39 40 #define DEBUG_TYPE "amdgpu-annotate-kernel-features" 41 42 using namespace llvm; 43 44 namespace { 45 46 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass { 47 private: 48 const TargetMachine *TM = nullptr; 49 AMDGPUAS AS; 50 51 bool addFeatureAttributes(Function &F); 52 53 public: 54 static char ID; 55 56 AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {} 57 58 bool doInitialization(CallGraph &CG) override; 59 bool runOnSCC(CallGraphSCC &SCC) override; 60 61 StringRef getPassName() const override { 62 return "AMDGPU Annotate Kernel Features"; 63 } 64 65 void getAnalysisUsage(AnalysisUsage &AU) const override { 66 AU.setPreservesAll(); 67 CallGraphSCCPass::getAnalysisUsage(AU); 68 } 69 70 static bool visitConstantExpr(const ConstantExpr *CE, AMDGPUAS AS); 71 static bool visitConstantExprsRecursively( 72 const Constant *EntryC, 73 SmallPtrSet<const Constant *, 8> &ConstantExprVisited, 74 AMDGPUAS AS); 75 }; 76 77 } // end anonymous namespace 78 79 char AMDGPUAnnotateKernelFeatures::ID = 0; 80 81 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID; 82 83 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, 84 "Add AMDGPU function attributes", false, false) 85 86 87 // The queue ptr is only needed when casting to flat, not from it. 88 static bool castRequiresQueuePtr(unsigned SrcAS, const AMDGPUAS &AS) { 89 return SrcAS == AS.LOCAL_ADDRESS || SrcAS == AS.PRIVATE_ADDRESS; 90 } 91 92 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC, 93 const AMDGPUAS &AS) { 94 return castRequiresQueuePtr(ASC->getSrcAddressSpace(), AS); 95 } 96 97 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE, 98 AMDGPUAS AS) { 99 if (CE->getOpcode() == Instruction::AddrSpaceCast) { 100 unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); 101 return castRequiresQueuePtr(SrcAS, AS); 102 } 103 104 return false; 105 } 106 107 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( 108 const Constant *EntryC, 109 SmallPtrSet<const Constant *, 8> &ConstantExprVisited, 110 AMDGPUAS AS) { 111 112 if (!ConstantExprVisited.insert(EntryC).second) 113 return false; 114 115 SmallVector<const Constant *, 16> Stack; 116 Stack.push_back(EntryC); 117 118 while (!Stack.empty()) { 119 const Constant *C = Stack.pop_back_val(); 120 121 // Check this constant expression. 122 if (const auto *CE = dyn_cast<ConstantExpr>(C)) { 123 if (visitConstantExpr(CE, AS)) 124 return true; 125 } 126 127 // Visit all sub-expressions. 128 for (const Use &U : C->operands()) { 129 const auto *OpC = dyn_cast<Constant>(U); 130 if (!OpC) 131 continue; 132 133 if (!ConstantExprVisited.insert(OpC).second) 134 continue; 135 136 Stack.push_back(OpC); 137 } 138 } 139 140 return false; 141 } 142 143 // We do not need to note the x workitem or workgroup id because they are always 144 // initialized. 145 // 146 // TODO: We should not add the attributes if the known compile time workgroup 147 // size is 1 for y/z. 148 static StringRef intrinsicToAttrName(Intrinsic::ID ID, 149 bool &NonKernelOnly, 150 bool &IsQueuePtr) { 151 switch (ID) { 152 case Intrinsic::amdgcn_workitem_id_x: 153 NonKernelOnly = true; 154 return "amdgpu-work-item-id-x"; 155 case Intrinsic::amdgcn_workgroup_id_x: 156 NonKernelOnly = true; 157 return "amdgpu-work-group-id-x"; 158 case Intrinsic::amdgcn_workitem_id_y: 159 case Intrinsic::r600_read_tidig_y: 160 return "amdgpu-work-item-id-y"; 161 case Intrinsic::amdgcn_workitem_id_z: 162 case Intrinsic::r600_read_tidig_z: 163 return "amdgpu-work-item-id-z"; 164 case Intrinsic::amdgcn_workgroup_id_y: 165 case Intrinsic::r600_read_tgid_y: 166 return "amdgpu-work-group-id-y"; 167 case Intrinsic::amdgcn_workgroup_id_z: 168 case Intrinsic::r600_read_tgid_z: 169 return "amdgpu-work-group-id-z"; 170 case Intrinsic::amdgcn_dispatch_ptr: 171 return "amdgpu-dispatch-ptr"; 172 case Intrinsic::amdgcn_dispatch_id: 173 return "amdgpu-dispatch-id"; 174 case Intrinsic::amdgcn_kernarg_segment_ptr: 175 return "amdgpu-kernarg-segment-ptr"; 176 case Intrinsic::amdgcn_implicitarg_ptr: 177 return "amdgpu-implicitarg-ptr"; 178 case Intrinsic::amdgcn_queue_ptr: 179 case Intrinsic::trap: 180 case Intrinsic::debugtrap: 181 IsQueuePtr = true; 182 return "amdgpu-queue-ptr"; 183 default: 184 return ""; 185 } 186 } 187 188 static bool handleAttr(Function &Parent, const Function &Callee, 189 StringRef Name) { 190 if (Callee.hasFnAttribute(Name)) { 191 Parent.addFnAttr(Name); 192 return true; 193 } 194 195 return false; 196 } 197 198 static void copyFeaturesToFunction(Function &Parent, const Function &Callee, 199 bool &NeedQueuePtr) { 200 // X ids unnecessarily propagated to kernels. 201 static const StringRef AttrNames[] = { 202 { "amdgpu-work-item-id-x" }, 203 { "amdgpu-work-item-id-y" }, 204 { "amdgpu-work-item-id-z" }, 205 { "amdgpu-work-group-id-x" }, 206 { "amdgpu-work-group-id-y" }, 207 { "amdgpu-work-group-id-z" }, 208 { "amdgpu-dispatch-ptr" }, 209 { "amdgpu-dispatch-id" }, 210 { "amdgpu-kernarg-segment-ptr" }, 211 { "amdgpu-implicitarg-ptr" } 212 }; 213 214 if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) 215 NeedQueuePtr = true; 216 217 for (StringRef AttrName : AttrNames) 218 handleAttr(Parent, Callee, AttrName); 219 } 220 221 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { 222 const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F); 223 bool HasFlat = ST.hasFlatAddressSpace(); 224 bool HasApertureRegs = ST.hasApertureRegs(); 225 SmallPtrSet<const Constant *, 8> ConstantExprVisited; 226 227 bool Changed = false; 228 bool NeedQueuePtr = false; 229 bool HaveCall = false; 230 bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv()); 231 232 for (BasicBlock &BB : F) { 233 for (Instruction &I : BB) { 234 CallSite CS(&I); 235 if (CS) { 236 Function *Callee = CS.getCalledFunction(); 237 238 // TODO: Do something with indirect calls. 239 if (!Callee) { 240 if (!CS.isInlineAsm()) 241 HaveCall = true; 242 continue; 243 } 244 245 Intrinsic::ID IID = Callee->getIntrinsicID(); 246 if (IID == Intrinsic::not_intrinsic) { 247 HaveCall = true; 248 copyFeaturesToFunction(F, *Callee, NeedQueuePtr); 249 Changed = true; 250 } else { 251 bool NonKernelOnly = false; 252 StringRef AttrName = intrinsicToAttrName(IID, 253 NonKernelOnly, NeedQueuePtr); 254 if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) { 255 F.addFnAttr(AttrName); 256 Changed = true; 257 } 258 } 259 } 260 261 if (NeedQueuePtr || HasApertureRegs) 262 continue; 263 264 if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) { 265 if (castRequiresQueuePtr(ASC, AS)) { 266 NeedQueuePtr = true; 267 continue; 268 } 269 } 270 271 for (const Use &U : I.operands()) { 272 const auto *OpC = dyn_cast<Constant>(U); 273 if (!OpC) 274 continue; 275 276 if (visitConstantExprsRecursively(OpC, ConstantExprVisited, AS)) { 277 NeedQueuePtr = true; 278 break; 279 } 280 } 281 } 282 } 283 284 if (NeedQueuePtr) { 285 F.addFnAttr("amdgpu-queue-ptr"); 286 Changed = true; 287 } 288 289 // TODO: We could refine this to captured pointers that could possibly be 290 // accessed by flat instructions. For now this is mostly a poor way of 291 // estimating whether there are calls before argument lowering. 292 if (HasFlat && !IsFunc && HaveCall) { 293 F.addFnAttr("amdgpu-flat-scratch"); 294 Changed = true; 295 } 296 297 return Changed; 298 } 299 300 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) { 301 Module &M = SCC.getCallGraph().getModule(); 302 Triple TT(M.getTargetTriple()); 303 304 bool Changed = false; 305 for (CallGraphNode *I : SCC) { 306 Function *F = I->getFunction(); 307 if (!F || F->isDeclaration()) 308 continue; 309 310 Changed |= addFeatureAttributes(*F); 311 } 312 313 return Changed; 314 } 315 316 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) { 317 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 318 if (!TPC) 319 report_fatal_error("TargetMachine is required"); 320 321 AS = AMDGPU::getAMDGPUAS(CG.getModule()); 322 TM = &TPC->getTM<TargetMachine>(); 323 return false; 324 } 325 326 Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { 327 return new AMDGPUAnnotateKernelFeatures(); 328 } 329