1 //===-- AMDGPUAnnotateKernelFeaturesPass.cpp ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file This pass adds target attributes to functions which use intrinsics 11 /// which will impact calling convention lowering. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPU.h" 16 #include "AMDGPUSubtarget.h" 17 #include "llvm/ADT/Triple.h" 18 #include "llvm/Analysis/CallGraphSCCPass.h" 19 #include "llvm/CodeGen/TargetPassConfig.h" 20 #include "llvm/IR/Constants.h" 21 #include "llvm/IR/InstIterator.h" 22 #include "llvm/IR/Instructions.h" 23 #include "llvm/IR/Module.h" 24 25 #define DEBUG_TYPE "amdgpu-annotate-kernel-features" 26 27 using namespace llvm; 28 29 namespace { 30 31 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass { 32 private: 33 const TargetMachine *TM = nullptr; 34 AMDGPUAS AS; 35 36 bool addFeatureAttributes(Function &F); 37 38 public: 39 static char ID; 40 41 AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {} 42 43 bool doInitialization(CallGraph &CG) override; 44 bool runOnSCC(CallGraphSCC &SCC) override; 45 StringRef getPassName() const override { 46 return "AMDGPU Annotate Kernel Features"; 47 } 48 49 void getAnalysisUsage(AnalysisUsage &AU) const override { 50 AU.setPreservesAll(); 51 CallGraphSCCPass::getAnalysisUsage(AU); 52 } 53 54 static bool visitConstantExpr(const ConstantExpr *CE, AMDGPUAS AS); 55 static bool visitConstantExprsRecursively( 56 const Constant *EntryC, 57 SmallPtrSet<const Constant *, 8> &ConstantExprVisited, 58 AMDGPUAS AS); 59 }; 60 61 } 62 63 char AMDGPUAnnotateKernelFeatures::ID = 0; 64 65 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID; 66 67 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, 68 "Add AMDGPU function attributes", false, false) 69 70 71 // The queue ptr is only needed when casting to flat, not from it. 72 static bool castRequiresQueuePtr(unsigned SrcAS, const AMDGPUAS &AS) { 73 return SrcAS == AS.LOCAL_ADDRESS || SrcAS == AS.PRIVATE_ADDRESS; 74 } 75 76 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC, 77 const AMDGPUAS &AS) { 78 return castRequiresQueuePtr(ASC->getSrcAddressSpace(), AS); 79 } 80 81 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE, 82 AMDGPUAS AS) { 83 if (CE->getOpcode() == Instruction::AddrSpaceCast) { 84 unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); 85 return castRequiresQueuePtr(SrcAS, AS); 86 } 87 88 return false; 89 } 90 91 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( 92 const Constant *EntryC, 93 SmallPtrSet<const Constant *, 8> &ConstantExprVisited, 94 AMDGPUAS AS) { 95 96 if (!ConstantExprVisited.insert(EntryC).second) 97 return false; 98 99 SmallVector<const Constant *, 16> Stack; 100 Stack.push_back(EntryC); 101 102 while (!Stack.empty()) { 103 const Constant *C = Stack.pop_back_val(); 104 105 // Check this constant expression. 106 if (const auto *CE = dyn_cast<ConstantExpr>(C)) { 107 if (visitConstantExpr(CE, AS)) 108 return true; 109 } 110 111 // Visit all sub-expressions. 112 for (const Use &U : C->operands()) { 113 const auto *OpC = dyn_cast<Constant>(U); 114 if (!OpC) 115 continue; 116 117 if (!ConstantExprVisited.insert(OpC).second) 118 continue; 119 120 Stack.push_back(OpC); 121 } 122 } 123 124 return false; 125 } 126 127 // We do not need to note the x workitem or workgroup id because they are always 128 // initialized. 129 // 130 // TODO: We should not add the attributes if the known compile time workgroup 131 // size is 1 for y/z. 132 static StringRef intrinsicToAttrName(Intrinsic::ID ID, 133 bool &NonKernelOnly, 134 bool &IsQueuePtr) { 135 switch (ID) { 136 case Intrinsic::amdgcn_workitem_id_x: 137 NonKernelOnly = true; 138 return "amdgpu-work-item-id-x"; 139 case Intrinsic::amdgcn_workgroup_id_x: 140 NonKernelOnly = true; 141 return "amdgpu-work-group-id-x"; 142 case Intrinsic::amdgcn_workitem_id_y: 143 case Intrinsic::r600_read_tidig_y: 144 return "amdgpu-work-item-id-y"; 145 case Intrinsic::amdgcn_workitem_id_z: 146 case Intrinsic::r600_read_tidig_z: 147 return "amdgpu-work-item-id-z"; 148 case Intrinsic::amdgcn_workgroup_id_y: 149 case Intrinsic::r600_read_tgid_y: 150 return "amdgpu-work-group-id-y"; 151 case Intrinsic::amdgcn_workgroup_id_z: 152 case Intrinsic::r600_read_tgid_z: 153 return "amdgpu-work-group-id-z"; 154 case Intrinsic::amdgcn_dispatch_ptr: 155 return "amdgpu-dispatch-ptr"; 156 case Intrinsic::amdgcn_dispatch_id: 157 return "amdgpu-dispatch-id"; 158 case Intrinsic::amdgcn_kernarg_segment_ptr: 159 case Intrinsic::amdgcn_implicitarg_ptr: 160 return "amdgpu-kernarg-segment-ptr"; 161 case Intrinsic::amdgcn_queue_ptr: 162 case Intrinsic::trap: 163 case Intrinsic::debugtrap: 164 IsQueuePtr = true; 165 return "amdgpu-queue-ptr"; 166 default: 167 return ""; 168 } 169 } 170 171 static bool handleAttr(Function &Parent, const Function &Callee, 172 StringRef Name) { 173 if (Callee.hasFnAttribute(Name)) { 174 Parent.addFnAttr(Name); 175 return true; 176 } 177 178 return false; 179 } 180 181 static void copyFeaturesToFunction(Function &Parent, const Function &Callee, 182 bool &NeedQueuePtr) { 183 // X ids unnecessarily propagated to kernels. 184 static const StringRef AttrNames[] = { 185 { "amdgpu-work-item-id-x" }, 186 { "amdgpu-work-item-id-y" }, 187 { "amdgpu-work-item-id-z" }, 188 { "amdgpu-work-group-id-x" }, 189 { "amdgpu-work-group-id-y" }, 190 { "amdgpu-work-group-id-z" }, 191 { "amdgpu-dispatch-ptr" }, 192 { "amdgpu-dispatch-id" }, 193 { "amdgpu-kernarg-segment-ptr" } 194 }; 195 196 if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) 197 NeedQueuePtr = true; 198 199 for (StringRef AttrName : AttrNames) 200 handleAttr(Parent, Callee, AttrName); 201 } 202 203 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { 204 const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F); 205 bool HasFlat = ST.hasFlatAddressSpace(); 206 bool HasApertureRegs = ST.hasApertureRegs(); 207 SmallPtrSet<const Constant *, 8> ConstantExprVisited; 208 209 bool Changed = false; 210 bool NeedQueuePtr = false; 211 bool HaveCall = false; 212 bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv()); 213 214 for (BasicBlock &BB : F) { 215 for (Instruction &I : BB) { 216 CallSite CS(&I); 217 if (CS) { 218 Function *Callee = CS.getCalledFunction(); 219 220 // TODO: Do something with indirect calls. 221 if (!Callee) { 222 if (!CS.isInlineAsm()) 223 HaveCall = true; 224 continue; 225 } 226 227 Intrinsic::ID IID = Callee->getIntrinsicID(); 228 if (IID == Intrinsic::not_intrinsic) { 229 HaveCall = true; 230 copyFeaturesToFunction(F, *Callee, NeedQueuePtr); 231 Changed = true; 232 } else { 233 bool NonKernelOnly = false; 234 StringRef AttrName = intrinsicToAttrName(IID, 235 NonKernelOnly, NeedQueuePtr); 236 if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) { 237 F.addFnAttr(AttrName); 238 Changed = true; 239 } 240 } 241 } 242 243 if (NeedQueuePtr || HasApertureRegs) 244 continue; 245 246 if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) { 247 if (castRequiresQueuePtr(ASC, AS)) { 248 NeedQueuePtr = true; 249 continue; 250 } 251 } 252 253 for (const Use &U : I.operands()) { 254 const auto *OpC = dyn_cast<Constant>(U); 255 if (!OpC) 256 continue; 257 258 if (visitConstantExprsRecursively(OpC, ConstantExprVisited, AS)) { 259 NeedQueuePtr = true; 260 break; 261 } 262 } 263 } 264 } 265 266 if (NeedQueuePtr) { 267 F.addFnAttr("amdgpu-queue-ptr"); 268 Changed = true; 269 } 270 271 // TODO: We could refine this to captured pointers that could possibly be 272 // accessed by flat instructions. For now this is mostly a poor way of 273 // estimating whether there are calls before argument lowering. 274 if (HasFlat && !IsFunc && HaveCall) { 275 F.addFnAttr("amdgpu-flat-scratch"); 276 Changed = true; 277 } 278 279 return Changed; 280 } 281 282 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) { 283 Module &M = SCC.getCallGraph().getModule(); 284 Triple TT(M.getTargetTriple()); 285 286 bool Changed = false; 287 for (CallGraphNode *I : SCC) { 288 Function *F = I->getFunction(); 289 if (!F || F->isDeclaration()) 290 continue; 291 292 Changed |= addFeatureAttributes(*F); 293 } 294 295 296 return Changed; 297 } 298 299 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) { 300 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 301 if (!TPC) 302 report_fatal_error("TargetMachine is required"); 303 304 AS = AMDGPU::getAMDGPUAS(CG.getModule()); 305 TM = &TPC->getTM<TargetMachine>(); 306 return false; 307 } 308 309 Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { 310 return new AMDGPUAnnotateKernelFeatures(); 311 } 312