1 //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 /// \file This pass adds target attributes to functions which use intrinsics 11 /// which will impact calling convention lowering. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "AMDGPU.h" 16 #include "AMDGPUSubtarget.h" 17 #include "Utils/AMDGPUBaseInfo.h" 18 #include "llvm/ADT/SmallPtrSet.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/ADT/StringRef.h" 21 #include "llvm/ADT/Triple.h" 22 #include "llvm/Analysis/CallGraph.h" 23 #include "llvm/Analysis/CallGraphSCCPass.h" 24 #include "llvm/CodeGen/TargetPassConfig.h" 25 #include "llvm/IR/CallSite.h" 26 #include "llvm/IR/Constant.h" 27 #include "llvm/IR/Constants.h" 28 #include "llvm/IR/Function.h" 29 #include "llvm/IR/Instruction.h" 30 #include "llvm/IR/Instructions.h" 31 #include "llvm/IR/Intrinsics.h" 32 #include "llvm/IR/Module.h" 33 #include "llvm/IR/Type.h" 34 #include "llvm/IR/Use.h" 35 #include "llvm/Pass.h" 36 #include "llvm/Support/Casting.h" 37 #include "llvm/Support/ErrorHandling.h" 38 #include "llvm/Target/TargetMachine.h" 39 40 #define DEBUG_TYPE "amdgpu-annotate-kernel-features" 41 42 using namespace llvm; 43 44 namespace { 45 46 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass { 47 private: 48 const TargetMachine *TM = nullptr; 49 SmallVector<CallGraphNode*, 8> NodeList; 50 51 bool addFeatureAttributes(Function &F); 52 bool processUniformWorkGroupAttribute(); 53 bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee); 54 55 public: 56 static char ID; 57 58 AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {} 59 60 bool doInitialization(CallGraph &CG) override; 61 bool runOnSCC(CallGraphSCC &SCC) override; 62 63 StringRef getPassName() const override { 64 return "AMDGPU Annotate Kernel Features"; 65 } 66 67 void getAnalysisUsage(AnalysisUsage &AU) const override { 68 AU.setPreservesAll(); 69 CallGraphSCCPass::getAnalysisUsage(AU); 70 } 71 72 static bool visitConstantExpr(const ConstantExpr *CE); 73 static bool visitConstantExprsRecursively( 74 const Constant *EntryC, 75 SmallPtrSet<const Constant *, 8> &ConstantExprVisited); 76 }; 77 78 } // end anonymous namespace 79 80 char AMDGPUAnnotateKernelFeatures::ID = 0; 81 82 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID; 83 84 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, 85 "Add AMDGPU function attributes", false, false) 86 87 88 // The queue ptr is only needed when casting to flat, not from it. 89 static bool castRequiresQueuePtr(unsigned SrcAS) { 90 return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS; 91 } 92 93 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) { 94 return castRequiresQueuePtr(ASC->getSrcAddressSpace()); 95 } 96 97 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) { 98 if (CE->getOpcode() == Instruction::AddrSpaceCast) { 99 unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); 100 return castRequiresQueuePtr(SrcAS); 101 } 102 103 return false; 104 } 105 106 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( 107 const Constant *EntryC, 108 SmallPtrSet<const Constant *, 8> &ConstantExprVisited) { 109 110 if (!ConstantExprVisited.insert(EntryC).second) 111 return false; 112 113 SmallVector<const Constant *, 16> Stack; 114 Stack.push_back(EntryC); 115 116 while (!Stack.empty()) { 117 const Constant *C = Stack.pop_back_val(); 118 119 // Check this constant expression. 120 if (const auto *CE = dyn_cast<ConstantExpr>(C)) { 121 if (visitConstantExpr(CE)) 122 return true; 123 } 124 125 // Visit all sub-expressions. 126 for (const Use &U : C->operands()) { 127 const auto *OpC = dyn_cast<Constant>(U); 128 if (!OpC) 129 continue; 130 131 if (!ConstantExprVisited.insert(OpC).second) 132 continue; 133 134 Stack.push_back(OpC); 135 } 136 } 137 138 return false; 139 } 140 141 // We do not need to note the x workitem or workgroup id because they are always 142 // initialized. 143 // 144 // TODO: We should not add the attributes if the known compile time workgroup 145 // size is 1 for y/z. 146 static StringRef intrinsicToAttrName(Intrinsic::ID ID, 147 bool &NonKernelOnly, 148 bool &IsQueuePtr) { 149 switch (ID) { 150 case Intrinsic::amdgcn_workitem_id_x: 151 NonKernelOnly = true; 152 return "amdgpu-work-item-id-x"; 153 case Intrinsic::amdgcn_workgroup_id_x: 154 NonKernelOnly = true; 155 return "amdgpu-work-group-id-x"; 156 case Intrinsic::amdgcn_workitem_id_y: 157 case Intrinsic::r600_read_tidig_y: 158 return "amdgpu-work-item-id-y"; 159 case Intrinsic::amdgcn_workitem_id_z: 160 case Intrinsic::r600_read_tidig_z: 161 return "amdgpu-work-item-id-z"; 162 case Intrinsic::amdgcn_workgroup_id_y: 163 case Intrinsic::r600_read_tgid_y: 164 return "amdgpu-work-group-id-y"; 165 case Intrinsic::amdgcn_workgroup_id_z: 166 case Intrinsic::r600_read_tgid_z: 167 return "amdgpu-work-group-id-z"; 168 case Intrinsic::amdgcn_dispatch_ptr: 169 return "amdgpu-dispatch-ptr"; 170 case Intrinsic::amdgcn_dispatch_id: 171 return "amdgpu-dispatch-id"; 172 case Intrinsic::amdgcn_kernarg_segment_ptr: 173 return "amdgpu-kernarg-segment-ptr"; 174 case Intrinsic::amdgcn_implicitarg_ptr: 175 return "amdgpu-implicitarg-ptr"; 176 case Intrinsic::amdgcn_queue_ptr: 177 case Intrinsic::trap: 178 case Intrinsic::debugtrap: 179 IsQueuePtr = true; 180 return "amdgpu-queue-ptr"; 181 default: 182 return ""; 183 } 184 } 185 186 static bool handleAttr(Function &Parent, const Function &Callee, 187 StringRef Name) { 188 if (Callee.hasFnAttribute(Name)) { 189 Parent.addFnAttr(Name); 190 return true; 191 } 192 return false; 193 } 194 195 static void copyFeaturesToFunction(Function &Parent, const Function &Callee, 196 bool &NeedQueuePtr) { 197 // X ids unnecessarily propagated to kernels. 198 static const StringRef AttrNames[] = { 199 { "amdgpu-work-item-id-x" }, 200 { "amdgpu-work-item-id-y" }, 201 { "amdgpu-work-item-id-z" }, 202 { "amdgpu-work-group-id-x" }, 203 { "amdgpu-work-group-id-y" }, 204 { "amdgpu-work-group-id-z" }, 205 { "amdgpu-dispatch-ptr" }, 206 { "amdgpu-dispatch-id" }, 207 { "amdgpu-kernarg-segment-ptr" }, 208 { "amdgpu-implicitarg-ptr" } 209 }; 210 211 if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) 212 NeedQueuePtr = true; 213 214 for (StringRef AttrName : AttrNames) 215 handleAttr(Parent, Callee, AttrName); 216 } 217 218 bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() { 219 bool Changed = false; 220 221 for (auto *Node : reverse(NodeList)) { 222 Function *Caller = Node->getFunction(); 223 224 for (auto I : *Node) { 225 Function *Callee = std::get<1>(I)->getFunction(); 226 if (Callee) 227 Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee); 228 } 229 } 230 231 return Changed; 232 } 233 234 bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute( 235 Function &Caller, Function &Callee) { 236 237 // Check for externally defined function 238 if (!Callee.hasExactDefinition()) { 239 Callee.addFnAttr("uniform-work-group-size", "false"); 240 if (!Caller.hasFnAttribute("uniform-work-group-size")) 241 Caller.addFnAttr("uniform-work-group-size", "false"); 242 243 return true; 244 } 245 // Check if the Caller has the attribute 246 if (Caller.hasFnAttribute("uniform-work-group-size")) { 247 // Check if the value of the attribute is true 248 if (Caller.getFnAttribute("uniform-work-group-size") 249 .getValueAsString().equals("true")) { 250 // Propagate the attribute to the Callee, if it does not have it 251 if (!Callee.hasFnAttribute("uniform-work-group-size")) { 252 Callee.addFnAttr("uniform-work-group-size", "true"); 253 return true; 254 } 255 } else { 256 Callee.addFnAttr("uniform-work-group-size", "false"); 257 return true; 258 } 259 } else { 260 // If the attribute is absent, set it as false 261 Caller.addFnAttr("uniform-work-group-size", "false"); 262 Callee.addFnAttr("uniform-work-group-size", "false"); 263 return true; 264 } 265 return false; 266 } 267 268 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { 269 const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F); 270 bool HasFlat = ST.hasFlatAddressSpace(); 271 bool HasApertureRegs = ST.hasApertureRegs(); 272 SmallPtrSet<const Constant *, 8> ConstantExprVisited; 273 274 bool Changed = false; 275 bool NeedQueuePtr = false; 276 bool HaveCall = false; 277 bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv()); 278 279 for (BasicBlock &BB : F) { 280 for (Instruction &I : BB) { 281 CallSite CS(&I); 282 if (CS) { 283 Function *Callee = CS.getCalledFunction(); 284 285 // TODO: Do something with indirect calls. 286 if (!Callee) { 287 if (!CS.isInlineAsm()) 288 HaveCall = true; 289 continue; 290 } 291 292 Intrinsic::ID IID = Callee->getIntrinsicID(); 293 if (IID == Intrinsic::not_intrinsic) { 294 HaveCall = true; 295 copyFeaturesToFunction(F, *Callee, NeedQueuePtr); 296 Changed = true; 297 } else { 298 bool NonKernelOnly = false; 299 StringRef AttrName = intrinsicToAttrName(IID, 300 NonKernelOnly, NeedQueuePtr); 301 if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) { 302 F.addFnAttr(AttrName); 303 Changed = true; 304 } 305 } 306 } 307 308 if (NeedQueuePtr || HasApertureRegs) 309 continue; 310 311 if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) { 312 if (castRequiresQueuePtr(ASC)) { 313 NeedQueuePtr = true; 314 continue; 315 } 316 } 317 318 for (const Use &U : I.operands()) { 319 const auto *OpC = dyn_cast<Constant>(U); 320 if (!OpC) 321 continue; 322 323 if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) { 324 NeedQueuePtr = true; 325 break; 326 } 327 } 328 } 329 } 330 331 if (NeedQueuePtr) { 332 F.addFnAttr("amdgpu-queue-ptr"); 333 Changed = true; 334 } 335 336 // TODO: We could refine this to captured pointers that could possibly be 337 // accessed by flat instructions. For now this is mostly a poor way of 338 // estimating whether there are calls before argument lowering. 339 if (HasFlat && !IsFunc && HaveCall) { 340 F.addFnAttr("amdgpu-flat-scratch"); 341 Changed = true; 342 } 343 344 return Changed; 345 } 346 347 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) { 348 bool Changed = false; 349 350 for (CallGraphNode *I : SCC) { 351 // Build a list of CallGraphNodes from most number of uses to least 352 if (I->getNumReferences()) 353 NodeList.push_back(I); 354 else 355 processUniformWorkGroupAttribute(); 356 357 Function *F = I->getFunction(); 358 // Add feature attributes 359 if (!F || F->isDeclaration()) 360 continue; 361 Changed |= addFeatureAttributes(*F); 362 } 363 364 return Changed; 365 } 366 367 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) { 368 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 369 if (!TPC) 370 report_fatal_error("TargetMachine is required"); 371 372 TM = &TPC->getTM<TargetMachine>(); 373 return false; 374 } 375 376 Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { 377 return new AMDGPUAnnotateKernelFeatures(); 378 } 379