1 //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This pass adds target attributes to functions which use intrinsics 10 /// which will impact calling convention lowering. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPUSubtarget.h" 16 #include "Utils/AMDGPUBaseInfo.h" 17 #include "llvm/ADT/SmallPtrSet.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/StringRef.h" 20 #include "llvm/ADT/Triple.h" 21 #include "llvm/Analysis/CallGraph.h" 22 #include "llvm/Analysis/CallGraphSCCPass.h" 23 #include "llvm/CodeGen/TargetPassConfig.h" 24 #include "llvm/IR/CallSite.h" 25 #include "llvm/IR/Constant.h" 26 #include "llvm/IR/Constants.h" 27 #include "llvm/IR/Function.h" 28 #include "llvm/IR/Instruction.h" 29 #include "llvm/IR/Instructions.h" 30 #include "llvm/IR/Intrinsics.h" 31 #include "llvm/IR/Module.h" 32 #include "llvm/IR/Type.h" 33 #include "llvm/IR/Use.h" 34 #include "llvm/Pass.h" 35 #include "llvm/Support/Casting.h" 36 #include "llvm/Support/ErrorHandling.h" 37 #include "llvm/Target/TargetMachine.h" 38 39 #define DEBUG_TYPE "amdgpu-annotate-kernel-features" 40 41 using namespace llvm; 42 43 namespace { 44 45 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass { 46 private: 47 const TargetMachine *TM = nullptr; 48 SmallVector<CallGraphNode*, 8> NodeList; 49 50 bool addFeatureAttributes(Function &F); 51 bool processUniformWorkGroupAttribute(); 52 bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee); 53 54 public: 55 static char ID; 56 57 AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {} 58 59 bool doInitialization(CallGraph &CG) override; 60 bool runOnSCC(CallGraphSCC &SCC) override; 61 62 StringRef getPassName() const override { 63 return "AMDGPU Annotate Kernel Features"; 64 } 65 66 void getAnalysisUsage(AnalysisUsage &AU) const override { 67 AU.setPreservesAll(); 68 CallGraphSCCPass::getAnalysisUsage(AU); 69 } 70 71 static bool visitConstantExpr(const ConstantExpr *CE); 72 static bool visitConstantExprsRecursively( 73 const Constant *EntryC, 74 SmallPtrSet<const Constant *, 8> &ConstantExprVisited, bool IsFunc, 75 bool HasApertureRegs); 76 }; 77 78 } // end anonymous namespace 79 80 char AMDGPUAnnotateKernelFeatures::ID = 0; 81 82 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID; 83 84 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, 85 "Add AMDGPU function attributes", false, false) 86 87 88 // The queue ptr is only needed when casting to flat, not from it. 89 static bool castRequiresQueuePtr(unsigned SrcAS) { 90 return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS; 91 } 92 93 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) { 94 return castRequiresQueuePtr(ASC->getSrcAddressSpace()); 95 } 96 97 static bool isDSAddress(const Constant *C) { 98 const GlobalValue *GV = dyn_cast<GlobalValue>(C); 99 if (!GV) 100 return false; 101 unsigned AS = GV->getAddressSpace(); 102 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS; 103 } 104 105 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) { 106 if (CE->getOpcode() == Instruction::AddrSpaceCast) { 107 unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); 108 return castRequiresQueuePtr(SrcAS); 109 } 110 111 return false; 112 } 113 114 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( 115 const Constant *EntryC, 116 SmallPtrSet<const Constant *, 8> &ConstantExprVisited, 117 bool IsFunc, bool HasApertureRegs) { 118 119 if (!ConstantExprVisited.insert(EntryC).second) 120 return false; 121 122 SmallVector<const Constant *, 16> Stack; 123 Stack.push_back(EntryC); 124 125 while (!Stack.empty()) { 126 const Constant *C = Stack.pop_back_val(); 127 128 // We need to trap on DS globals in non-entry functions. 129 if (IsFunc && isDSAddress(C)) 130 return true; 131 132 // Check this constant expression. 133 if (const auto *CE = dyn_cast<ConstantExpr>(C)) { 134 if (!HasApertureRegs && visitConstantExpr(CE)) 135 return true; 136 } 137 138 // Visit all sub-expressions. 139 for (const Use &U : C->operands()) { 140 const auto *OpC = dyn_cast<Constant>(U); 141 if (!OpC) 142 continue; 143 144 if (!ConstantExprVisited.insert(OpC).second) 145 continue; 146 147 Stack.push_back(OpC); 148 } 149 } 150 151 return false; 152 } 153 154 // We do not need to note the x workitem or workgroup id because they are always 155 // initialized. 156 // 157 // TODO: We should not add the attributes if the known compile time workgroup 158 // size is 1 for y/z. 159 static StringRef intrinsicToAttrName(Intrinsic::ID ID, 160 bool &NonKernelOnly, 161 bool &IsQueuePtr) { 162 switch (ID) { 163 case Intrinsic::amdgcn_workitem_id_x: 164 NonKernelOnly = true; 165 return "amdgpu-work-item-id-x"; 166 case Intrinsic::amdgcn_workgroup_id_x: 167 NonKernelOnly = true; 168 return "amdgpu-work-group-id-x"; 169 case Intrinsic::amdgcn_workitem_id_y: 170 case Intrinsic::r600_read_tidig_y: 171 return "amdgpu-work-item-id-y"; 172 case Intrinsic::amdgcn_workitem_id_z: 173 case Intrinsic::r600_read_tidig_z: 174 return "amdgpu-work-item-id-z"; 175 case Intrinsic::amdgcn_workgroup_id_y: 176 case Intrinsic::r600_read_tgid_y: 177 return "amdgpu-work-group-id-y"; 178 case Intrinsic::amdgcn_workgroup_id_z: 179 case Intrinsic::r600_read_tgid_z: 180 return "amdgpu-work-group-id-z"; 181 case Intrinsic::amdgcn_dispatch_ptr: 182 return "amdgpu-dispatch-ptr"; 183 case Intrinsic::amdgcn_dispatch_id: 184 return "amdgpu-dispatch-id"; 185 case Intrinsic::amdgcn_kernarg_segment_ptr: 186 return "amdgpu-kernarg-segment-ptr"; 187 case Intrinsic::amdgcn_implicitarg_ptr: 188 return "amdgpu-implicitarg-ptr"; 189 case Intrinsic::amdgcn_queue_ptr: 190 case Intrinsic::amdgcn_is_shared: 191 case Intrinsic::amdgcn_is_private: 192 // TODO: Does not require queue ptr on gfx9+ 193 case Intrinsic::trap: 194 case Intrinsic::debugtrap: 195 IsQueuePtr = true; 196 return "amdgpu-queue-ptr"; 197 default: 198 return ""; 199 } 200 } 201 202 static bool handleAttr(Function &Parent, const Function &Callee, 203 StringRef Name) { 204 if (Callee.hasFnAttribute(Name)) { 205 Parent.addFnAttr(Name); 206 return true; 207 } 208 return false; 209 } 210 211 static void copyFeaturesToFunction(Function &Parent, const Function &Callee, 212 bool &NeedQueuePtr) { 213 // X ids unnecessarily propagated to kernels. 214 static constexpr StringLiteral AttrNames[] = { 215 "amdgpu-work-item-id-x", "amdgpu-work-item-id-y", 216 "amdgpu-work-item-id-z", "amdgpu-work-group-id-x", 217 "amdgpu-work-group-id-y", "amdgpu-work-group-id-z", 218 "amdgpu-dispatch-ptr", "amdgpu-dispatch-id", 219 "amdgpu-implicitarg-ptr"}; 220 221 if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) 222 NeedQueuePtr = true; 223 224 for (StringRef AttrName : AttrNames) 225 handleAttr(Parent, Callee, AttrName); 226 } 227 228 bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() { 229 bool Changed = false; 230 231 for (auto *Node : reverse(NodeList)) { 232 Function *Caller = Node->getFunction(); 233 234 for (auto I : *Node) { 235 Function *Callee = std::get<1>(I)->getFunction(); 236 if (Callee) 237 Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee); 238 } 239 } 240 241 return Changed; 242 } 243 244 bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute( 245 Function &Caller, Function &Callee) { 246 247 // Check for externally defined function 248 if (!Callee.hasExactDefinition()) { 249 Callee.addFnAttr("uniform-work-group-size", "false"); 250 if (!Caller.hasFnAttribute("uniform-work-group-size")) 251 Caller.addFnAttr("uniform-work-group-size", "false"); 252 253 return true; 254 } 255 // Check if the Caller has the attribute 256 if (Caller.hasFnAttribute("uniform-work-group-size")) { 257 // Check if the value of the attribute is true 258 if (Caller.getFnAttribute("uniform-work-group-size") 259 .getValueAsString().equals("true")) { 260 // Propagate the attribute to the Callee, if it does not have it 261 if (!Callee.hasFnAttribute("uniform-work-group-size")) { 262 Callee.addFnAttr("uniform-work-group-size", "true"); 263 return true; 264 } 265 } else { 266 Callee.addFnAttr("uniform-work-group-size", "false"); 267 return true; 268 } 269 } else { 270 // If the attribute is absent, set it as false 271 Caller.addFnAttr("uniform-work-group-size", "false"); 272 Callee.addFnAttr("uniform-work-group-size", "false"); 273 return true; 274 } 275 return false; 276 } 277 278 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { 279 const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F); 280 bool HasApertureRegs = ST.hasApertureRegs(); 281 SmallPtrSet<const Constant *, 8> ConstantExprVisited; 282 283 bool Changed = false; 284 bool NeedQueuePtr = false; 285 bool HaveCall = false; 286 bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv()); 287 288 for (BasicBlock &BB : F) { 289 for (Instruction &I : BB) { 290 CallSite CS(&I); 291 if (CS) { 292 Function *Callee = CS.getCalledFunction(); 293 294 // TODO: Do something with indirect calls. 295 if (!Callee) { 296 if (!CS.isInlineAsm()) 297 HaveCall = true; 298 continue; 299 } 300 301 Intrinsic::ID IID = Callee->getIntrinsicID(); 302 if (IID == Intrinsic::not_intrinsic) { 303 HaveCall = true; 304 copyFeaturesToFunction(F, *Callee, NeedQueuePtr); 305 Changed = true; 306 } else { 307 bool NonKernelOnly = false; 308 309 if (!IsFunc && IID == Intrinsic::amdgcn_kernarg_segment_ptr) { 310 F.addFnAttr("amdgpu-kernarg-segment-ptr"); 311 } else { 312 StringRef AttrName = intrinsicToAttrName(IID, NonKernelOnly, 313 NeedQueuePtr); 314 if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) { 315 F.addFnAttr(AttrName); 316 Changed = true; 317 } 318 } 319 } 320 } 321 322 if (NeedQueuePtr || (!IsFunc && HasApertureRegs)) 323 continue; 324 325 if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) { 326 if (!HasApertureRegs && castRequiresQueuePtr(ASC)) { 327 NeedQueuePtr = true; 328 continue; 329 } 330 } 331 332 for (const Use &U : I.operands()) { 333 const auto *OpC = dyn_cast<Constant>(U); 334 if (!OpC) 335 continue; 336 337 if (visitConstantExprsRecursively(OpC, ConstantExprVisited, IsFunc, 338 HasApertureRegs)) { 339 NeedQueuePtr = true; 340 break; 341 } 342 } 343 } 344 } 345 346 if (NeedQueuePtr) { 347 F.addFnAttr("amdgpu-queue-ptr"); 348 Changed = true; 349 } 350 351 // TODO: We could refine this to captured pointers that could possibly be 352 // accessed by flat instructions. For now this is mostly a poor way of 353 // estimating whether there are calls before argument lowering. 354 if (!IsFunc && HaveCall) { 355 F.addFnAttr("amdgpu-calls"); 356 Changed = true; 357 } 358 359 return Changed; 360 } 361 362 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) { 363 bool Changed = false; 364 365 for (CallGraphNode *I : SCC) { 366 // Build a list of CallGraphNodes from most number of uses to least 367 if (I->getNumReferences()) 368 NodeList.push_back(I); 369 else { 370 processUniformWorkGroupAttribute(); 371 NodeList.clear(); 372 } 373 374 Function *F = I->getFunction(); 375 // Add feature attributes 376 if (!F || F->isDeclaration()) 377 continue; 378 Changed |= addFeatureAttributes(*F); 379 } 380 381 return Changed; 382 } 383 384 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) { 385 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 386 if (!TPC) 387 report_fatal_error("TargetMachine is required"); 388 389 TM = &TPC->getTM<TargetMachine>(); 390 return false; 391 } 392 393 Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { 394 return new AMDGPUAnnotateKernelFeatures(); 395 } 396