1 //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file This pass adds target attributes to functions which use intrinsics 10 /// which will impact calling convention lowering. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "GCNSubtarget.h" 16 #include "llvm/Analysis/CallGraph.h" 17 #include "llvm/Analysis/CallGraphSCCPass.h" 18 #include "llvm/CodeGen/TargetPassConfig.h" 19 #include "llvm/IR/IntrinsicsAMDGPU.h" 20 #include "llvm/IR/IntrinsicsR600.h" 21 #include "llvm/Target/TargetMachine.h" 22 23 #define DEBUG_TYPE "amdgpu-annotate-kernel-features" 24 25 using namespace llvm; 26 27 namespace { 28 static constexpr StringLiteral ImplicitAttrNames[] = { 29 // X ids unnecessarily propagated to kernels. 30 "amdgpu-work-item-id-x", "amdgpu-work-item-id-y", 31 "amdgpu-work-item-id-z", "amdgpu-work-group-id-x", 32 "amdgpu-work-group-id-y", "amdgpu-work-group-id-z", 33 "amdgpu-dispatch-ptr", "amdgpu-dispatch-id", 34 "amdgpu-queue-ptr", "amdgpu-implicitarg-ptr"}; 35 36 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass { 37 private: 38 const TargetMachine *TM = nullptr; 39 SmallVector<CallGraphNode*, 8> NodeList; 40 41 bool addFeatureAttributes(Function &F); 42 bool processUniformWorkGroupAttribute(); 43 bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee); 44 45 public: 46 static char ID; 47 48 AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {} 49 50 bool doInitialization(CallGraph &CG) override; 51 bool runOnSCC(CallGraphSCC &SCC) override; 52 53 StringRef getPassName() const override { 54 return "AMDGPU Annotate Kernel Features"; 55 } 56 57 void getAnalysisUsage(AnalysisUsage &AU) const override { 58 AU.setPreservesAll(); 59 CallGraphSCCPass::getAnalysisUsage(AU); 60 } 61 62 static bool visitConstantExpr(const ConstantExpr *CE); 63 static bool visitConstantExprsRecursively( 64 const Constant *EntryC, 65 SmallPtrSet<const Constant *, 8> &ConstantExprVisited, bool IsFunc, 66 bool HasApertureRegs); 67 }; 68 69 } // end anonymous namespace 70 71 char AMDGPUAnnotateKernelFeatures::ID = 0; 72 73 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID; 74 75 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE, 76 "Add AMDGPU function attributes", false, false) 77 78 79 // The queue ptr is only needed when casting to flat, not from it. 80 static bool castRequiresQueuePtr(unsigned SrcAS) { 81 return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS; 82 } 83 84 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) { 85 return castRequiresQueuePtr(ASC->getSrcAddressSpace()); 86 } 87 88 static bool isDSAddress(const Constant *C) { 89 const GlobalValue *GV = dyn_cast<GlobalValue>(C); 90 if (!GV) 91 return false; 92 unsigned AS = GV->getAddressSpace(); 93 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS; 94 } 95 96 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) { 97 if (CE->getOpcode() == Instruction::AddrSpaceCast) { 98 unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace(); 99 return castRequiresQueuePtr(SrcAS); 100 } 101 102 return false; 103 } 104 105 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( 106 const Constant *EntryC, 107 SmallPtrSet<const Constant *, 8> &ConstantExprVisited, 108 bool IsFunc, bool HasApertureRegs) { 109 110 if (!ConstantExprVisited.insert(EntryC).second) 111 return false; 112 113 SmallVector<const Constant *, 16> Stack; 114 Stack.push_back(EntryC); 115 116 while (!Stack.empty()) { 117 const Constant *C = Stack.pop_back_val(); 118 119 // We need to trap on DS globals in non-entry functions. 120 if (IsFunc && isDSAddress(C)) 121 return true; 122 123 // Check this constant expression. 124 if (const auto *CE = dyn_cast<ConstantExpr>(C)) { 125 if (!HasApertureRegs && visitConstantExpr(CE)) 126 return true; 127 } 128 129 // Visit all sub-expressions. 130 for (const Use &U : C->operands()) { 131 const auto *OpC = dyn_cast<Constant>(U); 132 if (!OpC) 133 continue; 134 135 if (!ConstantExprVisited.insert(OpC).second) 136 continue; 137 138 Stack.push_back(OpC); 139 } 140 } 141 142 return false; 143 } 144 145 // We do not need to note the x workitem or workgroup id because they are always 146 // initialized. 147 // 148 // TODO: We should not add the attributes if the known compile time workgroup 149 // size is 1 for y/z. 150 static StringRef intrinsicToAttrName(Intrinsic::ID ID, 151 bool &NonKernelOnly, 152 bool &IsQueuePtr) { 153 switch (ID) { 154 case Intrinsic::amdgcn_workitem_id_x: 155 NonKernelOnly = true; 156 return "amdgpu-work-item-id-x"; 157 case Intrinsic::amdgcn_workgroup_id_x: 158 NonKernelOnly = true; 159 return "amdgpu-work-group-id-x"; 160 case Intrinsic::amdgcn_workitem_id_y: 161 case Intrinsic::r600_read_tidig_y: 162 return "amdgpu-work-item-id-y"; 163 case Intrinsic::amdgcn_workitem_id_z: 164 case Intrinsic::r600_read_tidig_z: 165 return "amdgpu-work-item-id-z"; 166 case Intrinsic::amdgcn_workgroup_id_y: 167 case Intrinsic::r600_read_tgid_y: 168 return "amdgpu-work-group-id-y"; 169 case Intrinsic::amdgcn_workgroup_id_z: 170 case Intrinsic::r600_read_tgid_z: 171 return "amdgpu-work-group-id-z"; 172 case Intrinsic::amdgcn_dispatch_ptr: 173 return "amdgpu-dispatch-ptr"; 174 case Intrinsic::amdgcn_dispatch_id: 175 return "amdgpu-dispatch-id"; 176 case Intrinsic::amdgcn_implicitarg_ptr: 177 return "amdgpu-implicitarg-ptr"; 178 case Intrinsic::amdgcn_queue_ptr: 179 case Intrinsic::amdgcn_is_shared: 180 case Intrinsic::amdgcn_is_private: 181 // TODO: Does not require queue ptr on gfx9+ 182 case Intrinsic::trap: 183 case Intrinsic::debugtrap: 184 IsQueuePtr = true; 185 return "amdgpu-queue-ptr"; 186 default: 187 return ""; 188 } 189 } 190 191 static bool handleAttr(Function &Parent, const Function &Callee, 192 StringRef Name) { 193 if (Callee.hasFnAttribute(Name)) { 194 Parent.addFnAttr(Name); 195 return true; 196 } 197 return false; 198 } 199 200 static void copyFeaturesToFunction(Function &Parent, const Function &Callee, 201 bool &NeedQueuePtr) { 202 if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) 203 NeedQueuePtr = true; 204 205 for (StringRef AttrName : ImplicitAttrNames) 206 handleAttr(Parent, Callee, AttrName); 207 } 208 209 bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() { 210 bool Changed = false; 211 212 for (auto *Node : reverse(NodeList)) { 213 Function *Caller = Node->getFunction(); 214 215 for (auto I : *Node) { 216 Function *Callee = std::get<1>(I)->getFunction(); 217 if (Callee) 218 Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee); 219 } 220 } 221 222 return Changed; 223 } 224 225 bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute( 226 Function &Caller, Function &Callee) { 227 228 // Check for externally defined function 229 if (!Callee.hasExactDefinition()) { 230 Callee.addFnAttr("uniform-work-group-size", "false"); 231 if (!Caller.hasFnAttribute("uniform-work-group-size")) 232 Caller.addFnAttr("uniform-work-group-size", "false"); 233 234 return true; 235 } 236 // Check if the Caller has the attribute 237 if (Caller.hasFnAttribute("uniform-work-group-size")) { 238 // Check if the value of the attribute is true 239 if (Caller.getFnAttribute("uniform-work-group-size") 240 .getValueAsString().equals("true")) { 241 // Propagate the attribute to the Callee, if it does not have it 242 if (!Callee.hasFnAttribute("uniform-work-group-size")) { 243 Callee.addFnAttr("uniform-work-group-size", "true"); 244 return true; 245 } 246 } else { 247 Callee.addFnAttr("uniform-work-group-size", "false"); 248 return true; 249 } 250 } else { 251 // If the attribute is absent, set it as false 252 Caller.addFnAttr("uniform-work-group-size", "false"); 253 Callee.addFnAttr("uniform-work-group-size", "false"); 254 return true; 255 } 256 return false; 257 } 258 259 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { 260 const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F); 261 bool HasApertureRegs = ST.hasApertureRegs(); 262 SmallPtrSet<const Constant *, 8> ConstantExprVisited; 263 264 bool HaveStackObjects = false; 265 bool Changed = false; 266 bool NeedQueuePtr = false; 267 bool HaveCall = false; 268 bool HasIndirectCall = false; 269 bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv()); 270 CallingConv::ID CC = F.getCallingConv(); 271 bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx); 272 273 // If this function hasAddressTaken() = true 274 // then add all attributes corresponding to the implicit args. 275 if (CallingConvSupportsAllImplicits && 276 F.hasAddressTaken(nullptr, true, true, true)) { 277 for (StringRef AttrName : ImplicitAttrNames) { 278 F.addFnAttr(AttrName); 279 } 280 Changed = true; 281 } 282 283 for (BasicBlock &BB : F) { 284 for (Instruction &I : BB) { 285 if (isa<AllocaInst>(I)) { 286 HaveStackObjects = true; 287 continue; 288 } 289 290 if (auto *CB = dyn_cast<CallBase>(&I)) { 291 const Function *Callee = 292 dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts()); 293 294 // Note the occurence of indirect call. 295 if (!Callee) { 296 if (!CB->isInlineAsm()) { 297 HasIndirectCall = true; 298 HaveCall = true; 299 } 300 continue; 301 } 302 303 Intrinsic::ID IID = Callee->getIntrinsicID(); 304 if (IID == Intrinsic::not_intrinsic) { 305 HaveCall = true; 306 copyFeaturesToFunction(F, *Callee, NeedQueuePtr); 307 Changed = true; 308 } else { 309 bool NonKernelOnly = false; 310 311 StringRef AttrName = intrinsicToAttrName(IID, NonKernelOnly, 312 NeedQueuePtr); 313 if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) { 314 F.addFnAttr(AttrName); 315 Changed = true; 316 } 317 } 318 } 319 320 if (NeedQueuePtr || (!IsFunc && HasApertureRegs)) 321 continue; 322 323 if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) { 324 if (!HasApertureRegs && castRequiresQueuePtr(ASC)) { 325 NeedQueuePtr = true; 326 continue; 327 } 328 } 329 330 for (const Use &U : I.operands()) { 331 const auto *OpC = dyn_cast<Constant>(U); 332 if (!OpC) 333 continue; 334 335 if (visitConstantExprsRecursively(OpC, ConstantExprVisited, IsFunc, 336 HasApertureRegs)) { 337 NeedQueuePtr = true; 338 break; 339 } 340 } 341 } 342 } 343 344 if (NeedQueuePtr) { 345 F.addFnAttr("amdgpu-queue-ptr"); 346 Changed = true; 347 } 348 349 // TODO: We could refine this to captured pointers that could possibly be 350 // accessed by flat instructions. For now this is mostly a poor way of 351 // estimating whether there are calls before argument lowering. 352 if (!IsFunc && HaveCall) { 353 F.addFnAttr("amdgpu-calls"); 354 Changed = true; 355 } 356 357 if (HaveStackObjects) { 358 F.addFnAttr("amdgpu-stack-objects"); 359 Changed = true; 360 } 361 362 // This pass cannot copy attributes from callees to callers 363 // if there is an indirect call and in thus such cases, 364 // hasAddressTaken() would be false for kernels and functions 365 // making an indirect call (if they are themselves not indirectly called). 366 // We must tag all such kernels/functions with all implicits attributes 367 // for correctness. 368 // e.g. 369 // 1. Kernel K1 makes an indirect call to function F1. 370 // Without detecting an indirect call in K1, this pass will not 371 // add all implicit args to K1 (which is incorrect). 372 // 2. Kernel K1 makes direct call to F1 which makes indirect call to function 373 // F2. 374 // Without detecting an indirect call in F1 (whose hasAddressTaken() is 375 // false), the pass will not add all implicit args to F1 (which is 376 // essential for correctness). 377 if (CallingConvSupportsAllImplicits && HasIndirectCall) { 378 for (StringRef AttrName : ImplicitAttrNames) { 379 F.addFnAttr(AttrName); 380 } 381 Changed = true; 382 } 383 384 return Changed; 385 } 386 387 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) { 388 bool Changed = false; 389 390 for (CallGraphNode *I : SCC) { 391 // Build a list of CallGraphNodes from most number of uses to least 392 if (I->getNumReferences()) 393 NodeList.push_back(I); 394 else { 395 processUniformWorkGroupAttribute(); 396 NodeList.clear(); 397 } 398 399 Function *F = I->getFunction(); 400 // Ignore functions with graphics calling conventions, these are currently 401 // not allowed to have kernel arguments. 402 if (!F || F->isDeclaration() || AMDGPU::isGraphics(F->getCallingConv())) 403 continue; 404 // Add feature attributes 405 Changed |= addFeatureAttributes(*F); 406 } 407 408 return Changed; 409 } 410 411 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) { 412 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); 413 if (!TPC) 414 report_fatal_error("TargetMachine is required"); 415 416 TM = &TPC->getTM<TargetMachine>(); 417 return false; 418 } 419 420 Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { 421 return new AMDGPUAnnotateKernelFeatures(); 422 } 423