1 //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass adds target attributes to functions which use intrinsics
10 /// which will impact calling convention lowering.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "GCNSubtarget.h"
16 #include "llvm/Analysis/CallGraph.h"
17 #include "llvm/Analysis/CallGraphSCCPass.h"
18 #include "llvm/CodeGen/TargetPassConfig.h"
19 #include "llvm/IR/IntrinsicsAMDGPU.h"
20 #include "llvm/IR/IntrinsicsR600.h"
21 #include "llvm/Target/TargetMachine.h"
22 
23 #define DEBUG_TYPE "amdgpu-annotate-kernel-features"
24 
25 using namespace llvm;
26 
27 namespace {
28 static constexpr StringLiteral ImplicitAttrNames[] = {
29     // X ids unnecessarily propagated to kernels.
30     "amdgpu-work-item-id-x",  "amdgpu-work-item-id-y",
31     "amdgpu-work-item-id-z",  "amdgpu-work-group-id-x",
32     "amdgpu-work-group-id-y", "amdgpu-work-group-id-z",
33     "amdgpu-dispatch-ptr",    "amdgpu-dispatch-id",
34     "amdgpu-queue-ptr",       "amdgpu-implicitarg-ptr"};
35 
36 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
37 private:
38   const TargetMachine *TM = nullptr;
39   SmallVector<CallGraphNode*, 8> NodeList;
40 
41   bool addFeatureAttributes(Function &F);
42   bool processUniformWorkGroupAttribute();
43   bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee);
44 
45 public:
46   static char ID;
47 
48   AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}
49 
50   bool doInitialization(CallGraph &CG) override;
51   bool runOnSCC(CallGraphSCC &SCC) override;
52 
53   StringRef getPassName() const override {
54     return "AMDGPU Annotate Kernel Features";
55   }
56 
57   void getAnalysisUsage(AnalysisUsage &AU) const override {
58     AU.setPreservesAll();
59     CallGraphSCCPass::getAnalysisUsage(AU);
60   }
61 
62   static bool visitConstantExpr(const ConstantExpr *CE);
63   static bool visitConstantExprsRecursively(
64     const Constant *EntryC,
65     SmallPtrSet<const Constant *, 8> &ConstantExprVisited, bool IsFunc,
66     bool HasApertureRegs);
67 };
68 
69 } // end anonymous namespace
70 
71 char AMDGPUAnnotateKernelFeatures::ID = 0;
72 
73 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
74 
75 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
76                 "Add AMDGPU function attributes", false, false)
77 
78 
79 // The queue ptr is only needed when casting to flat, not from it.
80 static bool castRequiresQueuePtr(unsigned SrcAS) {
81   return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
82 }
83 
84 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
85   return castRequiresQueuePtr(ASC->getSrcAddressSpace());
86 }
87 
88 static bool isDSAddress(const Constant *C) {
89   const GlobalValue *GV = dyn_cast<GlobalValue>(C);
90   if (!GV)
91     return false;
92   unsigned AS = GV->getAddressSpace();
93   return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
94 }
95 
96 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
97   if (CE->getOpcode() == Instruction::AddrSpaceCast) {
98     unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
99     return castRequiresQueuePtr(SrcAS);
100   }
101 
102   return false;
103 }
104 
105 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
106   const Constant *EntryC,
107   SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
108   bool IsFunc, bool HasApertureRegs) {
109 
110   if (!ConstantExprVisited.insert(EntryC).second)
111     return false;
112 
113   SmallVector<const Constant *, 16> Stack;
114   Stack.push_back(EntryC);
115 
116   while (!Stack.empty()) {
117     const Constant *C = Stack.pop_back_val();
118 
119     // We need to trap on DS globals in non-entry functions.
120     if (IsFunc && isDSAddress(C))
121       return true;
122 
123     // Check this constant expression.
124     if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
125       if (!HasApertureRegs && visitConstantExpr(CE))
126         return true;
127     }
128 
129     // Visit all sub-expressions.
130     for (const Use &U : C->operands()) {
131       const auto *OpC = dyn_cast<Constant>(U);
132       if (!OpC)
133         continue;
134 
135       if (!ConstantExprVisited.insert(OpC).second)
136         continue;
137 
138       Stack.push_back(OpC);
139     }
140   }
141 
142   return false;
143 }
144 
145 // We do not need to note the x workitem or workgroup id because they are always
146 // initialized.
147 //
148 // TODO: We should not add the attributes if the known compile time workgroup
149 // size is 1 for y/z.
150 static StringRef intrinsicToAttrName(Intrinsic::ID ID,
151                                      bool &NonKernelOnly,
152                                      bool &IsQueuePtr) {
153   switch (ID) {
154   case Intrinsic::amdgcn_workitem_id_x:
155     NonKernelOnly = true;
156     return "amdgpu-work-item-id-x";
157   case Intrinsic::amdgcn_workgroup_id_x:
158     NonKernelOnly = true;
159     return "amdgpu-work-group-id-x";
160   case Intrinsic::amdgcn_workitem_id_y:
161   case Intrinsic::r600_read_tidig_y:
162     return "amdgpu-work-item-id-y";
163   case Intrinsic::amdgcn_workitem_id_z:
164   case Intrinsic::r600_read_tidig_z:
165     return "amdgpu-work-item-id-z";
166   case Intrinsic::amdgcn_workgroup_id_y:
167   case Intrinsic::r600_read_tgid_y:
168     return "amdgpu-work-group-id-y";
169   case Intrinsic::amdgcn_workgroup_id_z:
170   case Intrinsic::r600_read_tgid_z:
171     return "amdgpu-work-group-id-z";
172   case Intrinsic::amdgcn_dispatch_ptr:
173     return "amdgpu-dispatch-ptr";
174   case Intrinsic::amdgcn_dispatch_id:
175     return "amdgpu-dispatch-id";
176   case Intrinsic::amdgcn_implicitarg_ptr:
177     return "amdgpu-implicitarg-ptr";
178   case Intrinsic::amdgcn_queue_ptr:
179   case Intrinsic::amdgcn_is_shared:
180   case Intrinsic::amdgcn_is_private:
181     // TODO: Does not require queue ptr on gfx9+
182   case Intrinsic::trap:
183   case Intrinsic::debugtrap:
184     IsQueuePtr = true;
185     return "amdgpu-queue-ptr";
186   default:
187     return "";
188   }
189 }
190 
191 static bool handleAttr(Function &Parent, const Function &Callee,
192                        StringRef Name) {
193   if (Callee.hasFnAttribute(Name)) {
194     Parent.addFnAttr(Name);
195     return true;
196   }
197   return false;
198 }
199 
200 static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
201                                    bool &NeedQueuePtr) {
202   if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
203     NeedQueuePtr = true;
204 
205   for (StringRef AttrName : ImplicitAttrNames)
206     handleAttr(Parent, Callee, AttrName);
207 }
208 
209 bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() {
210   bool Changed = false;
211 
212   for (auto *Node : reverse(NodeList)) {
213     Function *Caller = Node->getFunction();
214 
215     for (auto I : *Node) {
216       Function *Callee = std::get<1>(I)->getFunction();
217       if (Callee)
218         Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee);
219     }
220   }
221 
222   return Changed;
223 }
224 
225 bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
226        Function &Caller, Function &Callee) {
227 
228   // Check for externally defined function
229   if (!Callee.hasExactDefinition()) {
230     Callee.addFnAttr("uniform-work-group-size", "false");
231     if (!Caller.hasFnAttribute("uniform-work-group-size"))
232       Caller.addFnAttr("uniform-work-group-size", "false");
233 
234     return true;
235   }
236   // Check if the Caller has the attribute
237   if (Caller.hasFnAttribute("uniform-work-group-size")) {
238     // Check if the value of the attribute is true
239     if (Caller.getFnAttribute("uniform-work-group-size")
240         .getValueAsString().equals("true")) {
241       // Propagate the attribute to the Callee, if it does not have it
242       if (!Callee.hasFnAttribute("uniform-work-group-size")) {
243         Callee.addFnAttr("uniform-work-group-size", "true");
244         return true;
245       }
246     } else {
247       Callee.addFnAttr("uniform-work-group-size", "false");
248       return true;
249     }
250   } else {
251     // If the attribute is absent, set it as false
252     Caller.addFnAttr("uniform-work-group-size", "false");
253     Callee.addFnAttr("uniform-work-group-size", "false");
254     return true;
255   }
256   return false;
257 }
258 
259 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
260   const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
261   bool HasApertureRegs = ST.hasApertureRegs();
262   SmallPtrSet<const Constant *, 8> ConstantExprVisited;
263 
264   bool HaveStackObjects = false;
265   bool Changed = false;
266   bool NeedQueuePtr = false;
267   bool HaveCall = false;
268   bool HasIndirectCall = false;
269   bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
270   CallingConv::ID CC = F.getCallingConv();
271   bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx);
272 
273   // If this function hasAddressTaken() = true
274   // then add all attributes corresponding to the implicit args.
275   if (CallingConvSupportsAllImplicits &&
276       F.hasAddressTaken(nullptr, true, true, true)) {
277     for (StringRef AttrName : ImplicitAttrNames) {
278       F.addFnAttr(AttrName);
279     }
280     Changed = true;
281   }
282 
283   for (BasicBlock &BB : F) {
284     for (Instruction &I : BB) {
285       if (isa<AllocaInst>(I)) {
286         HaveStackObjects = true;
287         continue;
288       }
289 
290       if (auto *CB = dyn_cast<CallBase>(&I)) {
291         const Function *Callee =
292             dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts());
293 
294         // Note the occurence of indirect call.
295         if (!Callee) {
296           if (!CB->isInlineAsm()) {
297             HasIndirectCall = true;
298             HaveCall = true;
299           }
300           continue;
301         }
302 
303         Intrinsic::ID IID = Callee->getIntrinsicID();
304         if (IID == Intrinsic::not_intrinsic) {
305           HaveCall = true;
306           copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
307           Changed = true;
308         } else {
309           bool NonKernelOnly = false;
310 
311           StringRef AttrName = intrinsicToAttrName(IID, NonKernelOnly,
312                                                    NeedQueuePtr);
313           if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
314             F.addFnAttr(AttrName);
315             Changed = true;
316           }
317         }
318       }
319 
320       if (NeedQueuePtr || (!IsFunc && HasApertureRegs))
321         continue;
322 
323       if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
324         if (!HasApertureRegs && castRequiresQueuePtr(ASC)) {
325           NeedQueuePtr = true;
326           continue;
327         }
328       }
329 
330       for (const Use &U : I.operands()) {
331         const auto *OpC = dyn_cast<Constant>(U);
332         if (!OpC)
333           continue;
334 
335         if (visitConstantExprsRecursively(OpC, ConstantExprVisited, IsFunc,
336                                           HasApertureRegs)) {
337           NeedQueuePtr = true;
338           break;
339         }
340       }
341     }
342   }
343 
344   if (NeedQueuePtr) {
345     F.addFnAttr("amdgpu-queue-ptr");
346     Changed = true;
347   }
348 
349   // TODO: We could refine this to captured pointers that could possibly be
350   // accessed by flat instructions. For now this is mostly a poor way of
351   // estimating whether there are calls before argument lowering.
352   if (!IsFunc && HaveCall) {
353     F.addFnAttr("amdgpu-calls");
354     Changed = true;
355   }
356 
357   if (HaveStackObjects) {
358     F.addFnAttr("amdgpu-stack-objects");
359     Changed = true;
360   }
361 
362   // This pass cannot copy attributes from callees to callers
363   // if there is an indirect call and in thus such cases,
364   // hasAddressTaken() would be false for kernels and functions
365   // making an indirect call (if they are themselves not indirectly called).
366   // We must tag all such kernels/functions with all implicits attributes
367   // for correctness.
368   // e.g.
369   // 1. Kernel K1 makes an indirect call to function F1.
370   //    Without detecting an indirect call in K1, this pass will not
371   //    add all implicit args to K1 (which is incorrect).
372   // 2. Kernel K1 makes direct call to F1 which makes indirect call to function
373   // F2.
374   //    Without detecting an indirect call in F1 (whose hasAddressTaken() is
375   //    false), the pass will not add all implicit args to F1 (which is
376   //    essential for correctness).
377   if (CallingConvSupportsAllImplicits && HasIndirectCall) {
378     for (StringRef AttrName : ImplicitAttrNames) {
379       F.addFnAttr(AttrName);
380     }
381     Changed = true;
382   }
383 
384   return Changed;
385 }
386 
387 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
388   bool Changed = false;
389 
390   for (CallGraphNode *I : SCC) {
391     // Build a list of CallGraphNodes from most number of uses to least
392     if (I->getNumReferences())
393       NodeList.push_back(I);
394     else {
395       processUniformWorkGroupAttribute();
396       NodeList.clear();
397     }
398 
399     Function *F = I->getFunction();
400     // Ignore functions with graphics calling conventions, these are currently
401     // not allowed to have kernel arguments.
402     if (!F || F->isDeclaration() || AMDGPU::isGraphics(F->getCallingConv()))
403       continue;
404     // Add feature attributes
405     Changed |= addFeatureAttributes(*F);
406   }
407 
408   return Changed;
409 }
410 
411 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
412   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
413   if (!TPC)
414     report_fatal_error("TargetMachine is required");
415 
416   TM = &TPC->getTM<TargetMachine>();
417   return false;
418 }
419 
420 Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
421   return new AMDGPUAnnotateKernelFeatures();
422 }
423