1 //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass adds target attributes to functions which use intrinsics
10 /// which will impact calling convention lowering.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPUSubtarget.h"
16 #include "Utils/AMDGPUBaseInfo.h"
17 #include "llvm/ADT/SmallPtrSet.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/StringRef.h"
20 #include "llvm/ADT/Triple.h"
21 #include "llvm/Analysis/CallGraph.h"
22 #include "llvm/Analysis/CallGraphSCCPass.h"
23 #include "llvm/CodeGen/TargetPassConfig.h"
24 #include "llvm/IR/CallSite.h"
25 #include "llvm/IR/Constant.h"
26 #include "llvm/IR/Constants.h"
27 #include "llvm/IR/Function.h"
28 #include "llvm/IR/Instruction.h"
29 #include "llvm/IR/Instructions.h"
30 #include "llvm/IR/Intrinsics.h"
31 #include "llvm/IR/Module.h"
32 #include "llvm/IR/Type.h"
33 #include "llvm/IR/Use.h"
34 #include "llvm/Pass.h"
35 #include "llvm/Support/Casting.h"
36 #include "llvm/Support/ErrorHandling.h"
37 #include "llvm/Target/TargetMachine.h"
38 
39 #define DEBUG_TYPE "amdgpu-annotate-kernel-features"
40 
41 using namespace llvm;
42 
43 namespace {
44 
45 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
46 private:
47   const TargetMachine *TM = nullptr;
48   SmallVector<CallGraphNode*, 8> NodeList;
49 
50   bool addFeatureAttributes(Function &F);
51   bool processUniformWorkGroupAttribute();
52   bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee);
53 
54 public:
55   static char ID;
56 
57   AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}
58 
59   bool doInitialization(CallGraph &CG) override;
60   bool runOnSCC(CallGraphSCC &SCC) override;
61 
62   StringRef getPassName() const override {
63     return "AMDGPU Annotate Kernel Features";
64   }
65 
66   void getAnalysisUsage(AnalysisUsage &AU) const override {
67     AU.setPreservesAll();
68     CallGraphSCCPass::getAnalysisUsage(AU);
69   }
70 
71   static bool visitConstantExpr(const ConstantExpr *CE);
72   static bool visitConstantExprsRecursively(
73     const Constant *EntryC,
74     SmallPtrSet<const Constant *, 8> &ConstantExprVisited, bool IsFunc,
75     bool HasApertureRegs);
76 };
77 
78 } // end anonymous namespace
79 
80 char AMDGPUAnnotateKernelFeatures::ID = 0;
81 
82 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
83 
84 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
85                 "Add AMDGPU function attributes", false, false)
86 
87 
88 // The queue ptr is only needed when casting to flat, not from it.
89 static bool castRequiresQueuePtr(unsigned SrcAS) {
90   return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
91 }
92 
93 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
94   return castRequiresQueuePtr(ASC->getSrcAddressSpace());
95 }
96 
97 static bool isDSAddress(const Constant *C) {
98   const GlobalValue *GV = dyn_cast<GlobalValue>(C);
99   if (!GV)
100     return false;
101   unsigned AS = GV->getAddressSpace();
102   return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
103 }
104 
105 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
106   if (CE->getOpcode() == Instruction::AddrSpaceCast) {
107     unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
108     return castRequiresQueuePtr(SrcAS);
109   }
110 
111   return false;
112 }
113 
114 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
115   const Constant *EntryC,
116   SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
117   bool IsFunc, bool HasApertureRegs) {
118 
119   if (!ConstantExprVisited.insert(EntryC).second)
120     return false;
121 
122   SmallVector<const Constant *, 16> Stack;
123   Stack.push_back(EntryC);
124 
125   while (!Stack.empty()) {
126     const Constant *C = Stack.pop_back_val();
127 
128     // We need to trap on DS globals in non-entry functions.
129     if (IsFunc && isDSAddress(C))
130       return true;
131 
132     // Check this constant expression.
133     if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
134       if (!HasApertureRegs && visitConstantExpr(CE))
135         return true;
136     }
137 
138     // Visit all sub-expressions.
139     for (const Use &U : C->operands()) {
140       const auto *OpC = dyn_cast<Constant>(U);
141       if (!OpC)
142         continue;
143 
144       if (!ConstantExprVisited.insert(OpC).second)
145         continue;
146 
147       Stack.push_back(OpC);
148     }
149   }
150 
151   return false;
152 }
153 
154 // We do not need to note the x workitem or workgroup id because they are always
155 // initialized.
156 //
157 // TODO: We should not add the attributes if the known compile time workgroup
158 // size is 1 for y/z.
159 static StringRef intrinsicToAttrName(Intrinsic::ID ID,
160                                      bool &NonKernelOnly,
161                                      bool &IsQueuePtr) {
162   switch (ID) {
163   case Intrinsic::amdgcn_workitem_id_x:
164     NonKernelOnly = true;
165     return "amdgpu-work-item-id-x";
166   case Intrinsic::amdgcn_workgroup_id_x:
167     NonKernelOnly = true;
168     return "amdgpu-work-group-id-x";
169   case Intrinsic::amdgcn_workitem_id_y:
170   case Intrinsic::r600_read_tidig_y:
171     return "amdgpu-work-item-id-y";
172   case Intrinsic::amdgcn_workitem_id_z:
173   case Intrinsic::r600_read_tidig_z:
174     return "amdgpu-work-item-id-z";
175   case Intrinsic::amdgcn_workgroup_id_y:
176   case Intrinsic::r600_read_tgid_y:
177     return "amdgpu-work-group-id-y";
178   case Intrinsic::amdgcn_workgroup_id_z:
179   case Intrinsic::r600_read_tgid_z:
180     return "amdgpu-work-group-id-z";
181   case Intrinsic::amdgcn_dispatch_ptr:
182     return "amdgpu-dispatch-ptr";
183   case Intrinsic::amdgcn_dispatch_id:
184     return "amdgpu-dispatch-id";
185   case Intrinsic::amdgcn_kernarg_segment_ptr:
186     return "amdgpu-kernarg-segment-ptr";
187   case Intrinsic::amdgcn_implicitarg_ptr:
188     return "amdgpu-implicitarg-ptr";
189   case Intrinsic::amdgcn_queue_ptr:
190   case Intrinsic::amdgcn_is_shared:
191   case Intrinsic::amdgcn_is_private:
192     // TODO: Does not require queue ptr on gfx9+
193   case Intrinsic::trap:
194   case Intrinsic::debugtrap:
195     IsQueuePtr = true;
196     return "amdgpu-queue-ptr";
197   default:
198     return "";
199   }
200 }
201 
202 static bool handleAttr(Function &Parent, const Function &Callee,
203                        StringRef Name) {
204   if (Callee.hasFnAttribute(Name)) {
205     Parent.addFnAttr(Name);
206     return true;
207   }
208   return false;
209 }
210 
211 static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
212                                    bool &NeedQueuePtr) {
213   // X ids unnecessarily propagated to kernels.
214   static constexpr StringLiteral AttrNames[] = {
215       "amdgpu-work-item-id-x",      "amdgpu-work-item-id-y",
216       "amdgpu-work-item-id-z",      "amdgpu-work-group-id-x",
217       "amdgpu-work-group-id-y",     "amdgpu-work-group-id-z",
218       "amdgpu-dispatch-ptr",        "amdgpu-dispatch-id",
219       "amdgpu-implicitarg-ptr"};
220 
221   if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
222     NeedQueuePtr = true;
223 
224   for (StringRef AttrName : AttrNames)
225     handleAttr(Parent, Callee, AttrName);
226 }
227 
228 bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() {
229   bool Changed = false;
230 
231   for (auto *Node : reverse(NodeList)) {
232     Function *Caller = Node->getFunction();
233 
234     for (auto I : *Node) {
235       Function *Callee = std::get<1>(I)->getFunction();
236       if (Callee)
237         Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee);
238     }
239   }
240 
241   return Changed;
242 }
243 
244 bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
245        Function &Caller, Function &Callee) {
246 
247   // Check for externally defined function
248   if (!Callee.hasExactDefinition()) {
249     Callee.addFnAttr("uniform-work-group-size", "false");
250     if (!Caller.hasFnAttribute("uniform-work-group-size"))
251       Caller.addFnAttr("uniform-work-group-size", "false");
252 
253     return true;
254   }
255   // Check if the Caller has the attribute
256   if (Caller.hasFnAttribute("uniform-work-group-size")) {
257     // Check if the value of the attribute is true
258     if (Caller.getFnAttribute("uniform-work-group-size")
259         .getValueAsString().equals("true")) {
260       // Propagate the attribute to the Callee, if it does not have it
261       if (!Callee.hasFnAttribute("uniform-work-group-size")) {
262         Callee.addFnAttr("uniform-work-group-size", "true");
263         return true;
264       }
265     } else {
266       Callee.addFnAttr("uniform-work-group-size", "false");
267       return true;
268     }
269   } else {
270     // If the attribute is absent, set it as false
271     Caller.addFnAttr("uniform-work-group-size", "false");
272     Callee.addFnAttr("uniform-work-group-size", "false");
273     return true;
274   }
275   return false;
276 }
277 
278 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
279   const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
280   bool HasApertureRegs = ST.hasApertureRegs();
281   SmallPtrSet<const Constant *, 8> ConstantExprVisited;
282 
283   bool Changed = false;
284   bool NeedQueuePtr = false;
285   bool HaveCall = false;
286   bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
287 
288   for (BasicBlock &BB : F) {
289     for (Instruction &I : BB) {
290       CallSite CS(&I);
291       if (CS) {
292         Function *Callee = CS.getCalledFunction();
293 
294         // TODO: Do something with indirect calls.
295         if (!Callee) {
296           if (!CS.isInlineAsm())
297             HaveCall = true;
298           continue;
299         }
300 
301         Intrinsic::ID IID = Callee->getIntrinsicID();
302         if (IID == Intrinsic::not_intrinsic) {
303           HaveCall = true;
304           copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
305           Changed = true;
306         } else {
307           bool NonKernelOnly = false;
308 
309           if (!IsFunc && IID == Intrinsic::amdgcn_kernarg_segment_ptr) {
310             F.addFnAttr("amdgpu-kernarg-segment-ptr");
311           } else {
312             StringRef AttrName = intrinsicToAttrName(IID, NonKernelOnly,
313                                                      NeedQueuePtr);
314             if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
315               F.addFnAttr(AttrName);
316               Changed = true;
317             }
318           }
319         }
320       }
321 
322       if (NeedQueuePtr || (!IsFunc && HasApertureRegs))
323         continue;
324 
325       if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
326         if (!HasApertureRegs && castRequiresQueuePtr(ASC)) {
327           NeedQueuePtr = true;
328           continue;
329         }
330       }
331 
332       for (const Use &U : I.operands()) {
333         const auto *OpC = dyn_cast<Constant>(U);
334         if (!OpC)
335           continue;
336 
337         if (visitConstantExprsRecursively(OpC, ConstantExprVisited, IsFunc,
338                                           HasApertureRegs)) {
339           NeedQueuePtr = true;
340           break;
341         }
342       }
343     }
344   }
345 
346   if (NeedQueuePtr) {
347     F.addFnAttr("amdgpu-queue-ptr");
348     Changed = true;
349   }
350 
351   // TODO: We could refine this to captured pointers that could possibly be
352   // accessed by flat instructions. For now this is mostly a poor way of
353   // estimating whether there are calls before argument lowering.
354   if (!IsFunc && HaveCall) {
355     F.addFnAttr("amdgpu-calls");
356     Changed = true;
357   }
358 
359   return Changed;
360 }
361 
362 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
363   bool Changed = false;
364 
365   for (CallGraphNode *I : SCC) {
366     // Build a list of CallGraphNodes from most number of uses to least
367     if (I->getNumReferences())
368       NodeList.push_back(I);
369     else {
370       processUniformWorkGroupAttribute();
371       NodeList.clear();
372     }
373 
374     Function *F = I->getFunction();
375     // Add feature attributes
376     if (!F || F->isDeclaration())
377       continue;
378     Changed |= addFeatureAttributes(*F);
379   }
380 
381   return Changed;
382 }
383 
384 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
385   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
386   if (!TPC)
387     report_fatal_error("TargetMachine is required");
388 
389   TM = &TPC->getTM<TargetMachine>();
390   return false;
391 }
392 
393 Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
394   return new AMDGPUAnnotateKernelFeatures();
395 }
396