1 //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass adds target attributes to functions which use intrinsics
10 /// which will impact calling convention lowering.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPUSubtarget.h"
16 #include "Utils/AMDGPUBaseInfo.h"
17 #include "llvm/ADT/SmallPtrSet.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/StringRef.h"
20 #include "llvm/ADT/Triple.h"
21 #include "llvm/Analysis/CallGraph.h"
22 #include "llvm/Analysis/CallGraphSCCPass.h"
23 #include "llvm/CodeGen/TargetPassConfig.h"
24 #include "llvm/IR/CallSite.h"
25 #include "llvm/IR/Constant.h"
26 #include "llvm/IR/Constants.h"
27 #include "llvm/IR/Function.h"
28 #include "llvm/IR/Instruction.h"
29 #include "llvm/IR/Instructions.h"
30 #include "llvm/IR/Intrinsics.h"
31 #include "llvm/IR/Module.h"
32 #include "llvm/IR/Type.h"
33 #include "llvm/IR/Use.h"
34 #include "llvm/Pass.h"
35 #include "llvm/Support/Casting.h"
36 #include "llvm/Support/ErrorHandling.h"
37 #include "llvm/Target/TargetMachine.h"
38 
39 #define DEBUG_TYPE "amdgpu-annotate-kernel-features"
40 
41 using namespace llvm;
42 
43 namespace {
44 
45 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
46 private:
47   const TargetMachine *TM = nullptr;
48   SmallVector<CallGraphNode*, 8> NodeList;
49 
50   bool addFeatureAttributes(Function &F);
51   bool processUniformWorkGroupAttribute();
52   bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee);
53 
54 public:
55   static char ID;
56 
57   AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}
58 
59   bool doInitialization(CallGraph &CG) override;
60   bool runOnSCC(CallGraphSCC &SCC) override;
61 
62   StringRef getPassName() const override {
63     return "AMDGPU Annotate Kernel Features";
64   }
65 
66   void getAnalysisUsage(AnalysisUsage &AU) const override {
67     AU.setPreservesAll();
68     CallGraphSCCPass::getAnalysisUsage(AU);
69   }
70 
71   static bool visitConstantExpr(const ConstantExpr *CE);
72   static bool visitConstantExprsRecursively(
73     const Constant *EntryC,
74     SmallPtrSet<const Constant *, 8> &ConstantExprVisited);
75 };
76 
77 } // end anonymous namespace
78 
79 char AMDGPUAnnotateKernelFeatures::ID = 0;
80 
81 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
82 
83 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
84                 "Add AMDGPU function attributes", false, false)
85 
86 
87 // The queue ptr is only needed when casting to flat, not from it.
88 static bool castRequiresQueuePtr(unsigned SrcAS) {
89   return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
90 }
91 
92 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
93   return castRequiresQueuePtr(ASC->getSrcAddressSpace());
94 }
95 
96 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
97   if (CE->getOpcode() == Instruction::AddrSpaceCast) {
98     unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
99     return castRequiresQueuePtr(SrcAS);
100   }
101 
102   return false;
103 }
104 
105 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
106   const Constant *EntryC,
107   SmallPtrSet<const Constant *, 8> &ConstantExprVisited) {
108 
109   if (!ConstantExprVisited.insert(EntryC).second)
110     return false;
111 
112   SmallVector<const Constant *, 16> Stack;
113   Stack.push_back(EntryC);
114 
115   while (!Stack.empty()) {
116     const Constant *C = Stack.pop_back_val();
117 
118     // Check this constant expression.
119     if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
120       if (visitConstantExpr(CE))
121         return true;
122     }
123 
124     // Visit all sub-expressions.
125     for (const Use &U : C->operands()) {
126       const auto *OpC = dyn_cast<Constant>(U);
127       if (!OpC)
128         continue;
129 
130       if (!ConstantExprVisited.insert(OpC).second)
131         continue;
132 
133       Stack.push_back(OpC);
134     }
135   }
136 
137   return false;
138 }
139 
140 // We do not need to note the x workitem or workgroup id because they are always
141 // initialized.
142 //
143 // TODO: We should not add the attributes if the known compile time workgroup
144 // size is 1 for y/z.
145 static StringRef intrinsicToAttrName(Intrinsic::ID ID,
146                                      bool &NonKernelOnly,
147                                      bool &IsQueuePtr) {
148   switch (ID) {
149   case Intrinsic::amdgcn_workitem_id_x:
150     NonKernelOnly = true;
151     return "amdgpu-work-item-id-x";
152   case Intrinsic::amdgcn_workgroup_id_x:
153     NonKernelOnly = true;
154     return "amdgpu-work-group-id-x";
155   case Intrinsic::amdgcn_workitem_id_y:
156   case Intrinsic::r600_read_tidig_y:
157     return "amdgpu-work-item-id-y";
158   case Intrinsic::amdgcn_workitem_id_z:
159   case Intrinsic::r600_read_tidig_z:
160     return "amdgpu-work-item-id-z";
161   case Intrinsic::amdgcn_workgroup_id_y:
162   case Intrinsic::r600_read_tgid_y:
163     return "amdgpu-work-group-id-y";
164   case Intrinsic::amdgcn_workgroup_id_z:
165   case Intrinsic::r600_read_tgid_z:
166     return "amdgpu-work-group-id-z";
167   case Intrinsic::amdgcn_dispatch_ptr:
168     return "amdgpu-dispatch-ptr";
169   case Intrinsic::amdgcn_dispatch_id:
170     return "amdgpu-dispatch-id";
171   case Intrinsic::amdgcn_kernarg_segment_ptr:
172     return "amdgpu-kernarg-segment-ptr";
173   case Intrinsic::amdgcn_implicitarg_ptr:
174     return "amdgpu-implicitarg-ptr";
175   case Intrinsic::amdgcn_queue_ptr:
176   case Intrinsic::amdgcn_is_shared:
177   case Intrinsic::amdgcn_is_private:
178     // TODO: Does not require queue ptr on gfx9+
179   case Intrinsic::trap:
180   case Intrinsic::debugtrap:
181     IsQueuePtr = true;
182     return "amdgpu-queue-ptr";
183   default:
184     return "";
185   }
186 }
187 
188 static bool handleAttr(Function &Parent, const Function &Callee,
189                        StringRef Name) {
190   if (Callee.hasFnAttribute(Name)) {
191     Parent.addFnAttr(Name);
192     return true;
193   }
194   return false;
195 }
196 
197 static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
198                                    bool &NeedQueuePtr) {
199   // X ids unnecessarily propagated to kernels.
200   static const StringRef AttrNames[] = {
201     { "amdgpu-work-item-id-x" },
202     { "amdgpu-work-item-id-y" },
203     { "amdgpu-work-item-id-z" },
204     { "amdgpu-work-group-id-x" },
205     { "amdgpu-work-group-id-y" },
206     { "amdgpu-work-group-id-z" },
207     { "amdgpu-dispatch-ptr" },
208     { "amdgpu-dispatch-id" },
209     { "amdgpu-kernarg-segment-ptr" },
210     { "amdgpu-implicitarg-ptr" }
211   };
212 
213   if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
214     NeedQueuePtr = true;
215 
216   for (StringRef AttrName : AttrNames)
217     handleAttr(Parent, Callee, AttrName);
218 }
219 
220 bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() {
221   bool Changed = false;
222 
223   for (auto *Node : reverse(NodeList)) {
224     Function *Caller = Node->getFunction();
225 
226     for (auto I : *Node) {
227       Function *Callee = std::get<1>(I)->getFunction();
228       if (Callee)
229         Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee);
230     }
231   }
232 
233   return Changed;
234 }
235 
236 bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
237        Function &Caller, Function &Callee) {
238 
239   // Check for externally defined function
240   if (!Callee.hasExactDefinition()) {
241     Callee.addFnAttr("uniform-work-group-size", "false");
242     if (!Caller.hasFnAttribute("uniform-work-group-size"))
243       Caller.addFnAttr("uniform-work-group-size", "false");
244 
245     return true;
246   }
247   // Check if the Caller has the attribute
248   if (Caller.hasFnAttribute("uniform-work-group-size")) {
249     // Check if the value of the attribute is true
250     if (Caller.getFnAttribute("uniform-work-group-size")
251         .getValueAsString().equals("true")) {
252       // Propagate the attribute to the Callee, if it does not have it
253       if (!Callee.hasFnAttribute("uniform-work-group-size")) {
254         Callee.addFnAttr("uniform-work-group-size", "true");
255         return true;
256       }
257     } else {
258       Callee.addFnAttr("uniform-work-group-size", "false");
259       return true;
260     }
261   } else {
262     // If the attribute is absent, set it as false
263     Caller.addFnAttr("uniform-work-group-size", "false");
264     Callee.addFnAttr("uniform-work-group-size", "false");
265     return true;
266   }
267   return false;
268 }
269 
270 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
271   const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
272   bool HasFlat = ST.hasFlatAddressSpace();
273   bool HasApertureRegs = ST.hasApertureRegs();
274   SmallPtrSet<const Constant *, 8> ConstantExprVisited;
275 
276   bool Changed = false;
277   bool NeedQueuePtr = false;
278   bool HaveCall = false;
279   bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
280 
281   for (BasicBlock &BB : F) {
282     for (Instruction &I : BB) {
283       CallSite CS(&I);
284       if (CS) {
285         Function *Callee = CS.getCalledFunction();
286 
287         // TODO: Do something with indirect calls.
288         if (!Callee) {
289           if (!CS.isInlineAsm())
290             HaveCall = true;
291           continue;
292         }
293 
294         Intrinsic::ID IID = Callee->getIntrinsicID();
295         if (IID == Intrinsic::not_intrinsic) {
296           HaveCall = true;
297           copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
298           Changed = true;
299         } else {
300           bool NonKernelOnly = false;
301           StringRef AttrName = intrinsicToAttrName(IID,
302                                                    NonKernelOnly, NeedQueuePtr);
303           if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
304             F.addFnAttr(AttrName);
305             Changed = true;
306           }
307         }
308       }
309 
310       if (NeedQueuePtr || HasApertureRegs)
311         continue;
312 
313       if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
314         if (castRequiresQueuePtr(ASC)) {
315           NeedQueuePtr = true;
316           continue;
317         }
318       }
319 
320       for (const Use &U : I.operands()) {
321         const auto *OpC = dyn_cast<Constant>(U);
322         if (!OpC)
323           continue;
324 
325         if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) {
326           NeedQueuePtr = true;
327           break;
328         }
329       }
330     }
331   }
332 
333   if (NeedQueuePtr) {
334     F.addFnAttr("amdgpu-queue-ptr");
335     Changed = true;
336   }
337 
338   // TODO: We could refine this to captured pointers that could possibly be
339   // accessed by flat instructions. For now this is mostly a poor way of
340   // estimating whether there are calls before argument lowering.
341   if (HasFlat && !IsFunc && HaveCall) {
342     F.addFnAttr("amdgpu-flat-scratch");
343     Changed = true;
344   }
345 
346   return Changed;
347 }
348 
349 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
350   bool Changed = false;
351 
352   for (CallGraphNode *I : SCC) {
353     // Build a list of CallGraphNodes from most number of uses to least
354     if (I->getNumReferences())
355       NodeList.push_back(I);
356     else {
357       processUniformWorkGroupAttribute();
358       NodeList.clear();
359     }
360 
361     Function *F = I->getFunction();
362     // Add feature attributes
363     if (!F || F->isDeclaration())
364       continue;
365     Changed |= addFeatureAttributes(*F);
366   }
367 
368   return Changed;
369 }
370 
371 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
372   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
373   if (!TPC)
374     report_fatal_error("TargetMachine is required");
375 
376   TM = &TPC->getTM<TargetMachine>();
377   return false;
378 }
379 
380 Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
381   return new AMDGPUAnnotateKernelFeatures();
382 }
383