1 //===- AMDGPUAnnotateKernelFeaturesPass.cpp -------------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file This pass adds target attributes to functions which use intrinsics
11 /// which will impact calling convention lowering.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
16 #include "AMDGPUSubtarget.h"
17 #include "Utils/AMDGPUBaseInfo.h"
18 #include "llvm/ADT/SmallPtrSet.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/ADT/StringRef.h"
21 #include "llvm/ADT/Triple.h"
22 #include "llvm/Analysis/CallGraph.h"
23 #include "llvm/Analysis/CallGraphSCCPass.h"
24 #include "llvm/CodeGen/TargetPassConfig.h"
25 #include "llvm/IR/CallSite.h"
26 #include "llvm/IR/Constant.h"
27 #include "llvm/IR/Constants.h"
28 #include "llvm/IR/Function.h"
29 #include "llvm/IR/Instruction.h"
30 #include "llvm/IR/Instructions.h"
31 #include "llvm/IR/Intrinsics.h"
32 #include "llvm/IR/Module.h"
33 #include "llvm/IR/Type.h"
34 #include "llvm/IR/Use.h"
35 #include "llvm/Pass.h"
36 #include "llvm/Support/Casting.h"
37 #include "llvm/Support/ErrorHandling.h"
38 #include "llvm/Target/TargetMachine.h"
39 
40 #define DEBUG_TYPE "amdgpu-annotate-kernel-features"
41 
42 using namespace llvm;
43 
44 namespace {
45 
46 class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
47 private:
48   const TargetMachine *TM = nullptr;
49   SmallVector<CallGraphNode*, 8> NodeList;
50 
51   bool addFeatureAttributes(Function &F);
52   bool processUniformWorkGroupAttribute();
53   bool propagateUniformWorkGroupAttribute(Function &Caller, Function &Callee);
54 
55 public:
56   static char ID;
57 
58   AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {}
59 
60   bool doInitialization(CallGraph &CG) override;
61   bool runOnSCC(CallGraphSCC &SCC) override;
62 
63   StringRef getPassName() const override {
64     return "AMDGPU Annotate Kernel Features";
65   }
66 
67   void getAnalysisUsage(AnalysisUsage &AU) const override {
68     AU.setPreservesAll();
69     CallGraphSCCPass::getAnalysisUsage(AU);
70   }
71 
72   static bool visitConstantExpr(const ConstantExpr *CE);
73   static bool visitConstantExprsRecursively(
74     const Constant *EntryC,
75     SmallPtrSet<const Constant *, 8> &ConstantExprVisited);
76 };
77 
78 } // end anonymous namespace
79 
80 char AMDGPUAnnotateKernelFeatures::ID = 0;
81 
82 char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
83 
84 INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
85                 "Add AMDGPU function attributes", false, false)
86 
87 
88 // The queue ptr is only needed when casting to flat, not from it.
89 static bool castRequiresQueuePtr(unsigned SrcAS) {
90   return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
91 }
92 
93 static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
94   return castRequiresQueuePtr(ASC->getSrcAddressSpace());
95 }
96 
97 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
98   if (CE->getOpcode() == Instruction::AddrSpaceCast) {
99     unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
100     return castRequiresQueuePtr(SrcAS);
101   }
102 
103   return false;
104 }
105 
106 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
107   const Constant *EntryC,
108   SmallPtrSet<const Constant *, 8> &ConstantExprVisited) {
109 
110   if (!ConstantExprVisited.insert(EntryC).second)
111     return false;
112 
113   SmallVector<const Constant *, 16> Stack;
114   Stack.push_back(EntryC);
115 
116   while (!Stack.empty()) {
117     const Constant *C = Stack.pop_back_val();
118 
119     // Check this constant expression.
120     if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
121       if (visitConstantExpr(CE))
122         return true;
123     }
124 
125     // Visit all sub-expressions.
126     for (const Use &U : C->operands()) {
127       const auto *OpC = dyn_cast<Constant>(U);
128       if (!OpC)
129         continue;
130 
131       if (!ConstantExprVisited.insert(OpC).second)
132         continue;
133 
134       Stack.push_back(OpC);
135     }
136   }
137 
138   return false;
139 }
140 
141 // We do not need to note the x workitem or workgroup id because they are always
142 // initialized.
143 //
144 // TODO: We should not add the attributes if the known compile time workgroup
145 // size is 1 for y/z.
146 static StringRef intrinsicToAttrName(Intrinsic::ID ID,
147                                      bool &NonKernelOnly,
148                                      bool &IsQueuePtr) {
149   switch (ID) {
150   case Intrinsic::amdgcn_workitem_id_x:
151     NonKernelOnly = true;
152     return "amdgpu-work-item-id-x";
153   case Intrinsic::amdgcn_workgroup_id_x:
154     NonKernelOnly = true;
155     return "amdgpu-work-group-id-x";
156   case Intrinsic::amdgcn_workitem_id_y:
157   case Intrinsic::r600_read_tidig_y:
158     return "amdgpu-work-item-id-y";
159   case Intrinsic::amdgcn_workitem_id_z:
160   case Intrinsic::r600_read_tidig_z:
161     return "amdgpu-work-item-id-z";
162   case Intrinsic::amdgcn_workgroup_id_y:
163   case Intrinsic::r600_read_tgid_y:
164     return "amdgpu-work-group-id-y";
165   case Intrinsic::amdgcn_workgroup_id_z:
166   case Intrinsic::r600_read_tgid_z:
167     return "amdgpu-work-group-id-z";
168   case Intrinsic::amdgcn_dispatch_ptr:
169     return "amdgpu-dispatch-ptr";
170   case Intrinsic::amdgcn_dispatch_id:
171     return "amdgpu-dispatch-id";
172   case Intrinsic::amdgcn_kernarg_segment_ptr:
173     return "amdgpu-kernarg-segment-ptr";
174   case Intrinsic::amdgcn_implicitarg_ptr:
175     return "amdgpu-implicitarg-ptr";
176   case Intrinsic::amdgcn_queue_ptr:
177   case Intrinsic::trap:
178   case Intrinsic::debugtrap:
179     IsQueuePtr = true;
180     return "amdgpu-queue-ptr";
181   default:
182     return "";
183   }
184 }
185 
186 static bool handleAttr(Function &Parent, const Function &Callee,
187                        StringRef Name) {
188   if (Callee.hasFnAttribute(Name)) {
189     Parent.addFnAttr(Name);
190     return true;
191   }
192   return false;
193 }
194 
195 static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
196                                    bool &NeedQueuePtr) {
197   // X ids unnecessarily propagated to kernels.
198   static const StringRef AttrNames[] = {
199     { "amdgpu-work-item-id-x" },
200     { "amdgpu-work-item-id-y" },
201     { "amdgpu-work-item-id-z" },
202     { "amdgpu-work-group-id-x" },
203     { "amdgpu-work-group-id-y" },
204     { "amdgpu-work-group-id-z" },
205     { "amdgpu-dispatch-ptr" },
206     { "amdgpu-dispatch-id" },
207     { "amdgpu-kernarg-segment-ptr" },
208     { "amdgpu-implicitarg-ptr" }
209   };
210 
211   if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
212     NeedQueuePtr = true;
213 
214   for (StringRef AttrName : AttrNames)
215     handleAttr(Parent, Callee, AttrName);
216 }
217 
218 bool AMDGPUAnnotateKernelFeatures::processUniformWorkGroupAttribute() {
219   bool Changed = false;
220 
221   for (auto *Node : reverse(NodeList)) {
222     Function *Caller = Node->getFunction();
223 
224     for (auto I : *Node) {
225       Function *Callee = std::get<1>(I)->getFunction();
226       if (Callee)
227         Changed = propagateUniformWorkGroupAttribute(*Caller, *Callee);
228     }
229   }
230 
231   return Changed;
232 }
233 
234 bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
235        Function &Caller, Function &Callee) {
236 
237   // Check for externally defined function
238   if (!Callee.hasExactDefinition()) {
239     Callee.addFnAttr("uniform-work-group-size", "false");
240     if (!Caller.hasFnAttribute("uniform-work-group-size"))
241       Caller.addFnAttr("uniform-work-group-size", "false");
242 
243     return true;
244   }
245   // Check if the Caller has the attribute
246   if (Caller.hasFnAttribute("uniform-work-group-size")) {
247     // Check if the value of the attribute is true
248     if (Caller.getFnAttribute("uniform-work-group-size")
249         .getValueAsString().equals("true")) {
250       // Propagate the attribute to the Callee, if it does not have it
251       if (!Callee.hasFnAttribute("uniform-work-group-size")) {
252         Callee.addFnAttr("uniform-work-group-size", "true");
253         return true;
254       }
255     } else {
256       Callee.addFnAttr("uniform-work-group-size", "false");
257       return true;
258     }
259   } else {
260     // If the attribute is absent, set it as false
261     Caller.addFnAttr("uniform-work-group-size", "false");
262     Callee.addFnAttr("uniform-work-group-size", "false");
263     return true;
264   }
265   return false;
266 }
267 
268 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
269   const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
270   bool HasFlat = ST.hasFlatAddressSpace();
271   bool HasApertureRegs = ST.hasApertureRegs();
272   SmallPtrSet<const Constant *, 8> ConstantExprVisited;
273 
274   bool Changed = false;
275   bool NeedQueuePtr = false;
276   bool HaveCall = false;
277   bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv());
278 
279   for (BasicBlock &BB : F) {
280     for (Instruction &I : BB) {
281       CallSite CS(&I);
282       if (CS) {
283         Function *Callee = CS.getCalledFunction();
284 
285         // TODO: Do something with indirect calls.
286         if (!Callee) {
287           if (!CS.isInlineAsm())
288             HaveCall = true;
289           continue;
290         }
291 
292         Intrinsic::ID IID = Callee->getIntrinsicID();
293         if (IID == Intrinsic::not_intrinsic) {
294           HaveCall = true;
295           copyFeaturesToFunction(F, *Callee, NeedQueuePtr);
296           Changed = true;
297         } else {
298           bool NonKernelOnly = false;
299           StringRef AttrName = intrinsicToAttrName(IID,
300                                                    NonKernelOnly, NeedQueuePtr);
301           if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
302             F.addFnAttr(AttrName);
303             Changed = true;
304           }
305         }
306       }
307 
308       if (NeedQueuePtr || HasApertureRegs)
309         continue;
310 
311       if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
312         if (castRequiresQueuePtr(ASC)) {
313           NeedQueuePtr = true;
314           continue;
315         }
316       }
317 
318       for (const Use &U : I.operands()) {
319         const auto *OpC = dyn_cast<Constant>(U);
320         if (!OpC)
321           continue;
322 
323         if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) {
324           NeedQueuePtr = true;
325           break;
326         }
327       }
328     }
329   }
330 
331   if (NeedQueuePtr) {
332     F.addFnAttr("amdgpu-queue-ptr");
333     Changed = true;
334   }
335 
336   // TODO: We could refine this to captured pointers that could possibly be
337   // accessed by flat instructions. For now this is mostly a poor way of
338   // estimating whether there are calls before argument lowering.
339   if (HasFlat && !IsFunc && HaveCall) {
340     F.addFnAttr("amdgpu-flat-scratch");
341     Changed = true;
342   }
343 
344   return Changed;
345 }
346 
347 bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) {
348   bool Changed = false;
349 
350   for (CallGraphNode *I : SCC) {
351     // Build a list of CallGraphNodes from most number of uses to least
352     if (I->getNumReferences())
353       NodeList.push_back(I);
354     else
355       processUniformWorkGroupAttribute();
356 
357     Function *F = I->getFunction();
358     // Add feature attributes
359     if (!F || F->isDeclaration())
360       continue;
361     Changed |= addFeatureAttributes(*F);
362   }
363 
364   return Changed;
365 }
366 
367 bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
368   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
369   if (!TPC)
370     report_fatal_error("TargetMachine is required");
371 
372   TM = &TPC->getTM<TargetMachine>();
373   return false;
374 }
375 
376 Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
377   return new AMDGPUAnnotateKernelFeatures();
378 }
379