1 //===- AMDGPUAttributor.cpp -----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass uses Attributor framework to deduce AMDGPU attributes.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AMDGPU.h"
14 #include "GCNSubtarget.h"
15 #include "llvm/CodeGen/TargetPassConfig.h"
16 #include "llvm/IR/IntrinsicsAMDGPU.h"
17 #include "llvm/IR/IntrinsicsR600.h"
18 #include "llvm/Target/TargetMachine.h"
19 #include "llvm/Transforms/IPO/Attributor.h"
20 
21 #define DEBUG_TYPE "amdgpu-attributor"
22 
23 using namespace llvm;
24 
25 static constexpr StringLiteral ImplicitAttrNames[] = {
26     // X ids unnecessarily propagated to kernels.
27     "amdgpu-work-item-id-x",  "amdgpu-work-item-id-y",
28     "amdgpu-work-item-id-z",  "amdgpu-work-group-id-x",
29     "amdgpu-work-group-id-y", "amdgpu-work-group-id-z",
30     "amdgpu-dispatch-ptr",    "amdgpu-dispatch-id",
31     "amdgpu-queue-ptr",       "amdgpu-implicitarg-ptr"};
32 
33 // We do not need to note the x workitem or workgroup id because they are always
34 // initialized.
35 //
36 // TODO: We should not add the attributes if the known compile time workgroup
37 // size is 1 for y/z.
38 static StringRef intrinsicToAttrName(Intrinsic::ID ID, bool &NonKernelOnly,
39                                      bool &IsQueuePtr) {
40   switch (ID) {
41   case Intrinsic::amdgcn_workitem_id_x:
42     NonKernelOnly = true;
43     return "amdgpu-work-item-id-x";
44   case Intrinsic::amdgcn_workgroup_id_x:
45     NonKernelOnly = true;
46     return "amdgpu-work-group-id-x";
47   case Intrinsic::amdgcn_workitem_id_y:
48   case Intrinsic::r600_read_tidig_y:
49     return "amdgpu-work-item-id-y";
50   case Intrinsic::amdgcn_workitem_id_z:
51   case Intrinsic::r600_read_tidig_z:
52     return "amdgpu-work-item-id-z";
53   case Intrinsic::amdgcn_workgroup_id_y:
54   case Intrinsic::r600_read_tgid_y:
55     return "amdgpu-work-group-id-y";
56   case Intrinsic::amdgcn_workgroup_id_z:
57   case Intrinsic::r600_read_tgid_z:
58     return "amdgpu-work-group-id-z";
59   case Intrinsic::amdgcn_dispatch_ptr:
60     return "amdgpu-dispatch-ptr";
61   case Intrinsic::amdgcn_dispatch_id:
62     return "amdgpu-dispatch-id";
63   case Intrinsic::amdgcn_kernarg_segment_ptr:
64     return "amdgpu-kernarg-segment-ptr";
65   case Intrinsic::amdgcn_implicitarg_ptr:
66     return "amdgpu-implicitarg-ptr";
67   case Intrinsic::amdgcn_queue_ptr:
68   case Intrinsic::amdgcn_is_shared:
69   case Intrinsic::amdgcn_is_private:
70     // TODO: Does not require queue ptr on gfx9+
71   case Intrinsic::trap:
72   case Intrinsic::debugtrap:
73     IsQueuePtr = true;
74     return "amdgpu-queue-ptr";
75   default:
76     return "";
77   }
78 }
79 
80 static bool castRequiresQueuePtr(unsigned SrcAS) {
81   return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
82 }
83 
84 static bool isDSAddress(const Constant *C) {
85   const GlobalValue *GV = dyn_cast<GlobalValue>(C);
86   if (!GV)
87     return false;
88   unsigned AS = GV->getAddressSpace();
89   return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
90 }
91 
92 class AMDGPUInformationCache : public InformationCache {
93 public:
94   AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
95                          BumpPtrAllocator &Allocator,
96                          SetVector<Function *> *CGSCC, TargetMachine &TM)
97       : InformationCache(M, AG, Allocator, CGSCC), TM(TM) {}
98   TargetMachine &TM;
99 
100   enum ConstantStatus { DS_GLOBAL = 1 << 0, ADDR_SPACE_CAST = 1 << 1 };
101 
102   /// Check if the subtarget has aperture regs.
103   bool hasApertureRegs(Function &F) {
104     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
105     return ST.hasApertureRegs();
106   }
107 
108 private:
109   /// Check if the ConstantExpr \p CE requires queue ptr attribute.
110   static bool visitConstExpr(const ConstantExpr *CE) {
111     if (CE->getOpcode() == Instruction::AddrSpaceCast) {
112       unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
113       return castRequiresQueuePtr(SrcAS);
114     }
115     return false;
116   }
117 
118   /// Get the constant access bitmap for \p C.
119   uint8_t getConstantAccess(const Constant *C) {
120     auto It = ConstantStatus.find(C);
121     if (It != ConstantStatus.end())
122       return It->second;
123 
124     uint8_t Result = 0;
125     if (isDSAddress(C))
126       Result = DS_GLOBAL;
127 
128     if (const auto *CE = dyn_cast<ConstantExpr>(C))
129       if (visitConstExpr(CE))
130         Result |= ADDR_SPACE_CAST;
131 
132     for (const Use &U : C->operands()) {
133       const auto *OpC = dyn_cast<Constant>(U);
134       if (!OpC)
135         continue;
136 
137       Result |= getConstantAccess(OpC);
138     }
139     return Result;
140   }
141 
142 public:
143   /// Returns true if \p Fn needs a queue ptr attribute because of \p C.
144   bool needsQueuePtr(const Constant *C, Function &Fn) {
145     bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv());
146     bool HasAperture = hasApertureRegs(Fn);
147 
148     // No need to explore the constants.
149     if (!IsNonEntryFunc && HasAperture)
150       return false;
151 
152     uint8_t Access = getConstantAccess(C);
153 
154     // We need to trap on DS globals in non-entry functions.
155     if (IsNonEntryFunc && (Access & DS_GLOBAL))
156       return true;
157 
158     return !HasAperture && (Access & ADDR_SPACE_CAST);
159   }
160 
161 private:
162   /// Used to determine if the Constant needs a queue ptr attribute.
163   DenseMap<const Constant *, uint8_t> ConstantStatus;
164 };
165 
166 struct AAAMDAttributes : public StateWrapper<BooleanState, AbstractAttribute> {
167   using Base = StateWrapper<BooleanState, AbstractAttribute>;
168   AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
169 
170   /// Create an abstract attribute view for the position \p IRP.
171   static AAAMDAttributes &createForPosition(const IRPosition &IRP,
172                                             Attributor &A);
173 
174   /// See AbstractAttribute::getName().
175   const std::string getName() const override { return "AAAMDAttributes"; }
176 
177   /// See AbstractAttribute::getIdAddr().
178   const char *getIdAddr() const override { return &ID; }
179 
180   /// This function should return true if the type of the \p AA is
181   /// AAAMDAttributes.
182   static bool classof(const AbstractAttribute *AA) {
183     return (AA->getIdAddr() == &ID);
184   }
185 
186   virtual const DenseSet<StringRef> &getAttributes() const = 0;
187 
188   /// Unique ID (due to the unique address)
189   static const char ID;
190 };
191 const char AAAMDAttributes::ID = 0;
192 
193 struct AAAMDWorkGroupSize
194     : public StateWrapper<BooleanState, AbstractAttribute> {
195   using Base = StateWrapper<BooleanState, AbstractAttribute>;
196   AAAMDWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
197 
198   /// Create an abstract attribute view for the position \p IRP.
199   static AAAMDWorkGroupSize &createForPosition(const IRPosition &IRP,
200                                                Attributor &A);
201 
202   /// See AbstractAttribute::getName().
203   const std::string getName() const override { return "AAAMDWorkGroupSize"; }
204 
205   /// See AbstractAttribute::getIdAddr().
206   const char *getIdAddr() const override { return &ID; }
207 
208   /// This function should return true if the type of the \p AA is
209   /// AAAMDAttributes.
210   static bool classof(const AbstractAttribute *AA) {
211     return (AA->getIdAddr() == &ID);
212   }
213 
214   /// Unique ID (due to the unique address)
215   static const char ID;
216 };
217 const char AAAMDWorkGroupSize::ID = 0;
218 
219 struct AAAMDWorkGroupSizeFunction : public AAAMDWorkGroupSize {
220   AAAMDWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
221       : AAAMDWorkGroupSize(IRP, A) {}
222 
223   void initialize(Attributor &A) override {
224     Function *F = getAssociatedFunction();
225     CallingConv::ID CC = F->getCallingConv();
226 
227     if (CC != CallingConv::AMDGPU_KERNEL)
228       return;
229 
230     bool InitialValue = false;
231     if (F->hasFnAttribute("uniform-work-group-size"))
232       InitialValue = F->getFnAttribute("uniform-work-group-size")
233                          .getValueAsString()
234                          .equals("true");
235 
236     if (InitialValue)
237       indicateOptimisticFixpoint();
238     else
239       indicatePessimisticFixpoint();
240   }
241 
242   ChangeStatus updateImpl(Attributor &A) override {
243     ChangeStatus Change = ChangeStatus::UNCHANGED;
244 
245     auto CheckCallSite = [&](AbstractCallSite CS) {
246       Function *Caller = CS.getInstruction()->getFunction();
247       LLVM_DEBUG(dbgs() << "[AAAMDWorkGroupSize] Call " << Caller->getName()
248                         << "->" << getAssociatedFunction()->getName() << "\n");
249 
250       const auto &CallerInfo = A.getAAFor<AAAMDWorkGroupSize>(
251           *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
252 
253       Change = Change | clampStateAndIndicateChange(this->getState(),
254                                                     CallerInfo.getState());
255 
256       return true;
257     };
258 
259     bool AllCallSitesKnown = true;
260     if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
261       indicatePessimisticFixpoint();
262 
263     return Change;
264   }
265 
266   ChangeStatus manifest(Attributor &A) override {
267     SmallVector<Attribute, 8> AttrList;
268     LLVMContext &Ctx = getAssociatedFunction()->getContext();
269 
270     AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size",
271                                       getAssumed() ? "true" : "false"));
272     return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
273                                               /* ForceReplace */ true);
274   }
275 
276   bool isValidState() const override {
277     // This state is always valid, even when the state is false.
278     return true;
279   }
280 
281   const std::string getAsStr() const override {
282     return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]";
283   }
284 
285   /// See AbstractAttribute::trackStatistics()
286   void trackStatistics() const override {}
287 };
288 
289 AAAMDWorkGroupSize &AAAMDWorkGroupSize::createForPosition(const IRPosition &IRP,
290                                                           Attributor &A) {
291   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
292     return *new (A.Allocator) AAAMDWorkGroupSizeFunction(IRP, A);
293   llvm_unreachable("AAAMDWorkGroupSize is only valid for function position");
294 }
295 
296 struct AAAMDAttributesFunction : public AAAMDAttributes {
297   AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)
298       : AAAMDAttributes(IRP, A) {}
299 
300   void initialize(Attributor &A) override {
301     Function *F = getAssociatedFunction();
302     CallingConv::ID CC = F->getCallingConv();
303     bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx);
304 
305     // Ignore functions with graphics calling conventions, these are currently
306     // not allowed to have kernel arguments.
307     if (AMDGPU::isGraphics(F->getCallingConv())) {
308       indicatePessimisticFixpoint();
309       return;
310     }
311 
312     for (StringRef Attr : ImplicitAttrNames) {
313       if (F->hasFnAttribute(Attr))
314         Attributes.insert(Attr);
315     }
316 
317     // TODO: We shouldn't need this in the future.
318     if (CallingConvSupportsAllImplicits &&
319         F->hasAddressTaken(nullptr, true, true, true)) {
320       for (StringRef AttrName : ImplicitAttrNames) {
321         Attributes.insert(AttrName);
322       }
323     }
324   }
325 
326   ChangeStatus updateImpl(Attributor &A) override {
327     Function *F = getAssociatedFunction();
328     ChangeStatus Change = ChangeStatus::UNCHANGED;
329     bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
330     CallingConv::ID CC = F->getCallingConv();
331     bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx);
332     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
333 
334     auto AddAttribute = [&](StringRef AttrName) {
335       if (Attributes.insert(AttrName).second)
336         Change = ChangeStatus::CHANGED;
337     };
338 
339     // Check for Intrinsics and propagate attributes.
340     const AACallEdges &AAEdges = A.getAAFor<AACallEdges>(
341         *this, this->getIRPosition(), DepClassTy::REQUIRED);
342 
343     // We have to assume that we can reach a function with these attributes.
344     // We do not consider inline assembly as a unknown callee.
345     if (CallingConvSupportsAllImplicits && AAEdges.hasNonAsmUnknownCallee()) {
346       for (StringRef AttrName : ImplicitAttrNames) {
347         AddAttribute(AttrName);
348       }
349     }
350 
351     bool NeedsQueuePtr = false;
352     bool HasCall = false;
353     for (Function *Callee : AAEdges.getOptimisticEdges()) {
354       Intrinsic::ID IID = Callee->getIntrinsicID();
355       if (IID != Intrinsic::not_intrinsic) {
356         if (!IsNonEntryFunc && IID == Intrinsic::amdgcn_kernarg_segment_ptr) {
357           AddAttribute("amdgpu-kernarg-segment-ptr");
358           continue;
359         }
360 
361         bool NonKernelOnly = false;
362         StringRef AttrName =
363             intrinsicToAttrName(IID, NonKernelOnly, NeedsQueuePtr);
364 
365         if (!AttrName.empty() && (IsNonEntryFunc || !NonKernelOnly))
366           AddAttribute(AttrName);
367 
368         continue;
369       }
370 
371       HasCall = true;
372       const AAAMDAttributes &AAAMD = A.getAAFor<AAAMDAttributes>(
373           *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
374       const DenseSet<StringRef> &CalleeAttributes = AAAMD.getAttributes();
375       // Propagate implicit attributes from called function.
376       for (StringRef AttrName : ImplicitAttrNames)
377         if (CalleeAttributes.count(AttrName))
378           AddAttribute(AttrName);
379     }
380 
381     HasCall |= AAEdges.hasUnknownCallee();
382     if (!IsNonEntryFunc && HasCall)
383       AddAttribute("amdgpu-calls");
384 
385     // Check the function body.
386     auto CheckAlloca = [&](Instruction &I) {
387       AddAttribute("amdgpu-stack-objects");
388       return false;
389     };
390 
391     bool UsedAssumedInformation = false;
392     A.checkForAllInstructions(CheckAlloca, *this, {Instruction::Alloca},
393                               UsedAssumedInformation);
394 
395     // If we found that we need amdgpu-queue-ptr, nothing else to do.
396     if (NeedsQueuePtr || Attributes.count("amdgpu-queue-ptr")) {
397       AddAttribute("amdgpu-queue-ptr");
398       return Change;
399     }
400 
401     auto CheckAddrSpaceCasts = [&](Instruction &I) {
402       unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
403       if (castRequiresQueuePtr(SrcAS)) {
404         NeedsQueuePtr = true;
405         return false;
406       }
407       return true;
408     };
409 
410     bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
411 
412     // `checkForAllInstructions` is much more cheaper than going through all
413     // instructions, try it first.
414 
415     // amdgpu-queue-ptr is not needed if aperture regs is present.
416     if (!HasApertureRegs)
417       A.checkForAllInstructions(CheckAddrSpaceCasts, *this,
418                                 {Instruction::AddrSpaceCast},
419                                 UsedAssumedInformation);
420 
421     // If we found  that we need amdgpu-queue-ptr, nothing else to do.
422     if (NeedsQueuePtr) {
423       AddAttribute("amdgpu-queue-ptr");
424       return Change;
425     }
426 
427     if (!IsNonEntryFunc && HasApertureRegs)
428       return Change;
429 
430     for (BasicBlock &BB : *F) {
431       for (Instruction &I : BB) {
432         for (const Use &U : I.operands()) {
433           if (const auto *C = dyn_cast<Constant>(U)) {
434             if (InfoCache.needsQueuePtr(C, *F)) {
435               AddAttribute("amdgpu-queue-ptr");
436               return Change;
437             }
438           }
439         }
440       }
441     }
442 
443     return Change;
444   }
445 
446   ChangeStatus manifest(Attributor &A) override {
447     SmallVector<Attribute, 8> AttrList;
448     LLVMContext &Ctx = getAssociatedFunction()->getContext();
449 
450     for (StringRef AttrName : Attributes)
451       AttrList.push_back(Attribute::get(Ctx, AttrName));
452 
453     return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
454                                               /* ForceReplace */ true);
455   }
456 
457   const std::string getAsStr() const override {
458     return "AMDInfo[" + std::to_string(Attributes.size()) + "]";
459   }
460 
461   const DenseSet<StringRef> &getAttributes() const override {
462     return Attributes;
463   }
464 
465   /// See AbstractAttribute::trackStatistics()
466   void trackStatistics() const override {}
467 
468 private:
469   DenseSet<StringRef> Attributes;
470 };
471 
472 AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
473                                                     Attributor &A) {
474   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
475     return *new (A.Allocator) AAAMDAttributesFunction(IRP, A);
476   llvm_unreachable("AAAMDAttributes is only valid for function position");
477 }
478 
479 class AMDGPUAttributor : public ModulePass {
480 public:
481   AMDGPUAttributor() : ModulePass(ID) {}
482 
483   /// doInitialization - Virtual method overridden by subclasses to do
484   /// any necessary initialization before any pass is run.
485   bool doInitialization(Module &) override {
486     auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
487     if (!TPC)
488       report_fatal_error("TargetMachine is required");
489 
490     TM = &TPC->getTM<TargetMachine>();
491     return false;
492   }
493 
494   bool runOnModule(Module &M) override {
495     SetVector<Function *> Functions;
496     AnalysisGetter AG;
497     for (Function &F : M) {
498       if (!F.isIntrinsic())
499         Functions.insert(&F);
500     }
501 
502     CallGraphUpdater CGUpdater;
503     BumpPtrAllocator Allocator;
504     AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM);
505     Attributor A(Functions, InfoCache, CGUpdater);
506 
507     for (Function &F : M) {
508       if (!F.isIntrinsic()) {
509         A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F));
510         A.getOrCreateAAFor<AAAMDWorkGroupSize>(IRPosition::function(F));
511       }
512     }
513 
514     ChangeStatus Change = A.run();
515     return Change == ChangeStatus::CHANGED;
516   }
517 
518   StringRef getPassName() const override { return "AMDGPU Attributor"; }
519   TargetMachine *TM;
520   static char ID;
521 };
522 
523 char AMDGPUAttributor::ID = 0;
524 
525 Pass *llvm::createAMDGPUAttributorPass() { return new AMDGPUAttributor(); }
526 INITIALIZE_PASS(AMDGPUAttributor, DEBUG_TYPE, "AMDGPU Attributor", false, false)
527