1 //===- AMDGPUAttributor.cpp -----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass uses Attributor framework to deduce AMDGPU attributes.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AMDGPU.h"
14 #include "GCNSubtarget.h"
15 #include "llvm/CodeGen/TargetPassConfig.h"
16 #include "llvm/IR/IntrinsicsAMDGPU.h"
17 #include "llvm/IR/IntrinsicsR600.h"
18 #include "llvm/Target/TargetMachine.h"
19 #include "llvm/Transforms/IPO/Attributor.h"
20 
21 #define DEBUG_TYPE "amdgpu-attributor"
22 
23 using namespace llvm;
24 
25 enum ImplicitArgumentMask {
26   NOT_IMPLICIT_INPUT = 0,
27 
28   // SGPRs
29   DISPATCH_PTR = 1 << 0,
30   QUEUE_PTR = 1 << 1,
31   DISPATCH_ID = 1 << 2,
32   IMPLICIT_ARG_PTR = 1 << 3,
33   WORKGROUP_ID_X = 1 << 4,
34   WORKGROUP_ID_Y = 1 << 5,
35   WORKGROUP_ID_Z = 1 << 6,
36 
37   // VGPRS:
38   WORKITEM_ID_X = 1 << 7,
39   WORKITEM_ID_Y = 1 << 8,
40   WORKITEM_ID_Z = 1 << 9,
41   ALL_ARGUMENT_MASK = (1 << 10) - 1
42 };
43 
44 static constexpr std::pair<ImplicitArgumentMask,
45                            StringLiteral> ImplicitAttrs[] = {
46   {DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
47   {QUEUE_PTR, "amdgpu-no-queue-ptr"},
48   {DISPATCH_ID, "amdgpu-no-dispatch-id"},
49   {IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
50   {WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
51   {WORKGROUP_ID_Y, "amdgpu-no-workgroup-id-y"},
52   {WORKGROUP_ID_Z, "amdgpu-no-workgroup-id-z"},
53   {WORKITEM_ID_X, "amdgpu-no-workitem-id-x"},
54   {WORKITEM_ID_Y, "amdgpu-no-workitem-id-y"},
55   {WORKITEM_ID_Z, "amdgpu-no-workitem-id-z"}
56 };
57 
58 // We do not need to note the x workitem or workgroup id because they are always
59 // initialized.
60 //
61 // TODO: We should not add the attributes if the known compile time workgroup
62 // size is 1 for y/z.
63 static ImplicitArgumentMask
64 intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &IsQueuePtr) {
65   switch (ID) {
66   case Intrinsic::amdgcn_workitem_id_x:
67     NonKernelOnly = true;
68     return WORKITEM_ID_X;
69   case Intrinsic::amdgcn_workgroup_id_x:
70     NonKernelOnly = true;
71     return WORKGROUP_ID_X;
72   case Intrinsic::amdgcn_workitem_id_y:
73   case Intrinsic::r600_read_tidig_y:
74     return WORKITEM_ID_Y;
75   case Intrinsic::amdgcn_workitem_id_z:
76   case Intrinsic::r600_read_tidig_z:
77     return WORKITEM_ID_Z;
78   case Intrinsic::amdgcn_workgroup_id_y:
79   case Intrinsic::r600_read_tgid_y:
80     return WORKGROUP_ID_Y;
81   case Intrinsic::amdgcn_workgroup_id_z:
82   case Intrinsic::r600_read_tgid_z:
83     return WORKGROUP_ID_Z;
84   case Intrinsic::amdgcn_dispatch_ptr:
85     return DISPATCH_PTR;
86   case Intrinsic::amdgcn_dispatch_id:
87     return DISPATCH_ID;
88   case Intrinsic::amdgcn_implicitarg_ptr:
89     return IMPLICIT_ARG_PTR;
90   case Intrinsic::amdgcn_queue_ptr:
91   case Intrinsic::amdgcn_is_shared:
92   case Intrinsic::amdgcn_is_private:
93     // TODO: Does not require queue ptr on gfx9+
94   case Intrinsic::trap:
95   case Intrinsic::debugtrap:
96     IsQueuePtr = true;
97     return QUEUE_PTR;
98   default:
99     return NOT_IMPLICIT_INPUT;
100   }
101 }
102 
103 static bool castRequiresQueuePtr(unsigned SrcAS) {
104   return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
105 }
106 
107 static bool isDSAddress(const Constant *C) {
108   const GlobalValue *GV = dyn_cast<GlobalValue>(C);
109   if (!GV)
110     return false;
111   unsigned AS = GV->getAddressSpace();
112   return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
113 }
114 
115 class AMDGPUInformationCache : public InformationCache {
116 public:
117   AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
118                          BumpPtrAllocator &Allocator,
119                          SetVector<Function *> *CGSCC, TargetMachine &TM)
120       : InformationCache(M, AG, Allocator, CGSCC), TM(TM) {}
121   TargetMachine &TM;
122 
123   enum ConstantStatus { DS_GLOBAL = 1 << 0, ADDR_SPACE_CAST = 1 << 1 };
124 
125   /// Check if the subtarget has aperture regs.
126   bool hasApertureRegs(Function &F) {
127     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
128     return ST.hasApertureRegs();
129   }
130 
131 private:
132   /// Check if the ConstantExpr \p CE requires queue ptr attribute.
133   static bool visitConstExpr(const ConstantExpr *CE) {
134     if (CE->getOpcode() == Instruction::AddrSpaceCast) {
135       unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
136       return castRequiresQueuePtr(SrcAS);
137     }
138     return false;
139   }
140 
141   /// Get the constant access bitmap for \p C.
142   uint8_t getConstantAccess(const Constant *C) {
143     auto It = ConstantStatus.find(C);
144     if (It != ConstantStatus.end())
145       return It->second;
146 
147     uint8_t Result = 0;
148     if (isDSAddress(C))
149       Result = DS_GLOBAL;
150 
151     if (const auto *CE = dyn_cast<ConstantExpr>(C))
152       if (visitConstExpr(CE))
153         Result |= ADDR_SPACE_CAST;
154 
155     for (const Use &U : C->operands()) {
156       const auto *OpC = dyn_cast<Constant>(U);
157       if (!OpC)
158         continue;
159 
160       Result |= getConstantAccess(OpC);
161     }
162     return Result;
163   }
164 
165 public:
166   /// Returns true if \p Fn needs a queue ptr attribute because of \p C.
167   bool needsQueuePtr(const Constant *C, Function &Fn) {
168     bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv());
169     bool HasAperture = hasApertureRegs(Fn);
170 
171     // No need to explore the constants.
172     if (!IsNonEntryFunc && HasAperture)
173       return false;
174 
175     uint8_t Access = getConstantAccess(C);
176 
177     // We need to trap on DS globals in non-entry functions.
178     if (IsNonEntryFunc && (Access & DS_GLOBAL))
179       return true;
180 
181     return !HasAperture && (Access & ADDR_SPACE_CAST);
182   }
183 
184 private:
185   /// Used to determine if the Constant needs a queue ptr attribute.
186   DenseMap<const Constant *, uint8_t> ConstantStatus;
187 };
188 
189 struct AAAMDAttributes : public StateWrapper<
190   BitIntegerState<uint16_t, ALL_ARGUMENT_MASK, 0>, AbstractAttribute> {
191   using Base = StateWrapper<BitIntegerState<uint16_t, ALL_ARGUMENT_MASK, 0>,
192                             AbstractAttribute>;
193 
194   AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
195 
196   /// Create an abstract attribute view for the position \p IRP.
197   static AAAMDAttributes &createForPosition(const IRPosition &IRP,
198                                             Attributor &A);
199 
200   /// See AbstractAttribute::getName().
201   const std::string getName() const override { return "AAAMDAttributes"; }
202 
203   /// See AbstractAttribute::getIdAddr().
204   const char *getIdAddr() const override { return &ID; }
205 
206   /// This function should return true if the type of the \p AA is
207   /// AAAMDAttributes.
208   static bool classof(const AbstractAttribute *AA) {
209     return (AA->getIdAddr() == &ID);
210   }
211 
212   /// Unique ID (due to the unique address)
213   static const char ID;
214 };
215 const char AAAMDAttributes::ID = 0;
216 
217 struct AAAMDWorkGroupSize
218     : public StateWrapper<BooleanState, AbstractAttribute> {
219   using Base = StateWrapper<BooleanState, AbstractAttribute>;
220   AAAMDWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
221 
222   /// Create an abstract attribute view for the position \p IRP.
223   static AAAMDWorkGroupSize &createForPosition(const IRPosition &IRP,
224                                                Attributor &A);
225 
226   /// See AbstractAttribute::getName().
227   const std::string getName() const override { return "AAAMDWorkGroupSize"; }
228 
229   /// See AbstractAttribute::getIdAddr().
230   const char *getIdAddr() const override { return &ID; }
231 
232   /// This function should return true if the type of the \p AA is
233   /// AAAMDAttributes.
234   static bool classof(const AbstractAttribute *AA) {
235     return (AA->getIdAddr() == &ID);
236   }
237 
238   /// Unique ID (due to the unique address)
239   static const char ID;
240 };
241 const char AAAMDWorkGroupSize::ID = 0;
242 
243 struct AAAMDWorkGroupSizeFunction : public AAAMDWorkGroupSize {
244   AAAMDWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
245       : AAAMDWorkGroupSize(IRP, A) {}
246 
247   void initialize(Attributor &A) override {
248     Function *F = getAssociatedFunction();
249     CallingConv::ID CC = F->getCallingConv();
250 
251     if (CC != CallingConv::AMDGPU_KERNEL)
252       return;
253 
254     bool InitialValue = false;
255     if (F->hasFnAttribute("uniform-work-group-size"))
256       InitialValue = F->getFnAttribute("uniform-work-group-size")
257                          .getValueAsString()
258                          .equals("true");
259 
260     if (InitialValue)
261       indicateOptimisticFixpoint();
262     else
263       indicatePessimisticFixpoint();
264   }
265 
266   ChangeStatus updateImpl(Attributor &A) override {
267     ChangeStatus Change = ChangeStatus::UNCHANGED;
268 
269     auto CheckCallSite = [&](AbstractCallSite CS) {
270       Function *Caller = CS.getInstruction()->getFunction();
271       LLVM_DEBUG(dbgs() << "[AAAMDWorkGroupSize] Call " << Caller->getName()
272                         << "->" << getAssociatedFunction()->getName() << "\n");
273 
274       const auto &CallerInfo = A.getAAFor<AAAMDWorkGroupSize>(
275           *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
276 
277       Change = Change | clampStateAndIndicateChange(this->getState(),
278                                                     CallerInfo.getState());
279 
280       return true;
281     };
282 
283     bool AllCallSitesKnown = true;
284     if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
285       indicatePessimisticFixpoint();
286 
287     return Change;
288   }
289 
290   ChangeStatus manifest(Attributor &A) override {
291     SmallVector<Attribute, 8> AttrList;
292     LLVMContext &Ctx = getAssociatedFunction()->getContext();
293 
294     AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size",
295                                       getAssumed() ? "true" : "false"));
296     return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
297                                               /* ForceReplace */ true);
298   }
299 
300   bool isValidState() const override {
301     // This state is always valid, even when the state is false.
302     return true;
303   }
304 
305   const std::string getAsStr() const override {
306     return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]";
307   }
308 
309   /// See AbstractAttribute::trackStatistics()
310   void trackStatistics() const override {}
311 };
312 
313 AAAMDWorkGroupSize &AAAMDWorkGroupSize::createForPosition(const IRPosition &IRP,
314                                                           Attributor &A) {
315   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
316     return *new (A.Allocator) AAAMDWorkGroupSizeFunction(IRP, A);
317   llvm_unreachable("AAAMDWorkGroupSize is only valid for function position");
318 }
319 
320 struct AAAMDAttributesFunction : public AAAMDAttributes {
321   AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)
322       : AAAMDAttributes(IRP, A) {}
323 
324   void initialize(Attributor &A) override {
325     Function *F = getAssociatedFunction();
326     for (auto Attr : ImplicitAttrs) {
327       if (F->hasFnAttribute(Attr.second))
328         addKnownBits(Attr.first);
329     }
330 
331     if (F->isDeclaration())
332       return;
333 
334     // Ignore functions with graphics calling conventions, these are currently
335     // not allowed to have kernel arguments.
336     if (AMDGPU::isGraphics(F->getCallingConv())) {
337       indicatePessimisticFixpoint();
338       return;
339     }
340   }
341 
342   ChangeStatus updateImpl(Attributor &A) override {
343     Function *F = getAssociatedFunction();
344     // The current assumed state used to determine a change.
345     auto OrigAssumed = getAssumed();
346 
347     // Check for Intrinsics and propagate attributes.
348     const AACallEdges &AAEdges = A.getAAFor<AACallEdges>(
349         *this, this->getIRPosition(), DepClassTy::REQUIRED);
350     if (AAEdges.hasNonAsmUnknownCallee())
351       return indicatePessimisticFixpoint();
352 
353     bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
354     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
355 
356     bool NeedsQueuePtr = false;
357 
358     for (Function *Callee : AAEdges.getOptimisticEdges()) {
359       Intrinsic::ID IID = Callee->getIntrinsicID();
360       if (IID == Intrinsic::not_intrinsic) {
361         const AAAMDAttributes &AAAMD = A.getAAFor<AAAMDAttributes>(
362           *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
363         *this &= AAAMD;
364         continue;
365       }
366 
367       bool NonKernelOnly = false;
368       ImplicitArgumentMask AttrMask =
369           intrinsicToAttrMask(IID, NonKernelOnly, NeedsQueuePtr);
370       if (AttrMask != NOT_IMPLICIT_INPUT) {
371         if ((IsNonEntryFunc || !NonKernelOnly))
372           removeAssumedBits(AttrMask);
373       }
374     }
375 
376     // If we found that we need amdgpu-queue-ptr, nothing else to do.
377     if (NeedsQueuePtr) {
378       removeAssumedBits(QUEUE_PTR);
379       return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED :
380                                            ChangeStatus::UNCHANGED;
381     }
382 
383     auto CheckAddrSpaceCasts = [&](Instruction &I) {
384       unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
385       if (castRequiresQueuePtr(SrcAS)) {
386         NeedsQueuePtr = true;
387         return false;
388       }
389       return true;
390     };
391 
392     bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
393 
394     // `checkForAllInstructions` is much more cheaper than going through all
395     // instructions, try it first.
396 
397     // amdgpu-queue-ptr is not needed if aperture regs is present.
398     if (!HasApertureRegs) {
399       bool UsedAssumedInformation = false;
400       A.checkForAllInstructions(CheckAddrSpaceCasts, *this,
401                                 {Instruction::AddrSpaceCast},
402                                 UsedAssumedInformation);
403     }
404 
405     // If we found  that we need amdgpu-queue-ptr, nothing else to do.
406     if (NeedsQueuePtr) {
407       removeAssumedBits(QUEUE_PTR);
408       return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED :
409                                            ChangeStatus::UNCHANGED;
410     }
411 
412     if (!IsNonEntryFunc && HasApertureRegs) {
413       return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED :
414                                            ChangeStatus::UNCHANGED;
415     }
416 
417     for (BasicBlock &BB : *F) {
418       for (Instruction &I : BB) {
419         for (const Use &U : I.operands()) {
420           if (const auto *C = dyn_cast<Constant>(U)) {
421             if (InfoCache.needsQueuePtr(C, *F)) {
422               removeAssumedBits(QUEUE_PTR);
423               return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED :
424                                                    ChangeStatus::UNCHANGED;
425             }
426           }
427         }
428       }
429     }
430 
431     return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED :
432                                          ChangeStatus::UNCHANGED;
433   }
434 
435   ChangeStatus manifest(Attributor &A) override {
436     SmallVector<Attribute, 8> AttrList;
437     LLVMContext &Ctx = getAssociatedFunction()->getContext();
438 
439     for (auto Attr : ImplicitAttrs) {
440       if (isKnown(Attr.first))
441         AttrList.push_back(Attribute::get(Ctx, Attr.second));
442     }
443 
444     return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
445                                               /* ForceReplace */ true);
446   }
447 
448   const std::string getAsStr() const override {
449     std::string Str;
450     raw_string_ostream OS(Str);
451     OS << "AMDInfo[";
452     for (auto Attr : ImplicitAttrs)
453       OS << ' ' << Attr.second;
454     OS << " ]";
455     return OS.str();
456   }
457 
458   /// See AbstractAttribute::trackStatistics()
459   void trackStatistics() const override {}
460 };
461 
462 AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
463                                                     Attributor &A) {
464   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
465     return *new (A.Allocator) AAAMDAttributesFunction(IRP, A);
466   llvm_unreachable("AAAMDAttributes is only valid for function position");
467 }
468 
469 class AMDGPUAttributor : public ModulePass {
470 public:
471   AMDGPUAttributor() : ModulePass(ID) {}
472 
473   /// doInitialization - Virtual method overridden by subclasses to do
474   /// any necessary initialization before any pass is run.
475   bool doInitialization(Module &) override {
476     auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
477     if (!TPC)
478       report_fatal_error("TargetMachine is required");
479 
480     TM = &TPC->getTM<TargetMachine>();
481     return false;
482   }
483 
484   bool runOnModule(Module &M) override {
485     SetVector<Function *> Functions;
486     AnalysisGetter AG;
487     for (Function &F : M) {
488       if (!F.isIntrinsic())
489         Functions.insert(&F);
490     }
491 
492     CallGraphUpdater CGUpdater;
493     BumpPtrAllocator Allocator;
494     AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM);
495     DenseSet<const char *> Allowed(
496         {&AAAMDAttributes::ID, &AAAMDWorkGroupSize::ID, &AACallEdges::ID});
497 
498     Attributor A(Functions, InfoCache, CGUpdater, &Allowed);
499 
500     for (Function &F : M) {
501       if (!F.isIntrinsic()) {
502         A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F));
503         A.getOrCreateAAFor<AAAMDWorkGroupSize>(IRPosition::function(F));
504       }
505     }
506 
507     ChangeStatus Change = A.run();
508     return Change == ChangeStatus::CHANGED;
509   }
510 
511   StringRef getPassName() const override { return "AMDGPU Attributor"; }
512   TargetMachine *TM;
513   static char ID;
514 };
515 
516 char AMDGPUAttributor::ID = 0;
517 
518 Pass *llvm::createAMDGPUAttributorPass() { return new AMDGPUAttributor(); }
519 INITIALIZE_PASS(AMDGPUAttributor, DEBUG_TYPE, "AMDGPU Attributor", false, false)
520