1 //===- AMDGPUAttributor.cpp -----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass uses Attributor framework to deduce AMDGPU attributes.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AMDGPU.h"
14 #include "GCNSubtarget.h"
15 #include "Utils/AMDGPUBaseInfo.h"
16 #include "llvm/CodeGen/TargetPassConfig.h"
17 #include "llvm/IR/IntrinsicsAMDGPU.h"
18 #include "llvm/IR/IntrinsicsR600.h"
19 #include "llvm/Target/TargetMachine.h"
20 #include "llvm/Transforms/IPO/Attributor.h"
21 
22 #define DEBUG_TYPE "amdgpu-attributor"
23 
24 using namespace llvm;
25 
26 #define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
27 
28 enum ImplicitArgumentPositions {
29   #include "AMDGPUAttributes.def"
30   LAST_ARG_POS
31 };
32 
33 #define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
34 
35 enum ImplicitArgumentMask {
36   NOT_IMPLICIT_INPUT = 0,
37   #include "AMDGPUAttributes.def"
38   ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1
39 };
40 
41 #define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
42 static constexpr std::pair<ImplicitArgumentMask,
43                            StringLiteral> ImplicitAttrs[] = {
44  #include "AMDGPUAttributes.def"
45 };
46 
47 // We do not need to note the x workitem or workgroup id because they are always
48 // initialized.
49 //
50 // TODO: We should not add the attributes if the known compile time workgroup
51 // size is 1 for y/z.
52 static ImplicitArgumentMask
53 intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
54                     bool HasApertureRegs, bool SupportsGetDoorBellID) {
55   unsigned CodeObjectVersion = AMDGPU::getAmdhsaCodeObjectVersion();
56   switch (ID) {
57   case Intrinsic::amdgcn_workitem_id_x:
58     NonKernelOnly = true;
59     return WORKITEM_ID_X;
60   case Intrinsic::amdgcn_workgroup_id_x:
61     NonKernelOnly = true;
62     return WORKGROUP_ID_X;
63   case Intrinsic::amdgcn_workitem_id_y:
64   case Intrinsic::r600_read_tidig_y:
65     return WORKITEM_ID_Y;
66   case Intrinsic::amdgcn_workitem_id_z:
67   case Intrinsic::r600_read_tidig_z:
68     return WORKITEM_ID_Z;
69   case Intrinsic::amdgcn_workgroup_id_y:
70   case Intrinsic::r600_read_tgid_y:
71     return WORKGROUP_ID_Y;
72   case Intrinsic::amdgcn_workgroup_id_z:
73   case Intrinsic::r600_read_tgid_z:
74     return WORKGROUP_ID_Z;
75   case Intrinsic::amdgcn_dispatch_ptr:
76     return DISPATCH_PTR;
77   case Intrinsic::amdgcn_dispatch_id:
78     return DISPATCH_ID;
79   case Intrinsic::amdgcn_implicitarg_ptr:
80     return IMPLICIT_ARG_PTR;
81   // Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access
82   // queue_ptr.
83   case Intrinsic::amdgcn_queue_ptr:
84     NeedsImplicit = (CodeObjectVersion == 5);
85     return QUEUE_PTR;
86   case Intrinsic::amdgcn_is_shared:
87   case Intrinsic::amdgcn_is_private:
88     if (HasApertureRegs)
89       return NOT_IMPLICIT_INPUT;
90     // Under V5, we need implicitarg_ptr + offsets to access private_base or
91     // shared_base. For pre-V5, however, need to access them through queue_ptr +
92     // offsets.
93     return CodeObjectVersion == 5 ? IMPLICIT_ARG_PTR : QUEUE_PTR;
94   case Intrinsic::trap:
95     if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.
96       return CodeObjectVersion >= 4 ? NOT_IMPLICIT_INPUT : QUEUE_PTR;
97     NeedsImplicit = (CodeObjectVersion == 5); // Need impicitarg_ptr under V5.
98     return QUEUE_PTR;
99   default:
100     return NOT_IMPLICIT_INPUT;
101   }
102 }
103 
104 static bool castRequiresQueuePtr(unsigned SrcAS) {
105   return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
106 }
107 
108 static bool isDSAddress(const Constant *C) {
109   const GlobalValue *GV = dyn_cast<GlobalValue>(C);
110   if (!GV)
111     return false;
112   unsigned AS = GV->getAddressSpace();
113   return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
114 }
115 
116 /// Returns true if the function requires the implicit argument be passed
117 /// regardless of the function contents.
118 static bool funcRequiresHostcallPtr(const Function &F) {
119   // Sanitizers require the hostcall buffer passed in the implicit arguments.
120   return F.hasFnAttribute(Attribute::SanitizeAddress) ||
121          F.hasFnAttribute(Attribute::SanitizeThread) ||
122          F.hasFnAttribute(Attribute::SanitizeMemory) ||
123          F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
124          F.hasFnAttribute(Attribute::SanitizeMemTag);
125 }
126 
127 namespace {
128 class AMDGPUInformationCache : public InformationCache {
129 public:
130   AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
131                          BumpPtrAllocator &Allocator,
132                          SetVector<Function *> *CGSCC, TargetMachine &TM)
133       : InformationCache(M, AG, Allocator, CGSCC), TM(TM) {}
134   TargetMachine &TM;
135 
136   enum ConstantStatus { DS_GLOBAL = 1 << 0, ADDR_SPACE_CAST = 1 << 1 };
137 
138   /// Check if the subtarget has aperture regs.
139   bool hasApertureRegs(Function &F) {
140     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
141     return ST.hasApertureRegs();
142   }
143 
144   /// Check if the subtarget supports GetDoorbellID.
145   bool supportsGetDoorbellID(Function &F) {
146     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
147     return ST.supportsGetDoorbellID();
148   }
149 
150   std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) {
151     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
152     return ST.getFlatWorkGroupSizes(F);
153   }
154 
155   std::pair<unsigned, unsigned>
156   getMaximumFlatWorkGroupRange(const Function &F) {
157     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
158     return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
159   }
160 
161 private:
162   /// Check if the ConstantExpr \p CE requires the queue pointer.
163   static bool visitConstExpr(const ConstantExpr *CE) {
164     if (CE->getOpcode() == Instruction::AddrSpaceCast) {
165       unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
166       return castRequiresQueuePtr(SrcAS);
167     }
168     return false;
169   }
170 
171   /// Get the constant access bitmap for \p C.
172   uint8_t getConstantAccess(const Constant *C) {
173     auto It = ConstantStatus.find(C);
174     if (It != ConstantStatus.end())
175       return It->second;
176 
177     uint8_t Result = 0;
178     if (isDSAddress(C))
179       Result = DS_GLOBAL;
180 
181     if (const auto *CE = dyn_cast<ConstantExpr>(C))
182       if (visitConstExpr(CE))
183         Result |= ADDR_SPACE_CAST;
184 
185     for (const Use &U : C->operands()) {
186       const auto *OpC = dyn_cast<Constant>(U);
187       if (!OpC)
188         continue;
189 
190       Result |= getConstantAccess(OpC);
191     }
192     return Result;
193   }
194 
195 public:
196   /// Returns true if \p Fn needs the queue pointer because of \p C.
197   bool needsQueuePtr(const Constant *C, Function &Fn) {
198     bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv());
199     bool HasAperture = hasApertureRegs(Fn);
200 
201     // No need to explore the constants.
202     if (!IsNonEntryFunc && HasAperture)
203       return false;
204 
205     uint8_t Access = getConstantAccess(C);
206 
207     // We need to trap on DS globals in non-entry functions.
208     if (IsNonEntryFunc && (Access & DS_GLOBAL))
209       return true;
210 
211     return !HasAperture && (Access & ADDR_SPACE_CAST);
212   }
213 
214 private:
215   /// Used to determine if the Constant needs the queue pointer.
216   DenseMap<const Constant *, uint8_t> ConstantStatus;
217 };
218 
219 struct AAAMDAttributes : public StateWrapper<
220   BitIntegerState<uint16_t, ALL_ARGUMENT_MASK, 0>, AbstractAttribute> {
221   using Base = StateWrapper<BitIntegerState<uint16_t, ALL_ARGUMENT_MASK, 0>,
222                             AbstractAttribute>;
223 
224   AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
225 
226   /// Create an abstract attribute view for the position \p IRP.
227   static AAAMDAttributes &createForPosition(const IRPosition &IRP,
228                                             Attributor &A);
229 
230   /// See AbstractAttribute::getName().
231   const std::string getName() const override { return "AAAMDAttributes"; }
232 
233   /// See AbstractAttribute::getIdAddr().
234   const char *getIdAddr() const override { return &ID; }
235 
236   /// This function should return true if the type of the \p AA is
237   /// AAAMDAttributes.
238   static bool classof(const AbstractAttribute *AA) {
239     return (AA->getIdAddr() == &ID);
240   }
241 
242   /// Unique ID (due to the unique address)
243   static const char ID;
244 };
245 const char AAAMDAttributes::ID = 0;
246 
247 struct AAUniformWorkGroupSize
248     : public StateWrapper<BooleanState, AbstractAttribute> {
249   using Base = StateWrapper<BooleanState, AbstractAttribute>;
250   AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
251 
252   /// Create an abstract attribute view for the position \p IRP.
253   static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP,
254                                                    Attributor &A);
255 
256   /// See AbstractAttribute::getName().
257   const std::string getName() const override {
258     return "AAUniformWorkGroupSize";
259   }
260 
261   /// See AbstractAttribute::getIdAddr().
262   const char *getIdAddr() const override { return &ID; }
263 
264   /// This function should return true if the type of the \p AA is
265   /// AAAMDAttributes.
266   static bool classof(const AbstractAttribute *AA) {
267     return (AA->getIdAddr() == &ID);
268   }
269 
270   /// Unique ID (due to the unique address)
271   static const char ID;
272 };
273 const char AAUniformWorkGroupSize::ID = 0;
274 
275 struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
276   AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
277       : AAUniformWorkGroupSize(IRP, A) {}
278 
279   void initialize(Attributor &A) override {
280     Function *F = getAssociatedFunction();
281     CallingConv::ID CC = F->getCallingConv();
282 
283     if (CC != CallingConv::AMDGPU_KERNEL)
284       return;
285 
286     bool InitialValue = false;
287     if (F->hasFnAttribute("uniform-work-group-size"))
288       InitialValue = F->getFnAttribute("uniform-work-group-size")
289                          .getValueAsString()
290                          .equals("true");
291 
292     if (InitialValue)
293       indicateOptimisticFixpoint();
294     else
295       indicatePessimisticFixpoint();
296   }
297 
298   ChangeStatus updateImpl(Attributor &A) override {
299     ChangeStatus Change = ChangeStatus::UNCHANGED;
300 
301     auto CheckCallSite = [&](AbstractCallSite CS) {
302       Function *Caller = CS.getInstruction()->getFunction();
303       LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName()
304                         << "->" << getAssociatedFunction()->getName() << "\n");
305 
306       const auto &CallerInfo = A.getAAFor<AAUniformWorkGroupSize>(
307           *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
308 
309       Change = Change | clampStateAndIndicateChange(this->getState(),
310                                                     CallerInfo.getState());
311 
312       return true;
313     };
314 
315     bool AllCallSitesKnown = true;
316     if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
317       return indicatePessimisticFixpoint();
318 
319     return Change;
320   }
321 
322   ChangeStatus manifest(Attributor &A) override {
323     SmallVector<Attribute, 8> AttrList;
324     LLVMContext &Ctx = getAssociatedFunction()->getContext();
325 
326     AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size",
327                                       getAssumed() ? "true" : "false"));
328     return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
329                                               /* ForceReplace */ true);
330   }
331 
332   bool isValidState() const override {
333     // This state is always valid, even when the state is false.
334     return true;
335   }
336 
337   const std::string getAsStr() const override {
338     return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]";
339   }
340 
341   /// See AbstractAttribute::trackStatistics()
342   void trackStatistics() const override {}
343 };
344 
345 AAUniformWorkGroupSize &
346 AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP,
347                                           Attributor &A) {
348   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
349     return *new (A.Allocator) AAUniformWorkGroupSizeFunction(IRP, A);
350   llvm_unreachable(
351       "AAUniformWorkGroupSize is only valid for function position");
352 }
353 
354 struct AAAMDAttributesFunction : public AAAMDAttributes {
355   AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)
356       : AAAMDAttributes(IRP, A) {}
357 
358   void initialize(Attributor &A) override {
359     Function *F = getAssociatedFunction();
360 
361     // If the function requires the implicit arg pointer due to sanitizers,
362     // assume it's needed even if explicitly marked as not requiring it.
363     const bool NeedsHostcall = funcRequiresHostcallPtr(*F);
364     if (NeedsHostcall) {
365       removeAssumedBits(IMPLICIT_ARG_PTR);
366       removeAssumedBits(HOSTCALL_PTR);
367     }
368 
369     for (auto Attr : ImplicitAttrs) {
370       if (NeedsHostcall &&
371           (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR))
372         continue;
373 
374       if (F->hasFnAttribute(Attr.second))
375         addKnownBits(Attr.first);
376     }
377 
378     if (F->isDeclaration())
379       return;
380 
381     // Ignore functions with graphics calling conventions, these are currently
382     // not allowed to have kernel arguments.
383     if (AMDGPU::isGraphics(F->getCallingConv())) {
384       indicatePessimisticFixpoint();
385       return;
386     }
387   }
388 
389   ChangeStatus updateImpl(Attributor &A) override {
390     Function *F = getAssociatedFunction();
391     // The current assumed state used to determine a change.
392     auto OrigAssumed = getAssumed();
393 
394     // Check for Intrinsics and propagate attributes.
395     const AACallEdges &AAEdges = A.getAAFor<AACallEdges>(
396         *this, this->getIRPosition(), DepClassTy::REQUIRED);
397     if (AAEdges.hasNonAsmUnknownCallee())
398       return indicatePessimisticFixpoint();
399 
400     bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
401 
402     bool NeedsImplicit = false;
403     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
404     bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
405     bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F);
406 
407     for (Function *Callee : AAEdges.getOptimisticEdges()) {
408       Intrinsic::ID IID = Callee->getIntrinsicID();
409       if (IID == Intrinsic::not_intrinsic) {
410         const AAAMDAttributes &AAAMD = A.getAAFor<AAAMDAttributes>(
411           *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
412         *this &= AAAMD;
413         continue;
414       }
415 
416       bool NonKernelOnly = false;
417       ImplicitArgumentMask AttrMask =
418           intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
419                               HasApertureRegs, SupportsGetDoorbellID);
420       if (AttrMask != NOT_IMPLICIT_INPUT) {
421         if ((IsNonEntryFunc || !NonKernelOnly))
422           removeAssumedBits(AttrMask);
423       }
424     }
425 
426     // Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base.
427     if (NeedsImplicit)
428       removeAssumedBits(IMPLICIT_ARG_PTR);
429 
430     if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) {
431       // Under V5, we need implicitarg_ptr + offsets to access private_base or
432       // shared_base. We do not actually need queue_ptr.
433       if (AMDGPU::getAmdhsaCodeObjectVersion() == 5)
434         removeAssumedBits(IMPLICIT_ARG_PTR);
435       else
436         removeAssumedBits(QUEUE_PTR);
437     }
438 
439     if (funcRetrievesHostcallPtr(A)) {
440       assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr");
441       removeAssumedBits(HOSTCALL_PTR);
442     }
443 
444     if (funcRetrievesHeapPtr(A)) {
445       assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr");
446       removeAssumedBits(HEAP_PTR);
447     }
448 
449     if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A)) {
450       assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");
451       removeAssumedBits(QUEUE_PTR);
452     }
453 
454     return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
455                                        : ChangeStatus::UNCHANGED;
456   }
457 
458   ChangeStatus manifest(Attributor &A) override {
459     SmallVector<Attribute, 8> AttrList;
460     LLVMContext &Ctx = getAssociatedFunction()->getContext();
461 
462     for (auto Attr : ImplicitAttrs) {
463       if (isKnown(Attr.first))
464         AttrList.push_back(Attribute::get(Ctx, Attr.second));
465     }
466 
467     return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
468                                               /* ForceReplace */ true);
469   }
470 
471   const std::string getAsStr() const override {
472     std::string Str;
473     raw_string_ostream OS(Str);
474     OS << "AMDInfo[";
475     for (auto Attr : ImplicitAttrs)
476       OS << ' ' << Attr.second;
477     OS << " ]";
478     return OS.str();
479   }
480 
481   /// See AbstractAttribute::trackStatistics()
482   void trackStatistics() const override {}
483 
484 private:
485   bool checkForQueuePtr(Attributor &A) {
486     Function *F = getAssociatedFunction();
487     bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
488 
489     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
490 
491     bool NeedsQueuePtr = false;
492 
493     auto CheckAddrSpaceCasts = [&](Instruction &I) {
494       unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
495       if (castRequiresQueuePtr(SrcAS)) {
496         NeedsQueuePtr = true;
497         return false;
498       }
499       return true;
500     };
501 
502     bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
503 
504     // `checkForAllInstructions` is much more cheaper than going through all
505     // instructions, try it first.
506 
507     // The queue pointer is not needed if aperture regs is present.
508     if (!HasApertureRegs) {
509       bool UsedAssumedInformation = false;
510       A.checkForAllInstructions(CheckAddrSpaceCasts, *this,
511                                 {Instruction::AddrSpaceCast},
512                                 UsedAssumedInformation);
513     }
514 
515     // If we found  that we need the queue pointer, nothing else to do.
516     if (NeedsQueuePtr)
517       return true;
518 
519     if (!IsNonEntryFunc && HasApertureRegs)
520       return false;
521 
522     for (BasicBlock &BB : *F) {
523       for (Instruction &I : BB) {
524         for (const Use &U : I.operands()) {
525           if (const auto *C = dyn_cast<Constant>(U)) {
526             if (InfoCache.needsQueuePtr(C, *F))
527               return true;
528           }
529         }
530       }
531     }
532 
533     return false;
534   }
535 
536   bool funcRetrievesHostcallPtr(Attributor &A) {
537     auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition();
538     AAPointerInfo::OffsetAndSize OAS(Pos, 8);
539     return funcRetrievesImplicitKernelArg(A, OAS);
540   }
541 
542   bool funcRetrievesHeapPtr(Attributor &A) {
543     if (AMDGPU::getAmdhsaCodeObjectVersion() != 5)
544       return false;
545     AAPointerInfo::OffsetAndSize OAS(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8);
546     return funcRetrievesImplicitKernelArg(A, OAS);
547   }
548 
549   bool funcRetrievesQueuePtr(Attributor &A) {
550     if (AMDGPU::getAmdhsaCodeObjectVersion() != 5)
551       return false;
552     AAPointerInfo::OffsetAndSize OAS(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8);
553     return funcRetrievesImplicitKernelArg(A, OAS);
554   }
555 
556   bool funcRetrievesImplicitKernelArg(Attributor &A,
557                                       AAPointerInfo::OffsetAndSize OAS) {
558     // Check if this is a call to the implicitarg_ptr builtin and it
559     // is used to retrieve the hostcall pointer. The implicit arg for
560     // hostcall is not used only if every use of the implicitarg_ptr
561     // is a load that clearly does not retrieve any byte of the
562     // hostcall pointer. We check this by tracing all the uses of the
563     // initial call to the implicitarg_ptr intrinsic.
564     auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) {
565       auto &Call = cast<CallBase>(I);
566       if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
567         return true;
568 
569       const auto &PointerInfoAA = A.getAAFor<AAPointerInfo>(
570           *this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED);
571 
572       return PointerInfoAA.forallInterferingAccesses(
573           OAS, [](const AAPointerInfo::Access &Acc, bool IsExact) {
574             return Acc.getRemoteInst()->isDroppable();
575           });
576     };
577 
578     bool UsedAssumedInformation = false;
579     return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this,
580                                               UsedAssumedInformation);
581   }
582 };
583 
584 AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
585                                                     Attributor &A) {
586   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
587     return *new (A.Allocator) AAAMDAttributesFunction(IRP, A);
588   llvm_unreachable("AAAMDAttributes is only valid for function position");
589 }
590 
591 /// Propagate amdgpu-flat-work-group-size attribute.
592 struct AAAMDFlatWorkGroupSize
593     : public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
594   using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
595   AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
596       : Base(IRP, 32) {}
597 
598   /// See AbstractAttribute::getState(...).
599   IntegerRangeState &getState() override { return *this; }
600   const IntegerRangeState &getState() const override { return *this; }
601 
602   void initialize(Attributor &A) override {
603     Function *F = getAssociatedFunction();
604     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
605     unsigned MinGroupSize, MaxGroupSize;
606     std::tie(MinGroupSize, MaxGroupSize) = InfoCache.getFlatWorkGroupSizes(*F);
607     intersectKnown(
608         ConstantRange(APInt(32, MinGroupSize), APInt(32, MaxGroupSize + 1)));
609 
610     if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
611       indicatePessimisticFixpoint();
612   }
613 
614   ChangeStatus updateImpl(Attributor &A) override {
615     ChangeStatus Change = ChangeStatus::UNCHANGED;
616 
617     auto CheckCallSite = [&](AbstractCallSite CS) {
618       Function *Caller = CS.getInstruction()->getFunction();
619       LLVM_DEBUG(dbgs() << "[AAAMDFlatWorkGroupSize] Call " << Caller->getName()
620                         << "->" << getAssociatedFunction()->getName() << '\n');
621 
622       const auto &CallerInfo = A.getAAFor<AAAMDFlatWorkGroupSize>(
623           *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
624 
625       Change |=
626           clampStateAndIndicateChange(this->getState(), CallerInfo.getState());
627 
628       return true;
629     };
630 
631     bool AllCallSitesKnown = true;
632     if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
633       return indicatePessimisticFixpoint();
634 
635     return Change;
636   }
637 
638   ChangeStatus manifest(Attributor &A) override {
639     SmallVector<Attribute, 8> AttrList;
640     Function *F = getAssociatedFunction();
641     LLVMContext &Ctx = F->getContext();
642 
643     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
644     unsigned Min, Max;
645     std::tie(Min, Max) = InfoCache.getMaximumFlatWorkGroupRange(*F);
646 
647     // Don't add the attribute if it's the implied default.
648     if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max)
649       return ChangeStatus::UNCHANGED;
650 
651     SmallString<10> Buffer;
652     raw_svector_ostream OS(Buffer);
653     OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
654 
655     AttrList.push_back(
656         Attribute::get(Ctx, "amdgpu-flat-work-group-size", OS.str()));
657     return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
658                                               /* ForceReplace */ true);
659   }
660 
661   const std::string getAsStr() const override {
662     std::string Str;
663     raw_string_ostream OS(Str);
664     OS << "AMDFlatWorkGroupSize[";
665     OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
666     OS << ']';
667     return OS.str();
668   }
669 
670   /// See AbstractAttribute::trackStatistics()
671   void trackStatistics() const override {}
672 
673   /// Create an abstract attribute view for the position \p IRP.
674   static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
675                                                    Attributor &A);
676 
677   /// See AbstractAttribute::getName()
678   const std::string getName() const override {
679     return "AAAMDFlatWorkGroupSize";
680   }
681 
682   /// See AbstractAttribute::getIdAddr()
683   const char *getIdAddr() const override { return &ID; }
684 
685   /// This function should return true if the type of the \p AA is
686   /// AAAMDFlatWorkGroupSize
687   static bool classof(const AbstractAttribute *AA) {
688     return (AA->getIdAddr() == &ID);
689   }
690 
691   /// Unique ID (due to the unique address)
692   static const char ID;
693 };
694 
695 const char AAAMDFlatWorkGroupSize::ID = 0;
696 
697 AAAMDFlatWorkGroupSize &
698 AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
699                                           Attributor &A) {
700   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
701     return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A);
702   llvm_unreachable(
703       "AAAMDFlatWorkGroupSize is only valid for function position");
704 }
705 
706 class AMDGPUAttributor : public ModulePass {
707 public:
708   AMDGPUAttributor() : ModulePass(ID) {}
709 
710   /// doInitialization - Virtual method overridden by subclasses to do
711   /// any necessary initialization before any pass is run.
712   bool doInitialization(Module &) override {
713     auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
714     if (!TPC)
715       report_fatal_error("TargetMachine is required");
716 
717     TM = &TPC->getTM<TargetMachine>();
718     return false;
719   }
720 
721   bool runOnModule(Module &M) override {
722     SetVector<Function *> Functions;
723     AnalysisGetter AG;
724     for (Function &F : M) {
725       if (!F.isIntrinsic())
726         Functions.insert(&F);
727     }
728 
729     CallGraphUpdater CGUpdater;
730     BumpPtrAllocator Allocator;
731     AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM);
732     DenseSet<const char *> Allowed(
733         {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
734          &AAAMDFlatWorkGroupSize::ID, &AACallEdges::ID, &AAPointerInfo::ID});
735 
736     Attributor A(Functions, InfoCache, CGUpdater, &Allowed);
737 
738     for (Function &F : M) {
739       if (!F.isIntrinsic()) {
740         A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F));
741         A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(F));
742         if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) {
743           A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(F));
744         }
745       }
746     }
747 
748     ChangeStatus Change = A.run();
749     return Change == ChangeStatus::CHANGED;
750   }
751 
752   StringRef getPassName() const override { return "AMDGPU Attributor"; }
753   TargetMachine *TM;
754   static char ID;
755 };
756 } // namespace
757 
758 char AMDGPUAttributor::ID = 0;
759 
760 Pass *llvm::createAMDGPUAttributorPass() { return new AMDGPUAttributor(); }
761 INITIALIZE_PASS(AMDGPUAttributor, DEBUG_TYPE, "AMDGPU Attributor", false, false)
762