1 //===- AMDGPUAttributor.cpp -----------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass uses Attributor framework to deduce AMDGPU attributes.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AMDGPU.h"
14 #include "GCNSubtarget.h"
15 #include "Utils/AMDGPUBaseInfo.h"
16 #include "llvm/CodeGen/TargetPassConfig.h"
17 #include "llvm/IR/IntrinsicsAMDGPU.h"
18 #include "llvm/IR/IntrinsicsR600.h"
19 #include "llvm/Target/TargetMachine.h"
20 #include "llvm/Transforms/IPO/Attributor.h"
21 
22 #define DEBUG_TYPE "amdgpu-attributor"
23 
24 using namespace llvm;
25 
26 #define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
27 
28 enum ImplicitArgumentPositions {
29   #include "AMDGPUAttributes.def"
30   LAST_ARG_POS
31 };
32 
33 #define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
34 
35 enum ImplicitArgumentMask {
36   NOT_IMPLICIT_INPUT = 0,
37   #include "AMDGPUAttributes.def"
38   ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1
39 };
40 
41 #define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
42 static constexpr std::pair<ImplicitArgumentMask,
43                            StringLiteral> ImplicitAttrs[] = {
44  #include "AMDGPUAttributes.def"
45 };
46 
47 // We do not need to note the x workitem or workgroup id because they are always
48 // initialized.
49 //
50 // TODO: We should not add the attributes if the known compile time workgroup
51 // size is 1 for y/z.
52 static ImplicitArgumentMask
53 intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
54                     bool HasApertureRegs, bool SupportsGetDoorBellID) {
55   unsigned CodeObjectVersion = AMDGPU::getAmdhsaCodeObjectVersion();
56   switch (ID) {
57   case Intrinsic::amdgcn_workitem_id_x:
58     NonKernelOnly = true;
59     return WORKITEM_ID_X;
60   case Intrinsic::amdgcn_workgroup_id_x:
61     NonKernelOnly = true;
62     return WORKGROUP_ID_X;
63   case Intrinsic::amdgcn_workitem_id_y:
64   case Intrinsic::r600_read_tidig_y:
65     return WORKITEM_ID_Y;
66   case Intrinsic::amdgcn_workitem_id_z:
67   case Intrinsic::r600_read_tidig_z:
68     return WORKITEM_ID_Z;
69   case Intrinsic::amdgcn_workgroup_id_y:
70   case Intrinsic::r600_read_tgid_y:
71     return WORKGROUP_ID_Y;
72   case Intrinsic::amdgcn_workgroup_id_z:
73   case Intrinsic::r600_read_tgid_z:
74     return WORKGROUP_ID_Z;
75   case Intrinsic::amdgcn_dispatch_ptr:
76     return DISPATCH_PTR;
77   case Intrinsic::amdgcn_dispatch_id:
78     return DISPATCH_ID;
79   case Intrinsic::amdgcn_implicitarg_ptr:
80     return IMPLICIT_ARG_PTR;
81   // Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access
82   // queue_ptr.
83   case Intrinsic::amdgcn_queue_ptr:
84     NeedsImplicit = (CodeObjectVersion == 5);
85     return QUEUE_PTR;
86   case Intrinsic::amdgcn_is_shared:
87   case Intrinsic::amdgcn_is_private:
88     if (HasApertureRegs)
89       return NOT_IMPLICIT_INPUT;
90     // Under V5, we need implicitarg_ptr + offsets to access private_base or
91     // shared_base. For pre-V5, however, need to access them through queue_ptr +
92     // offsets.
93     return CodeObjectVersion == 5 ? IMPLICIT_ARG_PTR : QUEUE_PTR;
94   case Intrinsic::trap:
95     if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.
96       return CodeObjectVersion >= 4 ? NOT_IMPLICIT_INPUT : QUEUE_PTR;
97     NeedsImplicit = (CodeObjectVersion == 5); // Need impicitarg_ptr under V5.
98     return QUEUE_PTR;
99   default:
100     return NOT_IMPLICIT_INPUT;
101   }
102 }
103 
104 static bool castRequiresQueuePtr(unsigned SrcAS) {
105   return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
106 }
107 
108 static bool isDSAddress(const Constant *C) {
109   const GlobalValue *GV = dyn_cast<GlobalValue>(C);
110   if (!GV)
111     return false;
112   unsigned AS = GV->getAddressSpace();
113   return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
114 }
115 
116 /// Returns true if the function requires the implicit argument be passed
117 /// regardless of the function contents.
118 static bool funcRequiresHostcallPtr(const Function &F) {
119   // Sanitizers require the hostcall buffer passed in the implicit arguments.
120   return F.hasFnAttribute(Attribute::SanitizeAddress) ||
121          F.hasFnAttribute(Attribute::SanitizeThread) ||
122          F.hasFnAttribute(Attribute::SanitizeMemory) ||
123          F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
124          F.hasFnAttribute(Attribute::SanitizeMemTag);
125 }
126 
127 namespace {
128 class AMDGPUInformationCache : public InformationCache {
129 public:
130   AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
131                          BumpPtrAllocator &Allocator,
132                          SetVector<Function *> *CGSCC, TargetMachine &TM)
133       : InformationCache(M, AG, Allocator, CGSCC), TM(TM) {}
134   TargetMachine &TM;
135 
136   enum ConstantStatus { DS_GLOBAL = 1 << 0, ADDR_SPACE_CAST = 1 << 1 };
137 
138   /// Check if the subtarget has aperture regs.
139   bool hasApertureRegs(Function &F) {
140     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
141     return ST.hasApertureRegs();
142   }
143 
144   /// Check if the subtarget supports GetDoorbellID.
145   bool supportsGetDoorbellID(Function &F) {
146     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
147     return ST.supportsGetDoorbellID();
148   }
149 
150   std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) {
151     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
152     return ST.getFlatWorkGroupSizes(F);
153   }
154 
155   std::pair<unsigned, unsigned>
156   getMaximumFlatWorkGroupRange(const Function &F) {
157     const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
158     return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
159   }
160 
161 private:
162   /// Check if the ConstantExpr \p CE requires the queue pointer.
163   static bool visitConstExpr(const ConstantExpr *CE) {
164     if (CE->getOpcode() == Instruction::AddrSpaceCast) {
165       unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
166       return castRequiresQueuePtr(SrcAS);
167     }
168     return false;
169   }
170 
171   /// Get the constant access bitmap for \p C.
172   uint8_t getConstantAccess(const Constant *C) {
173     auto It = ConstantStatus.find(C);
174     if (It != ConstantStatus.end())
175       return It->second;
176 
177     uint8_t Result = 0;
178     if (isDSAddress(C))
179       Result = DS_GLOBAL;
180 
181     if (const auto *CE = dyn_cast<ConstantExpr>(C))
182       if (visitConstExpr(CE))
183         Result |= ADDR_SPACE_CAST;
184 
185     for (const Use &U : C->operands()) {
186       const auto *OpC = dyn_cast<Constant>(U);
187       if (!OpC)
188         continue;
189 
190       Result |= getConstantAccess(OpC);
191     }
192     return Result;
193   }
194 
195 public:
196   /// Returns true if \p Fn needs the queue pointer because of \p C.
197   bool needsQueuePtr(const Constant *C, Function &Fn) {
198     bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(Fn.getCallingConv());
199     bool HasAperture = hasApertureRegs(Fn);
200 
201     // No need to explore the constants.
202     if (!IsNonEntryFunc && HasAperture)
203       return false;
204 
205     uint8_t Access = getConstantAccess(C);
206 
207     // We need to trap on DS globals in non-entry functions.
208     if (IsNonEntryFunc && (Access & DS_GLOBAL))
209       return true;
210 
211     return !HasAperture && (Access & ADDR_SPACE_CAST);
212   }
213 
214 private:
215   /// Used to determine if the Constant needs the queue pointer.
216   DenseMap<const Constant *, uint8_t> ConstantStatus;
217 };
218 
219 struct AAAMDAttributes : public StateWrapper<
220   BitIntegerState<uint16_t, ALL_ARGUMENT_MASK, 0>, AbstractAttribute> {
221   using Base = StateWrapper<BitIntegerState<uint16_t, ALL_ARGUMENT_MASK, 0>,
222                             AbstractAttribute>;
223 
224   AAAMDAttributes(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
225 
226   /// Create an abstract attribute view for the position \p IRP.
227   static AAAMDAttributes &createForPosition(const IRPosition &IRP,
228                                             Attributor &A);
229 
230   /// See AbstractAttribute::getName().
231   const std::string getName() const override { return "AAAMDAttributes"; }
232 
233   /// See AbstractAttribute::getIdAddr().
234   const char *getIdAddr() const override { return &ID; }
235 
236   /// This function should return true if the type of the \p AA is
237   /// AAAMDAttributes.
238   static bool classof(const AbstractAttribute *AA) {
239     return (AA->getIdAddr() == &ID);
240   }
241 
242   /// Unique ID (due to the unique address)
243   static const char ID;
244 };
245 const char AAAMDAttributes::ID = 0;
246 
247 struct AAUniformWorkGroupSize
248     : public StateWrapper<BooleanState, AbstractAttribute> {
249   using Base = StateWrapper<BooleanState, AbstractAttribute>;
250   AAUniformWorkGroupSize(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
251 
252   /// Create an abstract attribute view for the position \p IRP.
253   static AAUniformWorkGroupSize &createForPosition(const IRPosition &IRP,
254                                                    Attributor &A);
255 
256   /// See AbstractAttribute::getName().
257   const std::string getName() const override {
258     return "AAUniformWorkGroupSize";
259   }
260 
261   /// See AbstractAttribute::getIdAddr().
262   const char *getIdAddr() const override { return &ID; }
263 
264   /// This function should return true if the type of the \p AA is
265   /// AAAMDAttributes.
266   static bool classof(const AbstractAttribute *AA) {
267     return (AA->getIdAddr() == &ID);
268   }
269 
270   /// Unique ID (due to the unique address)
271   static const char ID;
272 };
273 const char AAUniformWorkGroupSize::ID = 0;
274 
275 struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
276   AAUniformWorkGroupSizeFunction(const IRPosition &IRP, Attributor &A)
277       : AAUniformWorkGroupSize(IRP, A) {}
278 
279   void initialize(Attributor &A) override {
280     Function *F = getAssociatedFunction();
281     CallingConv::ID CC = F->getCallingConv();
282 
283     if (CC != CallingConv::AMDGPU_KERNEL)
284       return;
285 
286     bool InitialValue = false;
287     if (F->hasFnAttribute("uniform-work-group-size"))
288       InitialValue = F->getFnAttribute("uniform-work-group-size")
289                          .getValueAsString()
290                          .equals("true");
291 
292     if (InitialValue)
293       indicateOptimisticFixpoint();
294     else
295       indicatePessimisticFixpoint();
296   }
297 
298   ChangeStatus updateImpl(Attributor &A) override {
299     ChangeStatus Change = ChangeStatus::UNCHANGED;
300 
301     auto CheckCallSite = [&](AbstractCallSite CS) {
302       Function *Caller = CS.getInstruction()->getFunction();
303       LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName()
304                         << "->" << getAssociatedFunction()->getName() << "\n");
305 
306       const auto &CallerInfo = A.getAAFor<AAUniformWorkGroupSize>(
307           *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
308 
309       Change = Change | clampStateAndIndicateChange(this->getState(),
310                                                     CallerInfo.getState());
311 
312       return true;
313     };
314 
315     bool AllCallSitesKnown = true;
316     if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
317       return indicatePessimisticFixpoint();
318 
319     return Change;
320   }
321 
322   ChangeStatus manifest(Attributor &A) override {
323     SmallVector<Attribute, 8> AttrList;
324     LLVMContext &Ctx = getAssociatedFunction()->getContext();
325 
326     AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size",
327                                       getAssumed() ? "true" : "false"));
328     return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
329                                               /* ForceReplace */ true);
330   }
331 
332   bool isValidState() const override {
333     // This state is always valid, even when the state is false.
334     return true;
335   }
336 
337   const std::string getAsStr() const override {
338     return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]";
339   }
340 
341   /// See AbstractAttribute::trackStatistics()
342   void trackStatistics() const override {}
343 };
344 
345 AAUniformWorkGroupSize &
346 AAUniformWorkGroupSize::createForPosition(const IRPosition &IRP,
347                                           Attributor &A) {
348   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
349     return *new (A.Allocator) AAUniformWorkGroupSizeFunction(IRP, A);
350   llvm_unreachable(
351       "AAUniformWorkGroupSize is only valid for function position");
352 }
353 
354 struct AAAMDAttributesFunction : public AAAMDAttributes {
355   AAAMDAttributesFunction(const IRPosition &IRP, Attributor &A)
356       : AAAMDAttributes(IRP, A) {}
357 
358   void initialize(Attributor &A) override {
359     Function *F = getAssociatedFunction();
360 
361     // If the function requires the implicit arg pointer due to sanitizers,
362     // assume it's needed even if explicitly marked as not requiring it.
363     const bool NeedsHostcall = funcRequiresHostcallPtr(*F);
364     if (NeedsHostcall) {
365       removeAssumedBits(IMPLICIT_ARG_PTR);
366       removeAssumedBits(HOSTCALL_PTR);
367     }
368 
369     for (auto Attr : ImplicitAttrs) {
370       if (NeedsHostcall &&
371           (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR))
372         continue;
373 
374       if (F->hasFnAttribute(Attr.second))
375         addKnownBits(Attr.first);
376     }
377 
378     if (F->isDeclaration())
379       return;
380 
381     // Ignore functions with graphics calling conventions, these are currently
382     // not allowed to have kernel arguments.
383     if (AMDGPU::isGraphics(F->getCallingConv())) {
384       indicatePessimisticFixpoint();
385       return;
386     }
387   }
388 
389   ChangeStatus updateImpl(Attributor &A) override {
390     Function *F = getAssociatedFunction();
391     // The current assumed state used to determine a change.
392     auto OrigAssumed = getAssumed();
393 
394     // Check for Intrinsics and propagate attributes.
395     const AACallEdges &AAEdges = A.getAAFor<AACallEdges>(
396         *this, this->getIRPosition(), DepClassTy::REQUIRED);
397     if (AAEdges.hasNonAsmUnknownCallee())
398       return indicatePessimisticFixpoint();
399 
400     bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
401 
402     bool NeedsImplicit = false;
403     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
404     bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
405     bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F);
406 
407     for (Function *Callee : AAEdges.getOptimisticEdges()) {
408       Intrinsic::ID IID = Callee->getIntrinsicID();
409       if (IID == Intrinsic::not_intrinsic) {
410         const AAAMDAttributes &AAAMD = A.getAAFor<AAAMDAttributes>(
411           *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
412         *this &= AAAMD;
413         continue;
414       }
415 
416       bool NonKernelOnly = false;
417       ImplicitArgumentMask AttrMask =
418           intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
419                               HasApertureRegs, SupportsGetDoorbellID);
420       if (AttrMask != NOT_IMPLICIT_INPUT) {
421         if ((IsNonEntryFunc || !NonKernelOnly))
422           removeAssumedBits(AttrMask);
423       }
424     }
425 
426     // Need implicitarg_ptr to acess queue_ptr, private_base, and shared_base.
427     if (NeedsImplicit)
428       removeAssumedBits(IMPLICIT_ARG_PTR);
429 
430     if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) {
431       // Under V5, we need implicitarg_ptr + offsets to access private_base or
432       // shared_base. We do not actually need queue_ptr.
433       if (AMDGPU::getAmdhsaCodeObjectVersion() == 5)
434         removeAssumedBits(IMPLICIT_ARG_PTR);
435       else
436         removeAssumedBits(QUEUE_PTR);
437     }
438 
439     if (funcRetrievesMultigridSyncArg(A)) {
440       assert(!isAssumed(IMPLICIT_ARG_PTR) &&
441              "multigrid_sync_arg needs implicitarg_ptr");
442       removeAssumedBits(MULTIGRID_SYNC_ARG);
443     }
444 
445     if (funcRetrievesHostcallPtr(A)) {
446       assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr");
447       removeAssumedBits(HOSTCALL_PTR);
448     }
449 
450     if (funcRetrievesHeapPtr(A)) {
451       assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr");
452       removeAssumedBits(HEAP_PTR);
453     }
454 
455     if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A)) {
456       assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");
457       removeAssumedBits(QUEUE_PTR);
458     }
459 
460     return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
461                                        : ChangeStatus::UNCHANGED;
462   }
463 
464   ChangeStatus manifest(Attributor &A) override {
465     SmallVector<Attribute, 8> AttrList;
466     LLVMContext &Ctx = getAssociatedFunction()->getContext();
467 
468     for (auto Attr : ImplicitAttrs) {
469       if (isKnown(Attr.first))
470         AttrList.push_back(Attribute::get(Ctx, Attr.second));
471     }
472 
473     return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
474                                               /* ForceReplace */ true);
475   }
476 
477   const std::string getAsStr() const override {
478     std::string Str;
479     raw_string_ostream OS(Str);
480     OS << "AMDInfo[";
481     for (auto Attr : ImplicitAttrs)
482       OS << ' ' << Attr.second;
483     OS << " ]";
484     return OS.str();
485   }
486 
487   /// See AbstractAttribute::trackStatistics()
488   void trackStatistics() const override {}
489 
490 private:
491   bool checkForQueuePtr(Attributor &A) {
492     Function *F = getAssociatedFunction();
493     bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
494 
495     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
496 
497     bool NeedsQueuePtr = false;
498 
499     auto CheckAddrSpaceCasts = [&](Instruction &I) {
500       unsigned SrcAS = static_cast<AddrSpaceCastInst &>(I).getSrcAddressSpace();
501       if (castRequiresQueuePtr(SrcAS)) {
502         NeedsQueuePtr = true;
503         return false;
504       }
505       return true;
506     };
507 
508     bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
509 
510     // `checkForAllInstructions` is much more cheaper than going through all
511     // instructions, try it first.
512 
513     // The queue pointer is not needed if aperture regs is present.
514     if (!HasApertureRegs) {
515       bool UsedAssumedInformation = false;
516       A.checkForAllInstructions(CheckAddrSpaceCasts, *this,
517                                 {Instruction::AddrSpaceCast},
518                                 UsedAssumedInformation);
519     }
520 
521     // If we found  that we need the queue pointer, nothing else to do.
522     if (NeedsQueuePtr)
523       return true;
524 
525     if (!IsNonEntryFunc && HasApertureRegs)
526       return false;
527 
528     for (BasicBlock &BB : *F) {
529       for (Instruction &I : BB) {
530         for (const Use &U : I.operands()) {
531           if (const auto *C = dyn_cast<Constant>(U)) {
532             if (InfoCache.needsQueuePtr(C, *F))
533               return true;
534           }
535         }
536       }
537     }
538 
539     return false;
540   }
541 
542   bool funcRetrievesMultigridSyncArg(Attributor &A) {
543     auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition();
544     AAPointerInfo::OffsetAndSize OAS(Pos, 8);
545     return funcRetrievesImplicitKernelArg(A, OAS);
546   }
547 
548   bool funcRetrievesHostcallPtr(Attributor &A) {
549     auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition();
550     AAPointerInfo::OffsetAndSize OAS(Pos, 8);
551     return funcRetrievesImplicitKernelArg(A, OAS);
552   }
553 
554   bool funcRetrievesHeapPtr(Attributor &A) {
555     if (AMDGPU::getAmdhsaCodeObjectVersion() != 5)
556       return false;
557     AAPointerInfo::OffsetAndSize OAS(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8);
558     return funcRetrievesImplicitKernelArg(A, OAS);
559   }
560 
561   bool funcRetrievesQueuePtr(Attributor &A) {
562     if (AMDGPU::getAmdhsaCodeObjectVersion() != 5)
563       return false;
564     AAPointerInfo::OffsetAndSize OAS(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8);
565     return funcRetrievesImplicitKernelArg(A, OAS);
566   }
567 
568   bool funcRetrievesImplicitKernelArg(Attributor &A,
569                                       AAPointerInfo::OffsetAndSize OAS) {
570     // Check if this is a call to the implicitarg_ptr builtin and it
571     // is used to retrieve the hostcall pointer. The implicit arg for
572     // hostcall is not used only if every use of the implicitarg_ptr
573     // is a load that clearly does not retrieve any byte of the
574     // hostcall pointer. We check this by tracing all the uses of the
575     // initial call to the implicitarg_ptr intrinsic.
576     auto DoesNotLeadToKernelArgLoc = [&](Instruction &I) {
577       auto &Call = cast<CallBase>(I);
578       if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
579         return true;
580 
581       const auto &PointerInfoAA = A.getAAFor<AAPointerInfo>(
582           *this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED);
583 
584       return PointerInfoAA.forallInterferingAccesses(
585           OAS, [](const AAPointerInfo::Access &Acc, bool IsExact) {
586             return Acc.getRemoteInst()->isDroppable();
587           });
588     };
589 
590     bool UsedAssumedInformation = false;
591     return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this,
592                                               UsedAssumedInformation);
593   }
594 };
595 
596 AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
597                                                     Attributor &A) {
598   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
599     return *new (A.Allocator) AAAMDAttributesFunction(IRP, A);
600   llvm_unreachable("AAAMDAttributes is only valid for function position");
601 }
602 
603 /// Propagate amdgpu-flat-work-group-size attribute.
604 struct AAAMDFlatWorkGroupSize
605     : public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
606   using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
607   AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
608       : Base(IRP, 32) {}
609 
610   /// See AbstractAttribute::getState(...).
611   IntegerRangeState &getState() override { return *this; }
612   const IntegerRangeState &getState() const override { return *this; }
613 
614   void initialize(Attributor &A) override {
615     Function *F = getAssociatedFunction();
616     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
617     unsigned MinGroupSize, MaxGroupSize;
618     std::tie(MinGroupSize, MaxGroupSize) = InfoCache.getFlatWorkGroupSizes(*F);
619     intersectKnown(
620         ConstantRange(APInt(32, MinGroupSize), APInt(32, MaxGroupSize + 1)));
621 
622     if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
623       indicatePessimisticFixpoint();
624   }
625 
626   ChangeStatus updateImpl(Attributor &A) override {
627     ChangeStatus Change = ChangeStatus::UNCHANGED;
628 
629     auto CheckCallSite = [&](AbstractCallSite CS) {
630       Function *Caller = CS.getInstruction()->getFunction();
631       LLVM_DEBUG(dbgs() << "[AAAMDFlatWorkGroupSize] Call " << Caller->getName()
632                         << "->" << getAssociatedFunction()->getName() << '\n');
633 
634       const auto &CallerInfo = A.getAAFor<AAAMDFlatWorkGroupSize>(
635           *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
636 
637       Change |=
638           clampStateAndIndicateChange(this->getState(), CallerInfo.getState());
639 
640       return true;
641     };
642 
643     bool AllCallSitesKnown = true;
644     if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
645       return indicatePessimisticFixpoint();
646 
647     return Change;
648   }
649 
650   ChangeStatus manifest(Attributor &A) override {
651     SmallVector<Attribute, 8> AttrList;
652     Function *F = getAssociatedFunction();
653     LLVMContext &Ctx = F->getContext();
654 
655     auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
656     unsigned Min, Max;
657     std::tie(Min, Max) = InfoCache.getMaximumFlatWorkGroupRange(*F);
658 
659     // Don't add the attribute if it's the implied default.
660     if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max)
661       return ChangeStatus::UNCHANGED;
662 
663     SmallString<10> Buffer;
664     raw_svector_ostream OS(Buffer);
665     OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
666 
667     AttrList.push_back(
668         Attribute::get(Ctx, "amdgpu-flat-work-group-size", OS.str()));
669     return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
670                                               /* ForceReplace */ true);
671   }
672 
673   const std::string getAsStr() const override {
674     std::string Str;
675     raw_string_ostream OS(Str);
676     OS << "AMDFlatWorkGroupSize[";
677     OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
678     OS << ']';
679     return OS.str();
680   }
681 
682   /// See AbstractAttribute::trackStatistics()
683   void trackStatistics() const override {}
684 
685   /// Create an abstract attribute view for the position \p IRP.
686   static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
687                                                    Attributor &A);
688 
689   /// See AbstractAttribute::getName()
690   const std::string getName() const override {
691     return "AAAMDFlatWorkGroupSize";
692   }
693 
694   /// See AbstractAttribute::getIdAddr()
695   const char *getIdAddr() const override { return &ID; }
696 
697   /// This function should return true if the type of the \p AA is
698   /// AAAMDFlatWorkGroupSize
699   static bool classof(const AbstractAttribute *AA) {
700     return (AA->getIdAddr() == &ID);
701   }
702 
703   /// Unique ID (due to the unique address)
704   static const char ID;
705 };
706 
707 const char AAAMDFlatWorkGroupSize::ID = 0;
708 
709 AAAMDFlatWorkGroupSize &
710 AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
711                                           Attributor &A) {
712   if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
713     return *new (A.Allocator) AAAMDFlatWorkGroupSize(IRP, A);
714   llvm_unreachable(
715       "AAAMDFlatWorkGroupSize is only valid for function position");
716 }
717 
718 class AMDGPUAttributor : public ModulePass {
719 public:
720   AMDGPUAttributor() : ModulePass(ID) {}
721 
722   /// doInitialization - Virtual method overridden by subclasses to do
723   /// any necessary initialization before any pass is run.
724   bool doInitialization(Module &) override {
725     auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
726     if (!TPC)
727       report_fatal_error("TargetMachine is required");
728 
729     TM = &TPC->getTM<TargetMachine>();
730     return false;
731   }
732 
733   bool runOnModule(Module &M) override {
734     SetVector<Function *> Functions;
735     AnalysisGetter AG;
736     for (Function &F : M) {
737       if (!F.isIntrinsic())
738         Functions.insert(&F);
739     }
740 
741     CallGraphUpdater CGUpdater;
742     BumpPtrAllocator Allocator;
743     AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM);
744     DenseSet<const char *> Allowed(
745         {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
746          &AAAMDFlatWorkGroupSize::ID, &AACallEdges::ID, &AAPointerInfo::ID});
747 
748     AttributorConfig AC(CGUpdater);
749     AC.Allowed = &Allowed;
750     AC.IsModulePass = true;
751     AC.DefaultInitializeLiveInternals = false;
752 
753     Attributor A(Functions, InfoCache, AC);
754 
755     for (Function &F : M) {
756       if (!F.isIntrinsic()) {
757         A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F));
758         A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(F));
759         if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) {
760           A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(F));
761         }
762       }
763     }
764 
765     ChangeStatus Change = A.run();
766     return Change == ChangeStatus::CHANGED;
767   }
768 
769   StringRef getPassName() const override { return "AMDGPU Attributor"; }
770   TargetMachine *TM;
771   static char ID;
772 };
773 } // namespace
774 
775 char AMDGPUAttributor::ID = 0;
776 
777 Pass *llvm::createAMDGPUAttributorPass() { return new AMDGPUAttributor(); }
778 INITIALIZE_PASS(AMDGPUAttributor, DEBUG_TYPE, "AMDGPU Attributor", false, false)
779