1*c9157d92SDimitry Andric //===- AMDGPUImageIntrinsicOptimizer.cpp ----------------------------------===//
2*c9157d92SDimitry Andric //
3*c9157d92SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*c9157d92SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
5*c9157d92SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*c9157d92SDimitry Andric //
7*c9157d92SDimitry Andric //===----------------------------------------------------------------------===//
8*c9157d92SDimitry Andric //
9*c9157d92SDimitry Andric // This pass tries to combine multiple image_load intrinsics with dim=2dmsaa
10*c9157d92SDimitry Andric // or dim=2darraymsaa into a single image_msaa_load intrinsic if:
11*c9157d92SDimitry Andric //
12*c9157d92SDimitry Andric // - they refer to the same vaddr except for sample_id,
13*c9157d92SDimitry Andric // - they use a constant sample_id and they fall into the same group,
14*c9157d92SDimitry Andric // - they have the same dmask and the number of intrinsics and the number of
15*c9157d92SDimitry Andric //   vaddr/vdata dword transfers is reduced by the combine.
16*c9157d92SDimitry Andric //
17*c9157d92SDimitry Andric // Examples for the tradeoff (all are assuming 2DMsaa for vaddr):
18*c9157d92SDimitry Andric //
19*c9157d92SDimitry Andric // +----------+-----+-----+-------+---------+------------+---------+----------+
20*c9157d92SDimitry Andric // | popcount | a16 | d16 | #load | vaddr / | #msaa_load | vaddr / | combine? |
21*c9157d92SDimitry Andric // |  (dmask) |     |     |       | vdata   |            | vdata   |          |
22*c9157d92SDimitry Andric // +----------+-----+-----+-------+---------+------------+---------+----------+
23*c9157d92SDimitry Andric // |        1 |   0 |   0 |     4 |  12 / 4 |          1 |   3 / 4 | yes      |
24*c9157d92SDimitry Andric // +----------+-----+-----+-------+---------+------------+---------+----------+
25*c9157d92SDimitry Andric // |        1 |   0 |   0 |     2 |   6 / 2 |          1 |   3 / 4 | yes?     |
26*c9157d92SDimitry Andric // +----------+-----+-----+-------+---------+------------+---------+----------+
27*c9157d92SDimitry Andric // |        2 |   0 |   0 |     4 |  12 / 8 |          2 |   6 / 8 | yes      |
28*c9157d92SDimitry Andric // +----------+-----+-----+-------+---------+------------+---------+----------+
29*c9157d92SDimitry Andric // |        2 |   0 |   0 |     2 |   6 / 4 |          2 |   6 / 8 | no       |
30*c9157d92SDimitry Andric // +----------+-----+-----+-------+---------+------------+---------+----------+
31*c9157d92SDimitry Andric // |        1 |   0 |   1 |     2 |   6 / 2 |          1 |   3 / 2 | yes      |
32*c9157d92SDimitry Andric // +----------+-----+-----+-------+---------+------------+---------+----------+
33*c9157d92SDimitry Andric //
34*c9157d92SDimitry Andric // Some cases are of questionable benefit, like the one marked with "yes?"
35*c9157d92SDimitry Andric // above: fewer intrinsics and fewer vaddr and fewer total transfers between SP
36*c9157d92SDimitry Andric // and TX, but higher vdata. We start by erring on the side of converting these
37*c9157d92SDimitry Andric // to MSAA_LOAD.
38*c9157d92SDimitry Andric //
39*c9157d92SDimitry Andric // clang-format off
40*c9157d92SDimitry Andric //
41*c9157d92SDimitry Andric // This pass will combine intrinsics such as (not neccessarily consecutive):
42*c9157d92SDimitry Andric //  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
43*c9157d92SDimitry Andric //  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
44*c9157d92SDimitry Andric //  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
45*c9157d92SDimitry Andric //  call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
46*c9157d92SDimitry Andric // ==>
47*c9157d92SDimitry Andric //  call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
48*c9157d92SDimitry Andric //
49*c9157d92SDimitry Andric // clang-format on
50*c9157d92SDimitry Andric //
51*c9157d92SDimitry Andric // Future improvements:
52*c9157d92SDimitry Andric //
53*c9157d92SDimitry Andric // - We may occasionally not want to do the combine if it increases the maximum
54*c9157d92SDimitry Andric //   register pressure.
55*c9157d92SDimitry Andric //
56*c9157d92SDimitry Andric // - Ensure clausing when multiple MSAA_LOAD are generated.
57*c9157d92SDimitry Andric //
58*c9157d92SDimitry Andric // Note: Even though the image_msaa_load intrinsic already exists on gfx10, this
59*c9157d92SDimitry Andric // combine only applies to gfx11, due to a limitation in gfx10: the gfx10
60*c9157d92SDimitry Andric // IMAGE_MSAA_LOAD only works correctly with single-channel texture formats, and
61*c9157d92SDimitry Andric // we don't know the format at compile time.
62*c9157d92SDimitry Andric //===----------------------------------------------------------------------===//
63*c9157d92SDimitry Andric 
64*c9157d92SDimitry Andric #include "AMDGPU.h"
65*c9157d92SDimitry Andric #include "AMDGPUInstrInfo.h"
66*c9157d92SDimitry Andric #include "AMDGPUTargetMachine.h"
67*c9157d92SDimitry Andric #include "llvm/IR/Function.h"
68*c9157d92SDimitry Andric #include "llvm/IR/IRBuilder.h"
69*c9157d92SDimitry Andric #include "llvm/IR/IntrinsicInst.h"
70*c9157d92SDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h"
71*c9157d92SDimitry Andric #include "llvm/Pass.h"
72*c9157d92SDimitry Andric #include "llvm/Support/raw_ostream.h"
73*c9157d92SDimitry Andric 
74*c9157d92SDimitry Andric using namespace llvm;
75*c9157d92SDimitry Andric 
76*c9157d92SDimitry Andric #define DEBUG_TYPE "amdgpu-image-intrinsic-opt"
77*c9157d92SDimitry Andric 
78*c9157d92SDimitry Andric namespace {
79*c9157d92SDimitry Andric class AMDGPUImageIntrinsicOptimizer : public FunctionPass {
80*c9157d92SDimitry Andric   const TargetMachine *TM;
81*c9157d92SDimitry Andric 
82*c9157d92SDimitry Andric public:
83*c9157d92SDimitry Andric   static char ID;
84*c9157d92SDimitry Andric 
AMDGPUImageIntrinsicOptimizer(const TargetMachine * TM=nullptr)85*c9157d92SDimitry Andric   AMDGPUImageIntrinsicOptimizer(const TargetMachine *TM = nullptr)
86*c9157d92SDimitry Andric       : FunctionPass(ID), TM(TM) {}
87*c9157d92SDimitry Andric 
88*c9157d92SDimitry Andric   bool runOnFunction(Function &F) override;
89*c9157d92SDimitry Andric 
90*c9157d92SDimitry Andric }; // End of class AMDGPUImageIntrinsicOptimizer
91*c9157d92SDimitry Andric } // End anonymous namespace
92*c9157d92SDimitry Andric 
93*c9157d92SDimitry Andric INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE,
94*c9157d92SDimitry Andric                 "AMDGPU Image Intrinsic Optimizer", false, false)
95*c9157d92SDimitry Andric 
96*c9157d92SDimitry Andric char AMDGPUImageIntrinsicOptimizer::ID = 0;
97*c9157d92SDimitry Andric 
addInstToMergeableList(IntrinsicInst * II,SmallVector<SmallVector<IntrinsicInst *,4>> & MergeableInsts,const AMDGPU::ImageDimIntrinsicInfo * ImageDimIntr)98*c9157d92SDimitry Andric void addInstToMergeableList(
99*c9157d92SDimitry Andric     IntrinsicInst *II,
100*c9157d92SDimitry Andric     SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts,
101*c9157d92SDimitry Andric     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) {
102*c9157d92SDimitry Andric   for (SmallVector<IntrinsicInst *, 4> &IIList : MergeableInsts) {
103*c9157d92SDimitry Andric     // Check Dim.
104*c9157d92SDimitry Andric     if (IIList.front()->getIntrinsicID() != II->getIntrinsicID())
105*c9157d92SDimitry Andric       continue;
106*c9157d92SDimitry Andric 
107*c9157d92SDimitry Andric     // Check D16.
108*c9157d92SDimitry Andric     if (IIList.front()->getType() != II->getType())
109*c9157d92SDimitry Andric       continue;
110*c9157d92SDimitry Andric 
111*c9157d92SDimitry Andric     // Check all arguments (DMask, VAddr, RSrc etc).
112*c9157d92SDimitry Andric     bool AllEqual = true;
113*c9157d92SDimitry Andric     assert(IIList.front()->arg_size() == II->arg_size());
114*c9157d92SDimitry Andric     for (int I = 1, E = II->arg_size(); AllEqual && I != E; ++I) {
115*c9157d92SDimitry Andric       Value *ArgList = IIList.front()->getArgOperand(I);
116*c9157d92SDimitry Andric       Value *Arg = II->getArgOperand(I);
117*c9157d92SDimitry Andric       if (I == ImageDimIntr->VAddrEnd - 1) {
118*c9157d92SDimitry Andric         // Check FragId group.
119*c9157d92SDimitry Andric         auto FragIdList = cast<ConstantInt>(IIList.front()->getArgOperand(I));
120*c9157d92SDimitry Andric         auto FragId = cast<ConstantInt>(II->getArgOperand(I));
121*c9157d92SDimitry Andric         AllEqual = FragIdList->getValue().udiv(4) == FragId->getValue().udiv(4);
122*c9157d92SDimitry Andric       } else {
123*c9157d92SDimitry Andric         // Check all arguments except FragId.
124*c9157d92SDimitry Andric         AllEqual = ArgList == Arg;
125*c9157d92SDimitry Andric       }
126*c9157d92SDimitry Andric     }
127*c9157d92SDimitry Andric     if (!AllEqual)
128*c9157d92SDimitry Andric       continue;
129*c9157d92SDimitry Andric 
130*c9157d92SDimitry Andric     // Add to the list.
131*c9157d92SDimitry Andric     IIList.emplace_back(II);
132*c9157d92SDimitry Andric     return;
133*c9157d92SDimitry Andric   }
134*c9157d92SDimitry Andric 
135*c9157d92SDimitry Andric   // Similar instruction not found, so add a new list.
136*c9157d92SDimitry Andric   MergeableInsts.emplace_back(1, II);
137*c9157d92SDimitry Andric   LLVM_DEBUG(dbgs() << "New: " << *II << "\n");
138*c9157d92SDimitry Andric }
139*c9157d92SDimitry Andric 
140*c9157d92SDimitry Andric // Collect list of all instructions we know how to merge in a subset of the
141*c9157d92SDimitry Andric // block. It returns an iterator to the instruction after the last one analyzed.
collectMergeableInsts(BasicBlock::iterator I,BasicBlock::iterator E,SmallVector<SmallVector<IntrinsicInst *,4>> & MergeableInsts)142*c9157d92SDimitry Andric BasicBlock::iterator collectMergeableInsts(
143*c9157d92SDimitry Andric     BasicBlock::iterator I, BasicBlock::iterator E,
144*c9157d92SDimitry Andric     SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts) {
145*c9157d92SDimitry Andric   for (; I != E; ++I) {
146*c9157d92SDimitry Andric     // Don't combine if there is a store in the middle or if there is a memory
147*c9157d92SDimitry Andric     // barrier.
148*c9157d92SDimitry Andric     if (I->mayHaveSideEffects()) {
149*c9157d92SDimitry Andric       ++I;
150*c9157d92SDimitry Andric       break;
151*c9157d92SDimitry Andric     }
152*c9157d92SDimitry Andric 
153*c9157d92SDimitry Andric     // Ignore non-intrinsics.
154*c9157d92SDimitry Andric     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
155*c9157d92SDimitry Andric       Intrinsic::ID IntrinID = II->getIntrinsicID();
156*c9157d92SDimitry Andric 
157*c9157d92SDimitry Andric       // Ignore other intrinsics.
158*c9157d92SDimitry Andric       if (IntrinID != Intrinsic::amdgcn_image_load_2dmsaa &&
159*c9157d92SDimitry Andric           IntrinID != Intrinsic::amdgcn_image_load_2darraymsaa)
160*c9157d92SDimitry Andric         continue;
161*c9157d92SDimitry Andric 
162*c9157d92SDimitry Andric       // Check for constant FragId.
163*c9157d92SDimitry Andric       const auto *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinID);
164*c9157d92SDimitry Andric       const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
165*c9157d92SDimitry Andric       if (!isa<ConstantInt>(II->getArgOperand(FragIdIndex)))
166*c9157d92SDimitry Andric         continue;
167*c9157d92SDimitry Andric 
168*c9157d92SDimitry Andric       LLVM_DEBUG(dbgs() << "Merge: " << *II << "\n");
169*c9157d92SDimitry Andric       addInstToMergeableList(II, MergeableInsts, ImageDimIntr);
170*c9157d92SDimitry Andric     }
171*c9157d92SDimitry Andric   }
172*c9157d92SDimitry Andric 
173*c9157d92SDimitry Andric   return I;
174*c9157d92SDimitry Andric }
175*c9157d92SDimitry Andric 
optimizeSection(ArrayRef<SmallVector<IntrinsicInst *,4>> MergeableInsts)176*c9157d92SDimitry Andric bool optimizeSection(ArrayRef<SmallVector<IntrinsicInst *, 4>> MergeableInsts) {
177*c9157d92SDimitry Andric   bool Modified = false;
178*c9157d92SDimitry Andric 
179*c9157d92SDimitry Andric   SmallVector<Instruction *, 4> InstrsToErase;
180*c9157d92SDimitry Andric   for (const auto &IIList : MergeableInsts) {
181*c9157d92SDimitry Andric     if (IIList.size() <= 1)
182*c9157d92SDimitry Andric       continue;
183*c9157d92SDimitry Andric 
184*c9157d92SDimitry Andric     // Assume the arguments are unchanged and later override them, if needed.
185*c9157d92SDimitry Andric     SmallVector<Value *, 16> Args(IIList.front()->args());
186*c9157d92SDimitry Andric 
187*c9157d92SDimitry Andric     // Validate function argument and return types, extracting overloaded
188*c9157d92SDimitry Andric     // types along the way.
189*c9157d92SDimitry Andric     SmallVector<Type *, 6> OverloadTys;
190*c9157d92SDimitry Andric     Function *F = IIList.front()->getCalledFunction();
191*c9157d92SDimitry Andric     if (!Intrinsic::getIntrinsicSignature(F, OverloadTys))
192*c9157d92SDimitry Andric       continue;
193*c9157d92SDimitry Andric 
194*c9157d92SDimitry Andric     Intrinsic::ID IntrinID = IIList.front()->getIntrinsicID();
195*c9157d92SDimitry Andric     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
196*c9157d92SDimitry Andric         AMDGPU::getImageDimIntrinsicInfo(IntrinID);
197*c9157d92SDimitry Andric 
198*c9157d92SDimitry Andric     Type *EltTy = IIList.front()->getType()->getScalarType();
199*c9157d92SDimitry Andric     Type *NewTy = FixedVectorType::get(EltTy, 4);
200*c9157d92SDimitry Andric     OverloadTys[0] = NewTy;
201*c9157d92SDimitry Andric     bool isD16 = EltTy->isHalfTy();
202*c9157d92SDimitry Andric 
203*c9157d92SDimitry Andric     ConstantInt *DMask = cast<ConstantInt>(
204*c9157d92SDimitry Andric         IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex));
205*c9157d92SDimitry Andric     unsigned DMaskVal = DMask->getZExtValue() & 0xf;
206*c9157d92SDimitry Andric     unsigned NumElts = popcount(DMaskVal);
207*c9157d92SDimitry Andric 
208*c9157d92SDimitry Andric     // Number of instructions and the number of vaddr/vdata dword transfers
209*c9157d92SDimitry Andric     // should be reduced.
210*c9157d92SDimitry Andric     unsigned NumLoads = IIList.size();
211*c9157d92SDimitry Andric     unsigned NumMsaas = NumElts;
212*c9157d92SDimitry Andric     unsigned NumVAddrLoads = 3 * NumLoads;
213*c9157d92SDimitry Andric     unsigned NumVDataLoads = divideCeil(NumElts, isD16 ? 2 : 1) * NumLoads;
214*c9157d92SDimitry Andric     unsigned NumVAddrMsaas = 3 * NumMsaas;
215*c9157d92SDimitry Andric     unsigned NumVDataMsaas = divideCeil(4, isD16 ? 2 : 1) * NumMsaas;
216*c9157d92SDimitry Andric 
217*c9157d92SDimitry Andric     if (NumLoads < NumMsaas ||
218*c9157d92SDimitry Andric         (NumVAddrLoads + NumVDataLoads < NumVAddrMsaas + NumVDataMsaas))
219*c9157d92SDimitry Andric       continue;
220*c9157d92SDimitry Andric 
221*c9157d92SDimitry Andric     const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
222*c9157d92SDimitry Andric     auto FragId = cast<ConstantInt>(IIList.front()->getArgOperand(FragIdIndex));
223*c9157d92SDimitry Andric     const APInt &NewFragIdVal = FragId->getValue().udiv(4) * 4;
224*c9157d92SDimitry Andric 
225*c9157d92SDimitry Andric     // Create the new instructions.
226*c9157d92SDimitry Andric     IRBuilder<> B(IIList.front());
227*c9157d92SDimitry Andric 
228*c9157d92SDimitry Andric     // Create the new image_msaa_load intrinsic.
229*c9157d92SDimitry Andric     SmallVector<Instruction *, 4> NewCalls;
230*c9157d92SDimitry Andric     while (DMaskVal != 0) {
231*c9157d92SDimitry Andric       unsigned NewMaskVal = 1 << countr_zero(DMaskVal);
232*c9157d92SDimitry Andric 
233*c9157d92SDimitry Andric       Intrinsic::ID NewIntrinID;
234*c9157d92SDimitry Andric       if (IntrinID == Intrinsic::amdgcn_image_load_2dmsaa)
235*c9157d92SDimitry Andric         NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2dmsaa;
236*c9157d92SDimitry Andric       else
237*c9157d92SDimitry Andric         NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa;
238*c9157d92SDimitry Andric 
239*c9157d92SDimitry Andric       Function *NewIntrin = Intrinsic::getDeclaration(
240*c9157d92SDimitry Andric           IIList.front()->getModule(), NewIntrinID, OverloadTys);
241*c9157d92SDimitry Andric       Args[ImageDimIntr->DMaskIndex] =
242*c9157d92SDimitry Andric           ConstantInt::get(DMask->getType(), NewMaskVal);
243*c9157d92SDimitry Andric       Args[FragIdIndex] = ConstantInt::get(FragId->getType(), NewFragIdVal);
244*c9157d92SDimitry Andric       CallInst *NewCall = B.CreateCall(NewIntrin, Args);
245*c9157d92SDimitry Andric       LLVM_DEBUG(dbgs() << "Optimize: " << *NewCall << "\n");
246*c9157d92SDimitry Andric 
247*c9157d92SDimitry Andric       NewCalls.push_back(NewCall);
248*c9157d92SDimitry Andric       DMaskVal -= NewMaskVal;
249*c9157d92SDimitry Andric     }
250*c9157d92SDimitry Andric 
251*c9157d92SDimitry Andric     // Create the new extractelement instructions.
252*c9157d92SDimitry Andric     for (auto &II : IIList) {
253*c9157d92SDimitry Andric       Value *VecOp = nullptr;
254*c9157d92SDimitry Andric       auto Idx = cast<ConstantInt>(II->getArgOperand(FragIdIndex));
255*c9157d92SDimitry Andric       B.SetCurrentDebugLocation(II->getDebugLoc());
256*c9157d92SDimitry Andric       if (NumElts == 1) {
257*c9157d92SDimitry Andric         VecOp = B.CreateExtractElement(NewCalls[0], Idx->getValue().urem(4));
258*c9157d92SDimitry Andric         LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");
259*c9157d92SDimitry Andric       } else {
260*c9157d92SDimitry Andric         VecOp = UndefValue::get(II->getType());
261*c9157d92SDimitry Andric         for (unsigned I = 0; I < NumElts; ++I) {
262*c9157d92SDimitry Andric           VecOp = B.CreateInsertElement(
263*c9157d92SDimitry Andric               VecOp,
264*c9157d92SDimitry Andric               B.CreateExtractElement(NewCalls[I], Idx->getValue().urem(4)), I);
265*c9157d92SDimitry Andric           LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");
266*c9157d92SDimitry Andric         }
267*c9157d92SDimitry Andric       }
268*c9157d92SDimitry Andric 
269*c9157d92SDimitry Andric       // Replace the old instruction.
270*c9157d92SDimitry Andric       II->replaceAllUsesWith(VecOp);
271*c9157d92SDimitry Andric       VecOp->takeName(II);
272*c9157d92SDimitry Andric       InstrsToErase.push_back(II);
273*c9157d92SDimitry Andric     }
274*c9157d92SDimitry Andric 
275*c9157d92SDimitry Andric     Modified = true;
276*c9157d92SDimitry Andric   }
277*c9157d92SDimitry Andric 
278*c9157d92SDimitry Andric   for (auto I : InstrsToErase)
279*c9157d92SDimitry Andric     I->eraseFromParent();
280*c9157d92SDimitry Andric 
281*c9157d92SDimitry Andric   return Modified;
282*c9157d92SDimitry Andric }
283*c9157d92SDimitry Andric 
imageIntrinsicOptimizerImpl(Function & F,const TargetMachine * TM)284*c9157d92SDimitry Andric static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM) {
285*c9157d92SDimitry Andric   if (!TM)
286*c9157d92SDimitry Andric     return false;
287*c9157d92SDimitry Andric 
288*c9157d92SDimitry Andric   // This optimization only applies to GFX11 and beyond.
289*c9157d92SDimitry Andric   const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
290*c9157d92SDimitry Andric   if (!AMDGPU::isGFX11Plus(ST) || ST.hasMSAALoadDstSelBug())
291*c9157d92SDimitry Andric     return false;
292*c9157d92SDimitry Andric 
293*c9157d92SDimitry Andric   Module *M = F.getParent();
294*c9157d92SDimitry Andric 
295*c9157d92SDimitry Andric   // Early test to determine if the intrinsics are used.
296*c9157d92SDimitry Andric   if (std::none_of(M->begin(), M->end(), [](Function &F) {
297*c9157d92SDimitry Andric         return !F.users().empty() &&
298*c9157d92SDimitry Andric                (F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2dmsaa ||
299*c9157d92SDimitry Andric                 F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2darraymsaa);
300*c9157d92SDimitry Andric       }))
301*c9157d92SDimitry Andric     return false;
302*c9157d92SDimitry Andric 
303*c9157d92SDimitry Andric   bool Modified = false;
304*c9157d92SDimitry Andric   for (auto &BB : F) {
305*c9157d92SDimitry Andric     BasicBlock::iterator SectionEnd;
306*c9157d92SDimitry Andric     for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;
307*c9157d92SDimitry Andric          I = SectionEnd) {
308*c9157d92SDimitry Andric       SmallVector<SmallVector<IntrinsicInst *, 4>> MergeableInsts;
309*c9157d92SDimitry Andric 
310*c9157d92SDimitry Andric       SectionEnd = collectMergeableInsts(I, E, MergeableInsts);
311*c9157d92SDimitry Andric       Modified |= optimizeSection(MergeableInsts);
312*c9157d92SDimitry Andric     }
313*c9157d92SDimitry Andric   }
314*c9157d92SDimitry Andric 
315*c9157d92SDimitry Andric   return Modified;
316*c9157d92SDimitry Andric }
317*c9157d92SDimitry Andric 
runOnFunction(Function & F)318*c9157d92SDimitry Andric bool AMDGPUImageIntrinsicOptimizer::runOnFunction(Function &F) {
319*c9157d92SDimitry Andric   if (skipFunction(F))
320*c9157d92SDimitry Andric     return false;
321*c9157d92SDimitry Andric 
322*c9157d92SDimitry Andric   return imageIntrinsicOptimizerImpl(F, TM);
323*c9157d92SDimitry Andric }
324*c9157d92SDimitry Andric 
325*c9157d92SDimitry Andric FunctionPass *
createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine * TM)326*c9157d92SDimitry Andric llvm::createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *TM) {
327*c9157d92SDimitry Andric   return new AMDGPUImageIntrinsicOptimizer(TM);
328*c9157d92SDimitry Andric }
329*c9157d92SDimitry Andric 
330*c9157d92SDimitry Andric PreservedAnalyses
run(Function & F,FunctionAnalysisManager & AM)331*c9157d92SDimitry Andric AMDGPUImageIntrinsicOptimizerPass::run(Function &F,
332*c9157d92SDimitry Andric                                        FunctionAnalysisManager &AM) {
333*c9157d92SDimitry Andric 
334*c9157d92SDimitry Andric   bool Changed = imageIntrinsicOptimizerImpl(F, &TM);
335*c9157d92SDimitry Andric   return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
336*c9157d92SDimitry Andric }
337