12cab237bSDimitry Andric //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
28f0fd8f6SDimitry Andric //
38f0fd8f6SDimitry Andric //                     The LLVM Compiler Infrastructure
48f0fd8f6SDimitry Andric //
58f0fd8f6SDimitry Andric // This file is distributed under the University of Illinois Open Source
68f0fd8f6SDimitry Andric // License. See LICENSE.TXT for details.
78f0fd8f6SDimitry Andric //
88f0fd8f6SDimitry Andric //===----------------------------------------------------------------------===//
98f0fd8f6SDimitry Andric //
108f0fd8f6SDimitry Andric // This pass tries to fuse DS instructions with close by immediate offsets.
118f0fd8f6SDimitry Andric // This will fuse operations such as
128f0fd8f6SDimitry Andric //  ds_read_b32 v0, v2 offset:16
138f0fd8f6SDimitry Andric //  ds_read_b32 v1, v2 offset:32
148f0fd8f6SDimitry Andric // ==>
158f0fd8f6SDimitry Andric //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
168f0fd8f6SDimitry Andric //
172cab237bSDimitry Andric // The same is done for certain SMEM and VMEM opcodes, e.g.:
182cab237bSDimitry Andric //  s_buffer_load_dword s4, s[0:3], 4
192cab237bSDimitry Andric //  s_buffer_load_dword s5, s[0:3], 8
202cab237bSDimitry Andric // ==>
212cab237bSDimitry Andric //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
222cab237bSDimitry Andric //
23*b5893f02SDimitry Andric // This pass also tries to promote constant offset to the immediate by
24*b5893f02SDimitry Andric // adjusting the base. It tries to use a base from the nearby instructions that
25*b5893f02SDimitry Andric // allows it to have a 13bit constant offset and then promotes the 13bit offset
26*b5893f02SDimitry Andric // to the immediate.
27*b5893f02SDimitry Andric // E.g.
28*b5893f02SDimitry Andric //  s_movk_i32 s0, 0x1800
29*b5893f02SDimitry Andric //  v_add_co_u32_e32 v0, vcc, s0, v2
30*b5893f02SDimitry Andric //  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
31*b5893f02SDimitry Andric //
32*b5893f02SDimitry Andric //  s_movk_i32 s0, 0x1000
33*b5893f02SDimitry Andric //  v_add_co_u32_e32 v5, vcc, s0, v2
34*b5893f02SDimitry Andric //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
35*b5893f02SDimitry Andric //  global_load_dwordx2 v[5:6], v[5:6], off
36*b5893f02SDimitry Andric //  global_load_dwordx2 v[0:1], v[0:1], off
37*b5893f02SDimitry Andric // =>
38*b5893f02SDimitry Andric //  s_movk_i32 s0, 0x1000
39*b5893f02SDimitry Andric //  v_add_co_u32_e32 v5, vcc, s0, v2
40*b5893f02SDimitry Andric //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
41*b5893f02SDimitry Andric //  global_load_dwordx2 v[5:6], v[5:6], off
42*b5893f02SDimitry Andric //  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
438f0fd8f6SDimitry Andric //
448f0fd8f6SDimitry Andric // Future improvements:
458f0fd8f6SDimitry Andric //
468f0fd8f6SDimitry Andric // - This currently relies on the scheduler to place loads and stores next to
478f0fd8f6SDimitry Andric //   each other, and then only merges adjacent pairs of instructions. It would
488f0fd8f6SDimitry Andric //   be good to be more flexible with interleaved instructions, and possibly run
498f0fd8f6SDimitry Andric //   before scheduling. It currently missing stores of constants because loading
508f0fd8f6SDimitry Andric //   the constant into the data register is placed between the stores, although
518f0fd8f6SDimitry Andric //   this is arguably a scheduling problem.
528f0fd8f6SDimitry Andric //
538f0fd8f6SDimitry Andric // - Live interval recomputing seems inefficient. This currently only matches
548f0fd8f6SDimitry Andric //   one pair, and recomputes live intervals and moves on to the next pair. It
553ca95b02SDimitry Andric //   would be better to compute a list of all merges that need to occur.
568f0fd8f6SDimitry Andric //
578f0fd8f6SDimitry Andric // - With a list of instructions to process, we can also merge more. If a
588f0fd8f6SDimitry Andric //   cluster of loads have offsets that are too large to fit in the 8-bit
598f0fd8f6SDimitry Andric //   offsets, but are close enough to fit in the 8 bits, we can add to the base
608f0fd8f6SDimitry Andric //   pointer and use the new reduced offsets.
618f0fd8f6SDimitry Andric //
628f0fd8f6SDimitry Andric //===----------------------------------------------------------------------===//
638f0fd8f6SDimitry Andric 
648f0fd8f6SDimitry Andric #include "AMDGPU.h"
653ca95b02SDimitry Andric #include "AMDGPUSubtarget.h"
66*b5893f02SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
678f0fd8f6SDimitry Andric #include "SIInstrInfo.h"
688f0fd8f6SDimitry Andric #include "SIRegisterInfo.h"
697a7e6055SDimitry Andric #include "Utils/AMDGPUBaseInfo.h"
707a7e6055SDimitry Andric #include "llvm/ADT/ArrayRef.h"
717a7e6055SDimitry Andric #include "llvm/ADT/SmallVector.h"
727a7e6055SDimitry Andric #include "llvm/ADT/StringRef.h"
737a7e6055SDimitry Andric #include "llvm/Analysis/AliasAnalysis.h"
747a7e6055SDimitry Andric #include "llvm/CodeGen/MachineBasicBlock.h"
758f0fd8f6SDimitry Andric #include "llvm/CodeGen/MachineFunction.h"
768f0fd8f6SDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h"
777a7e6055SDimitry Andric #include "llvm/CodeGen/MachineInstr.h"
788f0fd8f6SDimitry Andric #include "llvm/CodeGen/MachineInstrBuilder.h"
797a7e6055SDimitry Andric #include "llvm/CodeGen/MachineOperand.h"
808f0fd8f6SDimitry Andric #include "llvm/CodeGen/MachineRegisterInfo.h"
817a7e6055SDimitry Andric #include "llvm/IR/DebugLoc.h"
827a7e6055SDimitry Andric #include "llvm/Pass.h"
838f0fd8f6SDimitry Andric #include "llvm/Support/Debug.h"
847a7e6055SDimitry Andric #include "llvm/Support/MathExtras.h"
858f0fd8f6SDimitry Andric #include "llvm/Support/raw_ostream.h"
862cab237bSDimitry Andric #include <algorithm>
877a7e6055SDimitry Andric #include <cassert>
882cab237bSDimitry Andric #include <cstdlib>
897a7e6055SDimitry Andric #include <iterator>
907a7e6055SDimitry Andric #include <utility>
918f0fd8f6SDimitry Andric 
928f0fd8f6SDimitry Andric using namespace llvm;
938f0fd8f6SDimitry Andric 
948f0fd8f6SDimitry Andric #define DEBUG_TYPE "si-load-store-opt"
958f0fd8f6SDimitry Andric 
968f0fd8f6SDimitry Andric namespace {
972cab237bSDimitry Andric enum InstClassEnum {
98*b5893f02SDimitry Andric   UNKNOWN,
99*b5893f02SDimitry Andric   DS_READ,
100*b5893f02SDimitry Andric   DS_WRITE,
1012cab237bSDimitry Andric   S_BUFFER_LOAD_IMM,
102*b5893f02SDimitry Andric   BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN,
103*b5893f02SDimitry Andric   BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
104*b5893f02SDimitry Andric   BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN,
105*b5893f02SDimitry Andric   BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET,
106*b5893f02SDimitry Andric   BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact,
107*b5893f02SDimitry Andric   BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact,
108*b5893f02SDimitry Andric   BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact,
109*b5893f02SDimitry Andric   BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact,
1102cab237bSDimitry Andric };
1117a7e6055SDimitry Andric 
112*b5893f02SDimitry Andric enum RegisterEnum {
113*b5893f02SDimitry Andric   SBASE = 0x1,
114*b5893f02SDimitry Andric   SRSRC = 0x2,
115*b5893f02SDimitry Andric   SOFFSET = 0x4,
116*b5893f02SDimitry Andric   VADDR = 0x8,
117*b5893f02SDimitry Andric   ADDR = 0x10,
118*b5893f02SDimitry Andric };
119*b5893f02SDimitry Andric 
120*b5893f02SDimitry Andric class SILoadStoreOptimizer : public MachineFunctionPass {
1212cab237bSDimitry Andric   struct CombineInfo {
1227a7e6055SDimitry Andric     MachineBasicBlock::iterator I;
1237a7e6055SDimitry Andric     MachineBasicBlock::iterator Paired;
1247a7e6055SDimitry Andric     unsigned EltSize;
1257a7e6055SDimitry Andric     unsigned Offset0;
1267a7e6055SDimitry Andric     unsigned Offset1;
127*b5893f02SDimitry Andric     unsigned Width0;
128*b5893f02SDimitry Andric     unsigned Width1;
1297a7e6055SDimitry Andric     unsigned BaseOff;
1302cab237bSDimitry Andric     InstClassEnum InstClass;
1312cab237bSDimitry Andric     bool GLC0;
1322cab237bSDimitry Andric     bool GLC1;
1332cab237bSDimitry Andric     bool SLC0;
1342cab237bSDimitry Andric     bool SLC1;
1357a7e6055SDimitry Andric     bool UseST64;
1367a7e6055SDimitry Andric     SmallVector<MachineInstr *, 8> InstsToMove;
1372cab237bSDimitry Andric   };
1387a7e6055SDimitry Andric 
139*b5893f02SDimitry Andric   struct BaseRegisters {
140*b5893f02SDimitry Andric     unsigned LoReg = 0;
141*b5893f02SDimitry Andric     unsigned HiReg = 0;
142*b5893f02SDimitry Andric 
143*b5893f02SDimitry Andric     unsigned LoSubReg = 0;
144*b5893f02SDimitry Andric     unsigned HiSubReg = 0;
145*b5893f02SDimitry Andric   };
146*b5893f02SDimitry Andric 
147*b5893f02SDimitry Andric   struct MemAddress {
148*b5893f02SDimitry Andric     BaseRegisters Base;
149*b5893f02SDimitry Andric     int64_t Offset = 0;
150*b5893f02SDimitry Andric   };
151*b5893f02SDimitry Andric 
152*b5893f02SDimitry Andric   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
153*b5893f02SDimitry Andric 
1548f0fd8f6SDimitry Andric private:
1554ba319b5SDimitry Andric   const GCNSubtarget *STM = nullptr;
1567a7e6055SDimitry Andric   const SIInstrInfo *TII = nullptr;
1577a7e6055SDimitry Andric   const SIRegisterInfo *TRI = nullptr;
1587a7e6055SDimitry Andric   MachineRegisterInfo *MRI = nullptr;
1597a7e6055SDimitry Andric   AliasAnalysis *AA = nullptr;
160*b5893f02SDimitry Andric   bool OptimizeAgain;
1618f0fd8f6SDimitry Andric 
1627a7e6055SDimitry Andric   static bool offsetsCanBeCombined(CombineInfo &CI);
163*b5893f02SDimitry Andric   static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI);
164*b5893f02SDimitry Andric   static unsigned getNewOpcode(const CombineInfo &CI);
165*b5893f02SDimitry Andric   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);
166*b5893f02SDimitry Andric   const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI);
167*b5893f02SDimitry Andric   unsigned getOpcodeWidth(const MachineInstr &MI);
168*b5893f02SDimitry Andric   InstClassEnum getInstClass(unsigned Opc);
169*b5893f02SDimitry Andric   unsigned getRegs(unsigned Opc);
1708f0fd8f6SDimitry Andric 
1712cab237bSDimitry Andric   bool findMatchingInst(CombineInfo &CI);
1728f0fd8f6SDimitry Andric 
1732cab237bSDimitry Andric   unsigned read2Opcode(unsigned EltSize) const;
1742cab237bSDimitry Andric   unsigned read2ST64Opcode(unsigned EltSize) const;
1757a7e6055SDimitry Andric   MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
1768f0fd8f6SDimitry Andric 
1772cab237bSDimitry Andric   unsigned write2Opcode(unsigned EltSize) const;
1782cab237bSDimitry Andric   unsigned write2ST64Opcode(unsigned EltSize) const;
1797a7e6055SDimitry Andric   MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
1802cab237bSDimitry Andric   MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
1812cab237bSDimitry Andric   MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
1822cab237bSDimitry Andric   MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
1838f0fd8f6SDimitry Andric 
184*b5893f02SDimitry Andric   void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
185*b5893f02SDimitry Andric                            int32_t NewOffset);
186*b5893f02SDimitry Andric   unsigned computeBase(MachineInstr &MI, const MemAddress &Addr);
187*b5893f02SDimitry Andric   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI);
188*b5893f02SDimitry Andric   Optional<int32_t> extractConstOffset(const MachineOperand &Op);
189*b5893f02SDimitry Andric   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr);
190*b5893f02SDimitry Andric   /// Promotes constant offset to the immediate by adjusting the base. It
191*b5893f02SDimitry Andric   /// tries to use a base from the nearby instructions that allows it to have
192*b5893f02SDimitry Andric   /// a 13bit constant offset which gets promoted to the immediate.
193*b5893f02SDimitry Andric   bool promoteConstantOffsetToImm(MachineInstr &CI,
194*b5893f02SDimitry Andric                                   MemInfoMap &Visited,
195*b5893f02SDimitry Andric                                   SmallPtrSet<MachineInstr *, 4> &Promoted);
196*b5893f02SDimitry Andric 
1978f0fd8f6SDimitry Andric public:
1988f0fd8f6SDimitry Andric   static char ID;
1998f0fd8f6SDimitry Andric 
SILoadStoreOptimizer()200d8866befSDimitry Andric   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
2018f0fd8f6SDimitry Andric     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
2028f0fd8f6SDimitry Andric   }
2038f0fd8f6SDimitry Andric 
2048f0fd8f6SDimitry Andric   bool optimizeBlock(MachineBasicBlock &MBB);
2058f0fd8f6SDimitry Andric 
2068f0fd8f6SDimitry Andric   bool runOnMachineFunction(MachineFunction &MF) override;
2078f0fd8f6SDimitry Andric 
getPassName() const2084ba319b5SDimitry Andric   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
2098f0fd8f6SDimitry Andric 
getAnalysisUsage(AnalysisUsage & AU) const2108f0fd8f6SDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override {
2118f0fd8f6SDimitry Andric     AU.setPreservesCFG();
212d88c1a5aSDimitry Andric     AU.addRequired<AAResultsWrapperPass>();
2138f0fd8f6SDimitry Andric 
2148f0fd8f6SDimitry Andric     MachineFunctionPass::getAnalysisUsage(AU);
2158f0fd8f6SDimitry Andric   }
2168f0fd8f6SDimitry Andric };
2178f0fd8f6SDimitry Andric 
2187a7e6055SDimitry Andric } // end anonymous namespace.
2198f0fd8f6SDimitry Andric 
2208f0fd8f6SDimitry Andric INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
2214ba319b5SDimitry Andric                       "SI Load Store Optimizer", false, false)
222d88c1a5aSDimitry Andric INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
223*b5893f02SDimitry Andric INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
224*b5893f02SDimitry Andric                     false, false)
2258f0fd8f6SDimitry Andric 
2268f0fd8f6SDimitry Andric char SILoadStoreOptimizer::ID = 0;
2278f0fd8f6SDimitry Andric 
2288f0fd8f6SDimitry Andric char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
2298f0fd8f6SDimitry Andric 
createSILoadStoreOptimizerPass()230d8866befSDimitry Andric FunctionPass *llvm::createSILoadStoreOptimizerPass() {
231d8866befSDimitry Andric   return new SILoadStoreOptimizer();
2328f0fd8f6SDimitry Andric }
2338f0fd8f6SDimitry Andric 
moveInstsAfter(MachineBasicBlock::iterator I,ArrayRef<MachineInstr * > InstsToMove)234d88c1a5aSDimitry Andric static void moveInstsAfter(MachineBasicBlock::iterator I,
235d88c1a5aSDimitry Andric                            ArrayRef<MachineInstr *> InstsToMove) {
236d88c1a5aSDimitry Andric   MachineBasicBlock *MBB = I->getParent();
237d88c1a5aSDimitry Andric   ++I;
238d88c1a5aSDimitry Andric   for (MachineInstr *MI : InstsToMove) {
239d88c1a5aSDimitry Andric     MI->removeFromParent();
240d88c1a5aSDimitry Andric     MBB->insert(I, MI);
241d88c1a5aSDimitry Andric   }
242d88c1a5aSDimitry Andric }
243d88c1a5aSDimitry Andric 
addDefsUsesToList(const MachineInstr & MI,DenseSet<unsigned> & RegDefs,DenseSet<unsigned> & PhysRegUses)2444ba319b5SDimitry Andric static void addDefsUsesToList(const MachineInstr &MI,
2454ba319b5SDimitry Andric                               DenseSet<unsigned> &RegDefs,
2464ba319b5SDimitry Andric                               DenseSet<unsigned> &PhysRegUses) {
2474ba319b5SDimitry Andric   for (const MachineOperand &Op : MI.operands()) {
2484ba319b5SDimitry Andric     if (Op.isReg()) {
2494ba319b5SDimitry Andric       if (Op.isDef())
2504ba319b5SDimitry Andric         RegDefs.insert(Op.getReg());
2514ba319b5SDimitry Andric       else if (Op.readsReg() &&
2524ba319b5SDimitry Andric                TargetRegisterInfo::isPhysicalRegister(Op.getReg()))
2534ba319b5SDimitry Andric         PhysRegUses.insert(Op.getReg());
2544ba319b5SDimitry Andric     }
2554ba319b5SDimitry Andric   }
256d88c1a5aSDimitry Andric }
257d88c1a5aSDimitry Andric 
memAccessesCanBeReordered(MachineBasicBlock::iterator A,MachineBasicBlock::iterator B,const SIInstrInfo * TII,AliasAnalysis * AA)2587a7e6055SDimitry Andric static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
259d88c1a5aSDimitry Andric                                       MachineBasicBlock::iterator B,
260d88c1a5aSDimitry Andric                                       const SIInstrInfo *TII,
2617a7e6055SDimitry Andric                                       AliasAnalysis *AA) {
262d88c1a5aSDimitry Andric   // RAW or WAR - cannot reorder
263d88c1a5aSDimitry Andric   // WAW - cannot reorder
264d88c1a5aSDimitry Andric   // RAR - safe to reorder
2652cab237bSDimitry Andric   return !(A->mayStore() || B->mayStore()) ||
2662cab237bSDimitry Andric          TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
267d88c1a5aSDimitry Andric }
268d88c1a5aSDimitry Andric 
269d88c1a5aSDimitry Andric // Add MI and its defs to the lists if MI reads one of the defs that are
270d88c1a5aSDimitry Andric // already in the list. Returns true in that case.
addToListsIfDependent(MachineInstr & MI,DenseSet<unsigned> & RegDefs,DenseSet<unsigned> & PhysRegUses,SmallVectorImpl<MachineInstr * > & Insts)271*b5893f02SDimitry Andric static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs,
2724ba319b5SDimitry Andric                                   DenseSet<unsigned> &PhysRegUses,
273d88c1a5aSDimitry Andric                                   SmallVectorImpl<MachineInstr *> &Insts) {
2742cab237bSDimitry Andric   for (MachineOperand &Use : MI.operands()) {
2752cab237bSDimitry Andric     // If one of the defs is read, then there is a use of Def between I and the
2762cab237bSDimitry Andric     // instruction that I will potentially be merged with. We will need to move
2772cab237bSDimitry Andric     // this instruction after the merged instructions.
2784ba319b5SDimitry Andric     //
2794ba319b5SDimitry Andric     // Similarly, if there is a def which is read by an instruction that is to
2804ba319b5SDimitry Andric     // be moved for merging, then we need to move the def-instruction as well.
2814ba319b5SDimitry Andric     // This can only happen for physical registers such as M0; virtual
2824ba319b5SDimitry Andric     // registers are in SSA form.
2834ba319b5SDimitry Andric     if (Use.isReg() &&
2844ba319b5SDimitry Andric         ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
2854ba319b5SDimitry Andric          (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
2864ba319b5SDimitry Andric           PhysRegUses.count(Use.getReg())))) {
287d88c1a5aSDimitry Andric       Insts.push_back(&MI);
2884ba319b5SDimitry Andric       addDefsUsesToList(MI, RegDefs, PhysRegUses);
289d88c1a5aSDimitry Andric       return true;
290d88c1a5aSDimitry Andric     }
291d88c1a5aSDimitry Andric   }
292d88c1a5aSDimitry Andric 
293d88c1a5aSDimitry Andric   return false;
294d88c1a5aSDimitry Andric }
295d88c1a5aSDimitry Andric 
canMoveInstsAcrossMemOp(MachineInstr & MemOp,ArrayRef<MachineInstr * > InstsToMove,const SIInstrInfo * TII,AliasAnalysis * AA)296*b5893f02SDimitry Andric static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
297d88c1a5aSDimitry Andric                                     ArrayRef<MachineInstr *> InstsToMove,
298*b5893f02SDimitry Andric                                     const SIInstrInfo *TII, AliasAnalysis *AA) {
299d88c1a5aSDimitry Andric   assert(MemOp.mayLoadOrStore());
300d88c1a5aSDimitry Andric 
301d88c1a5aSDimitry Andric   for (MachineInstr *InstToMove : InstsToMove) {
302d88c1a5aSDimitry Andric     if (!InstToMove->mayLoadOrStore())
303d88c1a5aSDimitry Andric       continue;
304d88c1a5aSDimitry Andric     if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
305d88c1a5aSDimitry Andric       return false;
306d88c1a5aSDimitry Andric   }
307d88c1a5aSDimitry Andric   return true;
308d88c1a5aSDimitry Andric }
309d88c1a5aSDimitry Andric 
offsetsCanBeCombined(CombineInfo & CI)3107a7e6055SDimitry Andric bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
3118f0fd8f6SDimitry Andric   // XXX - Would the same offset be OK? Is there any reason this would happen or
3128f0fd8f6SDimitry Andric   // be useful?
3137a7e6055SDimitry Andric   if (CI.Offset0 == CI.Offset1)
3148f0fd8f6SDimitry Andric     return false;
3158f0fd8f6SDimitry Andric 
3168f0fd8f6SDimitry Andric   // This won't be valid if the offset isn't aligned.
3177a7e6055SDimitry Andric   if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
3188f0fd8f6SDimitry Andric     return false;
3198f0fd8f6SDimitry Andric 
3207a7e6055SDimitry Andric   unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
3217a7e6055SDimitry Andric   unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
3227a7e6055SDimitry Andric   CI.UseST64 = false;
3237a7e6055SDimitry Andric   CI.BaseOff = 0;
3248f0fd8f6SDimitry Andric 
3252cab237bSDimitry Andric   // Handle SMEM and VMEM instructions.
326*b5893f02SDimitry Andric   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
327*b5893f02SDimitry Andric     return (EltOffset0 + CI.Width0 == EltOffset1 ||
328*b5893f02SDimitry Andric             EltOffset1 + CI.Width1 == EltOffset0) &&
3292cab237bSDimitry Andric            CI.GLC0 == CI.GLC1 &&
3302cab237bSDimitry Andric            (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
3312cab237bSDimitry Andric   }
3322cab237bSDimitry Andric 
3338f0fd8f6SDimitry Andric   // If the offset in elements doesn't fit in 8-bits, we might be able to use
3348f0fd8f6SDimitry Andric   // the stride 64 versions.
3357a7e6055SDimitry Andric   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
3367a7e6055SDimitry Andric       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
3377a7e6055SDimitry Andric     CI.Offset0 = EltOffset0 / 64;
3387a7e6055SDimitry Andric     CI.Offset1 = EltOffset1 / 64;
3397a7e6055SDimitry Andric     CI.UseST64 = true;
3407a7e6055SDimitry Andric     return true;
3418f0fd8f6SDimitry Andric   }
3428f0fd8f6SDimitry Andric 
3437a7e6055SDimitry Andric   // Check if the new offsets fit in the reduced 8-bit range.
3447a7e6055SDimitry Andric   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
3457a7e6055SDimitry Andric     CI.Offset0 = EltOffset0;
3467a7e6055SDimitry Andric     CI.Offset1 = EltOffset1;
3477a7e6055SDimitry Andric     return true;
3487a7e6055SDimitry Andric   }
3497a7e6055SDimitry Andric 
3507a7e6055SDimitry Andric   // Try to shift base address to decrease offsets.
3517a7e6055SDimitry Andric   unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
3527a7e6055SDimitry Andric   CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
3537a7e6055SDimitry Andric 
3547a7e6055SDimitry Andric   if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
3557a7e6055SDimitry Andric     CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
3567a7e6055SDimitry Andric     CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
3577a7e6055SDimitry Andric     CI.UseST64 = true;
3587a7e6055SDimitry Andric     return true;
3597a7e6055SDimitry Andric   }
3607a7e6055SDimitry Andric 
3617a7e6055SDimitry Andric   if (isUInt<8>(OffsetDiff)) {
3627a7e6055SDimitry Andric     CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
3637a7e6055SDimitry Andric     CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
3647a7e6055SDimitry Andric     return true;
3657a7e6055SDimitry Andric   }
3667a7e6055SDimitry Andric 
3677a7e6055SDimitry Andric   return false;
3687a7e6055SDimitry Andric }
3697a7e6055SDimitry Andric 
widthsFit(const GCNSubtarget & STM,const CombineInfo & CI)370*b5893f02SDimitry Andric bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
371*b5893f02SDimitry Andric                                      const CombineInfo &CI) {
372*b5893f02SDimitry Andric   const unsigned Width = (CI.Width0 + CI.Width1);
373*b5893f02SDimitry Andric   switch (CI.InstClass) {
374*b5893f02SDimitry Andric   default:
375*b5893f02SDimitry Andric     return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
376*b5893f02SDimitry Andric   case S_BUFFER_LOAD_IMM:
377*b5893f02SDimitry Andric     switch (Width) {
378*b5893f02SDimitry Andric     default:
379*b5893f02SDimitry Andric       return false;
380*b5893f02SDimitry Andric     case 2:
381*b5893f02SDimitry Andric     case 4:
382*b5893f02SDimitry Andric       return true;
383*b5893f02SDimitry Andric     }
384*b5893f02SDimitry Andric   }
385*b5893f02SDimitry Andric }
386*b5893f02SDimitry Andric 
getOpcodeWidth(const MachineInstr & MI)387*b5893f02SDimitry Andric unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr &MI) {
388*b5893f02SDimitry Andric   const unsigned Opc = MI.getOpcode();
389*b5893f02SDimitry Andric 
390*b5893f02SDimitry Andric   if (TII->isMUBUF(MI)) {
391*b5893f02SDimitry Andric     return AMDGPU::getMUBUFDwords(Opc);
392*b5893f02SDimitry Andric   }
393*b5893f02SDimitry Andric 
394*b5893f02SDimitry Andric   switch (Opc) {
395*b5893f02SDimitry Andric   default:
396*b5893f02SDimitry Andric     return 0;
397*b5893f02SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
398*b5893f02SDimitry Andric     return 1;
399*b5893f02SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
400*b5893f02SDimitry Andric     return 2;
401*b5893f02SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
402*b5893f02SDimitry Andric     return 4;
403*b5893f02SDimitry Andric   }
404*b5893f02SDimitry Andric }
405*b5893f02SDimitry Andric 
getInstClass(unsigned Opc)406*b5893f02SDimitry Andric InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) {
407*b5893f02SDimitry Andric   if (TII->isMUBUF(Opc)) {
408*b5893f02SDimitry Andric     const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc);
409*b5893f02SDimitry Andric 
410*b5893f02SDimitry Andric     // If we couldn't identify the opcode, bail out.
411*b5893f02SDimitry Andric     if (baseOpcode == -1) {
412*b5893f02SDimitry Andric       return UNKNOWN;
413*b5893f02SDimitry Andric     }
414*b5893f02SDimitry Andric 
415*b5893f02SDimitry Andric     switch (baseOpcode) {
416*b5893f02SDimitry Andric     default:
417*b5893f02SDimitry Andric       return UNKNOWN;
418*b5893f02SDimitry Andric     case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
419*b5893f02SDimitry Andric       return BUFFER_LOAD_OFFEN;
420*b5893f02SDimitry Andric     case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
421*b5893f02SDimitry Andric       return BUFFER_LOAD_OFFSET;
422*b5893f02SDimitry Andric     case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
423*b5893f02SDimitry Andric       return BUFFER_STORE_OFFEN;
424*b5893f02SDimitry Andric     case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
425*b5893f02SDimitry Andric       return BUFFER_STORE_OFFSET;
426*b5893f02SDimitry Andric     case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
427*b5893f02SDimitry Andric       return BUFFER_LOAD_OFFEN_exact;
428*b5893f02SDimitry Andric     case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
429*b5893f02SDimitry Andric       return BUFFER_LOAD_OFFSET_exact;
430*b5893f02SDimitry Andric     case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
431*b5893f02SDimitry Andric       return BUFFER_STORE_OFFEN_exact;
432*b5893f02SDimitry Andric     case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
433*b5893f02SDimitry Andric       return BUFFER_STORE_OFFSET_exact;
434*b5893f02SDimitry Andric     }
435*b5893f02SDimitry Andric   }
436*b5893f02SDimitry Andric 
437*b5893f02SDimitry Andric   switch (Opc) {
438*b5893f02SDimitry Andric   default:
439*b5893f02SDimitry Andric     return UNKNOWN;
440*b5893f02SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
441*b5893f02SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
442*b5893f02SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
443*b5893f02SDimitry Andric     return S_BUFFER_LOAD_IMM;
444*b5893f02SDimitry Andric   case AMDGPU::DS_READ_B32:
445*b5893f02SDimitry Andric   case AMDGPU::DS_READ_B64:
446*b5893f02SDimitry Andric   case AMDGPU::DS_READ_B32_gfx9:
447*b5893f02SDimitry Andric   case AMDGPU::DS_READ_B64_gfx9:
448*b5893f02SDimitry Andric     return DS_READ;
449*b5893f02SDimitry Andric   case AMDGPU::DS_WRITE_B32:
450*b5893f02SDimitry Andric   case AMDGPU::DS_WRITE_B64:
451*b5893f02SDimitry Andric   case AMDGPU::DS_WRITE_B32_gfx9:
452*b5893f02SDimitry Andric   case AMDGPU::DS_WRITE_B64_gfx9:
453*b5893f02SDimitry Andric     return DS_WRITE;
454*b5893f02SDimitry Andric   }
455*b5893f02SDimitry Andric }
456*b5893f02SDimitry Andric 
getRegs(unsigned Opc)457*b5893f02SDimitry Andric unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) {
458*b5893f02SDimitry Andric   if (TII->isMUBUF(Opc)) {
459*b5893f02SDimitry Andric     unsigned result = 0;
460*b5893f02SDimitry Andric 
461*b5893f02SDimitry Andric     if (AMDGPU::getMUBUFHasVAddr(Opc)) {
462*b5893f02SDimitry Andric       result |= VADDR;
463*b5893f02SDimitry Andric     }
464*b5893f02SDimitry Andric 
465*b5893f02SDimitry Andric     if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
466*b5893f02SDimitry Andric       result |= SRSRC;
467*b5893f02SDimitry Andric     }
468*b5893f02SDimitry Andric 
469*b5893f02SDimitry Andric     if (AMDGPU::getMUBUFHasSoffset(Opc)) {
470*b5893f02SDimitry Andric       result |= SOFFSET;
471*b5893f02SDimitry Andric     }
472*b5893f02SDimitry Andric 
473*b5893f02SDimitry Andric     return result;
474*b5893f02SDimitry Andric   }
475*b5893f02SDimitry Andric 
476*b5893f02SDimitry Andric   switch (Opc) {
477*b5893f02SDimitry Andric   default:
478*b5893f02SDimitry Andric     return 0;
479*b5893f02SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
480*b5893f02SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
481*b5893f02SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
482*b5893f02SDimitry Andric     return SBASE;
483*b5893f02SDimitry Andric   case AMDGPU::DS_READ_B32:
484*b5893f02SDimitry Andric   case AMDGPU::DS_READ_B64:
485*b5893f02SDimitry Andric   case AMDGPU::DS_READ_B32_gfx9:
486*b5893f02SDimitry Andric   case AMDGPU::DS_READ_B64_gfx9:
487*b5893f02SDimitry Andric   case AMDGPU::DS_WRITE_B32:
488*b5893f02SDimitry Andric   case AMDGPU::DS_WRITE_B64:
489*b5893f02SDimitry Andric   case AMDGPU::DS_WRITE_B32_gfx9:
490*b5893f02SDimitry Andric   case AMDGPU::DS_WRITE_B64_gfx9:
491*b5893f02SDimitry Andric     return ADDR;
492*b5893f02SDimitry Andric   }
493*b5893f02SDimitry Andric }
494*b5893f02SDimitry Andric 
findMatchingInst(CombineInfo & CI)4952cab237bSDimitry Andric bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
4962cab237bSDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
4972cab237bSDimitry Andric   MachineBasicBlock::iterator E = MBB->end();
4987a7e6055SDimitry Andric   MachineBasicBlock::iterator MBBI = CI.I;
4992cab237bSDimitry Andric 
500*b5893f02SDimitry Andric   const unsigned Opc = CI.I->getOpcode();
501*b5893f02SDimitry Andric   const InstClassEnum InstClass = getInstClass(Opc);
502*b5893f02SDimitry Andric 
503*b5893f02SDimitry Andric   if (InstClass == UNKNOWN) {
504*b5893f02SDimitry Andric     return false;
505*b5893f02SDimitry Andric   }
506*b5893f02SDimitry Andric 
507*b5893f02SDimitry Andric   const unsigned Regs = getRegs(Opc);
508*b5893f02SDimitry Andric 
509*b5893f02SDimitry Andric   unsigned AddrOpName[5] = {0};
510*b5893f02SDimitry Andric   int AddrIdx[5];
511*b5893f02SDimitry Andric   const MachineOperand *AddrReg[5];
5122cab237bSDimitry Andric   unsigned NumAddresses = 0;
5132cab237bSDimitry Andric 
514*b5893f02SDimitry Andric   if (Regs & ADDR) {
5152cab237bSDimitry Andric     AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
516*b5893f02SDimitry Andric   }
517*b5893f02SDimitry Andric 
518*b5893f02SDimitry Andric   if (Regs & SBASE) {
5192cab237bSDimitry Andric     AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
520*b5893f02SDimitry Andric   }
521*b5893f02SDimitry Andric 
522*b5893f02SDimitry Andric   if (Regs & SRSRC) {
5232cab237bSDimitry Andric     AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
524*b5893f02SDimitry Andric   }
525*b5893f02SDimitry Andric 
526*b5893f02SDimitry Andric   if (Regs & SOFFSET) {
527*b5893f02SDimitry Andric     AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
528*b5893f02SDimitry Andric   }
529*b5893f02SDimitry Andric 
530*b5893f02SDimitry Andric   if (Regs & VADDR) {
5312cab237bSDimitry Andric     AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
5322cab237bSDimitry Andric   }
5332cab237bSDimitry Andric 
5342cab237bSDimitry Andric   for (unsigned i = 0; i < NumAddresses; i++) {
5352cab237bSDimitry Andric     AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
5362cab237bSDimitry Andric     AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
5372cab237bSDimitry Andric 
538*b5893f02SDimitry Andric     // We only ever merge operations with the same base address register, so
539*b5893f02SDimitry Andric     // don't bother scanning forward if there are no other uses.
5402cab237bSDimitry Andric     if (AddrReg[i]->isReg() &&
5412cab237bSDimitry Andric         (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) ||
5422cab237bSDimitry Andric          MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
5432cab237bSDimitry Andric       return false;
5442cab237bSDimitry Andric   }
5452cab237bSDimitry Andric 
5468f0fd8f6SDimitry Andric   ++MBBI;
5478f0fd8f6SDimitry Andric 
5484ba319b5SDimitry Andric   DenseSet<unsigned> RegDefsToMove;
5494ba319b5SDimitry Andric   DenseSet<unsigned> PhysRegUsesToMove;
5504ba319b5SDimitry Andric   addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
551d88c1a5aSDimitry Andric 
552d88c1a5aSDimitry Andric   for (; MBBI != E; ++MBBI) {
553*b5893f02SDimitry Andric     const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE);
554*b5893f02SDimitry Andric 
555*b5893f02SDimitry Andric     if ((getInstClass(MBBI->getOpcode()) != InstClass) ||
556*b5893f02SDimitry Andric         (IsDS && (MBBI->getOpcode() != Opc))) {
557d88c1a5aSDimitry Andric       // This is not a matching DS instruction, but we can keep looking as
558d88c1a5aSDimitry Andric       // long as one of these conditions are met:
559d88c1a5aSDimitry Andric       // 1. It is safe to move I down past MBBI.
560d88c1a5aSDimitry Andric       // 2. It is safe to move MBBI down past the instruction that I will
561d88c1a5aSDimitry Andric       //    be merged into.
562d88c1a5aSDimitry Andric 
5632cab237bSDimitry Andric       if (MBBI->hasUnmodeledSideEffects()) {
564d88c1a5aSDimitry Andric         // We can't re-order this instruction with respect to other memory
5652cab237bSDimitry Andric         // operations, so we fail both conditions mentioned above.
5667a7e6055SDimitry Andric         return false;
5672cab237bSDimitry Andric       }
5688f0fd8f6SDimitry Andric 
569d88c1a5aSDimitry Andric       if (MBBI->mayLoadOrStore() &&
5702cab237bSDimitry Andric           (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
5712cab237bSDimitry Andric            !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
572d88c1a5aSDimitry Andric         // We fail condition #1, but we may still be able to satisfy condition
573d88c1a5aSDimitry Andric         // #2.  Add this instruction to the move list and then we will check
574d88c1a5aSDimitry Andric         // if condition #2 holds once we have selected the matching instruction.
5757a7e6055SDimitry Andric         CI.InstsToMove.push_back(&*MBBI);
5764ba319b5SDimitry Andric         addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
577d88c1a5aSDimitry Andric         continue;
578d88c1a5aSDimitry Andric       }
579d88c1a5aSDimitry Andric 
580d88c1a5aSDimitry Andric       // When we match I with another DS instruction we will be moving I down
581d88c1a5aSDimitry Andric       // to the location of the matched instruction any uses of I will need to
582d88c1a5aSDimitry Andric       // be moved down as well.
5834ba319b5SDimitry Andric       addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
5844ba319b5SDimitry Andric                             CI.InstsToMove);
585d88c1a5aSDimitry Andric       continue;
586d88c1a5aSDimitry Andric     }
587d88c1a5aSDimitry Andric 
5888f0fd8f6SDimitry Andric     // Don't merge volatiles.
5898f0fd8f6SDimitry Andric     if (MBBI->hasOrderedMemoryRef())
5907a7e6055SDimitry Andric       return false;
5918f0fd8f6SDimitry Andric 
592d88c1a5aSDimitry Andric     // Handle a case like
593d88c1a5aSDimitry Andric     //   DS_WRITE_B32 addr, v, idx0
594d88c1a5aSDimitry Andric     //   w = DS_READ_B32 addr, idx0
595d88c1a5aSDimitry Andric     //   DS_WRITE_B32 addr, f(w), idx1
596d88c1a5aSDimitry Andric     // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
597d88c1a5aSDimitry Andric     // merging of the two writes.
5984ba319b5SDimitry Andric     if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
5994ba319b5SDimitry Andric                               CI.InstsToMove))
600d88c1a5aSDimitry Andric       continue;
601d88c1a5aSDimitry Andric 
6022cab237bSDimitry Andric     bool Match = true;
6032cab237bSDimitry Andric     for (unsigned i = 0; i < NumAddresses; i++) {
6042cab237bSDimitry Andric       const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
6052cab237bSDimitry Andric 
6062cab237bSDimitry Andric       if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
6072cab237bSDimitry Andric         if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
6082cab237bSDimitry Andric             AddrReg[i]->getImm() != AddrRegNext.getImm()) {
6092cab237bSDimitry Andric           Match = false;
6102cab237bSDimitry Andric           break;
6112cab237bSDimitry Andric         }
6122cab237bSDimitry Andric         continue;
6132cab237bSDimitry Andric       }
6148f0fd8f6SDimitry Andric 
615*b5893f02SDimitry Andric       // Check same base pointer. Be careful of subregisters, which can occur
616*b5893f02SDimitry Andric       // with vectors of pointers.
6172cab237bSDimitry Andric       if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
6182cab237bSDimitry Andric           AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
6192cab237bSDimitry Andric         Match = false;
6202cab237bSDimitry Andric         break;
6212cab237bSDimitry Andric       }
6222cab237bSDimitry Andric     }
6232cab237bSDimitry Andric 
6242cab237bSDimitry Andric     if (Match) {
625*b5893f02SDimitry Andric       int OffsetIdx =
626*b5893f02SDimitry Andric           AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset);
6272cab237bSDimitry Andric       CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
628*b5893f02SDimitry Andric       CI.Width0 = getOpcodeWidth(*CI.I);
6292cab237bSDimitry Andric       CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
630*b5893f02SDimitry Andric       CI.Width1 = getOpcodeWidth(*MBBI);
6317a7e6055SDimitry Andric       CI.Paired = MBBI;
6328f0fd8f6SDimitry Andric 
633*b5893f02SDimitry Andric       if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) {
6342cab237bSDimitry Andric         CI.Offset0 &= 0xffff;
6352cab237bSDimitry Andric         CI.Offset1 &= 0xffff;
6362cab237bSDimitry Andric       } else {
6372cab237bSDimitry Andric         CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
6382cab237bSDimitry Andric         CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
6392cab237bSDimitry Andric         if (CI.InstClass != S_BUFFER_LOAD_IMM) {
6402cab237bSDimitry Andric           CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
6412cab237bSDimitry Andric           CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
6422cab237bSDimitry Andric         }
6432cab237bSDimitry Andric       }
6442cab237bSDimitry Andric 
6458f0fd8f6SDimitry Andric       // Check both offsets fit in the reduced range.
646d88c1a5aSDimitry Andric       // We also need to go through the list of instructions that we plan to
647d88c1a5aSDimitry Andric       // move and make sure they are all safe to move down past the merged
648d88c1a5aSDimitry Andric       // instruction.
649*b5893f02SDimitry Andric       if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI))
6507a7e6055SDimitry Andric         if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
6517a7e6055SDimitry Andric           return true;
6528f0fd8f6SDimitry Andric     }
6538f0fd8f6SDimitry Andric 
654d88c1a5aSDimitry Andric     // We've found a load/store that we couldn't merge for some reason.
655d88c1a5aSDimitry Andric     // We could potentially keep looking, but we'd need to make sure that
656d88c1a5aSDimitry Andric     // it was safe to move I and also all the instruction in InstsToMove
657d88c1a5aSDimitry Andric     // down past this instruction.
6587a7e6055SDimitry Andric     // check if we can move I across MBBI and if we can move all I's users
6597a7e6055SDimitry Andric     if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
6607a7e6055SDimitry Andric         !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
661d88c1a5aSDimitry Andric       break;
662d88c1a5aSDimitry Andric   }
6637a7e6055SDimitry Andric   return false;
6648f0fd8f6SDimitry Andric }
6658f0fd8f6SDimitry Andric 
read2Opcode(unsigned EltSize) const6662cab237bSDimitry Andric unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
6672cab237bSDimitry Andric   if (STM->ldsRequiresM0Init())
6682cab237bSDimitry Andric     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
6692cab237bSDimitry Andric   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
6702cab237bSDimitry Andric }
6712cab237bSDimitry Andric 
read2ST64Opcode(unsigned EltSize) const6722cab237bSDimitry Andric unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
6732cab237bSDimitry Andric   if (STM->ldsRequiresM0Init())
6742cab237bSDimitry Andric     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
6752cab237bSDimitry Andric 
676*b5893f02SDimitry Andric   return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
677*b5893f02SDimitry Andric                         : AMDGPU::DS_READ2ST64_B64_gfx9;
6782cab237bSDimitry Andric }
6792cab237bSDimitry Andric 
680*b5893f02SDimitry Andric MachineBasicBlock::iterator
mergeRead2Pair(CombineInfo & CI)681*b5893f02SDimitry Andric SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
6827a7e6055SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
6838f0fd8f6SDimitry Andric 
6848f0fd8f6SDimitry Andric   // Be careful, since the addresses could be subregisters themselves in weird
6858f0fd8f6SDimitry Andric   // cases, like vectors of pointers.
6867a7e6055SDimitry Andric   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
6878f0fd8f6SDimitry Andric 
6887a7e6055SDimitry Andric   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
6897a7e6055SDimitry Andric   const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
6908f0fd8f6SDimitry Andric 
6917a7e6055SDimitry Andric   unsigned NewOffset0 = CI.Offset0;
6927a7e6055SDimitry Andric   unsigned NewOffset1 = CI.Offset1;
693*b5893f02SDimitry Andric   unsigned Opc =
694*b5893f02SDimitry Andric       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
6958f0fd8f6SDimitry Andric 
6967a7e6055SDimitry Andric   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
6977a7e6055SDimitry Andric   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
698d88c1a5aSDimitry Andric 
699d88c1a5aSDimitry Andric   if (NewOffset0 > NewOffset1) {
700d88c1a5aSDimitry Andric     // Canonicalize the merged instruction so the smaller offset comes first.
701d88c1a5aSDimitry Andric     std::swap(NewOffset0, NewOffset1);
702d88c1a5aSDimitry Andric     std::swap(SubRegIdx0, SubRegIdx1);
703d88c1a5aSDimitry Andric   }
704d88c1a5aSDimitry Andric 
7058f0fd8f6SDimitry Andric   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
706*b5893f02SDimitry Andric          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
7078f0fd8f6SDimitry Andric 
7088f0fd8f6SDimitry Andric   const MCInstrDesc &Read2Desc = TII->get(Opc);
7098f0fd8f6SDimitry Andric 
710*b5893f02SDimitry Andric   const TargetRegisterClass *SuperRC =
711*b5893f02SDimitry Andric       (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
7128f0fd8f6SDimitry Andric   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
7138f0fd8f6SDimitry Andric 
7147a7e6055SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
7157a7e6055SDimitry Andric 
7167a7e6055SDimitry Andric   unsigned BaseReg = AddrReg->getReg();
717*b5893f02SDimitry Andric   unsigned BaseSubReg = AddrReg->getSubReg();
7187a7e6055SDimitry Andric   unsigned BaseRegFlags = 0;
7197a7e6055SDimitry Andric   if (CI.BaseOff) {
7204ba319b5SDimitry Andric     unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
7214ba319b5SDimitry Andric     BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
7224ba319b5SDimitry Andric         .addImm(CI.BaseOff);
7234ba319b5SDimitry Andric 
7247a7e6055SDimitry Andric     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7257a7e6055SDimitry Andric     BaseRegFlags = RegState::Kill;
7262cab237bSDimitry Andric 
7274ba319b5SDimitry Andric     TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
7284ba319b5SDimitry Andric         .addReg(ImmReg)
729*b5893f02SDimitry Andric         .addReg(AddrReg->getReg(), 0, BaseSubReg);
730*b5893f02SDimitry Andric     BaseSubReg = 0;
7317a7e6055SDimitry Andric   }
7327a7e6055SDimitry Andric 
7337a7e6055SDimitry Andric   MachineInstrBuilder Read2 =
7347a7e6055SDimitry Andric       BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
735*b5893f02SDimitry Andric           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
7368f0fd8f6SDimitry Andric           .addImm(NewOffset0)                        // offset0
7378f0fd8f6SDimitry Andric           .addImm(NewOffset1)                        // offset1
7388f0fd8f6SDimitry Andric           .addImm(0)                                 // gds
739*b5893f02SDimitry Andric           .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
7407a7e6055SDimitry Andric 
741d88c1a5aSDimitry Andric   (void)Read2;
7428f0fd8f6SDimitry Andric 
743875ed548SDimitry Andric   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
744875ed548SDimitry Andric 
745875ed548SDimitry Andric   // Copy to the old destination registers.
7467a7e6055SDimitry Andric   BuildMI(*MBB, CI.Paired, DL, CopyDesc)
7477a7e6055SDimitry Andric       .add(*Dest0) // Copy to same destination including flags and sub reg.
748875ed548SDimitry Andric       .addReg(DestReg, 0, SubRegIdx0);
7497a7e6055SDimitry Andric   MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
7507a7e6055SDimitry Andric                             .add(*Dest1)
751875ed548SDimitry Andric                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
752875ed548SDimitry Andric 
7537a7e6055SDimitry Andric   moveInstsAfter(Copy1, CI.InstsToMove);
754875ed548SDimitry Andric 
7557a7e6055SDimitry Andric   MachineBasicBlock::iterator Next = std::next(CI.I);
7567a7e6055SDimitry Andric   CI.I->eraseFromParent();
7577a7e6055SDimitry Andric   CI.Paired->eraseFromParent();
7588f0fd8f6SDimitry Andric 
7594ba319b5SDimitry Andric   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
760d88c1a5aSDimitry Andric   return Next;
7618f0fd8f6SDimitry Andric }
7628f0fd8f6SDimitry Andric 
write2Opcode(unsigned EltSize) const7632cab237bSDimitry Andric unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
7642cab237bSDimitry Andric   if (STM->ldsRequiresM0Init())
7652cab237bSDimitry Andric     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
766*b5893f02SDimitry Andric   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
767*b5893f02SDimitry Andric                         : AMDGPU::DS_WRITE2_B64_gfx9;
7682cab237bSDimitry Andric }
7692cab237bSDimitry Andric 
write2ST64Opcode(unsigned EltSize) const7702cab237bSDimitry Andric unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
7712cab237bSDimitry Andric   if (STM->ldsRequiresM0Init())
772*b5893f02SDimitry Andric     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
773*b5893f02SDimitry Andric                           : AMDGPU::DS_WRITE2ST64_B64;
7742cab237bSDimitry Andric 
775*b5893f02SDimitry Andric   return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
776*b5893f02SDimitry Andric                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
7772cab237bSDimitry Andric }
7782cab237bSDimitry Andric 
779*b5893f02SDimitry Andric MachineBasicBlock::iterator
mergeWrite2Pair(CombineInfo & CI)780*b5893f02SDimitry Andric SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
7817a7e6055SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
7828f0fd8f6SDimitry Andric 
7838f0fd8f6SDimitry Andric   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
7848f0fd8f6SDimitry Andric   // sure we preserve the subregister index and any register flags set on them.
785*b5893f02SDimitry Andric   const MachineOperand *AddrReg =
786*b5893f02SDimitry Andric       TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
787*b5893f02SDimitry Andric   const MachineOperand *Data0 =
788*b5893f02SDimitry Andric       TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
789*b5893f02SDimitry Andric   const MachineOperand *Data1 =
790*b5893f02SDimitry Andric       TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
7918f0fd8f6SDimitry Andric 
7927a7e6055SDimitry Andric   unsigned NewOffset0 = CI.Offset0;
7937a7e6055SDimitry Andric   unsigned NewOffset1 = CI.Offset1;
794*b5893f02SDimitry Andric   unsigned Opc =
795*b5893f02SDimitry Andric       CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
7968f0fd8f6SDimitry Andric 
797d88c1a5aSDimitry Andric   if (NewOffset0 > NewOffset1) {
798d88c1a5aSDimitry Andric     // Canonicalize the merged instruction so the smaller offset comes first.
799d88c1a5aSDimitry Andric     std::swap(NewOffset0, NewOffset1);
800d88c1a5aSDimitry Andric     std::swap(Data0, Data1);
801d88c1a5aSDimitry Andric   }
802d88c1a5aSDimitry Andric 
8038f0fd8f6SDimitry Andric   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
804*b5893f02SDimitry Andric          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
8058f0fd8f6SDimitry Andric 
8068f0fd8f6SDimitry Andric   const MCInstrDesc &Write2Desc = TII->get(Opc);
8077a7e6055SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
8088f0fd8f6SDimitry Andric 
8094ba319b5SDimitry Andric   unsigned BaseReg = AddrReg->getReg();
810*b5893f02SDimitry Andric   unsigned BaseSubReg = AddrReg->getSubReg();
8117a7e6055SDimitry Andric   unsigned BaseRegFlags = 0;
8127a7e6055SDimitry Andric   if (CI.BaseOff) {
8134ba319b5SDimitry Andric     unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
8144ba319b5SDimitry Andric     BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
8154ba319b5SDimitry Andric         .addImm(CI.BaseOff);
8164ba319b5SDimitry Andric 
8177a7e6055SDimitry Andric     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8187a7e6055SDimitry Andric     BaseRegFlags = RegState::Kill;
8192cab237bSDimitry Andric 
8204ba319b5SDimitry Andric     TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
8214ba319b5SDimitry Andric         .addReg(ImmReg)
822*b5893f02SDimitry Andric         .addReg(AddrReg->getReg(), 0, BaseSubReg);
823*b5893f02SDimitry Andric     BaseSubReg = 0;
8247a7e6055SDimitry Andric   }
8257a7e6055SDimitry Andric 
8267a7e6055SDimitry Andric   MachineInstrBuilder Write2 =
8277a7e6055SDimitry Andric       BuildMI(*MBB, CI.Paired, DL, Write2Desc)
828*b5893f02SDimitry Andric           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
8297a7e6055SDimitry Andric           .add(*Data0)                               // data0
8307a7e6055SDimitry Andric           .add(*Data1)                               // data1
8318f0fd8f6SDimitry Andric           .addImm(NewOffset0)                        // offset0
8328f0fd8f6SDimitry Andric           .addImm(NewOffset1)                        // offset1
8338f0fd8f6SDimitry Andric           .addImm(0)                                 // gds
834*b5893f02SDimitry Andric           .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
8358f0fd8f6SDimitry Andric 
8367a7e6055SDimitry Andric   moveInstsAfter(Write2, CI.InstsToMove);
8378f0fd8f6SDimitry Andric 
8387a7e6055SDimitry Andric   MachineBasicBlock::iterator Next = std::next(CI.I);
8397a7e6055SDimitry Andric   CI.I->eraseFromParent();
8407a7e6055SDimitry Andric   CI.Paired->eraseFromParent();
8418f0fd8f6SDimitry Andric 
8424ba319b5SDimitry Andric   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
843d88c1a5aSDimitry Andric   return Next;
8448f0fd8f6SDimitry Andric }
8458f0fd8f6SDimitry Andric 
846*b5893f02SDimitry Andric MachineBasicBlock::iterator
mergeSBufferLoadImmPair(CombineInfo & CI)847*b5893f02SDimitry Andric SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
8482cab237bSDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
8492cab237bSDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
850*b5893f02SDimitry Andric   const unsigned Opcode = getNewOpcode(CI);
8512cab237bSDimitry Andric 
852*b5893f02SDimitry Andric   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
853*b5893f02SDimitry Andric 
8542cab237bSDimitry Andric   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
8552cab237bSDimitry Andric   unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
8562cab237bSDimitry Andric 
8572cab237bSDimitry Andric   BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
8582cab237bSDimitry Andric       .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
8592cab237bSDimitry Andric       .addImm(MergedOffset) // offset
8602cab237bSDimitry Andric       .addImm(CI.GLC0)      // glc
861*b5893f02SDimitry Andric       .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
8622cab237bSDimitry Andric 
863*b5893f02SDimitry Andric   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
864*b5893f02SDimitry Andric   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
865*b5893f02SDimitry Andric   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
8662cab237bSDimitry Andric 
8672cab237bSDimitry Andric   // Copy to the old destination registers.
8682cab237bSDimitry Andric   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
8692cab237bSDimitry Andric   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
8702cab237bSDimitry Andric   const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
8712cab237bSDimitry Andric 
8722cab237bSDimitry Andric   BuildMI(*MBB, CI.Paired, DL, CopyDesc)
8732cab237bSDimitry Andric       .add(*Dest0) // Copy to same destination including flags and sub reg.
8742cab237bSDimitry Andric       .addReg(DestReg, 0, SubRegIdx0);
8752cab237bSDimitry Andric   MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
8762cab237bSDimitry Andric                             .add(*Dest1)
8772cab237bSDimitry Andric                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
8782cab237bSDimitry Andric 
8792cab237bSDimitry Andric   moveInstsAfter(Copy1, CI.InstsToMove);
8802cab237bSDimitry Andric 
8812cab237bSDimitry Andric   MachineBasicBlock::iterator Next = std::next(CI.I);
8822cab237bSDimitry Andric   CI.I->eraseFromParent();
8832cab237bSDimitry Andric   CI.Paired->eraseFromParent();
8842cab237bSDimitry Andric   return Next;
8852cab237bSDimitry Andric }
8862cab237bSDimitry Andric 
887*b5893f02SDimitry Andric MachineBasicBlock::iterator
mergeBufferLoadPair(CombineInfo & CI)888*b5893f02SDimitry Andric SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
8892cab237bSDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
8902cab237bSDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
8912cab237bSDimitry Andric 
892*b5893f02SDimitry Andric   const unsigned Opcode = getNewOpcode(CI);
8932cab237bSDimitry Andric 
894*b5893f02SDimitry Andric   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
895*b5893f02SDimitry Andric 
896*b5893f02SDimitry Andric   // Copy to the new source register.
8972cab237bSDimitry Andric   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
8982cab237bSDimitry Andric   unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
8992cab237bSDimitry Andric 
9002cab237bSDimitry Andric   auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
9012cab237bSDimitry Andric 
902*b5893f02SDimitry Andric   const unsigned Regs = getRegs(Opcode);
903*b5893f02SDimitry Andric 
904*b5893f02SDimitry Andric   if (Regs & VADDR)
9052cab237bSDimitry Andric     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
9062cab237bSDimitry Andric 
9072cab237bSDimitry Andric   MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
9082cab237bSDimitry Andric       .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
9092cab237bSDimitry Andric       .addImm(MergedOffset) // offset
9102cab237bSDimitry Andric       .addImm(CI.GLC0)      // glc
9112cab237bSDimitry Andric       .addImm(CI.SLC0)      // slc
9122cab237bSDimitry Andric       .addImm(0)            // tfe
913*b5893f02SDimitry Andric       .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
9142cab237bSDimitry Andric 
915*b5893f02SDimitry Andric   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
916*b5893f02SDimitry Andric   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
917*b5893f02SDimitry Andric   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
9182cab237bSDimitry Andric 
9192cab237bSDimitry Andric   // Copy to the old destination registers.
9202cab237bSDimitry Andric   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
9212cab237bSDimitry Andric   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
9222cab237bSDimitry Andric   const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
9232cab237bSDimitry Andric 
9242cab237bSDimitry Andric   BuildMI(*MBB, CI.Paired, DL, CopyDesc)
9252cab237bSDimitry Andric       .add(*Dest0) // Copy to same destination including flags and sub reg.
9262cab237bSDimitry Andric       .addReg(DestReg, 0, SubRegIdx0);
9272cab237bSDimitry Andric   MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
9282cab237bSDimitry Andric                             .add(*Dest1)
9292cab237bSDimitry Andric                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
9302cab237bSDimitry Andric 
9312cab237bSDimitry Andric   moveInstsAfter(Copy1, CI.InstsToMove);
9322cab237bSDimitry Andric 
9332cab237bSDimitry Andric   MachineBasicBlock::iterator Next = std::next(CI.I);
9342cab237bSDimitry Andric   CI.I->eraseFromParent();
9352cab237bSDimitry Andric   CI.Paired->eraseFromParent();
9362cab237bSDimitry Andric   return Next;
9372cab237bSDimitry Andric }
9382cab237bSDimitry Andric 
getNewOpcode(const CombineInfo & CI)939*b5893f02SDimitry Andric unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
940*b5893f02SDimitry Andric   const unsigned Width = CI.Width0 + CI.Width1;
9412cab237bSDimitry Andric 
942*b5893f02SDimitry Andric   switch (CI.InstClass) {
943*b5893f02SDimitry Andric   default:
944*b5893f02SDimitry Andric     return AMDGPU::getMUBUFOpcode(CI.InstClass, Width);
945*b5893f02SDimitry Andric   case UNKNOWN:
946*b5893f02SDimitry Andric     llvm_unreachable("Unknown instruction class");
947*b5893f02SDimitry Andric   case S_BUFFER_LOAD_IMM:
948*b5893f02SDimitry Andric     switch (Width) {
949*b5893f02SDimitry Andric     default:
9502cab237bSDimitry Andric       return 0;
951*b5893f02SDimitry Andric     case 2:
952*b5893f02SDimitry Andric       return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
953*b5893f02SDimitry Andric     case 4:
954*b5893f02SDimitry Andric       return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
955*b5893f02SDimitry Andric     }
956*b5893f02SDimitry Andric   }
9572cab237bSDimitry Andric }
9582cab237bSDimitry Andric 
959*b5893f02SDimitry Andric std::pair<unsigned, unsigned>
getSubRegIdxs(const CombineInfo & CI)960*b5893f02SDimitry Andric SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) {
961*b5893f02SDimitry Andric   if (CI.Offset0 > CI.Offset1) {
962*b5893f02SDimitry Andric     switch (CI.Width0) {
963*b5893f02SDimitry Andric     default:
964*b5893f02SDimitry Andric       return std::make_pair(0, 0);
965*b5893f02SDimitry Andric     case 1:
966*b5893f02SDimitry Andric       switch (CI.Width1) {
967*b5893f02SDimitry Andric       default:
968*b5893f02SDimitry Andric         return std::make_pair(0, 0);
969*b5893f02SDimitry Andric       case 1:
970*b5893f02SDimitry Andric         return std::make_pair(AMDGPU::sub1, AMDGPU::sub0);
971*b5893f02SDimitry Andric       case 2:
972*b5893f02SDimitry Andric         return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1);
973*b5893f02SDimitry Andric       case 3:
974*b5893f02SDimitry Andric         return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2);
975*b5893f02SDimitry Andric       }
976*b5893f02SDimitry Andric     case 2:
977*b5893f02SDimitry Andric       switch (CI.Width1) {
978*b5893f02SDimitry Andric       default:
979*b5893f02SDimitry Andric         return std::make_pair(0, 0);
980*b5893f02SDimitry Andric       case 1:
981*b5893f02SDimitry Andric         return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0);
982*b5893f02SDimitry Andric       case 2:
983*b5893f02SDimitry Andric         return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1);
984*b5893f02SDimitry Andric       }
985*b5893f02SDimitry Andric     case 3:
986*b5893f02SDimitry Andric       switch (CI.Width1) {
987*b5893f02SDimitry Andric       default:
988*b5893f02SDimitry Andric         return std::make_pair(0, 0);
989*b5893f02SDimitry Andric       case 1:
990*b5893f02SDimitry Andric         return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0);
991*b5893f02SDimitry Andric       }
992*b5893f02SDimitry Andric     }
993*b5893f02SDimitry Andric   } else {
994*b5893f02SDimitry Andric     switch (CI.Width0) {
995*b5893f02SDimitry Andric     default:
996*b5893f02SDimitry Andric       return std::make_pair(0, 0);
997*b5893f02SDimitry Andric     case 1:
998*b5893f02SDimitry Andric       switch (CI.Width1) {
999*b5893f02SDimitry Andric       default:
1000*b5893f02SDimitry Andric         return std::make_pair(0, 0);
1001*b5893f02SDimitry Andric       case 1:
1002*b5893f02SDimitry Andric         return std::make_pair(AMDGPU::sub0, AMDGPU::sub1);
1003*b5893f02SDimitry Andric       case 2:
1004*b5893f02SDimitry Andric         return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2);
1005*b5893f02SDimitry Andric       case 3:
1006*b5893f02SDimitry Andric         return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3);
1007*b5893f02SDimitry Andric       }
1008*b5893f02SDimitry Andric     case 2:
1009*b5893f02SDimitry Andric       switch (CI.Width1) {
1010*b5893f02SDimitry Andric       default:
1011*b5893f02SDimitry Andric         return std::make_pair(0, 0);
1012*b5893f02SDimitry Andric       case 1:
1013*b5893f02SDimitry Andric         return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2);
1014*b5893f02SDimitry Andric       case 2:
1015*b5893f02SDimitry Andric         return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3);
1016*b5893f02SDimitry Andric       }
1017*b5893f02SDimitry Andric     case 3:
1018*b5893f02SDimitry Andric       switch (CI.Width1) {
1019*b5893f02SDimitry Andric       default:
1020*b5893f02SDimitry Andric         return std::make_pair(0, 0);
1021*b5893f02SDimitry Andric       case 1:
1022*b5893f02SDimitry Andric         return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3);
1023*b5893f02SDimitry Andric       }
1024*b5893f02SDimitry Andric     }
1025*b5893f02SDimitry Andric   }
1026*b5893f02SDimitry Andric }
1027*b5893f02SDimitry Andric 
1028*b5893f02SDimitry Andric const TargetRegisterClass *
getTargetRegisterClass(const CombineInfo & CI)1029*b5893f02SDimitry Andric SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) {
1030*b5893f02SDimitry Andric   if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1031*b5893f02SDimitry Andric     switch (CI.Width0 + CI.Width1) {
1032*b5893f02SDimitry Andric     default:
1033*b5893f02SDimitry Andric       return nullptr;
1034*b5893f02SDimitry Andric     case 2:
1035*b5893f02SDimitry Andric       return &AMDGPU::SReg_64_XEXECRegClass;
1036*b5893f02SDimitry Andric     case 4:
1037*b5893f02SDimitry Andric       return &AMDGPU::SReg_128RegClass;
1038*b5893f02SDimitry Andric     case 8:
1039*b5893f02SDimitry Andric       return &AMDGPU::SReg_256RegClass;
1040*b5893f02SDimitry Andric     case 16:
1041*b5893f02SDimitry Andric       return &AMDGPU::SReg_512RegClass;
1042*b5893f02SDimitry Andric     }
1043*b5893f02SDimitry Andric   } else {
1044*b5893f02SDimitry Andric     switch (CI.Width0 + CI.Width1) {
1045*b5893f02SDimitry Andric     default:
1046*b5893f02SDimitry Andric       return nullptr;
1047*b5893f02SDimitry Andric     case 2:
1048*b5893f02SDimitry Andric       return &AMDGPU::VReg_64RegClass;
1049*b5893f02SDimitry Andric     case 3:
1050*b5893f02SDimitry Andric       return &AMDGPU::VReg_96RegClass;
1051*b5893f02SDimitry Andric     case 4:
1052*b5893f02SDimitry Andric       return &AMDGPU::VReg_128RegClass;
1053*b5893f02SDimitry Andric     }
1054*b5893f02SDimitry Andric   }
1055*b5893f02SDimitry Andric }
1056*b5893f02SDimitry Andric 
1057*b5893f02SDimitry Andric MachineBasicBlock::iterator
mergeBufferStorePair(CombineInfo & CI)1058*b5893f02SDimitry Andric SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
10592cab237bSDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
10602cab237bSDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
10612cab237bSDimitry Andric 
1062*b5893f02SDimitry Andric   const unsigned Opcode = getNewOpcode(CI);
10632cab237bSDimitry Andric 
1064*b5893f02SDimitry Andric   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1065*b5893f02SDimitry Andric   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1066*b5893f02SDimitry Andric   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
10672cab237bSDimitry Andric 
10682cab237bSDimitry Andric   // Copy to the new source register.
1069*b5893f02SDimitry Andric   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
10702cab237bSDimitry Andric   unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
10712cab237bSDimitry Andric 
10722cab237bSDimitry Andric   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
10732cab237bSDimitry Andric   const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
10742cab237bSDimitry Andric 
10752cab237bSDimitry Andric   BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
10762cab237bSDimitry Andric       .add(*Src0)
10772cab237bSDimitry Andric       .addImm(SubRegIdx0)
10782cab237bSDimitry Andric       .add(*Src1)
10792cab237bSDimitry Andric       .addImm(SubRegIdx1);
10802cab237bSDimitry Andric 
10812cab237bSDimitry Andric   auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
10822cab237bSDimitry Andric                  .addReg(SrcReg, RegState::Kill);
10832cab237bSDimitry Andric 
1084*b5893f02SDimitry Andric   const unsigned Regs = getRegs(Opcode);
1085*b5893f02SDimitry Andric 
1086*b5893f02SDimitry Andric   if (Regs & VADDR)
10872cab237bSDimitry Andric     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
10882cab237bSDimitry Andric 
10892cab237bSDimitry Andric   MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
10902cab237bSDimitry Andric       .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
10912cab237bSDimitry Andric       .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
10922cab237bSDimitry Andric       .addImm(CI.GLC0)                          // glc
10932cab237bSDimitry Andric       .addImm(CI.SLC0)                          // slc
10942cab237bSDimitry Andric       .addImm(0)                                // tfe
1095*b5893f02SDimitry Andric       .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
10962cab237bSDimitry Andric 
10972cab237bSDimitry Andric   moveInstsAfter(MIB, CI.InstsToMove);
10982cab237bSDimitry Andric 
10992cab237bSDimitry Andric   MachineBasicBlock::iterator Next = std::next(CI.I);
11002cab237bSDimitry Andric   CI.I->eraseFromParent();
11012cab237bSDimitry Andric   CI.Paired->eraseFromParent();
11022cab237bSDimitry Andric   return Next;
11032cab237bSDimitry Andric }
11042cab237bSDimitry Andric 
1105*b5893f02SDimitry Andric MachineOperand
createRegOrImm(int32_t Val,MachineInstr & MI)1106*b5893f02SDimitry Andric SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) {
1107*b5893f02SDimitry Andric   APInt V(32, Val, true);
1108*b5893f02SDimitry Andric   if (TII->isInlineConstant(V))
1109*b5893f02SDimitry Andric     return MachineOperand::CreateImm(Val);
1110*b5893f02SDimitry Andric 
1111*b5893f02SDimitry Andric   unsigned Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1112*b5893f02SDimitry Andric   MachineInstr *Mov =
1113*b5893f02SDimitry Andric   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1114*b5893f02SDimitry Andric           TII->get(AMDGPU::S_MOV_B32), Reg)
1115*b5893f02SDimitry Andric     .addImm(Val);
1116*b5893f02SDimitry Andric   (void)Mov;
1117*b5893f02SDimitry Andric   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
1118*b5893f02SDimitry Andric   return MachineOperand::CreateReg(Reg, false);
1119*b5893f02SDimitry Andric }
1120*b5893f02SDimitry Andric 
1121*b5893f02SDimitry Andric // Compute base address using Addr and return the final register.
computeBase(MachineInstr & MI,const MemAddress & Addr)1122*b5893f02SDimitry Andric unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1123*b5893f02SDimitry Andric                                            const MemAddress &Addr) {
1124*b5893f02SDimitry Andric   MachineBasicBlock *MBB = MI.getParent();
1125*b5893f02SDimitry Andric   MachineBasicBlock::iterator MBBI = MI.getIterator();
1126*b5893f02SDimitry Andric   DebugLoc DL = MI.getDebugLoc();
1127*b5893f02SDimitry Andric 
1128*b5893f02SDimitry Andric   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1129*b5893f02SDimitry Andric           Addr.Base.LoSubReg) &&
1130*b5893f02SDimitry Andric          "Expected 32-bit Base-Register-Low!!");
1131*b5893f02SDimitry Andric 
1132*b5893f02SDimitry Andric   assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1133*b5893f02SDimitry Andric           Addr.Base.HiSubReg) &&
1134*b5893f02SDimitry Andric          "Expected 32-bit Base-Register-Hi!!");
1135*b5893f02SDimitry Andric 
1136*b5893f02SDimitry Andric   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
1137*b5893f02SDimitry Andric   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1138*b5893f02SDimitry Andric   MachineOperand OffsetHi =
1139*b5893f02SDimitry Andric     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1140*b5893f02SDimitry Andric   unsigned CarryReg = MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
1141*b5893f02SDimitry Andric   unsigned DeadCarryReg =
1142*b5893f02SDimitry Andric     MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
1143*b5893f02SDimitry Andric 
1144*b5893f02SDimitry Andric   unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1145*b5893f02SDimitry Andric   unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1146*b5893f02SDimitry Andric   MachineInstr *LoHalf =
1147*b5893f02SDimitry Andric     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
1148*b5893f02SDimitry Andric       .addReg(CarryReg, RegState::Define)
1149*b5893f02SDimitry Andric       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1150*b5893f02SDimitry Andric     .add(OffsetLo);
1151*b5893f02SDimitry Andric   (void)LoHalf;
1152*b5893f02SDimitry Andric   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
1153*b5893f02SDimitry Andric 
1154*b5893f02SDimitry Andric   MachineInstr *HiHalf =
1155*b5893f02SDimitry Andric   BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1156*b5893f02SDimitry Andric     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1157*b5893f02SDimitry Andric     .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1158*b5893f02SDimitry Andric     .add(OffsetHi)
1159*b5893f02SDimitry Andric     .addReg(CarryReg, RegState::Kill);
1160*b5893f02SDimitry Andric   (void)HiHalf;
1161*b5893f02SDimitry Andric   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
1162*b5893f02SDimitry Andric 
1163*b5893f02SDimitry Andric   unsigned FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
1164*b5893f02SDimitry Andric   MachineInstr *FullBase =
1165*b5893f02SDimitry Andric     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1166*b5893f02SDimitry Andric       .addReg(DestSub0)
1167*b5893f02SDimitry Andric       .addImm(AMDGPU::sub0)
1168*b5893f02SDimitry Andric       .addReg(DestSub1)
1169*b5893f02SDimitry Andric       .addImm(AMDGPU::sub1);
1170*b5893f02SDimitry Andric   (void)FullBase;
1171*b5893f02SDimitry Andric   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
1172*b5893f02SDimitry Andric 
1173*b5893f02SDimitry Andric   return FullDestReg;
1174*b5893f02SDimitry Andric }
1175*b5893f02SDimitry Andric 
1176*b5893f02SDimitry Andric // Update base and offset with the NewBase and NewOffset in MI.
updateBaseAndOffset(MachineInstr & MI,unsigned NewBase,int32_t NewOffset)1177*b5893f02SDimitry Andric void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1178*b5893f02SDimitry Andric                                                unsigned NewBase,
1179*b5893f02SDimitry Andric                                                int32_t NewOffset) {
1180*b5893f02SDimitry Andric   TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase);
1181*b5893f02SDimitry Andric   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1182*b5893f02SDimitry Andric }
1183*b5893f02SDimitry Andric 
1184*b5893f02SDimitry Andric Optional<int32_t>
extractConstOffset(const MachineOperand & Op)1185*b5893f02SDimitry Andric SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) {
1186*b5893f02SDimitry Andric   if (Op.isImm())
1187*b5893f02SDimitry Andric     return Op.getImm();
1188*b5893f02SDimitry Andric 
1189*b5893f02SDimitry Andric   if (!Op.isReg())
1190*b5893f02SDimitry Andric     return None;
1191*b5893f02SDimitry Andric 
1192*b5893f02SDimitry Andric   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1193*b5893f02SDimitry Andric   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1194*b5893f02SDimitry Andric       !Def->getOperand(1).isImm())
1195*b5893f02SDimitry Andric     return None;
1196*b5893f02SDimitry Andric 
1197*b5893f02SDimitry Andric   return Def->getOperand(1).getImm();
1198*b5893f02SDimitry Andric }
1199*b5893f02SDimitry Andric 
1200*b5893f02SDimitry Andric // Analyze Base and extracts:
1201*b5893f02SDimitry Andric //  - 32bit base registers, subregisters
1202*b5893f02SDimitry Andric //  - 64bit constant offset
1203*b5893f02SDimitry Andric // Expecting base computation as:
1204*b5893f02SDimitry Andric //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
1205*b5893f02SDimitry Andric //   %LO:vgpr_32, %c:sreg_64_xexec =
1206*b5893f02SDimitry Andric //       V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1207*b5893f02SDimitry Andric //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1208*b5893f02SDimitry Andric //   %Base:vreg_64 =
1209*b5893f02SDimitry Andric //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
processBaseWithConstOffset(const MachineOperand & Base,MemAddress & Addr)1210*b5893f02SDimitry Andric void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1211*b5893f02SDimitry Andric                                                       MemAddress &Addr) {
1212*b5893f02SDimitry Andric   if (!Base.isReg())
1213*b5893f02SDimitry Andric     return;
1214*b5893f02SDimitry Andric 
1215*b5893f02SDimitry Andric   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1216*b5893f02SDimitry Andric   if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1217*b5893f02SDimitry Andric       || Def->getNumOperands() != 5)
1218*b5893f02SDimitry Andric     return;
1219*b5893f02SDimitry Andric 
1220*b5893f02SDimitry Andric   MachineOperand BaseLo = Def->getOperand(1);
1221*b5893f02SDimitry Andric   MachineOperand BaseHi = Def->getOperand(3);
1222*b5893f02SDimitry Andric   if (!BaseLo.isReg() || !BaseHi.isReg())
1223*b5893f02SDimitry Andric     return;
1224*b5893f02SDimitry Andric 
1225*b5893f02SDimitry Andric   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1226*b5893f02SDimitry Andric   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1227*b5893f02SDimitry Andric 
1228*b5893f02SDimitry Andric   if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 ||
1229*b5893f02SDimitry Andric       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1230*b5893f02SDimitry Andric     return;
1231*b5893f02SDimitry Andric 
1232*b5893f02SDimitry Andric   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1233*b5893f02SDimitry Andric   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1234*b5893f02SDimitry Andric 
1235*b5893f02SDimitry Andric   auto Offset0P = extractConstOffset(*Src0);
1236*b5893f02SDimitry Andric   if (Offset0P)
1237*b5893f02SDimitry Andric     BaseLo = *Src1;
1238*b5893f02SDimitry Andric   else {
1239*b5893f02SDimitry Andric     if (!(Offset0P = extractConstOffset(*Src1)))
1240*b5893f02SDimitry Andric       return;
1241*b5893f02SDimitry Andric     BaseLo = *Src0;
1242*b5893f02SDimitry Andric   }
1243*b5893f02SDimitry Andric 
1244*b5893f02SDimitry Andric   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1245*b5893f02SDimitry Andric   Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1246*b5893f02SDimitry Andric 
1247*b5893f02SDimitry Andric   if (Src0->isImm())
1248*b5893f02SDimitry Andric     std::swap(Src0, Src1);
1249*b5893f02SDimitry Andric 
1250*b5893f02SDimitry Andric   if (!Src1->isImm())
1251*b5893f02SDimitry Andric     return;
1252*b5893f02SDimitry Andric 
1253*b5893f02SDimitry Andric   uint64_t Offset1 = Src1->getImm();
1254*b5893f02SDimitry Andric   BaseHi = *Src0;
1255*b5893f02SDimitry Andric 
1256*b5893f02SDimitry Andric   Addr.Base.LoReg = BaseLo.getReg();
1257*b5893f02SDimitry Andric   Addr.Base.HiReg = BaseHi.getReg();
1258*b5893f02SDimitry Andric   Addr.Base.LoSubReg = BaseLo.getSubReg();
1259*b5893f02SDimitry Andric   Addr.Base.HiSubReg = BaseHi.getSubReg();
1260*b5893f02SDimitry Andric   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1261*b5893f02SDimitry Andric }
1262*b5893f02SDimitry Andric 
promoteConstantOffsetToImm(MachineInstr & MI,MemInfoMap & Visited,SmallPtrSet<MachineInstr *,4> & AnchorList)1263*b5893f02SDimitry Andric bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1264*b5893f02SDimitry Andric     MachineInstr &MI,
1265*b5893f02SDimitry Andric     MemInfoMap &Visited,
1266*b5893f02SDimitry Andric     SmallPtrSet<MachineInstr *, 4> &AnchorList) {
1267*b5893f02SDimitry Andric 
1268*b5893f02SDimitry Andric   // TODO: Support flat and scratch.
1269*b5893f02SDimitry Andric   if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0 ||
1270*b5893f02SDimitry Andric       TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
1271*b5893f02SDimitry Andric     return false;
1272*b5893f02SDimitry Andric 
1273*b5893f02SDimitry Andric   // TODO: Support Store.
1274*b5893f02SDimitry Andric   if (!MI.mayLoad())
1275*b5893f02SDimitry Andric     return false;
1276*b5893f02SDimitry Andric 
1277*b5893f02SDimitry Andric   if (AnchorList.count(&MI))
1278*b5893f02SDimitry Andric     return false;
1279*b5893f02SDimitry Andric 
1280*b5893f02SDimitry Andric   LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1281*b5893f02SDimitry Andric 
1282*b5893f02SDimitry Andric   if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1283*b5893f02SDimitry Andric     LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
1284*b5893f02SDimitry Andric     return false;
1285*b5893f02SDimitry Andric   }
1286*b5893f02SDimitry Andric 
1287*b5893f02SDimitry Andric   // Step1: Find the base-registers and a 64bit constant offset.
1288*b5893f02SDimitry Andric   MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1289*b5893f02SDimitry Andric   MemAddress MAddr;
1290*b5893f02SDimitry Andric   if (Visited.find(&MI) == Visited.end()) {
1291*b5893f02SDimitry Andric     processBaseWithConstOffset(Base, MAddr);
1292*b5893f02SDimitry Andric     Visited[&MI] = MAddr;
1293*b5893f02SDimitry Andric   } else
1294*b5893f02SDimitry Andric     MAddr = Visited[&MI];
1295*b5893f02SDimitry Andric 
1296*b5893f02SDimitry Andric   if (MAddr.Offset == 0) {
1297*b5893f02SDimitry Andric     LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
1298*b5893f02SDimitry Andric                          " constant offsets that can be promoted.\n";);
1299*b5893f02SDimitry Andric     return false;
1300*b5893f02SDimitry Andric   }
1301*b5893f02SDimitry Andric 
1302*b5893f02SDimitry Andric   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
1303*b5893f02SDimitry Andric              << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
1304*b5893f02SDimitry Andric 
1305*b5893f02SDimitry Andric   // Step2: Traverse through MI's basic block and find an anchor(that has the
1306*b5893f02SDimitry Andric   // same base-registers) with the highest 13bit distance from MI's offset.
1307*b5893f02SDimitry Andric   // E.g. (64bit loads)
1308*b5893f02SDimitry Andric   // bb:
1309*b5893f02SDimitry Andric   //   addr1 = &a + 4096;   load1 = load(addr1,  0)
1310*b5893f02SDimitry Andric   //   addr2 = &a + 6144;   load2 = load(addr2,  0)
1311*b5893f02SDimitry Andric   //   addr3 = &a + 8192;   load3 = load(addr3,  0)
1312*b5893f02SDimitry Andric   //   addr4 = &a + 10240;  load4 = load(addr4,  0)
1313*b5893f02SDimitry Andric   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
1314*b5893f02SDimitry Andric   //
1315*b5893f02SDimitry Andric   // Starting from the first load, the optimization will try to find a new base
1316*b5893f02SDimitry Andric   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1317*b5893f02SDimitry Andric   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1318*b5893f02SDimitry Andric   // as the new-base(anchor) because of the maximum distance which can
1319*b5893f02SDimitry Andric   // accomodate more intermediate bases presumeably.
1320*b5893f02SDimitry Andric   //
1321*b5893f02SDimitry Andric   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1322*b5893f02SDimitry Andric   // (&a + 8192) for load1, load2, load4.
1323*b5893f02SDimitry Andric   //   addr = &a + 8192
1324*b5893f02SDimitry Andric   //   load1 = load(addr,       -4096)
1325*b5893f02SDimitry Andric   //   load2 = load(addr,       -2048)
1326*b5893f02SDimitry Andric   //   load3 = load(addr,       0)
1327*b5893f02SDimitry Andric   //   load4 = load(addr,       2048)
1328*b5893f02SDimitry Andric   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
1329*b5893f02SDimitry Andric   //
1330*b5893f02SDimitry Andric   MachineInstr *AnchorInst = nullptr;
1331*b5893f02SDimitry Andric   MemAddress AnchorAddr;
1332*b5893f02SDimitry Andric   uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
1333*b5893f02SDimitry Andric   SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
1334*b5893f02SDimitry Andric 
1335*b5893f02SDimitry Andric   MachineBasicBlock *MBB = MI.getParent();
1336*b5893f02SDimitry Andric   MachineBasicBlock::iterator E = MBB->end();
1337*b5893f02SDimitry Andric   MachineBasicBlock::iterator MBBI = MI.getIterator();
1338*b5893f02SDimitry Andric   ++MBBI;
1339*b5893f02SDimitry Andric   const SITargetLowering *TLI =
1340*b5893f02SDimitry Andric     static_cast<const SITargetLowering *>(STM->getTargetLowering());
1341*b5893f02SDimitry Andric 
1342*b5893f02SDimitry Andric   for ( ; MBBI != E; ++MBBI) {
1343*b5893f02SDimitry Andric     MachineInstr &MINext = *MBBI;
1344*b5893f02SDimitry Andric     // TODO: Support finding an anchor(with same base) from store addresses or
1345*b5893f02SDimitry Andric     // any other load addresses where the opcodes are different.
1346*b5893f02SDimitry Andric     if (MINext.getOpcode() != MI.getOpcode() ||
1347*b5893f02SDimitry Andric         TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
1348*b5893f02SDimitry Andric       continue;
1349*b5893f02SDimitry Andric 
1350*b5893f02SDimitry Andric     const MachineOperand &BaseNext =
1351*b5893f02SDimitry Andric       *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
1352*b5893f02SDimitry Andric     MemAddress MAddrNext;
1353*b5893f02SDimitry Andric     if (Visited.find(&MINext) == Visited.end()) {
1354*b5893f02SDimitry Andric       processBaseWithConstOffset(BaseNext, MAddrNext);
1355*b5893f02SDimitry Andric       Visited[&MINext] = MAddrNext;
1356*b5893f02SDimitry Andric     } else
1357*b5893f02SDimitry Andric       MAddrNext = Visited[&MINext];
1358*b5893f02SDimitry Andric 
1359*b5893f02SDimitry Andric     if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1360*b5893f02SDimitry Andric         MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
1361*b5893f02SDimitry Andric         MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
1362*b5893f02SDimitry Andric         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
1363*b5893f02SDimitry Andric       continue;
1364*b5893f02SDimitry Andric 
1365*b5893f02SDimitry Andric     InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
1366*b5893f02SDimitry Andric 
1367*b5893f02SDimitry Andric     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1368*b5893f02SDimitry Andric     TargetLoweringBase::AddrMode AM;
1369*b5893f02SDimitry Andric     AM.HasBaseReg = true;
1370*b5893f02SDimitry Andric     AM.BaseOffs = Dist;
1371*b5893f02SDimitry Andric     if (TLI->isLegalGlobalAddressingMode(AM) &&
1372*b5893f02SDimitry Andric         (uint32_t)std::abs(Dist) > MaxDist) {
1373*b5893f02SDimitry Andric       MaxDist = std::abs(Dist);
1374*b5893f02SDimitry Andric 
1375*b5893f02SDimitry Andric       AnchorAddr = MAddrNext;
1376*b5893f02SDimitry Andric       AnchorInst = &MINext;
1377*b5893f02SDimitry Andric     }
1378*b5893f02SDimitry Andric   }
1379*b5893f02SDimitry Andric 
1380*b5893f02SDimitry Andric   if (AnchorInst) {
1381*b5893f02SDimitry Andric     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
1382*b5893f02SDimitry Andric                AnchorInst->dump());
1383*b5893f02SDimitry Andric     LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
1384*b5893f02SDimitry Andric                <<  AnchorAddr.Offset << "\n\n");
1385*b5893f02SDimitry Andric 
1386*b5893f02SDimitry Andric     // Instead of moving up, just re-compute anchor-instruction's base address.
1387*b5893f02SDimitry Andric     unsigned Base = computeBase(MI, AnchorAddr);
1388*b5893f02SDimitry Andric 
1389*b5893f02SDimitry Andric     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
1390*b5893f02SDimitry Andric     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
1391*b5893f02SDimitry Andric 
1392*b5893f02SDimitry Andric     for (auto P : InstsWCommonBase) {
1393*b5893f02SDimitry Andric       TargetLoweringBase::AddrMode AM;
1394*b5893f02SDimitry Andric       AM.HasBaseReg = true;
1395*b5893f02SDimitry Andric       AM.BaseOffs = P.second - AnchorAddr.Offset;
1396*b5893f02SDimitry Andric 
1397*b5893f02SDimitry Andric       if (TLI->isLegalGlobalAddressingMode(AM)) {
1398*b5893f02SDimitry Andric         LLVM_DEBUG(dbgs() << "  Promote Offset(" << P.second;
1399*b5893f02SDimitry Andric                    dbgs() << ")"; P.first->dump());
1400*b5893f02SDimitry Andric         updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
1401*b5893f02SDimitry Andric         LLVM_DEBUG(dbgs() << "     After promotion: "; P.first->dump());
1402*b5893f02SDimitry Andric       }
1403*b5893f02SDimitry Andric     }
1404*b5893f02SDimitry Andric     AnchorList.insert(AnchorInst);
1405*b5893f02SDimitry Andric     return true;
1406*b5893f02SDimitry Andric   }
1407*b5893f02SDimitry Andric 
1408*b5893f02SDimitry Andric   return false;
1409*b5893f02SDimitry Andric }
1410*b5893f02SDimitry Andric 
14118f0fd8f6SDimitry Andric // Scan through looking for adjacent LDS operations with constant offsets from
14128f0fd8f6SDimitry Andric // the same base register. We rely on the scheduler to do the hard work of
14138f0fd8f6SDimitry Andric // clustering nearby loads, and assume these are all adjacent.
optimizeBlock(MachineBasicBlock & MBB)14148f0fd8f6SDimitry Andric bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
14158f0fd8f6SDimitry Andric   bool Modified = false;
14168f0fd8f6SDimitry Andric 
1417*b5893f02SDimitry Andric   // Contain the list
1418*b5893f02SDimitry Andric   MemInfoMap Visited;
1419*b5893f02SDimitry Andric   // Contains the list of instructions for which constant offsets are being
1420*b5893f02SDimitry Andric   // promoted to the IMM.
1421*b5893f02SDimitry Andric   SmallPtrSet<MachineInstr *, 4> AnchorList;
1422*b5893f02SDimitry Andric 
14238f0fd8f6SDimitry Andric   for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
14248f0fd8f6SDimitry Andric     MachineInstr &MI = *I;
14258f0fd8f6SDimitry Andric 
1426*b5893f02SDimitry Andric     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
1427*b5893f02SDimitry Andric       Modified = true;
1428*b5893f02SDimitry Andric 
14298f0fd8f6SDimitry Andric     // Don't combine if volatile.
14308f0fd8f6SDimitry Andric     if (MI.hasOrderedMemoryRef()) {
14318f0fd8f6SDimitry Andric       ++I;
14328f0fd8f6SDimitry Andric       continue;
14338f0fd8f6SDimitry Andric     }
14348f0fd8f6SDimitry Andric 
1435*b5893f02SDimitry Andric     const unsigned Opc = MI.getOpcode();
1436*b5893f02SDimitry Andric 
14377a7e6055SDimitry Andric     CombineInfo CI;
14387a7e6055SDimitry Andric     CI.I = I;
1439*b5893f02SDimitry Andric     CI.InstClass = getInstClass(Opc);
14402cab237bSDimitry Andric 
1441*b5893f02SDimitry Andric     switch (CI.InstClass) {
1442*b5893f02SDimitry Andric     default:
1443*b5893f02SDimitry Andric       break;
1444*b5893f02SDimitry Andric     case DS_READ:
14452cab237bSDimitry Andric       CI.EltSize =
1446*b5893f02SDimitry Andric           (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
1447*b5893f02SDimitry Andric                                                                           : 4;
14482cab237bSDimitry Andric       if (findMatchingInst(CI)) {
14498f0fd8f6SDimitry Andric         Modified = true;
14507a7e6055SDimitry Andric         I = mergeRead2Pair(CI);
14518f0fd8f6SDimitry Andric       } else {
14528f0fd8f6SDimitry Andric         ++I;
14538f0fd8f6SDimitry Andric       }
14548f0fd8f6SDimitry Andric       continue;
1455*b5893f02SDimitry Andric     case DS_WRITE:
1456*b5893f02SDimitry Andric       CI.EltSize =
1457*b5893f02SDimitry Andric           (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
1458*b5893f02SDimitry Andric                                                                             : 4;
14592cab237bSDimitry Andric       if (findMatchingInst(CI)) {
14608f0fd8f6SDimitry Andric         Modified = true;
14617a7e6055SDimitry Andric         I = mergeWrite2Pair(CI);
14628f0fd8f6SDimitry Andric       } else {
14638f0fd8f6SDimitry Andric         ++I;
14648f0fd8f6SDimitry Andric       }
14658f0fd8f6SDimitry Andric       continue;
1466*b5893f02SDimitry Andric     case S_BUFFER_LOAD_IMM:
14672cab237bSDimitry Andric       CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
14682cab237bSDimitry Andric       if (findMatchingInst(CI)) {
14692cab237bSDimitry Andric         Modified = true;
14702cab237bSDimitry Andric         I = mergeSBufferLoadImmPair(CI);
1471*b5893f02SDimitry Andric         OptimizeAgain |= (CI.Width0 + CI.Width1) < 16;
14722cab237bSDimitry Andric       } else {
14732cab237bSDimitry Andric         ++I;
14742cab237bSDimitry Andric       }
14752cab237bSDimitry Andric       continue;
1476*b5893f02SDimitry Andric     case BUFFER_LOAD_OFFEN:
1477*b5893f02SDimitry Andric     case BUFFER_LOAD_OFFSET:
1478*b5893f02SDimitry Andric     case BUFFER_LOAD_OFFEN_exact:
1479*b5893f02SDimitry Andric     case BUFFER_LOAD_OFFSET_exact:
14802cab237bSDimitry Andric       CI.EltSize = 4;
14812cab237bSDimitry Andric       if (findMatchingInst(CI)) {
14822cab237bSDimitry Andric         Modified = true;
14832cab237bSDimitry Andric         I = mergeBufferLoadPair(CI);
1484*b5893f02SDimitry Andric         OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
14852cab237bSDimitry Andric       } else {
14862cab237bSDimitry Andric         ++I;
14872cab237bSDimitry Andric       }
14882cab237bSDimitry Andric       continue;
1489*b5893f02SDimitry Andric     case BUFFER_STORE_OFFEN:
1490*b5893f02SDimitry Andric     case BUFFER_STORE_OFFSET:
1491*b5893f02SDimitry Andric     case BUFFER_STORE_OFFEN_exact:
1492*b5893f02SDimitry Andric     case BUFFER_STORE_OFFSET_exact:
14932cab237bSDimitry Andric       CI.EltSize = 4;
14942cab237bSDimitry Andric       if (findMatchingInst(CI)) {
14952cab237bSDimitry Andric         Modified = true;
14962cab237bSDimitry Andric         I = mergeBufferStorePair(CI);
1497*b5893f02SDimitry Andric         OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
14982cab237bSDimitry Andric       } else {
14992cab237bSDimitry Andric         ++I;
15002cab237bSDimitry Andric       }
15012cab237bSDimitry Andric       continue;
15022cab237bSDimitry Andric     }
15038f0fd8f6SDimitry Andric 
15048f0fd8f6SDimitry Andric     ++I;
15058f0fd8f6SDimitry Andric   }
15068f0fd8f6SDimitry Andric 
15078f0fd8f6SDimitry Andric   return Modified;
15088f0fd8f6SDimitry Andric }
15098f0fd8f6SDimitry Andric 
runOnMachineFunction(MachineFunction & MF)15108f0fd8f6SDimitry Andric bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
15112cab237bSDimitry Andric   if (skipFunction(MF.getFunction()))
15123ca95b02SDimitry Andric     return false;
15133ca95b02SDimitry Andric 
15144ba319b5SDimitry Andric   STM = &MF.getSubtarget<GCNSubtarget>();
15152cab237bSDimitry Andric   if (!STM->loadStoreOptEnabled())
15163ca95b02SDimitry Andric     return false;
15173ca95b02SDimitry Andric 
15182cab237bSDimitry Andric   TII = STM->getInstrInfo();
15193ca95b02SDimitry Andric   TRI = &TII->getRegisterInfo();
15203ca95b02SDimitry Andric 
15218f0fd8f6SDimitry Andric   MRI = &MF.getRegInfo();
1522d88c1a5aSDimitry Andric   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
15238f0fd8f6SDimitry Andric 
15242cab237bSDimitry Andric   assert(MRI->isSSA() && "Must be run on SSA");
15252cab237bSDimitry Andric 
15264ba319b5SDimitry Andric   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
15278f0fd8f6SDimitry Andric 
15288f0fd8f6SDimitry Andric   bool Modified = false;
15298f0fd8f6SDimitry Andric 
15302cab237bSDimitry Andric   for (MachineBasicBlock &MBB : MF) {
1531*b5893f02SDimitry Andric     do {
1532*b5893f02SDimitry Andric       OptimizeAgain = false;
15338f0fd8f6SDimitry Andric       Modified |= optimizeBlock(MBB);
1534*b5893f02SDimitry Andric     } while (OptimizeAgain);
15352cab237bSDimitry Andric   }
15362cab237bSDimitry Andric 
15378f0fd8f6SDimitry Andric   return Modified;
15388f0fd8f6SDimitry Andric }
1539