10b57cec5SDimitry Andric //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric //
90b57cec5SDimitry Andric // This pass tries to fuse DS instructions with close by immediate offsets.
100b57cec5SDimitry Andric // This will fuse operations such as
110b57cec5SDimitry Andric //  ds_read_b32 v0, v2 offset:16
120b57cec5SDimitry Andric //  ds_read_b32 v1, v2 offset:32
130b57cec5SDimitry Andric // ==>
140b57cec5SDimitry Andric //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
150b57cec5SDimitry Andric //
160b57cec5SDimitry Andric // The same is done for certain SMEM and VMEM opcodes, e.g.:
170b57cec5SDimitry Andric //  s_buffer_load_dword s4, s[0:3], 4
180b57cec5SDimitry Andric //  s_buffer_load_dword s5, s[0:3], 8
190b57cec5SDimitry Andric // ==>
200b57cec5SDimitry Andric //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
210b57cec5SDimitry Andric //
220b57cec5SDimitry Andric // This pass also tries to promote constant offset to the immediate by
230b57cec5SDimitry Andric // adjusting the base. It tries to use a base from the nearby instructions that
240b57cec5SDimitry Andric // allows it to have a 13bit constant offset and then promotes the 13bit offset
250b57cec5SDimitry Andric // to the immediate.
260b57cec5SDimitry Andric // E.g.
270b57cec5SDimitry Andric //  s_movk_i32 s0, 0x1800
280b57cec5SDimitry Andric //  v_add_co_u32_e32 v0, vcc, s0, v2
290b57cec5SDimitry Andric //  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
300b57cec5SDimitry Andric //
310b57cec5SDimitry Andric //  s_movk_i32 s0, 0x1000
320b57cec5SDimitry Andric //  v_add_co_u32_e32 v5, vcc, s0, v2
330b57cec5SDimitry Andric //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
340b57cec5SDimitry Andric //  global_load_dwordx2 v[5:6], v[5:6], off
350b57cec5SDimitry Andric //  global_load_dwordx2 v[0:1], v[0:1], off
360b57cec5SDimitry Andric // =>
370b57cec5SDimitry Andric //  s_movk_i32 s0, 0x1000
380b57cec5SDimitry Andric //  v_add_co_u32_e32 v5, vcc, s0, v2
390b57cec5SDimitry Andric //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
400b57cec5SDimitry Andric //  global_load_dwordx2 v[5:6], v[5:6], off
410b57cec5SDimitry Andric //  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
420b57cec5SDimitry Andric //
430b57cec5SDimitry Andric // Future improvements:
440b57cec5SDimitry Andric //
458bcb0991SDimitry Andric // - This is currently missing stores of constants because loading
460b57cec5SDimitry Andric //   the constant into the data register is placed between the stores, although
470b57cec5SDimitry Andric //   this is arguably a scheduling problem.
480b57cec5SDimitry Andric //
490b57cec5SDimitry Andric // - Live interval recomputing seems inefficient. This currently only matches
500b57cec5SDimitry Andric //   one pair, and recomputes live intervals and moves on to the next pair. It
510b57cec5SDimitry Andric //   would be better to compute a list of all merges that need to occur.
520b57cec5SDimitry Andric //
530b57cec5SDimitry Andric // - With a list of instructions to process, we can also merge more. If a
540b57cec5SDimitry Andric //   cluster of loads have offsets that are too large to fit in the 8-bit
550b57cec5SDimitry Andric //   offsets, but are close enough to fit in the 8 bits, we can add to the base
560b57cec5SDimitry Andric //   pointer and use the new reduced offsets.
570b57cec5SDimitry Andric //
580b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
590b57cec5SDimitry Andric 
600b57cec5SDimitry Andric #include "AMDGPU.h"
61e8d8bef9SDimitry Andric #include "GCNSubtarget.h"
620b57cec5SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
630b57cec5SDimitry Andric #include "llvm/Analysis/AliasAnalysis.h"
640b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h"
65480093f4SDimitry Andric #include "llvm/InitializePasses.h"
660b57cec5SDimitry Andric 
670b57cec5SDimitry Andric using namespace llvm;
680b57cec5SDimitry Andric 
690b57cec5SDimitry Andric #define DEBUG_TYPE "si-load-store-opt"
700b57cec5SDimitry Andric 
710b57cec5SDimitry Andric namespace {
720b57cec5SDimitry Andric enum InstClassEnum {
730b57cec5SDimitry Andric   UNKNOWN,
740b57cec5SDimitry Andric   DS_READ,
750b57cec5SDimitry Andric   DS_WRITE,
760b57cec5SDimitry Andric   S_BUFFER_LOAD_IMM,
77bdd1243dSDimitry Andric   S_BUFFER_LOAD_SGPR_IMM,
78bdd1243dSDimitry Andric   S_LOAD_IMM,
798bcb0991SDimitry Andric   BUFFER_LOAD,
808bcb0991SDimitry Andric   BUFFER_STORE,
818bcb0991SDimitry Andric   MIMG,
82480093f4SDimitry Andric   TBUFFER_LOAD,
83480093f4SDimitry Andric   TBUFFER_STORE,
8481ad6265SDimitry Andric   GLOBAL_LOAD_SADDR,
8581ad6265SDimitry Andric   GLOBAL_STORE_SADDR,
8681ad6265SDimitry Andric   FLAT_LOAD,
8781ad6265SDimitry Andric   FLAT_STORE,
8881ad6265SDimitry Andric   GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
8981ad6265SDimitry Andric   GLOBAL_STORE // any CombineInfo, they are only ever returned by
9081ad6265SDimitry Andric                // getCommonInstClass.
910b57cec5SDimitry Andric };
920b57cec5SDimitry Andric 
935ffd83dbSDimitry Andric struct AddressRegs {
945ffd83dbSDimitry Andric   unsigned char NumVAddrs = 0;
955ffd83dbSDimitry Andric   bool SBase = false;
965ffd83dbSDimitry Andric   bool SRsrc = false;
975ffd83dbSDimitry Andric   bool SOffset = false;
9881ad6265SDimitry Andric   bool SAddr = false;
995ffd83dbSDimitry Andric   bool VAddr = false;
1005ffd83dbSDimitry Andric   bool Addr = false;
1015ffd83dbSDimitry Andric   bool SSamp = false;
1020b57cec5SDimitry Andric };
1030b57cec5SDimitry Andric 
1045ffd83dbSDimitry Andric // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
1055ffd83dbSDimitry Andric const unsigned MaxAddressRegs = 12 + 1 + 1;
1065ffd83dbSDimitry Andric 
1070b57cec5SDimitry Andric class SILoadStoreOptimizer : public MachineFunctionPass {
1080b57cec5SDimitry Andric   struct CombineInfo {
1090b57cec5SDimitry Andric     MachineBasicBlock::iterator I;
1100b57cec5SDimitry Andric     unsigned EltSize;
111480093f4SDimitry Andric     unsigned Offset;
112480093f4SDimitry Andric     unsigned Width;
113480093f4SDimitry Andric     unsigned Format;
1140b57cec5SDimitry Andric     unsigned BaseOff;
115480093f4SDimitry Andric     unsigned DMask;
1160b57cec5SDimitry Andric     InstClassEnum InstClass;
117fe6060f1SDimitry Andric     unsigned CPol = 0;
11804eeddc0SDimitry Andric     bool IsAGPR;
1190b57cec5SDimitry Andric     bool UseST64;
1205ffd83dbSDimitry Andric     int AddrIdx[MaxAddressRegs];
1215ffd83dbSDimitry Andric     const MachineOperand *AddrReg[MaxAddressRegs];
1228bcb0991SDimitry Andric     unsigned NumAddresses;
1235ffd83dbSDimitry Andric     unsigned Order;
1248bcb0991SDimitry Andric 
hasSameBaseAddress__anon56ab8fb50111::SILoadStoreOptimizer::CombineInfo125bdd1243dSDimitry Andric     bool hasSameBaseAddress(const CombineInfo &CI) {
126bdd1243dSDimitry Andric       if (NumAddresses != CI.NumAddresses)
127bdd1243dSDimitry Andric         return false;
128bdd1243dSDimitry Andric 
129bdd1243dSDimitry Andric       const MachineInstr &MI = *CI.I;
1308bcb0991SDimitry Andric       for (unsigned i = 0; i < NumAddresses; i++) {
1318bcb0991SDimitry Andric         const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
1328bcb0991SDimitry Andric 
1338bcb0991SDimitry Andric         if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
1348bcb0991SDimitry Andric           if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
1358bcb0991SDimitry Andric               AddrReg[i]->getImm() != AddrRegNext.getImm()) {
1368bcb0991SDimitry Andric             return false;
1378bcb0991SDimitry Andric           }
1388bcb0991SDimitry Andric           continue;
1398bcb0991SDimitry Andric         }
1408bcb0991SDimitry Andric 
1418bcb0991SDimitry Andric         // Check same base pointer. Be careful of subregisters, which can occur
1428bcb0991SDimitry Andric         // with vectors of pointers.
1438bcb0991SDimitry Andric         if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
1448bcb0991SDimitry Andric             AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
1458bcb0991SDimitry Andric          return false;
1468bcb0991SDimitry Andric         }
1478bcb0991SDimitry Andric       }
1488bcb0991SDimitry Andric       return true;
1498bcb0991SDimitry Andric     }
1508bcb0991SDimitry Andric 
hasMergeableAddress__anon56ab8fb50111::SILoadStoreOptimizer::CombineInfo1518bcb0991SDimitry Andric     bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
1528bcb0991SDimitry Andric       for (unsigned i = 0; i < NumAddresses; ++i) {
1538bcb0991SDimitry Andric         const MachineOperand *AddrOp = AddrReg[i];
1548bcb0991SDimitry Andric         // Immediates are always OK.
1558bcb0991SDimitry Andric         if (AddrOp->isImm())
1568bcb0991SDimitry Andric           continue;
1578bcb0991SDimitry Andric 
1588bcb0991SDimitry Andric         // Don't try to merge addresses that aren't either immediates or registers.
1598bcb0991SDimitry Andric         // TODO: Should be possible to merge FrameIndexes and maybe some other
1608bcb0991SDimitry Andric         // non-register
1618bcb0991SDimitry Andric         if (!AddrOp->isReg())
1628bcb0991SDimitry Andric           return false;
1638bcb0991SDimitry Andric 
164*c9157d92SDimitry Andric         // TODO: We should be able to merge instructions with other physical reg
165*c9157d92SDimitry Andric         // addresses too.
166*c9157d92SDimitry Andric         if (AddrOp->getReg().isPhysical() &&
167*c9157d92SDimitry Andric             AddrOp->getReg() != AMDGPU::SGPR_NULL)
1688bcb0991SDimitry Andric           return false;
1698bcb0991SDimitry Andric 
170bdd1243dSDimitry Andric         // If an address has only one use then there will be no other
1718bcb0991SDimitry Andric         // instructions with the same address, so we can't merge this one.
1728bcb0991SDimitry Andric         if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
1738bcb0991SDimitry Andric           return false;
1748bcb0991SDimitry Andric       }
1758bcb0991SDimitry Andric       return true;
1768bcb0991SDimitry Andric     }
1778bcb0991SDimitry Andric 
17804eeddc0SDimitry Andric     void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
17981ad6265SDimitry Andric 
18081ad6265SDimitry Andric     // Compare by pointer order.
operator <__anon56ab8fb50111::SILoadStoreOptimizer::CombineInfo18181ad6265SDimitry Andric     bool operator<(const CombineInfo& Other) const {
18281ad6265SDimitry Andric       return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
18381ad6265SDimitry Andric     }
1840b57cec5SDimitry Andric   };
1850b57cec5SDimitry Andric 
1860b57cec5SDimitry Andric   struct BaseRegisters {
1875ffd83dbSDimitry Andric     Register LoReg;
1885ffd83dbSDimitry Andric     Register HiReg;
1890b57cec5SDimitry Andric 
1900b57cec5SDimitry Andric     unsigned LoSubReg = 0;
1910b57cec5SDimitry Andric     unsigned HiSubReg = 0;
1920b57cec5SDimitry Andric   };
1930b57cec5SDimitry Andric 
1940b57cec5SDimitry Andric   struct MemAddress {
1950b57cec5SDimitry Andric     BaseRegisters Base;
1960b57cec5SDimitry Andric     int64_t Offset = 0;
1970b57cec5SDimitry Andric   };
1980b57cec5SDimitry Andric 
1990b57cec5SDimitry Andric   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
2000b57cec5SDimitry Andric 
2010b57cec5SDimitry Andric private:
2020b57cec5SDimitry Andric   const GCNSubtarget *STM = nullptr;
2030b57cec5SDimitry Andric   const SIInstrInfo *TII = nullptr;
2040b57cec5SDimitry Andric   const SIRegisterInfo *TRI = nullptr;
2050b57cec5SDimitry Andric   MachineRegisterInfo *MRI = nullptr;
2060b57cec5SDimitry Andric   AliasAnalysis *AA = nullptr;
2070b57cec5SDimitry Andric   bool OptimizeAgain;
2080b57cec5SDimitry Andric 
20981ad6265SDimitry Andric   bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
21081ad6265SDimitry Andric                            const DenseSet<Register> &ARegUses,
21181ad6265SDimitry Andric                            const MachineInstr &A, const MachineInstr &B) const;
212480093f4SDimitry Andric   static bool dmasksCanBeCombined(const CombineInfo &CI,
213480093f4SDimitry Andric                                   const SIInstrInfo &TII,
214480093f4SDimitry Andric                                   const CombineInfo &Paired);
2155ffd83dbSDimitry Andric   static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
2165ffd83dbSDimitry Andric                                    CombineInfo &Paired, bool Modify = false);
2175ffd83dbSDimitry Andric   static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
218480093f4SDimitry Andric                         const CombineInfo &Paired);
219480093f4SDimitry Andric   static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
220480093f4SDimitry Andric   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
221480093f4SDimitry Andric                                                      const CombineInfo &Paired);
222480093f4SDimitry Andric   const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
223480093f4SDimitry Andric                                                     const CombineInfo &Paired);
224fe6060f1SDimitry Andric   const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
2250b57cec5SDimitry Andric 
22681ad6265SDimitry Andric   CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
2270b57cec5SDimitry Andric 
2280b57cec5SDimitry Andric   unsigned read2Opcode(unsigned EltSize) const;
2290b57cec5SDimitry Andric   unsigned read2ST64Opcode(unsigned EltSize) const;
23081ad6265SDimitry Andric   MachineBasicBlock::iterator
23181ad6265SDimitry Andric   mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
23281ad6265SDimitry Andric                  MachineBasicBlock::iterator InsertBefore);
2330b57cec5SDimitry Andric 
2340b57cec5SDimitry Andric   unsigned write2Opcode(unsigned EltSize) const;
2350b57cec5SDimitry Andric   unsigned write2ST64Opcode(unsigned EltSize) const;
2365ffd83dbSDimitry Andric   MachineBasicBlock::iterator
2375ffd83dbSDimitry Andric   mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
23881ad6265SDimitry Andric                   MachineBasicBlock::iterator InsertBefore);
2395ffd83dbSDimitry Andric   MachineBasicBlock::iterator
2405ffd83dbSDimitry Andric   mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
24181ad6265SDimitry Andric                  MachineBasicBlock::iterator InsertBefore);
2425ffd83dbSDimitry Andric   MachineBasicBlock::iterator
243bdd1243dSDimitry Andric   mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
24481ad6265SDimitry Andric                        MachineBasicBlock::iterator InsertBefore);
2455ffd83dbSDimitry Andric   MachineBasicBlock::iterator
2465ffd83dbSDimitry Andric   mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
24781ad6265SDimitry Andric                       MachineBasicBlock::iterator InsertBefore);
2485ffd83dbSDimitry Andric   MachineBasicBlock::iterator
2495ffd83dbSDimitry Andric   mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
25081ad6265SDimitry Andric                        MachineBasicBlock::iterator InsertBefore);
2515ffd83dbSDimitry Andric   MachineBasicBlock::iterator
2525ffd83dbSDimitry Andric   mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
25381ad6265SDimitry Andric                        MachineBasicBlock::iterator InsertBefore);
2545ffd83dbSDimitry Andric   MachineBasicBlock::iterator
2555ffd83dbSDimitry Andric   mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
25681ad6265SDimitry Andric                         MachineBasicBlock::iterator InsertBefore);
25781ad6265SDimitry Andric   MachineBasicBlock::iterator
25881ad6265SDimitry Andric   mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
25981ad6265SDimitry Andric                     MachineBasicBlock::iterator InsertBefore);
26081ad6265SDimitry Andric   MachineBasicBlock::iterator
26181ad6265SDimitry Andric   mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
26281ad6265SDimitry Andric                      MachineBasicBlock::iterator InsertBefore);
2630b57cec5SDimitry Andric 
2645ffd83dbSDimitry Andric   void updateBaseAndOffset(MachineInstr &I, Register NewBase,
2658bcb0991SDimitry Andric                            int32_t NewOffset) const;
2665ffd83dbSDimitry Andric   Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
2678bcb0991SDimitry Andric   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
268bdd1243dSDimitry Andric   std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
2698bcb0991SDimitry Andric   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
2700b57cec5SDimitry Andric   /// Promotes constant offset to the immediate by adjusting the base. It
2710b57cec5SDimitry Andric   /// tries to use a base from the nearby instructions that allows it to have
2720b57cec5SDimitry Andric   /// a 13bit constant offset which gets promoted to the immediate.
2730b57cec5SDimitry Andric   bool promoteConstantOffsetToImm(MachineInstr &CI,
2740b57cec5SDimitry Andric                                   MemInfoMap &Visited,
2758bcb0991SDimitry Andric                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
2768bcb0991SDimitry Andric   void addInstToMergeableList(const CombineInfo &CI,
2778bcb0991SDimitry Andric                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
2785ffd83dbSDimitry Andric 
2795ffd83dbSDimitry Andric   std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
2805ffd83dbSDimitry Andric       MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
2815ffd83dbSDimitry Andric       MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2828bcb0991SDimitry Andric       std::list<std::list<CombineInfo>> &MergeableInsts) const;
2830b57cec5SDimitry Andric 
28481ad6265SDimitry Andric   static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
28581ad6265SDimitry Andric                                                      const CombineInfo &Paired);
28681ad6265SDimitry Andric 
28781ad6265SDimitry Andric   static InstClassEnum getCommonInstClass(const CombineInfo &CI,
28881ad6265SDimitry Andric                                           const CombineInfo &Paired);
28981ad6265SDimitry Andric 
2900b57cec5SDimitry Andric public:
2910b57cec5SDimitry Andric   static char ID;
2920b57cec5SDimitry Andric 
SILoadStoreOptimizer()2930b57cec5SDimitry Andric   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
2940b57cec5SDimitry Andric     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
2950b57cec5SDimitry Andric   }
2960b57cec5SDimitry Andric 
2978bcb0991SDimitry Andric   bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
2988bcb0991SDimitry Andric                                      bool &OptimizeListAgain);
2998bcb0991SDimitry Andric   bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
3000b57cec5SDimitry Andric 
3010b57cec5SDimitry Andric   bool runOnMachineFunction(MachineFunction &MF) override;
3020b57cec5SDimitry Andric 
getPassName() const3030b57cec5SDimitry Andric   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
3040b57cec5SDimitry Andric 
getAnalysisUsage(AnalysisUsage & AU) const3050b57cec5SDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override {
3060b57cec5SDimitry Andric     AU.setPreservesCFG();
3070b57cec5SDimitry Andric     AU.addRequired<AAResultsWrapperPass>();
3080b57cec5SDimitry Andric 
3090b57cec5SDimitry Andric     MachineFunctionPass::getAnalysisUsage(AU);
3100b57cec5SDimitry Andric   }
3115ffd83dbSDimitry Andric 
getRequiredProperties() const3125ffd83dbSDimitry Andric   MachineFunctionProperties getRequiredProperties() const override {
3135ffd83dbSDimitry Andric     return MachineFunctionProperties()
3145ffd83dbSDimitry Andric       .set(MachineFunctionProperties::Property::IsSSA);
3155ffd83dbSDimitry Andric   }
3160b57cec5SDimitry Andric };
3170b57cec5SDimitry Andric 
getOpcodeWidth(const MachineInstr & MI,const SIInstrInfo & TII)3188bcb0991SDimitry Andric static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
3198bcb0991SDimitry Andric   const unsigned Opc = MI.getOpcode();
3208bcb0991SDimitry Andric 
3218bcb0991SDimitry Andric   if (TII.isMUBUF(Opc)) {
3228bcb0991SDimitry Andric     // FIXME: Handle d16 correctly
3238bcb0991SDimitry Andric     return AMDGPU::getMUBUFElements(Opc);
3248bcb0991SDimitry Andric   }
325*c9157d92SDimitry Andric   if (TII.isImage(MI)) {
3268bcb0991SDimitry Andric     uint64_t DMaskImm =
3278bcb0991SDimitry Andric         TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
328bdd1243dSDimitry Andric     return llvm::popcount(DMaskImm);
3298bcb0991SDimitry Andric   }
330480093f4SDimitry Andric   if (TII.isMTBUF(Opc)) {
331480093f4SDimitry Andric     return AMDGPU::getMTBUFElements(Opc);
332480093f4SDimitry Andric   }
3338bcb0991SDimitry Andric 
3348bcb0991SDimitry Andric   switch (Opc) {
3358bcb0991SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
336bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
337bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORD_IMM:
33881ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORD:
33981ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
34081ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORD:
34181ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
34281ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORD:
34381ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORD:
3448bcb0991SDimitry Andric     return 1;
3458bcb0991SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
346bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
347bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORDX2_IMM:
34881ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX2:
34981ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
35081ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX2:
35181ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
35281ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX2:
35381ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX2:
3548bcb0991SDimitry Andric     return 2;
355*c9157d92SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
356*c9157d92SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
357*c9157d92SDimitry Andric   case AMDGPU::S_LOAD_DWORDX3_IMM:
35881ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX3:
35981ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
36081ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX3:
36181ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
36281ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX3:
36381ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX3:
36481ad6265SDimitry Andric     return 3;
3658bcb0991SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
366bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
367bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORDX4_IMM:
36881ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX4:
36981ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
37081ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX4:
37181ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
37281ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX4:
37381ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX4:
3748bcb0991SDimitry Andric     return 4;
375349cc55cSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
376bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
377bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORDX8_IMM:
378349cc55cSDimitry Andric     return 8;
379bdd1243dSDimitry Andric   case AMDGPU::DS_READ_B32:      [[fallthrough]];
380bdd1243dSDimitry Andric   case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]];
381bdd1243dSDimitry Andric   case AMDGPU::DS_WRITE_B32:     [[fallthrough]];
382fe6060f1SDimitry Andric   case AMDGPU::DS_WRITE_B32_gfx9:
383fe6060f1SDimitry Andric     return 1;
384bdd1243dSDimitry Andric   case AMDGPU::DS_READ_B64:      [[fallthrough]];
385bdd1243dSDimitry Andric   case AMDGPU::DS_READ_B64_gfx9: [[fallthrough]];
386bdd1243dSDimitry Andric   case AMDGPU::DS_WRITE_B64:     [[fallthrough]];
387fe6060f1SDimitry Andric   case AMDGPU::DS_WRITE_B64_gfx9:
388fe6060f1SDimitry Andric     return 2;
3898bcb0991SDimitry Andric   default:
3908bcb0991SDimitry Andric     return 0;
3918bcb0991SDimitry Andric   }
3928bcb0991SDimitry Andric }
3938bcb0991SDimitry Andric 
3948bcb0991SDimitry Andric /// Maps instruction opcode to enum InstClassEnum.
getInstClass(unsigned Opc,const SIInstrInfo & TII)3958bcb0991SDimitry Andric static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
3968bcb0991SDimitry Andric   switch (Opc) {
3978bcb0991SDimitry Andric   default:
3988bcb0991SDimitry Andric     if (TII.isMUBUF(Opc)) {
3998bcb0991SDimitry Andric       switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
4008bcb0991SDimitry Andric       default:
4018bcb0991SDimitry Andric         return UNKNOWN;
4028bcb0991SDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
4038bcb0991SDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
4048bcb0991SDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
4058bcb0991SDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
406*c9157d92SDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
407*c9157d92SDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
408*c9157d92SDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
409*c9157d92SDimitry Andric       case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
4108bcb0991SDimitry Andric         return BUFFER_LOAD;
4118bcb0991SDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
4128bcb0991SDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
4138bcb0991SDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
4148bcb0991SDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
415*c9157d92SDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
416*c9157d92SDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
417*c9157d92SDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
418*c9157d92SDimitry Andric       case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
4198bcb0991SDimitry Andric         return BUFFER_STORE;
4208bcb0991SDimitry Andric       }
4218bcb0991SDimitry Andric     }
422*c9157d92SDimitry Andric     if (TII.isImage(Opc)) {
4238bcb0991SDimitry Andric       // Ignore instructions encoded without vaddr.
424bdd1243dSDimitry Andric       if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
425bdd1243dSDimitry Andric           !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
4268bcb0991SDimitry Andric         return UNKNOWN;
427349cc55cSDimitry Andric       // Ignore BVH instructions
428349cc55cSDimitry Andric       if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
429349cc55cSDimitry Andric         return UNKNOWN;
4308bcb0991SDimitry Andric       // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
431480093f4SDimitry Andric       if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
432480093f4SDimitry Andric           TII.isGather4(Opc))
4338bcb0991SDimitry Andric         return UNKNOWN;
4348bcb0991SDimitry Andric       return MIMG;
4358bcb0991SDimitry Andric     }
436480093f4SDimitry Andric     if (TII.isMTBUF(Opc)) {
437480093f4SDimitry Andric       switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
438480093f4SDimitry Andric       default:
439480093f4SDimitry Andric         return UNKNOWN;
440*c9157d92SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
441*c9157d92SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
442*c9157d92SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
443*c9157d92SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
444480093f4SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
445480093f4SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
446480093f4SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
447480093f4SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
448*c9157d92SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
449*c9157d92SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
450*c9157d92SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
451*c9157d92SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
452*c9157d92SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
453*c9157d92SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
454*c9157d92SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
455*c9157d92SDimitry Andric       case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
456480093f4SDimitry Andric         return TBUFFER_LOAD;
457480093f4SDimitry Andric       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
458480093f4SDimitry Andric       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
459480093f4SDimitry Andric       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
460480093f4SDimitry Andric       case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
461*c9157d92SDimitry Andric       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
462*c9157d92SDimitry Andric       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
463*c9157d92SDimitry Andric       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
464*c9157d92SDimitry Andric       case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
465480093f4SDimitry Andric         return TBUFFER_STORE;
466480093f4SDimitry Andric       }
467480093f4SDimitry Andric     }
4688bcb0991SDimitry Andric     return UNKNOWN;
4698bcb0991SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
4708bcb0991SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
471*c9157d92SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
4728bcb0991SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
473349cc55cSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
4748bcb0991SDimitry Andric     return S_BUFFER_LOAD_IMM;
475bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
476bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
477*c9157d92SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
478bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
479bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
480bdd1243dSDimitry Andric     return S_BUFFER_LOAD_SGPR_IMM;
481bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORD_IMM:
482bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORDX2_IMM:
483*c9157d92SDimitry Andric   case AMDGPU::S_LOAD_DWORDX3_IMM:
484bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORDX4_IMM:
485bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORDX8_IMM:
486bdd1243dSDimitry Andric     return S_LOAD_IMM;
4878bcb0991SDimitry Andric   case AMDGPU::DS_READ_B32:
4888bcb0991SDimitry Andric   case AMDGPU::DS_READ_B32_gfx9:
4898bcb0991SDimitry Andric   case AMDGPU::DS_READ_B64:
4908bcb0991SDimitry Andric   case AMDGPU::DS_READ_B64_gfx9:
4918bcb0991SDimitry Andric     return DS_READ;
4928bcb0991SDimitry Andric   case AMDGPU::DS_WRITE_B32:
4938bcb0991SDimitry Andric   case AMDGPU::DS_WRITE_B32_gfx9:
4948bcb0991SDimitry Andric   case AMDGPU::DS_WRITE_B64:
4958bcb0991SDimitry Andric   case AMDGPU::DS_WRITE_B64_gfx9:
4968bcb0991SDimitry Andric     return DS_WRITE;
49781ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORD:
49881ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX2:
49981ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX3:
50081ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX4:
50181ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORD:
50281ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX2:
50381ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX3:
50481ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX4:
50581ad6265SDimitry Andric     return FLAT_LOAD;
50681ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
50781ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
50881ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
50981ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
51081ad6265SDimitry Andric     return GLOBAL_LOAD_SADDR;
51181ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORD:
51281ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX2:
51381ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX3:
51481ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX4:
51581ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORD:
51681ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX2:
51781ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX3:
51881ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX4:
51981ad6265SDimitry Andric     return FLAT_STORE;
52081ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
52181ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
52281ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
52381ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
52481ad6265SDimitry Andric     return GLOBAL_STORE_SADDR;
5258bcb0991SDimitry Andric   }
5268bcb0991SDimitry Andric }
5278bcb0991SDimitry Andric 
5288bcb0991SDimitry Andric /// Determines instruction subclass from opcode. Only instructions
52981ad6265SDimitry Andric /// of the same subclass can be merged together. The merged instruction may have
53081ad6265SDimitry Andric /// a different subclass but must have the same class.
getInstSubclass(unsigned Opc,const SIInstrInfo & TII)5318bcb0991SDimitry Andric static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
5328bcb0991SDimitry Andric   switch (Opc) {
5338bcb0991SDimitry Andric   default:
5348bcb0991SDimitry Andric     if (TII.isMUBUF(Opc))
5358bcb0991SDimitry Andric       return AMDGPU::getMUBUFBaseOpcode(Opc);
536*c9157d92SDimitry Andric     if (TII.isImage(Opc)) {
5378bcb0991SDimitry Andric       const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
5388bcb0991SDimitry Andric       assert(Info);
5398bcb0991SDimitry Andric       return Info->BaseOpcode;
5408bcb0991SDimitry Andric     }
541480093f4SDimitry Andric     if (TII.isMTBUF(Opc))
542480093f4SDimitry Andric       return AMDGPU::getMTBUFBaseOpcode(Opc);
5438bcb0991SDimitry Andric     return -1;
5448bcb0991SDimitry Andric   case AMDGPU::DS_READ_B32:
5458bcb0991SDimitry Andric   case AMDGPU::DS_READ_B32_gfx9:
5468bcb0991SDimitry Andric   case AMDGPU::DS_READ_B64:
5478bcb0991SDimitry Andric   case AMDGPU::DS_READ_B64_gfx9:
5488bcb0991SDimitry Andric   case AMDGPU::DS_WRITE_B32:
5498bcb0991SDimitry Andric   case AMDGPU::DS_WRITE_B32_gfx9:
5508bcb0991SDimitry Andric   case AMDGPU::DS_WRITE_B64:
5518bcb0991SDimitry Andric   case AMDGPU::DS_WRITE_B64_gfx9:
5528bcb0991SDimitry Andric     return Opc;
5538bcb0991SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
5548bcb0991SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
555*c9157d92SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
5568bcb0991SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
557349cc55cSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
5588bcb0991SDimitry Andric     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
559bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
560bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
561*c9157d92SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
562bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
563bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
564bdd1243dSDimitry Andric     return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
565bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORD_IMM:
566bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORDX2_IMM:
567*c9157d92SDimitry Andric   case AMDGPU::S_LOAD_DWORDX3_IMM:
568bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORDX4_IMM:
569bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORDX8_IMM:
570bdd1243dSDimitry Andric     return AMDGPU::S_LOAD_DWORD_IMM;
57181ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORD:
57281ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX2:
57381ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX3:
57481ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX4:
57581ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORD:
57681ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX2:
57781ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX3:
57881ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX4:
57981ad6265SDimitry Andric     return AMDGPU::FLAT_LOAD_DWORD;
58081ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
58181ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
58281ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
58381ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
58481ad6265SDimitry Andric     return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
58581ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORD:
58681ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX2:
58781ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX3:
58881ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX4:
58981ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORD:
59081ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX2:
59181ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX3:
59281ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX4:
59381ad6265SDimitry Andric     return AMDGPU::FLAT_STORE_DWORD;
59481ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
59581ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
59681ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
59781ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
59881ad6265SDimitry Andric     return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
5998bcb0991SDimitry Andric   }
6008bcb0991SDimitry Andric }
6018bcb0991SDimitry Andric 
60281ad6265SDimitry Andric // GLOBAL loads and stores are classified as FLAT initially. If both combined
60381ad6265SDimitry Andric // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
60481ad6265SDimitry Andric // If either or both instructions are non segment specific FLAT the resulting
60581ad6265SDimitry Andric // combined operation will be FLAT, potentially promoting one of the GLOBAL
60681ad6265SDimitry Andric // operations to FLAT.
60781ad6265SDimitry Andric // For other instructions return the original unmodified class.
60881ad6265SDimitry Andric InstClassEnum
getCommonInstClass(const CombineInfo & CI,const CombineInfo & Paired)60981ad6265SDimitry Andric SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
61081ad6265SDimitry Andric                                          const CombineInfo &Paired) {
61181ad6265SDimitry Andric   assert(CI.InstClass == Paired.InstClass);
61281ad6265SDimitry Andric 
61381ad6265SDimitry Andric   if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
61481ad6265SDimitry Andric       SIInstrInfo::isFLATGlobal(*CI.I) && SIInstrInfo::isFLATGlobal(*Paired.I))
61581ad6265SDimitry Andric     return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
61681ad6265SDimitry Andric 
61781ad6265SDimitry Andric   return CI.InstClass;
61881ad6265SDimitry Andric }
61981ad6265SDimitry Andric 
getRegs(unsigned Opc,const SIInstrInfo & TII)6205ffd83dbSDimitry Andric static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
6215ffd83dbSDimitry Andric   AddressRegs Result;
6225ffd83dbSDimitry Andric 
6238bcb0991SDimitry Andric   if (TII.isMUBUF(Opc)) {
6245ffd83dbSDimitry Andric     if (AMDGPU::getMUBUFHasVAddr(Opc))
6255ffd83dbSDimitry Andric       Result.VAddr = true;
6265ffd83dbSDimitry Andric     if (AMDGPU::getMUBUFHasSrsrc(Opc))
6275ffd83dbSDimitry Andric       Result.SRsrc = true;
6285ffd83dbSDimitry Andric     if (AMDGPU::getMUBUFHasSoffset(Opc))
6295ffd83dbSDimitry Andric       Result.SOffset = true;
6308bcb0991SDimitry Andric 
6315ffd83dbSDimitry Andric     return Result;
6328bcb0991SDimitry Andric   }
6338bcb0991SDimitry Andric 
634*c9157d92SDimitry Andric   if (TII.isImage(Opc)) {
6355ffd83dbSDimitry Andric     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
6365ffd83dbSDimitry Andric     if (VAddr0Idx >= 0) {
637*c9157d92SDimitry Andric       int RsrcName =
638*c9157d92SDimitry Andric           TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
639*c9157d92SDimitry Andric       int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName);
640*c9157d92SDimitry Andric       Result.NumVAddrs = RsrcIdx - VAddr0Idx;
6415ffd83dbSDimitry Andric     } else {
6425ffd83dbSDimitry Andric       Result.VAddr = true;
6435ffd83dbSDimitry Andric     }
6445ffd83dbSDimitry Andric     Result.SRsrc = true;
6458bcb0991SDimitry Andric     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
6468bcb0991SDimitry Andric     if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
6475ffd83dbSDimitry Andric       Result.SSamp = true;
648480093f4SDimitry Andric 
6495ffd83dbSDimitry Andric     return Result;
650480093f4SDimitry Andric   }
651480093f4SDimitry Andric   if (TII.isMTBUF(Opc)) {
6525ffd83dbSDimitry Andric     if (AMDGPU::getMTBUFHasVAddr(Opc))
6535ffd83dbSDimitry Andric       Result.VAddr = true;
6545ffd83dbSDimitry Andric     if (AMDGPU::getMTBUFHasSrsrc(Opc))
6555ffd83dbSDimitry Andric       Result.SRsrc = true;
6565ffd83dbSDimitry Andric     if (AMDGPU::getMTBUFHasSoffset(Opc))
6575ffd83dbSDimitry Andric       Result.SOffset = true;
658480093f4SDimitry Andric 
6595ffd83dbSDimitry Andric     return Result;
6608bcb0991SDimitry Andric   }
6618bcb0991SDimitry Andric 
6628bcb0991SDimitry Andric   switch (Opc) {
6638bcb0991SDimitry Andric   default:
6645ffd83dbSDimitry Andric     return Result;
665bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
666bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
667*c9157d92SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
668bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
669bdd1243dSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
670bdd1243dSDimitry Andric     Result.SOffset = true;
671bdd1243dSDimitry Andric     [[fallthrough]];
6728bcb0991SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
6738bcb0991SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
674*c9157d92SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
6758bcb0991SDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
676349cc55cSDimitry Andric   case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
677bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORD_IMM:
678bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORDX2_IMM:
679*c9157d92SDimitry Andric   case AMDGPU::S_LOAD_DWORDX3_IMM:
680bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORDX4_IMM:
681bdd1243dSDimitry Andric   case AMDGPU::S_LOAD_DWORDX8_IMM:
6825ffd83dbSDimitry Andric     Result.SBase = true;
6835ffd83dbSDimitry Andric     return Result;
6848bcb0991SDimitry Andric   case AMDGPU::DS_READ_B32:
6858bcb0991SDimitry Andric   case AMDGPU::DS_READ_B64:
6868bcb0991SDimitry Andric   case AMDGPU::DS_READ_B32_gfx9:
6878bcb0991SDimitry Andric   case AMDGPU::DS_READ_B64_gfx9:
6888bcb0991SDimitry Andric   case AMDGPU::DS_WRITE_B32:
6898bcb0991SDimitry Andric   case AMDGPU::DS_WRITE_B64:
6908bcb0991SDimitry Andric   case AMDGPU::DS_WRITE_B32_gfx9:
6918bcb0991SDimitry Andric   case AMDGPU::DS_WRITE_B64_gfx9:
6925ffd83dbSDimitry Andric     Result.Addr = true;
6935ffd83dbSDimitry Andric     return Result;
69481ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
69581ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
69681ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
69781ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
69881ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
69981ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
70081ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
70181ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
70281ad6265SDimitry Andric     Result.SAddr = true;
703bdd1243dSDimitry Andric     [[fallthrough]];
70481ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORD:
70581ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX2:
70681ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX3:
70781ad6265SDimitry Andric   case AMDGPU::GLOBAL_LOAD_DWORDX4:
70881ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORD:
70981ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX2:
71081ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX3:
71181ad6265SDimitry Andric   case AMDGPU::GLOBAL_STORE_DWORDX4:
71281ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORD:
71381ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX2:
71481ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX3:
71581ad6265SDimitry Andric   case AMDGPU::FLAT_LOAD_DWORDX4:
71681ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORD:
71781ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX2:
71881ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX3:
71981ad6265SDimitry Andric   case AMDGPU::FLAT_STORE_DWORDX4:
72081ad6265SDimitry Andric     Result.VAddr = true;
72181ad6265SDimitry Andric     return Result;
7228bcb0991SDimitry Andric   }
7238bcb0991SDimitry Andric }
7248bcb0991SDimitry Andric 
setMI(MachineBasicBlock::iterator MI,const SILoadStoreOptimizer & LSO)7258bcb0991SDimitry Andric void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
72604eeddc0SDimitry Andric                                               const SILoadStoreOptimizer &LSO) {
7278bcb0991SDimitry Andric   I = MI;
7288bcb0991SDimitry Andric   unsigned Opc = MI->getOpcode();
72904eeddc0SDimitry Andric   InstClass = getInstClass(Opc, *LSO.TII);
7308bcb0991SDimitry Andric 
7318bcb0991SDimitry Andric   if (InstClass == UNKNOWN)
7328bcb0991SDimitry Andric     return;
7338bcb0991SDimitry Andric 
73404eeddc0SDimitry Andric   IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
73504eeddc0SDimitry Andric 
7368bcb0991SDimitry Andric   switch (InstClass) {
7378bcb0991SDimitry Andric   case DS_READ:
7388bcb0991SDimitry Andric    EltSize =
7398bcb0991SDimitry Andric           (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
7408bcb0991SDimitry Andric                                                                           : 4;
7418bcb0991SDimitry Andric    break;
7428bcb0991SDimitry Andric   case DS_WRITE:
7438bcb0991SDimitry Andric     EltSize =
7448bcb0991SDimitry Andric           (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
7458bcb0991SDimitry Andric                                                                             : 4;
7468bcb0991SDimitry Andric     break;
7478bcb0991SDimitry Andric   case S_BUFFER_LOAD_IMM:
748bdd1243dSDimitry Andric   case S_BUFFER_LOAD_SGPR_IMM:
749bdd1243dSDimitry Andric   case S_LOAD_IMM:
75004eeddc0SDimitry Andric     EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
7518bcb0991SDimitry Andric     break;
7528bcb0991SDimitry Andric   default:
7538bcb0991SDimitry Andric     EltSize = 4;
7548bcb0991SDimitry Andric     break;
7558bcb0991SDimitry Andric   }
7568bcb0991SDimitry Andric 
7578bcb0991SDimitry Andric   if (InstClass == MIMG) {
75804eeddc0SDimitry Andric     DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
7595ffd83dbSDimitry Andric     // Offset is not considered for MIMG instructions.
7605ffd83dbSDimitry Andric     Offset = 0;
7618bcb0991SDimitry Andric   } else {
7628bcb0991SDimitry Andric     int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
763fe013be4SDimitry Andric     Offset = I->getOperand(OffsetIdx).getImm();
7648bcb0991SDimitry Andric   }
7658bcb0991SDimitry Andric 
766480093f4SDimitry Andric   if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
76704eeddc0SDimitry Andric     Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
768480093f4SDimitry Andric 
76904eeddc0SDimitry Andric   Width = getOpcodeWidth(*I, *LSO.TII);
7708bcb0991SDimitry Andric 
7718bcb0991SDimitry Andric   if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
772480093f4SDimitry Andric     Offset &= 0xffff;
7738bcb0991SDimitry Andric   } else if (InstClass != MIMG) {
77404eeddc0SDimitry Andric     CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
7758bcb0991SDimitry Andric   }
7768bcb0991SDimitry Andric 
77704eeddc0SDimitry Andric   AddressRegs Regs = getRegs(Opc, *LSO.TII);
778*c9157d92SDimitry Andric   bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);
7795ffd83dbSDimitry Andric 
7808bcb0991SDimitry Andric   NumAddresses = 0;
7815ffd83dbSDimitry Andric   for (unsigned J = 0; J < Regs.NumVAddrs; J++)
7825ffd83dbSDimitry Andric     AddrIdx[NumAddresses++] =
7835ffd83dbSDimitry Andric         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
7845ffd83dbSDimitry Andric   if (Regs.Addr)
7855ffd83dbSDimitry Andric     AddrIdx[NumAddresses++] =
7865ffd83dbSDimitry Andric         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
7875ffd83dbSDimitry Andric   if (Regs.SBase)
7885ffd83dbSDimitry Andric     AddrIdx[NumAddresses++] =
7895ffd83dbSDimitry Andric         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
7905ffd83dbSDimitry Andric   if (Regs.SRsrc)
791*c9157d92SDimitry Andric     AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
792*c9157d92SDimitry Andric         Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
7935ffd83dbSDimitry Andric   if (Regs.SOffset)
7945ffd83dbSDimitry Andric     AddrIdx[NumAddresses++] =
7955ffd83dbSDimitry Andric         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
79681ad6265SDimitry Andric   if (Regs.SAddr)
79781ad6265SDimitry Andric     AddrIdx[NumAddresses++] =
79881ad6265SDimitry Andric         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
7995ffd83dbSDimitry Andric   if (Regs.VAddr)
8005ffd83dbSDimitry Andric     AddrIdx[NumAddresses++] =
8015ffd83dbSDimitry Andric         AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
8025ffd83dbSDimitry Andric   if (Regs.SSamp)
803*c9157d92SDimitry Andric     AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
804*c9157d92SDimitry Andric         Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
8055ffd83dbSDimitry Andric   assert(NumAddresses <= MaxAddressRegs);
8068bcb0991SDimitry Andric 
8075ffd83dbSDimitry Andric   for (unsigned J = 0; J < NumAddresses; J++)
8085ffd83dbSDimitry Andric     AddrReg[J] = &I->getOperand(AddrIdx[J]);
8098bcb0991SDimitry Andric }
8108bcb0991SDimitry Andric 
8110b57cec5SDimitry Andric } // end anonymous namespace.
8120b57cec5SDimitry Andric 
8130b57cec5SDimitry Andric INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
8140b57cec5SDimitry Andric                       "SI Load Store Optimizer", false, false)
8150b57cec5SDimitry Andric INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
8160b57cec5SDimitry Andric INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
8170b57cec5SDimitry Andric                     false, false)
8180b57cec5SDimitry Andric 
8190b57cec5SDimitry Andric char SILoadStoreOptimizer::ID = 0;
8200b57cec5SDimitry Andric 
8210b57cec5SDimitry Andric char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
8220b57cec5SDimitry Andric 
createSILoadStoreOptimizerPass()8230b57cec5SDimitry Andric FunctionPass *llvm::createSILoadStoreOptimizerPass() {
8240b57cec5SDimitry Andric   return new SILoadStoreOptimizer();
8250b57cec5SDimitry Andric }
8260b57cec5SDimitry Andric 
addDefsUsesToList(const MachineInstr & MI,DenseSet<Register> & RegDefs,DenseSet<Register> & RegUses)8270b57cec5SDimitry Andric static void addDefsUsesToList(const MachineInstr &MI,
8285ffd83dbSDimitry Andric                               DenseSet<Register> &RegDefs,
82981ad6265SDimitry Andric                               DenseSet<Register> &RegUses) {
83081ad6265SDimitry Andric   for (const auto &Op : MI.operands()) {
83181ad6265SDimitry Andric     if (!Op.isReg())
83281ad6265SDimitry Andric       continue;
8330b57cec5SDimitry Andric     if (Op.isDef())
8340b57cec5SDimitry Andric       RegDefs.insert(Op.getReg());
83581ad6265SDimitry Andric     if (Op.readsReg())
83681ad6265SDimitry Andric       RegUses.insert(Op.getReg());
8370b57cec5SDimitry Andric   }
8380b57cec5SDimitry Andric }
8390b57cec5SDimitry Andric 
canSwapInstructions(const DenseSet<Register> & ARegDefs,const DenseSet<Register> & ARegUses,const MachineInstr & A,const MachineInstr & B) const84081ad6265SDimitry Andric bool SILoadStoreOptimizer::canSwapInstructions(
84181ad6265SDimitry Andric     const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
84281ad6265SDimitry Andric     const MachineInstr &A, const MachineInstr &B) const {
84381ad6265SDimitry Andric   if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
84481ad6265SDimitry Andric       (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
8450b57cec5SDimitry Andric     return false;
84681ad6265SDimitry Andric   for (const auto &BOp : B.operands()) {
84781ad6265SDimitry Andric     if (!BOp.isReg())
8480b57cec5SDimitry Andric       continue;
84981ad6265SDimitry Andric     if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
85081ad6265SDimitry Andric       return false;
85181ad6265SDimitry Andric     if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
8520b57cec5SDimitry Andric       return false;
8530b57cec5SDimitry Andric   }
8540b57cec5SDimitry Andric   return true;
8550b57cec5SDimitry Andric }
8560b57cec5SDimitry Andric 
85781ad6265SDimitry Andric // Given that \p CI and \p Paired are adjacent memory operations produce a new
85881ad6265SDimitry Andric // MMO for the combined operation with a new access size.
85981ad6265SDimitry Andric MachineMemOperand *
combineKnownAdjacentMMOs(const CombineInfo & CI,const CombineInfo & Paired)86081ad6265SDimitry Andric SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
86181ad6265SDimitry Andric                                                const CombineInfo &Paired) {
86281ad6265SDimitry Andric   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
86381ad6265SDimitry Andric   const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
86481ad6265SDimitry Andric 
86581ad6265SDimitry Andric   unsigned Size = MMOa->getSize() + MMOb->getSize();
86681ad6265SDimitry Andric 
86781ad6265SDimitry Andric   // A base pointer for the combined operation is the same as the leading
86881ad6265SDimitry Andric   // operation's pointer.
86981ad6265SDimitry Andric   if (Paired < CI)
87081ad6265SDimitry Andric     std::swap(MMOa, MMOb);
87181ad6265SDimitry Andric 
87281ad6265SDimitry Andric   MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
87381ad6265SDimitry Andric   // If merging FLAT and GLOBAL set address space to FLAT.
87481ad6265SDimitry Andric   if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
87581ad6265SDimitry Andric     PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
87681ad6265SDimitry Andric 
87781ad6265SDimitry Andric   MachineFunction *MF = CI.I->getMF();
87881ad6265SDimitry Andric   return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
8798bcb0991SDimitry Andric }
8808bcb0991SDimitry Andric 
dmasksCanBeCombined(const CombineInfo & CI,const SIInstrInfo & TII,const CombineInfo & Paired)881480093f4SDimitry Andric bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
882480093f4SDimitry Andric                                                const SIInstrInfo &TII,
883480093f4SDimitry Andric                                                const CombineInfo &Paired) {
8848bcb0991SDimitry Andric   assert(CI.InstClass == MIMG);
8858bcb0991SDimitry Andric 
8868bcb0991SDimitry Andric   // Ignore instructions with tfe/lwe set.
8878bcb0991SDimitry Andric   const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
8888bcb0991SDimitry Andric   const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
8898bcb0991SDimitry Andric 
8908bcb0991SDimitry Andric   if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
8918bcb0991SDimitry Andric     return false;
8928bcb0991SDimitry Andric 
8938bcb0991SDimitry Andric   // Check other optional immediate operands for equality.
894fe6060f1SDimitry Andric   unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
895fe6060f1SDimitry Andric                                 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
896fe6060f1SDimitry Andric                                 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
8978bcb0991SDimitry Andric 
8988bcb0991SDimitry Andric   for (auto op : OperandsToMatch) {
8998bcb0991SDimitry Andric     int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
900480093f4SDimitry Andric     if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
9018bcb0991SDimitry Andric       return false;
9028bcb0991SDimitry Andric     if (Idx != -1 &&
903480093f4SDimitry Andric         CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
9048bcb0991SDimitry Andric       return false;
9058bcb0991SDimitry Andric   }
9068bcb0991SDimitry Andric 
9078bcb0991SDimitry Andric   // Check DMask for overlaps.
908480093f4SDimitry Andric   unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
909480093f4SDimitry Andric   unsigned MinMask = std::min(CI.DMask, Paired.DMask);
9108bcb0991SDimitry Andric 
911*c9157d92SDimitry Andric   if (!MaxMask)
912*c9157d92SDimitry Andric     return false;
913*c9157d92SDimitry Andric 
914fe013be4SDimitry Andric   unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
9158bcb0991SDimitry Andric   if ((1u << AllowedBitsForMin) <= MinMask)
9168bcb0991SDimitry Andric     return false;
9178bcb0991SDimitry Andric 
9188bcb0991SDimitry Andric   return true;
9198bcb0991SDimitry Andric }
9208bcb0991SDimitry Andric 
getBufferFormatWithCompCount(unsigned OldFormat,unsigned ComponentCount,const GCNSubtarget & STI)921480093f4SDimitry Andric static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
922480093f4SDimitry Andric                                        unsigned ComponentCount,
9235ffd83dbSDimitry Andric                                        const GCNSubtarget &STI) {
924480093f4SDimitry Andric   if (ComponentCount > 4)
925480093f4SDimitry Andric     return 0;
926480093f4SDimitry Andric 
927480093f4SDimitry Andric   const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
928480093f4SDimitry Andric       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
929480093f4SDimitry Andric   if (!OldFormatInfo)
930480093f4SDimitry Andric     return 0;
931480093f4SDimitry Andric 
932480093f4SDimitry Andric   const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
933480093f4SDimitry Andric       llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
934480093f4SDimitry Andric                                            ComponentCount,
935480093f4SDimitry Andric                                            OldFormatInfo->NumFormat, STI);
936480093f4SDimitry Andric 
937480093f4SDimitry Andric   if (!NewFormatInfo)
938480093f4SDimitry Andric     return 0;
939480093f4SDimitry Andric 
940480093f4SDimitry Andric   assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
941480093f4SDimitry Andric          NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
942480093f4SDimitry Andric 
943480093f4SDimitry Andric   return NewFormatInfo->Format;
944480093f4SDimitry Andric }
945480093f4SDimitry Andric 
946fe6060f1SDimitry Andric // Return the value in the inclusive range [Lo,Hi] that is aligned to the
947fe6060f1SDimitry Andric // highest power of two. Note that the result is well defined for all inputs
948fe6060f1SDimitry Andric // including corner cases like:
949fe6060f1SDimitry Andric // - if Lo == Hi, return that value
950fe6060f1SDimitry Andric // - if Lo == 0, return 0 (even though the "- 1" below underflows
951fe6060f1SDimitry Andric // - if Lo > Hi, return 0 (as if the range wrapped around)
mostAlignedValueInRange(uint32_t Lo,uint32_t Hi)952fe6060f1SDimitry Andric static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
953fe013be4SDimitry Andric   return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1);
954fe6060f1SDimitry Andric }
955fe6060f1SDimitry Andric 
offsetsCanBeCombined(CombineInfo & CI,const GCNSubtarget & STI,CombineInfo & Paired,bool Modify)956480093f4SDimitry Andric bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
9575ffd83dbSDimitry Andric                                                 const GCNSubtarget &STI,
9585ffd83dbSDimitry Andric                                                 CombineInfo &Paired,
9595ffd83dbSDimitry Andric                                                 bool Modify) {
9608bcb0991SDimitry Andric   assert(CI.InstClass != MIMG);
9618bcb0991SDimitry Andric 
9620b57cec5SDimitry Andric   // XXX - Would the same offset be OK? Is there any reason this would happen or
9630b57cec5SDimitry Andric   // be useful?
964480093f4SDimitry Andric   if (CI.Offset == Paired.Offset)
9650b57cec5SDimitry Andric     return false;
9660b57cec5SDimitry Andric 
9670b57cec5SDimitry Andric   // This won't be valid if the offset isn't aligned.
968480093f4SDimitry Andric   if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
9690b57cec5SDimitry Andric     return false;
9700b57cec5SDimitry Andric 
971480093f4SDimitry Andric   if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
972480093f4SDimitry Andric 
973480093f4SDimitry Andric     const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
974480093f4SDimitry Andric         llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
975480093f4SDimitry Andric     if (!Info0)
976480093f4SDimitry Andric       return false;
977480093f4SDimitry Andric     const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
978480093f4SDimitry Andric         llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
979480093f4SDimitry Andric     if (!Info1)
980480093f4SDimitry Andric       return false;
981480093f4SDimitry Andric 
982480093f4SDimitry Andric     if (Info0->BitsPerComp != Info1->BitsPerComp ||
983480093f4SDimitry Andric         Info0->NumFormat != Info1->NumFormat)
984480093f4SDimitry Andric       return false;
985480093f4SDimitry Andric 
986480093f4SDimitry Andric     // TODO: Should be possible to support more formats, but if format loads
987480093f4SDimitry Andric     // are not dword-aligned, the merged load might not be valid.
988480093f4SDimitry Andric     if (Info0->BitsPerComp != 32)
989480093f4SDimitry Andric       return false;
990480093f4SDimitry Andric 
991480093f4SDimitry Andric     if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
992480093f4SDimitry Andric       return false;
993480093f4SDimitry Andric   }
994480093f4SDimitry Andric 
995fe6060f1SDimitry Andric   uint32_t EltOffset0 = CI.Offset / CI.EltSize;
996fe6060f1SDimitry Andric   uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
9970b57cec5SDimitry Andric   CI.UseST64 = false;
9980b57cec5SDimitry Andric   CI.BaseOff = 0;
9990b57cec5SDimitry Andric 
1000fe6060f1SDimitry Andric   // Handle all non-DS instructions.
10010b57cec5SDimitry Andric   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1002fe013be4SDimitry Andric     if (EltOffset0 + CI.Width != EltOffset1 &&
1003fe013be4SDimitry Andric             EltOffset1 + Paired.Width != EltOffset0)
1004fe013be4SDimitry Andric       return false;
1005fe013be4SDimitry Andric     if (CI.CPol != Paired.CPol)
1006fe013be4SDimitry Andric       return false;
1007*c9157d92SDimitry Andric     if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1008*c9157d92SDimitry Andric         CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1009*c9157d92SDimitry Andric       // Reject cases like:
1010*c9157d92SDimitry Andric       //   dword + dwordx2 -> dwordx3
1011*c9157d92SDimitry Andric       //   dword + dwordx3 -> dwordx4
1012*c9157d92SDimitry Andric       // If we tried to combine these cases, we would fail to extract a subreg
1013*c9157d92SDimitry Andric       // for the result of the second load due to SGPR alignment requirements.
1014*c9157d92SDimitry Andric       if (CI.Width != Paired.Width &&
1015*c9157d92SDimitry Andric           (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1016*c9157d92SDimitry Andric         return false;
1017*c9157d92SDimitry Andric     }
1018fe013be4SDimitry Andric     return true;
10190b57cec5SDimitry Andric   }
10200b57cec5SDimitry Andric 
10210b57cec5SDimitry Andric   // If the offset in elements doesn't fit in 8-bits, we might be able to use
10220b57cec5SDimitry Andric   // the stride 64 versions.
10230b57cec5SDimitry Andric   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
10240b57cec5SDimitry Andric       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
10255ffd83dbSDimitry Andric     if (Modify) {
1026480093f4SDimitry Andric       CI.Offset = EltOffset0 / 64;
1027480093f4SDimitry Andric       Paired.Offset = EltOffset1 / 64;
10280b57cec5SDimitry Andric       CI.UseST64 = true;
10295ffd83dbSDimitry Andric     }
10300b57cec5SDimitry Andric     return true;
10310b57cec5SDimitry Andric   }
10320b57cec5SDimitry Andric 
10330b57cec5SDimitry Andric   // Check if the new offsets fit in the reduced 8-bit range.
10340b57cec5SDimitry Andric   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
10355ffd83dbSDimitry Andric     if (Modify) {
1036480093f4SDimitry Andric       CI.Offset = EltOffset0;
1037480093f4SDimitry Andric       Paired.Offset = EltOffset1;
10385ffd83dbSDimitry Andric     }
10390b57cec5SDimitry Andric     return true;
10400b57cec5SDimitry Andric   }
10410b57cec5SDimitry Andric 
10420b57cec5SDimitry Andric   // Try to shift base address to decrease offsets.
1043fe6060f1SDimitry Andric   uint32_t Min = std::min(EltOffset0, EltOffset1);
1044fe6060f1SDimitry Andric   uint32_t Max = std::max(EltOffset0, EltOffset1);
10450b57cec5SDimitry Andric 
1046fe6060f1SDimitry Andric   const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1047fe6060f1SDimitry Andric   if (((Max - Min) & ~Mask) == 0) {
10485ffd83dbSDimitry Andric     if (Modify) {
1049fe6060f1SDimitry Andric       // From the range of values we could use for BaseOff, choose the one that
1050fe6060f1SDimitry Andric       // is aligned to the highest power of two, to maximise the chance that
1051fe6060f1SDimitry Andric       // the same offset can be reused for other load/store pairs.
1052fe6060f1SDimitry Andric       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1053fe6060f1SDimitry Andric       // Copy the low bits of the offsets, so that when we adjust them by
1054fe6060f1SDimitry Andric       // subtracting BaseOff they will be multiples of 64.
1055fe6060f1SDimitry Andric       BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1056fe6060f1SDimitry Andric       CI.BaseOff = BaseOff * CI.EltSize;
1057fe6060f1SDimitry Andric       CI.Offset = (EltOffset0 - BaseOff) / 64;
1058fe6060f1SDimitry Andric       Paired.Offset = (EltOffset1 - BaseOff) / 64;
10590b57cec5SDimitry Andric       CI.UseST64 = true;
10605ffd83dbSDimitry Andric     }
10610b57cec5SDimitry Andric     return true;
10620b57cec5SDimitry Andric   }
10630b57cec5SDimitry Andric 
1064fe6060f1SDimitry Andric   if (isUInt<8>(Max - Min)) {
10655ffd83dbSDimitry Andric     if (Modify) {
1066fe6060f1SDimitry Andric       // From the range of values we could use for BaseOff, choose the one that
1067fe6060f1SDimitry Andric       // is aligned to the highest power of two, to maximise the chance that
1068fe6060f1SDimitry Andric       // the same offset can be reused for other load/store pairs.
1069fe6060f1SDimitry Andric       uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1070fe6060f1SDimitry Andric       CI.BaseOff = BaseOff * CI.EltSize;
1071fe6060f1SDimitry Andric       CI.Offset = EltOffset0 - BaseOff;
1072fe6060f1SDimitry Andric       Paired.Offset = EltOffset1 - BaseOff;
10735ffd83dbSDimitry Andric     }
10740b57cec5SDimitry Andric     return true;
10750b57cec5SDimitry Andric   }
10760b57cec5SDimitry Andric 
10770b57cec5SDimitry Andric   return false;
10780b57cec5SDimitry Andric }
10790b57cec5SDimitry Andric 
widthsFit(const GCNSubtarget & STM,const CombineInfo & CI,const CombineInfo & Paired)10800b57cec5SDimitry Andric bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1081480093f4SDimitry Andric                                      const CombineInfo &CI,
1082480093f4SDimitry Andric                                      const CombineInfo &Paired) {
1083480093f4SDimitry Andric   const unsigned Width = (CI.Width + Paired.Width);
10840b57cec5SDimitry Andric   switch (CI.InstClass) {
10850b57cec5SDimitry Andric   default:
10860b57cec5SDimitry Andric     return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
10870b57cec5SDimitry Andric   case S_BUFFER_LOAD_IMM:
1088bdd1243dSDimitry Andric   case S_BUFFER_LOAD_SGPR_IMM:
1089bdd1243dSDimitry Andric   case S_LOAD_IMM:
10900b57cec5SDimitry Andric     switch (Width) {
10910b57cec5SDimitry Andric     default:
10920b57cec5SDimitry Andric       return false;
10930b57cec5SDimitry Andric     case 2:
10940b57cec5SDimitry Andric     case 4:
1095349cc55cSDimitry Andric     case 8:
10960b57cec5SDimitry Andric       return true;
1097*c9157d92SDimitry Andric     case 3:
1098*c9157d92SDimitry Andric       return STM.hasScalarDwordx3Loads();
10990b57cec5SDimitry Andric     }
11000b57cec5SDimitry Andric   }
11010b57cec5SDimitry Andric }
11020b57cec5SDimitry Andric 
1103fe6060f1SDimitry Andric const TargetRegisterClass *
getDataRegClass(const MachineInstr & MI) const1104fe6060f1SDimitry Andric SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1105fe6060f1SDimitry Andric   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1106fe6060f1SDimitry Andric     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1107fe6060f1SDimitry Andric   }
1108fe6060f1SDimitry Andric   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1109fe6060f1SDimitry Andric     return TRI->getRegClassForReg(*MRI, Src->getReg());
1110fe6060f1SDimitry Andric   }
1111fe6060f1SDimitry Andric   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1112fe6060f1SDimitry Andric     return TRI->getRegClassForReg(*MRI, Src->getReg());
1113fe6060f1SDimitry Andric   }
1114fe6060f1SDimitry Andric   if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1115fe6060f1SDimitry Andric     return TRI->getRegClassForReg(*MRI, Dst->getReg());
1116fe6060f1SDimitry Andric   }
1117fe6060f1SDimitry Andric   if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1118fe6060f1SDimitry Andric     return TRI->getRegClassForReg(*MRI, Src->getReg());
1119fe6060f1SDimitry Andric   }
1120fe6060f1SDimitry Andric   return nullptr;
1121fe6060f1SDimitry Andric }
1122fe6060f1SDimitry Andric 
112381ad6265SDimitry Andric /// This function assumes that CI comes before Paired in a basic block. Return
112481ad6265SDimitry Andric /// an insertion point for the merged instruction or nullptr on failure.
112581ad6265SDimitry Andric SILoadStoreOptimizer::CombineInfo *
checkAndPrepareMerge(CombineInfo & CI,CombineInfo & Paired)112681ad6265SDimitry Andric SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
112781ad6265SDimitry Andric                                            CombineInfo &Paired) {
112881ad6265SDimitry Andric   // If another instruction has already been merged into CI, it may now be a
112981ad6265SDimitry Andric   // type that we can't do any further merging into.
113081ad6265SDimitry Andric   if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
113181ad6265SDimitry Andric     return nullptr;
113281ad6265SDimitry Andric   assert(CI.InstClass == Paired.InstClass);
113381ad6265SDimitry Andric 
113481ad6265SDimitry Andric   if (getInstSubclass(CI.I->getOpcode(), *TII) !=
113581ad6265SDimitry Andric       getInstSubclass(Paired.I->getOpcode(), *TII))
113681ad6265SDimitry Andric     return nullptr;
11375ffd83dbSDimitry Andric 
11385ffd83dbSDimitry Andric   // Check both offsets (or masks for MIMG) can be combined and fit in the
11395ffd83dbSDimitry Andric   // reduced range.
114081ad6265SDimitry Andric   if (CI.InstClass == MIMG) {
114181ad6265SDimitry Andric     if (!dmasksCanBeCombined(CI, *TII, Paired))
114281ad6265SDimitry Andric       return nullptr;
114381ad6265SDimitry Andric   } else {
114481ad6265SDimitry Andric     if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
114581ad6265SDimitry Andric       return nullptr;
11465ffd83dbSDimitry Andric   }
11475ffd83dbSDimitry Andric 
114881ad6265SDimitry Andric   DenseSet<Register> RegDefs;
114981ad6265SDimitry Andric   DenseSet<Register> RegUses;
115081ad6265SDimitry Andric   CombineInfo *Where;
115181ad6265SDimitry Andric   if (CI.I->mayLoad()) {
115281ad6265SDimitry Andric     // Try to hoist Paired up to CI.
115381ad6265SDimitry Andric     addDefsUsesToList(*Paired.I, RegDefs, RegUses);
115481ad6265SDimitry Andric     for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
115581ad6265SDimitry Andric       if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
115681ad6265SDimitry Andric         return nullptr;
11570b57cec5SDimitry Andric     }
115881ad6265SDimitry Andric     Where = &CI;
115981ad6265SDimitry Andric   } else {
116081ad6265SDimitry Andric     // Try to sink CI down to Paired.
116181ad6265SDimitry Andric     addDefsUsesToList(*CI.I, RegDefs, RegUses);
116281ad6265SDimitry Andric     for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
116381ad6265SDimitry Andric       if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
116481ad6265SDimitry Andric         return nullptr;
11650b57cec5SDimitry Andric     }
116681ad6265SDimitry Andric     Where = &Paired;
116781ad6265SDimitry Andric   }
11685ffd83dbSDimitry Andric 
11695ffd83dbSDimitry Andric   // Call offsetsCanBeCombined with modify = true so that the offsets are
11705ffd83dbSDimitry Andric   // correct for the new instruction.  This should return true, because
11715ffd83dbSDimitry Andric   // this function should only be called on CombineInfo objects that
11725ffd83dbSDimitry Andric   // have already been confirmed to be mergeable.
117381ad6265SDimitry Andric   if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
11745ffd83dbSDimitry Andric     offsetsCanBeCombined(CI, *STM, Paired, true);
117581ad6265SDimitry Andric   return Where;
11760b57cec5SDimitry Andric }
11770b57cec5SDimitry Andric 
read2Opcode(unsigned EltSize) const11780b57cec5SDimitry Andric unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
11790b57cec5SDimitry Andric   if (STM->ldsRequiresM0Init())
11800b57cec5SDimitry Andric     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
11810b57cec5SDimitry Andric   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
11820b57cec5SDimitry Andric }
11830b57cec5SDimitry Andric 
read2ST64Opcode(unsigned EltSize) const11840b57cec5SDimitry Andric unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
11850b57cec5SDimitry Andric   if (STM->ldsRequiresM0Init())
11860b57cec5SDimitry Andric     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
11870b57cec5SDimitry Andric 
11880b57cec5SDimitry Andric   return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
11890b57cec5SDimitry Andric                         : AMDGPU::DS_READ2ST64_B64_gfx9;
11900b57cec5SDimitry Andric }
11910b57cec5SDimitry Andric 
11920b57cec5SDimitry Andric MachineBasicBlock::iterator
mergeRead2Pair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)11935ffd83dbSDimitry Andric SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
119481ad6265SDimitry Andric                                      MachineBasicBlock::iterator InsertBefore) {
11950b57cec5SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
11960b57cec5SDimitry Andric 
11970b57cec5SDimitry Andric   // Be careful, since the addresses could be subregisters themselves in weird
11980b57cec5SDimitry Andric   // cases, like vectors of pointers.
11990b57cec5SDimitry Andric   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
12000b57cec5SDimitry Andric 
12010b57cec5SDimitry Andric   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1202480093f4SDimitry Andric   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
12030b57cec5SDimitry Andric 
1204480093f4SDimitry Andric   unsigned NewOffset0 = CI.Offset;
1205480093f4SDimitry Andric   unsigned NewOffset1 = Paired.Offset;
12060b57cec5SDimitry Andric   unsigned Opc =
12070b57cec5SDimitry Andric       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
12080b57cec5SDimitry Andric 
12090b57cec5SDimitry Andric   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
12100b57cec5SDimitry Andric   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
12110b57cec5SDimitry Andric 
12120b57cec5SDimitry Andric   if (NewOffset0 > NewOffset1) {
12130b57cec5SDimitry Andric     // Canonicalize the merged instruction so the smaller offset comes first.
12140b57cec5SDimitry Andric     std::swap(NewOffset0, NewOffset1);
12150b57cec5SDimitry Andric     std::swap(SubRegIdx0, SubRegIdx1);
12160b57cec5SDimitry Andric   }
12170b57cec5SDimitry Andric 
12180b57cec5SDimitry Andric   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
12190b57cec5SDimitry Andric          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
12200b57cec5SDimitry Andric 
12210b57cec5SDimitry Andric   const MCInstrDesc &Read2Desc = TII->get(Opc);
12220b57cec5SDimitry Andric 
1223fe6060f1SDimitry Andric   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
12248bcb0991SDimitry Andric   Register DestReg = MRI->createVirtualRegister(SuperRC);
12250b57cec5SDimitry Andric 
12260b57cec5SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
12270b57cec5SDimitry Andric 
12288bcb0991SDimitry Andric   Register BaseReg = AddrReg->getReg();
12290b57cec5SDimitry Andric   unsigned BaseSubReg = AddrReg->getSubReg();
12300b57cec5SDimitry Andric   unsigned BaseRegFlags = 0;
12310b57cec5SDimitry Andric   if (CI.BaseOff) {
12328bcb0991SDimitry Andric     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
123381ad6265SDimitry Andric     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
12340b57cec5SDimitry Andric         .addImm(CI.BaseOff);
12350b57cec5SDimitry Andric 
12360b57cec5SDimitry Andric     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
12370b57cec5SDimitry Andric     BaseRegFlags = RegState::Kill;
12380b57cec5SDimitry Andric 
123981ad6265SDimitry Andric     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
12400b57cec5SDimitry Andric         .addReg(ImmReg)
12410b57cec5SDimitry Andric         .addReg(AddrReg->getReg(), 0, BaseSubReg)
12420b57cec5SDimitry Andric         .addImm(0); // clamp bit
12430b57cec5SDimitry Andric     BaseSubReg = 0;
12440b57cec5SDimitry Andric   }
12450b57cec5SDimitry Andric 
12460b57cec5SDimitry Andric   MachineInstrBuilder Read2 =
124781ad6265SDimitry Andric       BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
12480b57cec5SDimitry Andric           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
12490b57cec5SDimitry Andric           .addImm(NewOffset0)                        // offset0
12500b57cec5SDimitry Andric           .addImm(NewOffset1)                        // offset1
12510b57cec5SDimitry Andric           .addImm(0)                                 // gds
1252480093f4SDimitry Andric           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
12530b57cec5SDimitry Andric 
12540b57cec5SDimitry Andric   (void)Read2;
12550b57cec5SDimitry Andric 
12560b57cec5SDimitry Andric   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
12570b57cec5SDimitry Andric 
12580b57cec5SDimitry Andric   // Copy to the old destination registers.
125981ad6265SDimitry Andric   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
12600b57cec5SDimitry Andric       .add(*Dest0) // Copy to same destination including flags and sub reg.
12610b57cec5SDimitry Andric       .addReg(DestReg, 0, SubRegIdx0);
126281ad6265SDimitry Andric   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
12630b57cec5SDimitry Andric       .add(*Dest1)
12640b57cec5SDimitry Andric       .addReg(DestReg, RegState::Kill, SubRegIdx1);
12650b57cec5SDimitry Andric 
12660b57cec5SDimitry Andric   CI.I->eraseFromParent();
1267480093f4SDimitry Andric   Paired.I->eraseFromParent();
12680b57cec5SDimitry Andric 
12690b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
12708bcb0991SDimitry Andric   return Read2;
12710b57cec5SDimitry Andric }
12720b57cec5SDimitry Andric 
write2Opcode(unsigned EltSize) const12730b57cec5SDimitry Andric unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
12740b57cec5SDimitry Andric   if (STM->ldsRequiresM0Init())
12750b57cec5SDimitry Andric     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
12760b57cec5SDimitry Andric   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
12770b57cec5SDimitry Andric                         : AMDGPU::DS_WRITE2_B64_gfx9;
12780b57cec5SDimitry Andric }
12790b57cec5SDimitry Andric 
write2ST64Opcode(unsigned EltSize) const12800b57cec5SDimitry Andric unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
12810b57cec5SDimitry Andric   if (STM->ldsRequiresM0Init())
12820b57cec5SDimitry Andric     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
12830b57cec5SDimitry Andric                           : AMDGPU::DS_WRITE2ST64_B64;
12840b57cec5SDimitry Andric 
12850b57cec5SDimitry Andric   return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
12860b57cec5SDimitry Andric                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
12870b57cec5SDimitry Andric }
12880b57cec5SDimitry Andric 
mergeWrite2Pair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)128981ad6265SDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
129081ad6265SDimitry Andric     CombineInfo &CI, CombineInfo &Paired,
129181ad6265SDimitry Andric     MachineBasicBlock::iterator InsertBefore) {
12920b57cec5SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
12930b57cec5SDimitry Andric 
12940b57cec5SDimitry Andric   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
12950b57cec5SDimitry Andric   // sure we preserve the subregister index and any register flags set on them.
12960b57cec5SDimitry Andric   const MachineOperand *AddrReg =
12970b57cec5SDimitry Andric       TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
12980b57cec5SDimitry Andric   const MachineOperand *Data0 =
12990b57cec5SDimitry Andric       TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
13000b57cec5SDimitry Andric   const MachineOperand *Data1 =
1301480093f4SDimitry Andric       TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
13020b57cec5SDimitry Andric 
1303480093f4SDimitry Andric   unsigned NewOffset0 = CI.Offset;
1304480093f4SDimitry Andric   unsigned NewOffset1 = Paired.Offset;
13050b57cec5SDimitry Andric   unsigned Opc =
13060b57cec5SDimitry Andric       CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
13070b57cec5SDimitry Andric 
13080b57cec5SDimitry Andric   if (NewOffset0 > NewOffset1) {
13090b57cec5SDimitry Andric     // Canonicalize the merged instruction so the smaller offset comes first.
13100b57cec5SDimitry Andric     std::swap(NewOffset0, NewOffset1);
13110b57cec5SDimitry Andric     std::swap(Data0, Data1);
13120b57cec5SDimitry Andric   }
13130b57cec5SDimitry Andric 
13140b57cec5SDimitry Andric   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
13150b57cec5SDimitry Andric          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
13160b57cec5SDimitry Andric 
13170b57cec5SDimitry Andric   const MCInstrDesc &Write2Desc = TII->get(Opc);
13180b57cec5SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
13190b57cec5SDimitry Andric 
13208bcb0991SDimitry Andric   Register BaseReg = AddrReg->getReg();
13210b57cec5SDimitry Andric   unsigned BaseSubReg = AddrReg->getSubReg();
13220b57cec5SDimitry Andric   unsigned BaseRegFlags = 0;
13230b57cec5SDimitry Andric   if (CI.BaseOff) {
13248bcb0991SDimitry Andric     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
132581ad6265SDimitry Andric     BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
13260b57cec5SDimitry Andric         .addImm(CI.BaseOff);
13270b57cec5SDimitry Andric 
13280b57cec5SDimitry Andric     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
13290b57cec5SDimitry Andric     BaseRegFlags = RegState::Kill;
13300b57cec5SDimitry Andric 
133181ad6265SDimitry Andric     TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
13320b57cec5SDimitry Andric         .addReg(ImmReg)
13330b57cec5SDimitry Andric         .addReg(AddrReg->getReg(), 0, BaseSubReg)
13340b57cec5SDimitry Andric         .addImm(0); // clamp bit
13350b57cec5SDimitry Andric     BaseSubReg = 0;
13360b57cec5SDimitry Andric   }
13370b57cec5SDimitry Andric 
13380b57cec5SDimitry Andric   MachineInstrBuilder Write2 =
133981ad6265SDimitry Andric       BuildMI(*MBB, InsertBefore, DL, Write2Desc)
13400b57cec5SDimitry Andric           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
13410b57cec5SDimitry Andric           .add(*Data0)                               // data0
13420b57cec5SDimitry Andric           .add(*Data1)                               // data1
13430b57cec5SDimitry Andric           .addImm(NewOffset0)                        // offset0
13440b57cec5SDimitry Andric           .addImm(NewOffset1)                        // offset1
13450b57cec5SDimitry Andric           .addImm(0)                                 // gds
1346480093f4SDimitry Andric           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
13470b57cec5SDimitry Andric 
13480b57cec5SDimitry Andric   CI.I->eraseFromParent();
1349480093f4SDimitry Andric   Paired.I->eraseFromParent();
13500b57cec5SDimitry Andric 
13510b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
13528bcb0991SDimitry Andric   return Write2;
13530b57cec5SDimitry Andric }
13540b57cec5SDimitry Andric 
13550b57cec5SDimitry Andric MachineBasicBlock::iterator
mergeImagePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)13565ffd83dbSDimitry Andric SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
135781ad6265SDimitry Andric                                      MachineBasicBlock::iterator InsertBefore) {
13580b57cec5SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
13590b57cec5SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
1360480093f4SDimitry Andric   const unsigned Opcode = getNewOpcode(CI, Paired);
13610b57cec5SDimitry Andric 
1362480093f4SDimitry Andric   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
13630b57cec5SDimitry Andric 
13648bcb0991SDimitry Andric   Register DestReg = MRI->createVirtualRegister(SuperRC);
1365480093f4SDimitry Andric   unsigned MergedDMask = CI.DMask | Paired.DMask;
13668bcb0991SDimitry Andric   unsigned DMaskIdx =
13678bcb0991SDimitry Andric       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
13680b57cec5SDimitry Andric 
136981ad6265SDimitry Andric   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
13708bcb0991SDimitry Andric   for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
13718bcb0991SDimitry Andric     if (I == DMaskIdx)
13728bcb0991SDimitry Andric       MIB.addImm(MergedDMask);
13738bcb0991SDimitry Andric     else
13748bcb0991SDimitry Andric       MIB.add((*CI.I).getOperand(I));
13758bcb0991SDimitry Andric   }
13760b57cec5SDimitry Andric 
13778bcb0991SDimitry Andric   // It shouldn't be possible to get this far if the two instructions
13788bcb0991SDimitry Andric   // don't have a single memoperand, because MachineInstr::mayAlias()
13798bcb0991SDimitry Andric   // will return true if this is the case.
1380480093f4SDimitry Andric   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
13810b57cec5SDimitry Andric 
138281ad6265SDimitry Andric   MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
13830b57cec5SDimitry Andric 
1384480093f4SDimitry Andric   unsigned SubRegIdx0, SubRegIdx1;
1385480093f4SDimitry Andric   std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
13860b57cec5SDimitry Andric 
13870b57cec5SDimitry Andric   // Copy to the old destination registers.
13880b57cec5SDimitry Andric   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
13890b57cec5SDimitry Andric   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1390480093f4SDimitry Andric   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
13910b57cec5SDimitry Andric 
139281ad6265SDimitry Andric   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
13930b57cec5SDimitry Andric       .add(*Dest0) // Copy to same destination including flags and sub reg.
13940b57cec5SDimitry Andric       .addReg(DestReg, 0, SubRegIdx0);
139581ad6265SDimitry Andric   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
13960b57cec5SDimitry Andric       .add(*Dest1)
13970b57cec5SDimitry Andric       .addReg(DestReg, RegState::Kill, SubRegIdx1);
13980b57cec5SDimitry Andric 
13990b57cec5SDimitry Andric   CI.I->eraseFromParent();
1400480093f4SDimitry Andric   Paired.I->eraseFromParent();
14018bcb0991SDimitry Andric   return New;
14028bcb0991SDimitry Andric }
14038bcb0991SDimitry Andric 
mergeSMemLoadImmPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)1404bdd1243dSDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
14055ffd83dbSDimitry Andric     CombineInfo &CI, CombineInfo &Paired,
140681ad6265SDimitry Andric     MachineBasicBlock::iterator InsertBefore) {
14078bcb0991SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
14088bcb0991SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
1409480093f4SDimitry Andric   const unsigned Opcode = getNewOpcode(CI, Paired);
14108bcb0991SDimitry Andric 
1411480093f4SDimitry Andric   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
14128bcb0991SDimitry Andric 
14138bcb0991SDimitry Andric   Register DestReg = MRI->createVirtualRegister(SuperRC);
1414480093f4SDimitry Andric   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
14158bcb0991SDimitry Andric 
14168bcb0991SDimitry Andric   // It shouldn't be possible to get this far if the two instructions
14178bcb0991SDimitry Andric   // don't have a single memoperand, because MachineInstr::mayAlias()
14188bcb0991SDimitry Andric   // will return true if this is the case.
1419480093f4SDimitry Andric   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
14208bcb0991SDimitry Andric 
1421bdd1243dSDimitry Andric   MachineInstrBuilder New =
142281ad6265SDimitry Andric       BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1423bdd1243dSDimitry Andric           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1424bdd1243dSDimitry Andric   if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1425bdd1243dSDimitry Andric     New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1426bdd1243dSDimitry Andric   New.addImm(MergedOffset);
1427bdd1243dSDimitry Andric   New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
14288bcb0991SDimitry Andric 
1429480093f4SDimitry Andric   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
14308bcb0991SDimitry Andric   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
14318bcb0991SDimitry Andric   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
14328bcb0991SDimitry Andric 
14338bcb0991SDimitry Andric   // Copy to the old destination registers.
14348bcb0991SDimitry Andric   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
14358bcb0991SDimitry Andric   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1436480093f4SDimitry Andric   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
14378bcb0991SDimitry Andric 
143881ad6265SDimitry Andric   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
14398bcb0991SDimitry Andric       .add(*Dest0) // Copy to same destination including flags and sub reg.
14408bcb0991SDimitry Andric       .addReg(DestReg, 0, SubRegIdx0);
144181ad6265SDimitry Andric   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
14428bcb0991SDimitry Andric       .add(*Dest1)
14438bcb0991SDimitry Andric       .addReg(DestReg, RegState::Kill, SubRegIdx1);
14448bcb0991SDimitry Andric 
14458bcb0991SDimitry Andric   CI.I->eraseFromParent();
1446480093f4SDimitry Andric   Paired.I->eraseFromParent();
14478bcb0991SDimitry Andric   return New;
14488bcb0991SDimitry Andric }
14498bcb0991SDimitry Andric 
mergeBufferLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)14505ffd83dbSDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
14515ffd83dbSDimitry Andric     CombineInfo &CI, CombineInfo &Paired,
145281ad6265SDimitry Andric     MachineBasicBlock::iterator InsertBefore) {
14538bcb0991SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
14548bcb0991SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
14558bcb0991SDimitry Andric 
1456480093f4SDimitry Andric   const unsigned Opcode = getNewOpcode(CI, Paired);
14578bcb0991SDimitry Andric 
1458480093f4SDimitry Andric   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
14598bcb0991SDimitry Andric 
14608bcb0991SDimitry Andric   // Copy to the new source register.
14618bcb0991SDimitry Andric   Register DestReg = MRI->createVirtualRegister(SuperRC);
1462480093f4SDimitry Andric   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
14638bcb0991SDimitry Andric 
146481ad6265SDimitry Andric   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
14658bcb0991SDimitry Andric 
14665ffd83dbSDimitry Andric   AddressRegs Regs = getRegs(Opcode, *TII);
14678bcb0991SDimitry Andric 
14685ffd83dbSDimitry Andric   if (Regs.VAddr)
14698bcb0991SDimitry Andric     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
14708bcb0991SDimitry Andric 
14718bcb0991SDimitry Andric   // It shouldn't be possible to get this far if the two instructions
14728bcb0991SDimitry Andric   // don't have a single memoperand, because MachineInstr::mayAlias()
14738bcb0991SDimitry Andric   // will return true if this is the case.
1474480093f4SDimitry Andric   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
14758bcb0991SDimitry Andric 
14768bcb0991SDimitry Andric   MachineInstr *New =
14778bcb0991SDimitry Andric     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
14788bcb0991SDimitry Andric         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
14798bcb0991SDimitry Andric         .addImm(MergedOffset) // offset
1480fe6060f1SDimitry Andric         .addImm(CI.CPol)      // cpol
14818bcb0991SDimitry Andric         .addImm(0)            // swz
148281ad6265SDimitry Andric         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
14838bcb0991SDimitry Andric 
1484480093f4SDimitry Andric   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
14858bcb0991SDimitry Andric   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
14868bcb0991SDimitry Andric   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
14878bcb0991SDimitry Andric 
14888bcb0991SDimitry Andric   // Copy to the old destination registers.
14898bcb0991SDimitry Andric   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
14908bcb0991SDimitry Andric   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1491480093f4SDimitry Andric   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
14928bcb0991SDimitry Andric 
149381ad6265SDimitry Andric   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
14948bcb0991SDimitry Andric       .add(*Dest0) // Copy to same destination including flags and sub reg.
14958bcb0991SDimitry Andric       .addReg(DestReg, 0, SubRegIdx0);
149681ad6265SDimitry Andric   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
14978bcb0991SDimitry Andric       .add(*Dest1)
14988bcb0991SDimitry Andric       .addReg(DestReg, RegState::Kill, SubRegIdx1);
14998bcb0991SDimitry Andric 
15008bcb0991SDimitry Andric   CI.I->eraseFromParent();
1501480093f4SDimitry Andric   Paired.I->eraseFromParent();
15028bcb0991SDimitry Andric   return New;
15030b57cec5SDimitry Andric }
15040b57cec5SDimitry Andric 
mergeTBufferLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)15055ffd83dbSDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
15065ffd83dbSDimitry Andric     CombineInfo &CI, CombineInfo &Paired,
150781ad6265SDimitry Andric     MachineBasicBlock::iterator InsertBefore) {
1508480093f4SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
1509480093f4SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
1510480093f4SDimitry Andric 
1511480093f4SDimitry Andric   const unsigned Opcode = getNewOpcode(CI, Paired);
1512480093f4SDimitry Andric 
1513480093f4SDimitry Andric   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1514480093f4SDimitry Andric 
1515480093f4SDimitry Andric   // Copy to the new source register.
1516480093f4SDimitry Andric   Register DestReg = MRI->createVirtualRegister(SuperRC);
1517480093f4SDimitry Andric   unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1518480093f4SDimitry Andric 
151981ad6265SDimitry Andric   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1520480093f4SDimitry Andric 
15215ffd83dbSDimitry Andric   AddressRegs Regs = getRegs(Opcode, *TII);
1522480093f4SDimitry Andric 
15235ffd83dbSDimitry Andric   if (Regs.VAddr)
1524480093f4SDimitry Andric     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1525480093f4SDimitry Andric 
1526480093f4SDimitry Andric   unsigned JoinedFormat =
15275ffd83dbSDimitry Andric       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1528480093f4SDimitry Andric 
1529480093f4SDimitry Andric   // It shouldn't be possible to get this far if the two instructions
1530480093f4SDimitry Andric   // don't have a single memoperand, because MachineInstr::mayAlias()
1531480093f4SDimitry Andric   // will return true if this is the case.
1532480093f4SDimitry Andric   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1533480093f4SDimitry Andric 
1534480093f4SDimitry Andric   MachineInstr *New =
1535480093f4SDimitry Andric       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1536480093f4SDimitry Andric           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1537480093f4SDimitry Andric           .addImm(MergedOffset) // offset
1538480093f4SDimitry Andric           .addImm(JoinedFormat) // format
1539fe6060f1SDimitry Andric           .addImm(CI.CPol)      // cpol
1540480093f4SDimitry Andric           .addImm(0)            // swz
154181ad6265SDimitry Andric           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1542480093f4SDimitry Andric 
1543480093f4SDimitry Andric   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1544480093f4SDimitry Andric   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1545480093f4SDimitry Andric   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1546480093f4SDimitry Andric 
1547480093f4SDimitry Andric   // Copy to the old destination registers.
1548480093f4SDimitry Andric   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1549480093f4SDimitry Andric   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1550480093f4SDimitry Andric   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1551480093f4SDimitry Andric 
155281ad6265SDimitry Andric   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1553480093f4SDimitry Andric       .add(*Dest0) // Copy to same destination including flags and sub reg.
1554480093f4SDimitry Andric       .addReg(DestReg, 0, SubRegIdx0);
155581ad6265SDimitry Andric   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1556480093f4SDimitry Andric       .add(*Dest1)
1557480093f4SDimitry Andric       .addReg(DestReg, RegState::Kill, SubRegIdx1);
1558480093f4SDimitry Andric 
1559480093f4SDimitry Andric   CI.I->eraseFromParent();
1560480093f4SDimitry Andric   Paired.I->eraseFromParent();
1561480093f4SDimitry Andric   return New;
1562480093f4SDimitry Andric }
1563480093f4SDimitry Andric 
mergeTBufferStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)15645ffd83dbSDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
15655ffd83dbSDimitry Andric     CombineInfo &CI, CombineInfo &Paired,
156681ad6265SDimitry Andric     MachineBasicBlock::iterator InsertBefore) {
1567480093f4SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
1568480093f4SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
1569480093f4SDimitry Andric 
1570480093f4SDimitry Andric   const unsigned Opcode = getNewOpcode(CI, Paired);
1571480093f4SDimitry Andric 
1572480093f4SDimitry Andric   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1573480093f4SDimitry Andric   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1574480093f4SDimitry Andric   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1575480093f4SDimitry Andric 
1576480093f4SDimitry Andric   // Copy to the new source register.
1577480093f4SDimitry Andric   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1578480093f4SDimitry Andric   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1579480093f4SDimitry Andric 
1580480093f4SDimitry Andric   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1581480093f4SDimitry Andric   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1582480093f4SDimitry Andric 
158381ad6265SDimitry Andric   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1584480093f4SDimitry Andric       .add(*Src0)
1585480093f4SDimitry Andric       .addImm(SubRegIdx0)
1586480093f4SDimitry Andric       .add(*Src1)
1587480093f4SDimitry Andric       .addImm(SubRegIdx1);
1588480093f4SDimitry Andric 
158981ad6265SDimitry Andric   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1590480093f4SDimitry Andric                  .addReg(SrcReg, RegState::Kill);
1591480093f4SDimitry Andric 
15925ffd83dbSDimitry Andric   AddressRegs Regs = getRegs(Opcode, *TII);
1593480093f4SDimitry Andric 
15945ffd83dbSDimitry Andric   if (Regs.VAddr)
1595480093f4SDimitry Andric     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1596480093f4SDimitry Andric 
1597480093f4SDimitry Andric   unsigned JoinedFormat =
15985ffd83dbSDimitry Andric       getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1599480093f4SDimitry Andric 
1600480093f4SDimitry Andric   // It shouldn't be possible to get this far if the two instructions
1601480093f4SDimitry Andric   // don't have a single memoperand, because MachineInstr::mayAlias()
1602480093f4SDimitry Andric   // will return true if this is the case.
1603480093f4SDimitry Andric   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1604480093f4SDimitry Andric 
1605480093f4SDimitry Andric   MachineInstr *New =
1606480093f4SDimitry Andric       MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1607480093f4SDimitry Andric           .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1608480093f4SDimitry Andric           .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1609480093f4SDimitry Andric           .addImm(JoinedFormat)                     // format
1610fe6060f1SDimitry Andric           .addImm(CI.CPol)                          // cpol
1611480093f4SDimitry Andric           .addImm(0)                                // swz
161281ad6265SDimitry Andric           .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1613480093f4SDimitry Andric 
161481ad6265SDimitry Andric   CI.I->eraseFromParent();
161581ad6265SDimitry Andric   Paired.I->eraseFromParent();
161681ad6265SDimitry Andric   return New;
161781ad6265SDimitry Andric }
161881ad6265SDimitry Andric 
mergeFlatLoadPair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)161981ad6265SDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
162081ad6265SDimitry Andric     CombineInfo &CI, CombineInfo &Paired,
162181ad6265SDimitry Andric     MachineBasicBlock::iterator InsertBefore) {
162281ad6265SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
162381ad6265SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
162481ad6265SDimitry Andric 
162581ad6265SDimitry Andric   const unsigned Opcode = getNewOpcode(CI, Paired);
162681ad6265SDimitry Andric 
162781ad6265SDimitry Andric   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
162881ad6265SDimitry Andric   Register DestReg = MRI->createVirtualRegister(SuperRC);
162981ad6265SDimitry Andric 
163081ad6265SDimitry Andric   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
163181ad6265SDimitry Andric 
163281ad6265SDimitry Andric   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
163381ad6265SDimitry Andric     MIB.add(*SAddr);
163481ad6265SDimitry Andric 
163581ad6265SDimitry Andric   MachineInstr *New =
163681ad6265SDimitry Andric     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
163781ad6265SDimitry Andric        .addImm(std::min(CI.Offset, Paired.Offset))
163881ad6265SDimitry Andric        .addImm(CI.CPol)
163981ad6265SDimitry Andric        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
164081ad6265SDimitry Andric 
164181ad6265SDimitry Andric   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
164281ad6265SDimitry Andric   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
164381ad6265SDimitry Andric   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
164481ad6265SDimitry Andric 
164581ad6265SDimitry Andric   // Copy to the old destination registers.
164681ad6265SDimitry Andric   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
164781ad6265SDimitry Andric   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
164881ad6265SDimitry Andric   const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
164981ad6265SDimitry Andric 
165081ad6265SDimitry Andric   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
165181ad6265SDimitry Andric       .add(*Dest0) // Copy to same destination including flags and sub reg.
165281ad6265SDimitry Andric       .addReg(DestReg, 0, SubRegIdx0);
165381ad6265SDimitry Andric   BuildMI(*MBB, InsertBefore, DL, CopyDesc)
165481ad6265SDimitry Andric       .add(*Dest1)
165581ad6265SDimitry Andric       .addReg(DestReg, RegState::Kill, SubRegIdx1);
165681ad6265SDimitry Andric 
165781ad6265SDimitry Andric   CI.I->eraseFromParent();
165881ad6265SDimitry Andric   Paired.I->eraseFromParent();
165981ad6265SDimitry Andric   return New;
166081ad6265SDimitry Andric }
166181ad6265SDimitry Andric 
mergeFlatStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)166281ad6265SDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
166381ad6265SDimitry Andric     CombineInfo &CI, CombineInfo &Paired,
166481ad6265SDimitry Andric     MachineBasicBlock::iterator InsertBefore) {
166581ad6265SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
166681ad6265SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
166781ad6265SDimitry Andric 
166881ad6265SDimitry Andric   const unsigned Opcode = getNewOpcode(CI, Paired);
166981ad6265SDimitry Andric 
167081ad6265SDimitry Andric   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
167181ad6265SDimitry Andric   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
167281ad6265SDimitry Andric   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
167381ad6265SDimitry Andric 
167481ad6265SDimitry Andric   // Copy to the new source register.
167581ad6265SDimitry Andric   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
167681ad6265SDimitry Andric   Register SrcReg = MRI->createVirtualRegister(SuperRC);
167781ad6265SDimitry Andric 
167881ad6265SDimitry Andric   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
167981ad6265SDimitry Andric   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
168081ad6265SDimitry Andric 
168181ad6265SDimitry Andric   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
168281ad6265SDimitry Andric       .add(*Src0)
168381ad6265SDimitry Andric       .addImm(SubRegIdx0)
168481ad6265SDimitry Andric       .add(*Src1)
168581ad6265SDimitry Andric       .addImm(SubRegIdx1);
168681ad6265SDimitry Andric 
168781ad6265SDimitry Andric   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
168881ad6265SDimitry Andric                  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
168981ad6265SDimitry Andric                  .addReg(SrcReg, RegState::Kill);
169081ad6265SDimitry Andric 
169181ad6265SDimitry Andric   if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
169281ad6265SDimitry Andric     MIB.add(*SAddr);
169381ad6265SDimitry Andric 
169481ad6265SDimitry Andric   MachineInstr *New =
169581ad6265SDimitry Andric     MIB.addImm(std::min(CI.Offset, Paired.Offset))
169681ad6265SDimitry Andric        .addImm(CI.CPol)
169781ad6265SDimitry Andric        .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1698480093f4SDimitry Andric 
1699480093f4SDimitry Andric   CI.I->eraseFromParent();
1700480093f4SDimitry Andric   Paired.I->eraseFromParent();
1701480093f4SDimitry Andric   return New;
1702480093f4SDimitry Andric }
1703480093f4SDimitry Andric 
getNewOpcode(const CombineInfo & CI,const CombineInfo & Paired)1704480093f4SDimitry Andric unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1705480093f4SDimitry Andric                                             const CombineInfo &Paired) {
1706480093f4SDimitry Andric   const unsigned Width = CI.Width + Paired.Width;
17070b57cec5SDimitry Andric 
170881ad6265SDimitry Andric   switch (getCommonInstClass(CI, Paired)) {
17090b57cec5SDimitry Andric   default:
17108bcb0991SDimitry Andric     assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
17118bcb0991SDimitry Andric     // FIXME: Handle d16 correctly
17128bcb0991SDimitry Andric     return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
17138bcb0991SDimitry Andric                                   Width);
1714480093f4SDimitry Andric   case TBUFFER_LOAD:
1715480093f4SDimitry Andric   case TBUFFER_STORE:
1716480093f4SDimitry Andric     return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1717480093f4SDimitry Andric                                   Width);
1718480093f4SDimitry Andric 
17190b57cec5SDimitry Andric   case UNKNOWN:
17200b57cec5SDimitry Andric     llvm_unreachable("Unknown instruction class");
17210b57cec5SDimitry Andric   case S_BUFFER_LOAD_IMM:
17220b57cec5SDimitry Andric     switch (Width) {
17230b57cec5SDimitry Andric     default:
17240b57cec5SDimitry Andric       return 0;
17250b57cec5SDimitry Andric     case 2:
17260b57cec5SDimitry Andric       return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1727*c9157d92SDimitry Andric     case 3:
1728*c9157d92SDimitry Andric       return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
17290b57cec5SDimitry Andric     case 4:
17300b57cec5SDimitry Andric       return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1731349cc55cSDimitry Andric     case 8:
1732349cc55cSDimitry Andric       return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
17330b57cec5SDimitry Andric     }
1734bdd1243dSDimitry Andric   case S_BUFFER_LOAD_SGPR_IMM:
1735bdd1243dSDimitry Andric     switch (Width) {
1736bdd1243dSDimitry Andric     default:
1737bdd1243dSDimitry Andric       return 0;
1738bdd1243dSDimitry Andric     case 2:
1739fe013be4SDimitry Andric       return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1740*c9157d92SDimitry Andric     case 3:
1741*c9157d92SDimitry Andric       return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1742bdd1243dSDimitry Andric     case 4:
1743fe013be4SDimitry Andric       return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1744bdd1243dSDimitry Andric     case 8:
1745fe013be4SDimitry Andric       return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1746bdd1243dSDimitry Andric     }
1747bdd1243dSDimitry Andric   case S_LOAD_IMM:
1748bdd1243dSDimitry Andric     switch (Width) {
1749bdd1243dSDimitry Andric     default:
1750bdd1243dSDimitry Andric       return 0;
1751bdd1243dSDimitry Andric     case 2:
1752bdd1243dSDimitry Andric       return AMDGPU::S_LOAD_DWORDX2_IMM;
1753*c9157d92SDimitry Andric     case 3:
1754*c9157d92SDimitry Andric       return AMDGPU::S_LOAD_DWORDX3_IMM;
1755bdd1243dSDimitry Andric     case 4:
1756bdd1243dSDimitry Andric       return AMDGPU::S_LOAD_DWORDX4_IMM;
1757bdd1243dSDimitry Andric     case 8:
1758bdd1243dSDimitry Andric       return AMDGPU::S_LOAD_DWORDX8_IMM;
1759bdd1243dSDimitry Andric     }
176081ad6265SDimitry Andric   case GLOBAL_LOAD:
176181ad6265SDimitry Andric     switch (Width) {
176281ad6265SDimitry Andric     default:
176381ad6265SDimitry Andric       return 0;
176481ad6265SDimitry Andric     case 2:
176581ad6265SDimitry Andric       return AMDGPU::GLOBAL_LOAD_DWORDX2;
176681ad6265SDimitry Andric     case 3:
176781ad6265SDimitry Andric       return AMDGPU::GLOBAL_LOAD_DWORDX3;
176881ad6265SDimitry Andric     case 4:
176981ad6265SDimitry Andric       return AMDGPU::GLOBAL_LOAD_DWORDX4;
177081ad6265SDimitry Andric     }
177181ad6265SDimitry Andric   case GLOBAL_LOAD_SADDR:
177281ad6265SDimitry Andric     switch (Width) {
177381ad6265SDimitry Andric     default:
177481ad6265SDimitry Andric       return 0;
177581ad6265SDimitry Andric     case 2:
177681ad6265SDimitry Andric       return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
177781ad6265SDimitry Andric     case 3:
177881ad6265SDimitry Andric       return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
177981ad6265SDimitry Andric     case 4:
178081ad6265SDimitry Andric       return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
178181ad6265SDimitry Andric     }
178281ad6265SDimitry Andric   case GLOBAL_STORE:
178381ad6265SDimitry Andric     switch (Width) {
178481ad6265SDimitry Andric     default:
178581ad6265SDimitry Andric       return 0;
178681ad6265SDimitry Andric     case 2:
178781ad6265SDimitry Andric       return AMDGPU::GLOBAL_STORE_DWORDX2;
178881ad6265SDimitry Andric     case 3:
178981ad6265SDimitry Andric       return AMDGPU::GLOBAL_STORE_DWORDX3;
179081ad6265SDimitry Andric     case 4:
179181ad6265SDimitry Andric       return AMDGPU::GLOBAL_STORE_DWORDX4;
179281ad6265SDimitry Andric     }
179381ad6265SDimitry Andric   case GLOBAL_STORE_SADDR:
179481ad6265SDimitry Andric     switch (Width) {
179581ad6265SDimitry Andric     default:
179681ad6265SDimitry Andric       return 0;
179781ad6265SDimitry Andric     case 2:
179881ad6265SDimitry Andric       return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
179981ad6265SDimitry Andric     case 3:
180081ad6265SDimitry Andric       return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
180181ad6265SDimitry Andric     case 4:
180281ad6265SDimitry Andric       return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
180381ad6265SDimitry Andric     }
180481ad6265SDimitry Andric   case FLAT_LOAD:
180581ad6265SDimitry Andric     switch (Width) {
180681ad6265SDimitry Andric     default:
180781ad6265SDimitry Andric       return 0;
180881ad6265SDimitry Andric     case 2:
180981ad6265SDimitry Andric       return AMDGPU::FLAT_LOAD_DWORDX2;
181081ad6265SDimitry Andric     case 3:
181181ad6265SDimitry Andric       return AMDGPU::FLAT_LOAD_DWORDX3;
181281ad6265SDimitry Andric     case 4:
181381ad6265SDimitry Andric       return AMDGPU::FLAT_LOAD_DWORDX4;
181481ad6265SDimitry Andric     }
181581ad6265SDimitry Andric   case FLAT_STORE:
181681ad6265SDimitry Andric     switch (Width) {
181781ad6265SDimitry Andric     default:
181881ad6265SDimitry Andric       return 0;
181981ad6265SDimitry Andric     case 2:
182081ad6265SDimitry Andric       return AMDGPU::FLAT_STORE_DWORDX2;
182181ad6265SDimitry Andric     case 3:
182281ad6265SDimitry Andric       return AMDGPU::FLAT_STORE_DWORDX3;
182381ad6265SDimitry Andric     case 4:
182481ad6265SDimitry Andric       return AMDGPU::FLAT_STORE_DWORDX4;
182581ad6265SDimitry Andric     }
18268bcb0991SDimitry Andric   case MIMG:
1827bdd1243dSDimitry Andric     assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1828349cc55cSDimitry Andric            "No overlaps");
18298bcb0991SDimitry Andric     return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
18300b57cec5SDimitry Andric   }
18310b57cec5SDimitry Andric }
18320b57cec5SDimitry Andric 
18330b57cec5SDimitry Andric std::pair<unsigned, unsigned>
getSubRegIdxs(const CombineInfo & CI,const CombineInfo & Paired)1834349cc55cSDimitry Andric SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1835349cc55cSDimitry Andric                                     const CombineInfo &Paired) {
1836bdd1243dSDimitry Andric   assert((CI.InstClass != MIMG ||
1837bdd1243dSDimitry Andric           ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
183881ad6265SDimitry Andric            CI.Width + Paired.Width)) &&
18398bcb0991SDimitry Andric          "No overlaps");
18408bcb0991SDimitry Andric 
1841349cc55cSDimitry Andric   unsigned Idx0;
1842349cc55cSDimitry Andric   unsigned Idx1;
1843349cc55cSDimitry Andric 
184404eeddc0SDimitry Andric   static const unsigned Idxs[5][4] = {
18458bcb0991SDimitry Andric       {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
184604eeddc0SDimitry Andric       {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
184704eeddc0SDimitry Andric       {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
184804eeddc0SDimitry Andric       {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
184904eeddc0SDimitry Andric       {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
18508bcb0991SDimitry Andric   };
18518bcb0991SDimitry Andric 
185204eeddc0SDimitry Andric   assert(CI.Width >= 1 && CI.Width <= 4);
185304eeddc0SDimitry Andric   assert(Paired.Width >= 1 && Paired.Width <= 4);
18548bcb0991SDimitry Andric 
185581ad6265SDimitry Andric   if (Paired < CI) {
1856480093f4SDimitry Andric     Idx1 = Idxs[0][Paired.Width - 1];
1857480093f4SDimitry Andric     Idx0 = Idxs[Paired.Width][CI.Width - 1];
18580b57cec5SDimitry Andric   } else {
1859480093f4SDimitry Andric     Idx0 = Idxs[0][CI.Width - 1];
1860480093f4SDimitry Andric     Idx1 = Idxs[CI.Width][Paired.Width - 1];
18610b57cec5SDimitry Andric   }
18628bcb0991SDimitry Andric 
1863bdd1243dSDimitry Andric   return std::pair(Idx0, Idx1);
18640b57cec5SDimitry Andric }
18650b57cec5SDimitry Andric 
18660b57cec5SDimitry Andric const TargetRegisterClass *
getTargetRegisterClass(const CombineInfo & CI,const CombineInfo & Paired)1867480093f4SDimitry Andric SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1868480093f4SDimitry Andric                                              const CombineInfo &Paired) {
1869bdd1243dSDimitry Andric   if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1870bdd1243dSDimitry Andric       CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1871480093f4SDimitry Andric     switch (CI.Width + Paired.Width) {
18720b57cec5SDimitry Andric     default:
18730b57cec5SDimitry Andric       return nullptr;
18740b57cec5SDimitry Andric     case 2:
18750b57cec5SDimitry Andric       return &AMDGPU::SReg_64_XEXECRegClass;
1876*c9157d92SDimitry Andric     case 3:
1877*c9157d92SDimitry Andric       return &AMDGPU::SGPR_96RegClass;
18780b57cec5SDimitry Andric     case 4:
18798bcb0991SDimitry Andric       return &AMDGPU::SGPR_128RegClass;
18800b57cec5SDimitry Andric     case 8:
18815ffd83dbSDimitry Andric       return &AMDGPU::SGPR_256RegClass;
18820b57cec5SDimitry Andric     case 16:
18835ffd83dbSDimitry Andric       return &AMDGPU::SGPR_512RegClass;
18840b57cec5SDimitry Andric     }
18850b57cec5SDimitry Andric   }
1886fe6060f1SDimitry Andric 
1887fe6060f1SDimitry Andric   unsigned BitWidth = 32 * (CI.Width + Paired.Width);
18884824e7fdSDimitry Andric   return TRI->isAGPRClass(getDataRegClass(*CI.I))
1889fe6060f1SDimitry Andric              ? TRI->getAGPRClassForBitWidth(BitWidth)
1890fe6060f1SDimitry Andric              : TRI->getVGPRClassForBitWidth(BitWidth);
18910b57cec5SDimitry Andric }
18920b57cec5SDimitry Andric 
mergeBufferStorePair(CombineInfo & CI,CombineInfo & Paired,MachineBasicBlock::iterator InsertBefore)18935ffd83dbSDimitry Andric MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
18945ffd83dbSDimitry Andric     CombineInfo &CI, CombineInfo &Paired,
189581ad6265SDimitry Andric     MachineBasicBlock::iterator InsertBefore) {
18960b57cec5SDimitry Andric   MachineBasicBlock *MBB = CI.I->getParent();
18970b57cec5SDimitry Andric   DebugLoc DL = CI.I->getDebugLoc();
18980b57cec5SDimitry Andric 
1899480093f4SDimitry Andric   const unsigned Opcode = getNewOpcode(CI, Paired);
19000b57cec5SDimitry Andric 
1901480093f4SDimitry Andric   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
19020b57cec5SDimitry Andric   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
19030b57cec5SDimitry Andric   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
19040b57cec5SDimitry Andric 
19050b57cec5SDimitry Andric   // Copy to the new source register.
1906480093f4SDimitry Andric   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
19078bcb0991SDimitry Andric   Register SrcReg = MRI->createVirtualRegister(SuperRC);
19080b57cec5SDimitry Andric 
19090b57cec5SDimitry Andric   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1910480093f4SDimitry Andric   const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
19110b57cec5SDimitry Andric 
191281ad6265SDimitry Andric   BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
19130b57cec5SDimitry Andric       .add(*Src0)
19140b57cec5SDimitry Andric       .addImm(SubRegIdx0)
19150b57cec5SDimitry Andric       .add(*Src1)
19160b57cec5SDimitry Andric       .addImm(SubRegIdx1);
19170b57cec5SDimitry Andric 
191881ad6265SDimitry Andric   auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
19190b57cec5SDimitry Andric                  .addReg(SrcReg, RegState::Kill);
19200b57cec5SDimitry Andric 
19215ffd83dbSDimitry Andric   AddressRegs Regs = getRegs(Opcode, *TII);
19220b57cec5SDimitry Andric 
19235ffd83dbSDimitry Andric   if (Regs.VAddr)
19240b57cec5SDimitry Andric     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
19250b57cec5SDimitry Andric 
19268bcb0991SDimitry Andric 
19278bcb0991SDimitry Andric   // It shouldn't be possible to get this far if the two instructions
19288bcb0991SDimitry Andric   // don't have a single memoperand, because MachineInstr::mayAlias()
19298bcb0991SDimitry Andric   // will return true if this is the case.
1930480093f4SDimitry Andric   assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
19318bcb0991SDimitry Andric 
19328bcb0991SDimitry Andric   MachineInstr *New =
19330b57cec5SDimitry Andric     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
19340b57cec5SDimitry Andric         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1935480093f4SDimitry Andric         .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1936fe6060f1SDimitry Andric         .addImm(CI.CPol)      // cpol
19378bcb0991SDimitry Andric         .addImm(0)            // swz
193881ad6265SDimitry Andric         .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
19390b57cec5SDimitry Andric 
19400b57cec5SDimitry Andric   CI.I->eraseFromParent();
1941480093f4SDimitry Andric   Paired.I->eraseFromParent();
19428bcb0991SDimitry Andric   return New;
19430b57cec5SDimitry Andric }
19440b57cec5SDimitry Andric 
19450b57cec5SDimitry Andric MachineOperand
createRegOrImm(int32_t Val,MachineInstr & MI) const19468bcb0991SDimitry Andric SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
19470b57cec5SDimitry Andric   APInt V(32, Val, true);
19480b57cec5SDimitry Andric   if (TII->isInlineConstant(V))
19490b57cec5SDimitry Andric     return MachineOperand::CreateImm(Val);
19500b57cec5SDimitry Andric 
19518bcb0991SDimitry Andric   Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
19520b57cec5SDimitry Andric   MachineInstr *Mov =
19530b57cec5SDimitry Andric   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
19540b57cec5SDimitry Andric           TII->get(AMDGPU::S_MOV_B32), Reg)
19550b57cec5SDimitry Andric     .addImm(Val);
19560b57cec5SDimitry Andric   (void)Mov;
19570b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
19580b57cec5SDimitry Andric   return MachineOperand::CreateReg(Reg, false);
19590b57cec5SDimitry Andric }
19600b57cec5SDimitry Andric 
19610b57cec5SDimitry Andric // Compute base address using Addr and return the final register.
computeBase(MachineInstr & MI,const MemAddress & Addr) const19625ffd83dbSDimitry Andric Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
19638bcb0991SDimitry Andric                                            const MemAddress &Addr) const {
19640b57cec5SDimitry Andric   MachineBasicBlock *MBB = MI.getParent();
19650b57cec5SDimitry Andric   MachineBasicBlock::iterator MBBI = MI.getIterator();
19660b57cec5SDimitry Andric   DebugLoc DL = MI.getDebugLoc();
19670b57cec5SDimitry Andric 
19680b57cec5SDimitry Andric   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
19690b57cec5SDimitry Andric           Addr.Base.LoSubReg) &&
19700b57cec5SDimitry Andric          "Expected 32-bit Base-Register-Low!!");
19710b57cec5SDimitry Andric 
19720b57cec5SDimitry Andric   assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
19730b57cec5SDimitry Andric           Addr.Base.HiSubReg) &&
19740b57cec5SDimitry Andric          "Expected 32-bit Base-Register-Hi!!");
19750b57cec5SDimitry Andric 
19760b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
19770b57cec5SDimitry Andric   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
19780b57cec5SDimitry Andric   MachineOperand OffsetHi =
19790b57cec5SDimitry Andric     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
19800b57cec5SDimitry Andric 
19810b57cec5SDimitry Andric   const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
19828bcb0991SDimitry Andric   Register CarryReg = MRI->createVirtualRegister(CarryRC);
19838bcb0991SDimitry Andric   Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
19840b57cec5SDimitry Andric 
19858bcb0991SDimitry Andric   Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
19868bcb0991SDimitry Andric   Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
19870b57cec5SDimitry Andric   MachineInstr *LoHalf =
1988e8d8bef9SDimitry Andric     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
19890b57cec5SDimitry Andric       .addReg(CarryReg, RegState::Define)
19900b57cec5SDimitry Andric       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
19910b57cec5SDimitry Andric       .add(OffsetLo)
19920b57cec5SDimitry Andric       .addImm(0); // clamp bit
19930b57cec5SDimitry Andric   (void)LoHalf;
19940b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
19950b57cec5SDimitry Andric 
19960b57cec5SDimitry Andric   MachineInstr *HiHalf =
19970b57cec5SDimitry Andric   BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
19980b57cec5SDimitry Andric     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
19990b57cec5SDimitry Andric     .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
20000b57cec5SDimitry Andric     .add(OffsetHi)
20010b57cec5SDimitry Andric     .addReg(CarryReg, RegState::Kill)
20020b57cec5SDimitry Andric     .addImm(0); // clamp bit
20030b57cec5SDimitry Andric   (void)HiHalf;
20040b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
20050b57cec5SDimitry Andric 
2006fe6060f1SDimitry Andric   Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
20070b57cec5SDimitry Andric   MachineInstr *FullBase =
20080b57cec5SDimitry Andric     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
20090b57cec5SDimitry Andric       .addReg(DestSub0)
20100b57cec5SDimitry Andric       .addImm(AMDGPU::sub0)
20110b57cec5SDimitry Andric       .addReg(DestSub1)
20120b57cec5SDimitry Andric       .addImm(AMDGPU::sub1);
20130b57cec5SDimitry Andric   (void)FullBase;
20140b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
20150b57cec5SDimitry Andric 
20160b57cec5SDimitry Andric   return FullDestReg;
20170b57cec5SDimitry Andric }
20180b57cec5SDimitry Andric 
20190b57cec5SDimitry Andric // Update base and offset with the NewBase and NewOffset in MI.
updateBaseAndOffset(MachineInstr & MI,Register NewBase,int32_t NewOffset) const20200b57cec5SDimitry Andric void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
20215ffd83dbSDimitry Andric                                                Register NewBase,
20228bcb0991SDimitry Andric                                                int32_t NewOffset) const {
2023480093f4SDimitry Andric   auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2024480093f4SDimitry Andric   Base->setReg(NewBase);
2025480093f4SDimitry Andric   Base->setIsKill(false);
20260b57cec5SDimitry Andric   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
20270b57cec5SDimitry Andric }
20280b57cec5SDimitry Andric 
2029bdd1243dSDimitry Andric std::optional<int32_t>
extractConstOffset(const MachineOperand & Op) const20308bcb0991SDimitry Andric SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
20310b57cec5SDimitry Andric   if (Op.isImm())
20320b57cec5SDimitry Andric     return Op.getImm();
20330b57cec5SDimitry Andric 
20340b57cec5SDimitry Andric   if (!Op.isReg())
2035bdd1243dSDimitry Andric     return std::nullopt;
20360b57cec5SDimitry Andric 
20370b57cec5SDimitry Andric   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
20380b57cec5SDimitry Andric   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
20390b57cec5SDimitry Andric       !Def->getOperand(1).isImm())
2040bdd1243dSDimitry Andric     return std::nullopt;
20410b57cec5SDimitry Andric 
20420b57cec5SDimitry Andric   return Def->getOperand(1).getImm();
20430b57cec5SDimitry Andric }
20440b57cec5SDimitry Andric 
20450b57cec5SDimitry Andric // Analyze Base and extracts:
20460b57cec5SDimitry Andric //  - 32bit base registers, subregisters
20470b57cec5SDimitry Andric //  - 64bit constant offset
20480b57cec5SDimitry Andric // Expecting base computation as:
20490b57cec5SDimitry Andric //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
20500b57cec5SDimitry Andric //   %LO:vgpr_32, %c:sreg_64_xexec =
2051e8d8bef9SDimitry Andric //       V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
20520b57cec5SDimitry Andric //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
20530b57cec5SDimitry Andric //   %Base:vreg_64 =
20540b57cec5SDimitry Andric //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
processBaseWithConstOffset(const MachineOperand & Base,MemAddress & Addr) const20550b57cec5SDimitry Andric void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
20568bcb0991SDimitry Andric                                                       MemAddress &Addr) const {
20570b57cec5SDimitry Andric   if (!Base.isReg())
20580b57cec5SDimitry Andric     return;
20590b57cec5SDimitry Andric 
20600b57cec5SDimitry Andric   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
20610b57cec5SDimitry Andric   if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
20620b57cec5SDimitry Andric       || Def->getNumOperands() != 5)
20630b57cec5SDimitry Andric     return;
20640b57cec5SDimitry Andric 
20650b57cec5SDimitry Andric   MachineOperand BaseLo = Def->getOperand(1);
20660b57cec5SDimitry Andric   MachineOperand BaseHi = Def->getOperand(3);
20670b57cec5SDimitry Andric   if (!BaseLo.isReg() || !BaseHi.isReg())
20680b57cec5SDimitry Andric     return;
20690b57cec5SDimitry Andric 
20700b57cec5SDimitry Andric   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
20710b57cec5SDimitry Andric   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
20720b57cec5SDimitry Andric 
2073e8d8bef9SDimitry Andric   if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
20740b57cec5SDimitry Andric       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
20750b57cec5SDimitry Andric     return;
20760b57cec5SDimitry Andric 
20770b57cec5SDimitry Andric   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
20780b57cec5SDimitry Andric   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
20790b57cec5SDimitry Andric 
20800b57cec5SDimitry Andric   auto Offset0P = extractConstOffset(*Src0);
20810b57cec5SDimitry Andric   if (Offset0P)
20820b57cec5SDimitry Andric     BaseLo = *Src1;
20830b57cec5SDimitry Andric   else {
20840b57cec5SDimitry Andric     if (!(Offset0P = extractConstOffset(*Src1)))
20850b57cec5SDimitry Andric       return;
20860b57cec5SDimitry Andric     BaseLo = *Src0;
20870b57cec5SDimitry Andric   }
20880b57cec5SDimitry Andric 
20890b57cec5SDimitry Andric   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
20900b57cec5SDimitry Andric   Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
20910b57cec5SDimitry Andric 
20920b57cec5SDimitry Andric   if (Src0->isImm())
20930b57cec5SDimitry Andric     std::swap(Src0, Src1);
20940b57cec5SDimitry Andric 
20950b57cec5SDimitry Andric   if (!Src1->isImm())
20960b57cec5SDimitry Andric     return;
20970b57cec5SDimitry Andric 
20980b57cec5SDimitry Andric   uint64_t Offset1 = Src1->getImm();
20990b57cec5SDimitry Andric   BaseHi = *Src0;
21000b57cec5SDimitry Andric 
21010b57cec5SDimitry Andric   Addr.Base.LoReg = BaseLo.getReg();
21020b57cec5SDimitry Andric   Addr.Base.HiReg = BaseHi.getReg();
21030b57cec5SDimitry Andric   Addr.Base.LoSubReg = BaseLo.getSubReg();
21040b57cec5SDimitry Andric   Addr.Base.HiSubReg = BaseHi.getSubReg();
21050b57cec5SDimitry Andric   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
21060b57cec5SDimitry Andric }
21070b57cec5SDimitry Andric 
promoteConstantOffsetToImm(MachineInstr & MI,MemInfoMap & Visited,SmallPtrSet<MachineInstr *,4> & AnchorList) const21080b57cec5SDimitry Andric bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
21090b57cec5SDimitry Andric     MachineInstr &MI,
21100b57cec5SDimitry Andric     MemInfoMap &Visited,
21118bcb0991SDimitry Andric     SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
21120b57cec5SDimitry Andric 
21138bcb0991SDimitry Andric   if (!(MI.mayLoad() ^ MI.mayStore()))
21140b57cec5SDimitry Andric     return false;
21150b57cec5SDimitry Andric 
21168bcb0991SDimitry Andric   // TODO: Support flat and scratch.
21178bcb0991SDimitry Andric   if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
21188bcb0991SDimitry Andric     return false;
21198bcb0991SDimitry Andric 
212004eeddc0SDimitry Andric   if (MI.mayLoad() &&
212104eeddc0SDimitry Andric       TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
21220b57cec5SDimitry Andric     return false;
21230b57cec5SDimitry Andric 
21240b57cec5SDimitry Andric   if (AnchorList.count(&MI))
21250b57cec5SDimitry Andric     return false;
21260b57cec5SDimitry Andric 
21270b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
21280b57cec5SDimitry Andric 
21290b57cec5SDimitry Andric   if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
21300b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
21310b57cec5SDimitry Andric     return false;
21320b57cec5SDimitry Andric   }
21330b57cec5SDimitry Andric 
21340b57cec5SDimitry Andric   // Step1: Find the base-registers and a 64bit constant offset.
21350b57cec5SDimitry Andric   MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
21360b57cec5SDimitry Andric   MemAddress MAddr;
2137fe013be4SDimitry Andric   if (!Visited.contains(&MI)) {
21380b57cec5SDimitry Andric     processBaseWithConstOffset(Base, MAddr);
21390b57cec5SDimitry Andric     Visited[&MI] = MAddr;
21400b57cec5SDimitry Andric   } else
21410b57cec5SDimitry Andric     MAddr = Visited[&MI];
21420b57cec5SDimitry Andric 
21430b57cec5SDimitry Andric   if (MAddr.Offset == 0) {
21440b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
21450b57cec5SDimitry Andric                          " constant offsets that can be promoted.\n";);
21460b57cec5SDimitry Andric     return false;
21470b57cec5SDimitry Andric   }
21480b57cec5SDimitry Andric 
21490b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
21500b57cec5SDimitry Andric              << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
21510b57cec5SDimitry Andric 
21520b57cec5SDimitry Andric   // Step2: Traverse through MI's basic block and find an anchor(that has the
21530b57cec5SDimitry Andric   // same base-registers) with the highest 13bit distance from MI's offset.
21540b57cec5SDimitry Andric   // E.g. (64bit loads)
21550b57cec5SDimitry Andric   // bb:
21560b57cec5SDimitry Andric   //   addr1 = &a + 4096;   load1 = load(addr1,  0)
21570b57cec5SDimitry Andric   //   addr2 = &a + 6144;   load2 = load(addr2,  0)
21580b57cec5SDimitry Andric   //   addr3 = &a + 8192;   load3 = load(addr3,  0)
21590b57cec5SDimitry Andric   //   addr4 = &a + 10240;  load4 = load(addr4,  0)
21600b57cec5SDimitry Andric   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
21610b57cec5SDimitry Andric   //
21620b57cec5SDimitry Andric   // Starting from the first load, the optimization will try to find a new base
21630b57cec5SDimitry Andric   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
21640b57cec5SDimitry Andric   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
21650b57cec5SDimitry Andric   // as the new-base(anchor) because of the maximum distance which can
216681ad6265SDimitry Andric   // accommodate more intermediate bases presumably.
21670b57cec5SDimitry Andric   //
21680b57cec5SDimitry Andric   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
21690b57cec5SDimitry Andric   // (&a + 8192) for load1, load2, load4.
21700b57cec5SDimitry Andric   //   addr = &a + 8192
21710b57cec5SDimitry Andric   //   load1 = load(addr,       -4096)
21720b57cec5SDimitry Andric   //   load2 = load(addr,       -2048)
21730b57cec5SDimitry Andric   //   load3 = load(addr,       0)
21740b57cec5SDimitry Andric   //   load4 = load(addr,       2048)
21750b57cec5SDimitry Andric   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
21760b57cec5SDimitry Andric   //
21770b57cec5SDimitry Andric   MachineInstr *AnchorInst = nullptr;
21780b57cec5SDimitry Andric   MemAddress AnchorAddr;
21790b57cec5SDimitry Andric   uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
21800b57cec5SDimitry Andric   SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
21810b57cec5SDimitry Andric 
21820b57cec5SDimitry Andric   MachineBasicBlock *MBB = MI.getParent();
21830b57cec5SDimitry Andric   MachineBasicBlock::iterator E = MBB->end();
21840b57cec5SDimitry Andric   MachineBasicBlock::iterator MBBI = MI.getIterator();
21850b57cec5SDimitry Andric   ++MBBI;
21860b57cec5SDimitry Andric   const SITargetLowering *TLI =
21870b57cec5SDimitry Andric     static_cast<const SITargetLowering *>(STM->getTargetLowering());
21880b57cec5SDimitry Andric 
21890b57cec5SDimitry Andric   for ( ; MBBI != E; ++MBBI) {
21900b57cec5SDimitry Andric     MachineInstr &MINext = *MBBI;
21910b57cec5SDimitry Andric     // TODO: Support finding an anchor(with same base) from store addresses or
21920b57cec5SDimitry Andric     // any other load addresses where the opcodes are different.
21930b57cec5SDimitry Andric     if (MINext.getOpcode() != MI.getOpcode() ||
21940b57cec5SDimitry Andric         TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
21950b57cec5SDimitry Andric       continue;
21960b57cec5SDimitry Andric 
21970b57cec5SDimitry Andric     const MachineOperand &BaseNext =
21980b57cec5SDimitry Andric       *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
21990b57cec5SDimitry Andric     MemAddress MAddrNext;
2200fe013be4SDimitry Andric     if (!Visited.contains(&MINext)) {
22010b57cec5SDimitry Andric       processBaseWithConstOffset(BaseNext, MAddrNext);
22020b57cec5SDimitry Andric       Visited[&MINext] = MAddrNext;
22030b57cec5SDimitry Andric     } else
22040b57cec5SDimitry Andric       MAddrNext = Visited[&MINext];
22050b57cec5SDimitry Andric 
22060b57cec5SDimitry Andric     if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
22070b57cec5SDimitry Andric         MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
22080b57cec5SDimitry Andric         MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
22090b57cec5SDimitry Andric         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
22100b57cec5SDimitry Andric       continue;
22110b57cec5SDimitry Andric 
2212bdd1243dSDimitry Andric     InstsWCommonBase.push_back(std::pair(&MINext, MAddrNext.Offset));
22130b57cec5SDimitry Andric 
22140b57cec5SDimitry Andric     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
22150b57cec5SDimitry Andric     TargetLoweringBase::AddrMode AM;
22160b57cec5SDimitry Andric     AM.HasBaseReg = true;
22170b57cec5SDimitry Andric     AM.BaseOffs = Dist;
22180b57cec5SDimitry Andric     if (TLI->isLegalGlobalAddressingMode(AM) &&
22190b57cec5SDimitry Andric         (uint32_t)std::abs(Dist) > MaxDist) {
22200b57cec5SDimitry Andric       MaxDist = std::abs(Dist);
22210b57cec5SDimitry Andric 
22220b57cec5SDimitry Andric       AnchorAddr = MAddrNext;
22230b57cec5SDimitry Andric       AnchorInst = &MINext;
22240b57cec5SDimitry Andric     }
22250b57cec5SDimitry Andric   }
22260b57cec5SDimitry Andric 
22270b57cec5SDimitry Andric   if (AnchorInst) {
22280b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
22290b57cec5SDimitry Andric                AnchorInst->dump());
22300b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
22310b57cec5SDimitry Andric                <<  AnchorAddr.Offset << "\n\n");
22320b57cec5SDimitry Andric 
22330b57cec5SDimitry Andric     // Instead of moving up, just re-compute anchor-instruction's base address.
22345ffd83dbSDimitry Andric     Register Base = computeBase(MI, AnchorAddr);
22350b57cec5SDimitry Andric 
22360b57cec5SDimitry Andric     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
22370b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
22380b57cec5SDimitry Andric 
22390b57cec5SDimitry Andric     for (auto P : InstsWCommonBase) {
22400b57cec5SDimitry Andric       TargetLoweringBase::AddrMode AM;
22410b57cec5SDimitry Andric       AM.HasBaseReg = true;
22420b57cec5SDimitry Andric       AM.BaseOffs = P.second - AnchorAddr.Offset;
22430b57cec5SDimitry Andric 
22440b57cec5SDimitry Andric       if (TLI->isLegalGlobalAddressingMode(AM)) {
22450b57cec5SDimitry Andric         LLVM_DEBUG(dbgs() << "  Promote Offset(" << P.second;
22460b57cec5SDimitry Andric                    dbgs() << ")"; P.first->dump());
22470b57cec5SDimitry Andric         updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
22480b57cec5SDimitry Andric         LLVM_DEBUG(dbgs() << "     After promotion: "; P.first->dump());
22490b57cec5SDimitry Andric       }
22500b57cec5SDimitry Andric     }
22510b57cec5SDimitry Andric     AnchorList.insert(AnchorInst);
22520b57cec5SDimitry Andric     return true;
22530b57cec5SDimitry Andric   }
22540b57cec5SDimitry Andric 
22550b57cec5SDimitry Andric   return false;
22560b57cec5SDimitry Andric }
22570b57cec5SDimitry Andric 
addInstToMergeableList(const CombineInfo & CI,std::list<std::list<CombineInfo>> & MergeableInsts) const22588bcb0991SDimitry Andric void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
22598bcb0991SDimitry Andric                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
22608bcb0991SDimitry Andric   for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2261480093f4SDimitry Andric     if (AddrList.front().InstClass == CI.InstClass &&
226204eeddc0SDimitry Andric         AddrList.front().IsAGPR == CI.IsAGPR &&
2263bdd1243dSDimitry Andric         AddrList.front().hasSameBaseAddress(CI)) {
22648bcb0991SDimitry Andric       AddrList.emplace_back(CI);
22658bcb0991SDimitry Andric       return;
22668bcb0991SDimitry Andric     }
22678bcb0991SDimitry Andric   }
22680b57cec5SDimitry Andric 
22698bcb0991SDimitry Andric   // Base address not found, so add a new list.
22708bcb0991SDimitry Andric   MergeableInsts.emplace_back(1, CI);
22718bcb0991SDimitry Andric }
22728bcb0991SDimitry Andric 
22735ffd83dbSDimitry Andric std::pair<MachineBasicBlock::iterator, bool>
collectMergeableInsts(MachineBasicBlock::iterator Begin,MachineBasicBlock::iterator End,MemInfoMap & Visited,SmallPtrSet<MachineInstr *,4> & AnchorList,std::list<std::list<CombineInfo>> & MergeableInsts) const22745ffd83dbSDimitry Andric SILoadStoreOptimizer::collectMergeableInsts(
22755ffd83dbSDimitry Andric     MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
22765ffd83dbSDimitry Andric     MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
22778bcb0991SDimitry Andric     std::list<std::list<CombineInfo>> &MergeableInsts) const {
22788bcb0991SDimitry Andric   bool Modified = false;
22790b57cec5SDimitry Andric 
22808bcb0991SDimitry Andric   // Sort potential mergeable instructions into lists.  One list per base address.
22815ffd83dbSDimitry Andric   unsigned Order = 0;
22825ffd83dbSDimitry Andric   MachineBasicBlock::iterator BlockI = Begin;
22835ffd83dbSDimitry Andric   for (; BlockI != End; ++BlockI) {
22845ffd83dbSDimitry Andric     MachineInstr &MI = *BlockI;
22855ffd83dbSDimitry Andric 
22868bcb0991SDimitry Andric     // We run this before checking if an address is mergeable, because it can produce
22878bcb0991SDimitry Andric     // better code even if the instructions aren't mergeable.
22880b57cec5SDimitry Andric     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
22890b57cec5SDimitry Andric       Modified = true;
22900b57cec5SDimitry Andric 
22911fd87a68SDimitry Andric     // Treat volatile accesses, ordered accesses and unmodeled side effects as
22921fd87a68SDimitry Andric     // barriers. We can look after this barrier for separate merges.
22931fd87a68SDimitry Andric     if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
22941fd87a68SDimitry Andric       LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
22955ffd83dbSDimitry Andric 
22965ffd83dbSDimitry Andric       // Search will resume after this instruction in a separate merge list.
22975ffd83dbSDimitry Andric       ++BlockI;
22985ffd83dbSDimitry Andric       break;
22995ffd83dbSDimitry Andric     }
23005ffd83dbSDimitry Andric 
23018bcb0991SDimitry Andric     const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
23028bcb0991SDimitry Andric     if (InstClass == UNKNOWN)
23038bcb0991SDimitry Andric       continue;
23048bcb0991SDimitry Andric 
230504eeddc0SDimitry Andric     // Do not merge VMEM buffer instructions with "swizzled" bit set.
230604eeddc0SDimitry Andric     int Swizzled =
230704eeddc0SDimitry Andric         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
230804eeddc0SDimitry Andric     if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
230904eeddc0SDimitry Andric       continue;
231004eeddc0SDimitry Andric 
23118bcb0991SDimitry Andric     CombineInfo CI;
231204eeddc0SDimitry Andric     CI.setMI(MI, *this);
23135ffd83dbSDimitry Andric     CI.Order = Order++;
23148bcb0991SDimitry Andric 
23158bcb0991SDimitry Andric     if (!CI.hasMergeableAddress(*MRI))
23168bcb0991SDimitry Andric       continue;
23178bcb0991SDimitry Andric 
231804eeddc0SDimitry Andric     if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
231904eeddc0SDimitry Andric       // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
232004eeddc0SDimitry Andric       //        operands. However we are reporting that ds_write2 shall have
232104eeddc0SDimitry Andric       //        only VGPR data so that machine copy propagation does not
232204eeddc0SDimitry Andric       //        create an illegal instruction with a VGPR and AGPR sources.
232304eeddc0SDimitry Andric       //        Consequenctially if we create such instruction the verifier
232404eeddc0SDimitry Andric       //        will complain.
232504eeddc0SDimitry Andric       continue;
232604eeddc0SDimitry Andric     }
232704eeddc0SDimitry Andric 
23285ffd83dbSDimitry Andric     LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
23295ffd83dbSDimitry Andric 
23308bcb0991SDimitry Andric     addInstToMergeableList(CI, MergeableInsts);
23318bcb0991SDimitry Andric   }
23325ffd83dbSDimitry Andric 
23335ffd83dbSDimitry Andric   // At this point we have lists of Mergeable instructions.
23345ffd83dbSDimitry Andric   //
23355ffd83dbSDimitry Andric   // Part 2: Sort lists by offset and then for each CombineInfo object in the
23365ffd83dbSDimitry Andric   // list try to find an instruction that can be merged with I.  If an instruction
23375ffd83dbSDimitry Andric   // is found, it is stored in the Paired field.  If no instructions are found, then
23385ffd83dbSDimitry Andric   // the CombineInfo object is deleted from the list.
23395ffd83dbSDimitry Andric 
23405ffd83dbSDimitry Andric   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
23415ffd83dbSDimitry Andric                                                    E = MergeableInsts.end(); I != E;) {
23425ffd83dbSDimitry Andric 
23435ffd83dbSDimitry Andric     std::list<CombineInfo> &MergeList = *I;
23445ffd83dbSDimitry Andric     if (MergeList.size() <= 1) {
23455ffd83dbSDimitry Andric       // This means we have found only one instruction with a given address
23465ffd83dbSDimitry Andric       // that can be merged, and we need at least 2 instructions to do a merge,
23475ffd83dbSDimitry Andric       // so this list can be discarded.
23485ffd83dbSDimitry Andric       I = MergeableInsts.erase(I);
23495ffd83dbSDimitry Andric       continue;
23505ffd83dbSDimitry Andric     }
23515ffd83dbSDimitry Andric 
23525ffd83dbSDimitry Andric     // Sort the lists by offsets, this way mergeable instructions will be
23535ffd83dbSDimitry Andric     // adjacent to each other in the list, which will make it easier to find
23545ffd83dbSDimitry Andric     // matches.
23555ffd83dbSDimitry Andric     MergeList.sort(
2356349cc55cSDimitry Andric         [] (const CombineInfo &A, const CombineInfo &B) {
23575ffd83dbSDimitry Andric           return A.Offset < B.Offset;
23585ffd83dbSDimitry Andric         });
23595ffd83dbSDimitry Andric     ++I;
23605ffd83dbSDimitry Andric   }
23615ffd83dbSDimitry Andric 
2362bdd1243dSDimitry Andric   return std::pair(BlockI, Modified);
23638bcb0991SDimitry Andric }
23648bcb0991SDimitry Andric 
23658bcb0991SDimitry Andric // Scan through looking for adjacent LDS operations with constant offsets from
23668bcb0991SDimitry Andric // the same base register. We rely on the scheduler to do the hard work of
23678bcb0991SDimitry Andric // clustering nearby loads, and assume these are all adjacent.
optimizeBlock(std::list<std::list<CombineInfo>> & MergeableInsts)23688bcb0991SDimitry Andric bool SILoadStoreOptimizer::optimizeBlock(
23698bcb0991SDimitry Andric                        std::list<std::list<CombineInfo> > &MergeableInsts) {
23708bcb0991SDimitry Andric   bool Modified = false;
23718bcb0991SDimitry Andric 
23725ffd83dbSDimitry Andric   for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
23735ffd83dbSDimitry Andric                                                    E = MergeableInsts.end(); I != E;) {
23745ffd83dbSDimitry Andric     std::list<CombineInfo> &MergeList = *I;
23758bcb0991SDimitry Andric 
23768bcb0991SDimitry Andric     bool OptimizeListAgain = false;
23778bcb0991SDimitry Andric     if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
23785ffd83dbSDimitry Andric       // We weren't able to make any changes, so delete the list so we don't
23798bcb0991SDimitry Andric       // process the same instructions the next time we try to optimize this
23808bcb0991SDimitry Andric       // block.
23815ffd83dbSDimitry Andric       I = MergeableInsts.erase(I);
23820b57cec5SDimitry Andric       continue;
23830b57cec5SDimitry Andric     }
23840b57cec5SDimitry Andric 
23855ffd83dbSDimitry Andric     Modified = true;
23865ffd83dbSDimitry Andric 
23878bcb0991SDimitry Andric     // We made changes, but also determined that there were no more optimization
23888bcb0991SDimitry Andric     // opportunities, so we don't need to reprocess the list
23895ffd83dbSDimitry Andric     if (!OptimizeListAgain) {
23905ffd83dbSDimitry Andric       I = MergeableInsts.erase(I);
23915ffd83dbSDimitry Andric       continue;
23925ffd83dbSDimitry Andric     }
23935ffd83dbSDimitry Andric     OptimizeAgain = true;
23948bcb0991SDimitry Andric   }
23958bcb0991SDimitry Andric   return Modified;
23968bcb0991SDimitry Andric }
23978bcb0991SDimitry Andric 
23988bcb0991SDimitry Andric bool
optimizeInstsWithSameBaseAddr(std::list<CombineInfo> & MergeList,bool & OptimizeListAgain)23998bcb0991SDimitry Andric SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
24008bcb0991SDimitry Andric                                           std::list<CombineInfo> &MergeList,
24018bcb0991SDimitry Andric                                           bool &OptimizeListAgain) {
24025ffd83dbSDimitry Andric   if (MergeList.empty())
24035ffd83dbSDimitry Andric     return false;
24045ffd83dbSDimitry Andric 
24058bcb0991SDimitry Andric   bool Modified = false;
2406480093f4SDimitry Andric 
24075ffd83dbSDimitry Andric   for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
24085ffd83dbSDimitry Andric        Next = std::next(I)) {
24095ffd83dbSDimitry Andric 
24105ffd83dbSDimitry Andric     auto First = I;
24115ffd83dbSDimitry Andric     auto Second = Next;
24125ffd83dbSDimitry Andric 
24135ffd83dbSDimitry Andric     if ((*First).Order > (*Second).Order)
24145ffd83dbSDimitry Andric       std::swap(First, Second);
24155ffd83dbSDimitry Andric     CombineInfo &CI = *First;
24165ffd83dbSDimitry Andric     CombineInfo &Paired = *Second;
24175ffd83dbSDimitry Andric 
241881ad6265SDimitry Andric     CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
241981ad6265SDimitry Andric     if (!Where) {
24205ffd83dbSDimitry Andric       ++I;
2421480093f4SDimitry Andric       continue;
24225ffd83dbSDimitry Andric     }
2423480093f4SDimitry Andric 
2424480093f4SDimitry Andric     Modified = true;
24255ffd83dbSDimitry Andric 
24265ffd83dbSDimitry Andric     LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);
24270b57cec5SDimitry Andric 
242881ad6265SDimitry Andric     MachineBasicBlock::iterator NewMI;
24290b57cec5SDimitry Andric     switch (CI.InstClass) {
24300b57cec5SDimitry Andric     default:
2431480093f4SDimitry Andric       llvm_unreachable("unknown InstClass");
24320b57cec5SDimitry Andric       break;
243381ad6265SDimitry Andric     case DS_READ:
243481ad6265SDimitry Andric       NewMI = mergeRead2Pair(CI, Paired, Where->I);
243581ad6265SDimitry Andric       break;
243681ad6265SDimitry Andric     case DS_WRITE:
243781ad6265SDimitry Andric       NewMI = mergeWrite2Pair(CI, Paired, Where->I);
243881ad6265SDimitry Andric       break;
243981ad6265SDimitry Andric     case S_BUFFER_LOAD_IMM:
2440bdd1243dSDimitry Andric     case S_BUFFER_LOAD_SGPR_IMM:
2441bdd1243dSDimitry Andric     case S_LOAD_IMM:
2442bdd1243dSDimitry Andric       NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
244381ad6265SDimitry Andric       OptimizeListAgain |= CI.Width + Paired.Width < 8;
244481ad6265SDimitry Andric       break;
244581ad6265SDimitry Andric     case BUFFER_LOAD:
244681ad6265SDimitry Andric       NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
244781ad6265SDimitry Andric       OptimizeListAgain |= CI.Width + Paired.Width < 4;
244881ad6265SDimitry Andric       break;
244981ad6265SDimitry Andric     case BUFFER_STORE:
245081ad6265SDimitry Andric       NewMI = mergeBufferStorePair(CI, Paired, Where->I);
245181ad6265SDimitry Andric       OptimizeListAgain |= CI.Width + Paired.Width < 4;
245281ad6265SDimitry Andric       break;
245381ad6265SDimitry Andric     case MIMG:
245481ad6265SDimitry Andric       NewMI = mergeImagePair(CI, Paired, Where->I);
245581ad6265SDimitry Andric       OptimizeListAgain |= CI.Width + Paired.Width < 4;
245681ad6265SDimitry Andric       break;
245781ad6265SDimitry Andric     case TBUFFER_LOAD:
245881ad6265SDimitry Andric       NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
245981ad6265SDimitry Andric       OptimizeListAgain |= CI.Width + Paired.Width < 4;
246081ad6265SDimitry Andric       break;
246181ad6265SDimitry Andric     case TBUFFER_STORE:
246281ad6265SDimitry Andric       NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
246381ad6265SDimitry Andric       OptimizeListAgain |= CI.Width + Paired.Width < 4;
246481ad6265SDimitry Andric       break;
246581ad6265SDimitry Andric     case FLAT_LOAD:
246681ad6265SDimitry Andric     case GLOBAL_LOAD:
246781ad6265SDimitry Andric     case GLOBAL_LOAD_SADDR:
246881ad6265SDimitry Andric       NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
246981ad6265SDimitry Andric       OptimizeListAgain |= CI.Width + Paired.Width < 4;
247081ad6265SDimitry Andric       break;
247181ad6265SDimitry Andric     case FLAT_STORE:
247281ad6265SDimitry Andric     case GLOBAL_STORE:
247381ad6265SDimitry Andric     case GLOBAL_STORE_SADDR:
247481ad6265SDimitry Andric       NewMI = mergeFlatStorePair(CI, Paired, Where->I);
247581ad6265SDimitry Andric       OptimizeListAgain |= CI.Width + Paired.Width < 4;
24768bcb0991SDimitry Andric       break;
2477480093f4SDimitry Andric     }
247804eeddc0SDimitry Andric     CI.setMI(NewMI, *this);
247981ad6265SDimitry Andric     CI.Order = Where->Order;
24805ffd83dbSDimitry Andric     if (I == Second)
24815ffd83dbSDimitry Andric       I = Next;
2482480093f4SDimitry Andric 
24835ffd83dbSDimitry Andric     MergeList.erase(Second);
24840b57cec5SDimitry Andric   }
24850b57cec5SDimitry Andric 
24860b57cec5SDimitry Andric   return Modified;
24870b57cec5SDimitry Andric }
24880b57cec5SDimitry Andric 
runOnMachineFunction(MachineFunction & MF)24890b57cec5SDimitry Andric bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
24900b57cec5SDimitry Andric   if (skipFunction(MF.getFunction()))
24910b57cec5SDimitry Andric     return false;
24920b57cec5SDimitry Andric 
24930b57cec5SDimitry Andric   STM = &MF.getSubtarget<GCNSubtarget>();
24940b57cec5SDimitry Andric   if (!STM->loadStoreOptEnabled())
24950b57cec5SDimitry Andric     return false;
24960b57cec5SDimitry Andric 
24970b57cec5SDimitry Andric   TII = STM->getInstrInfo();
24980b57cec5SDimitry Andric   TRI = &TII->getRegisterInfo();
24990b57cec5SDimitry Andric 
25000b57cec5SDimitry Andric   MRI = &MF.getRegInfo();
25010b57cec5SDimitry Andric   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
25020b57cec5SDimitry Andric 
25030b57cec5SDimitry Andric   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
25040b57cec5SDimitry Andric 
25050b57cec5SDimitry Andric   bool Modified = false;
25060b57cec5SDimitry Andric 
25075ffd83dbSDimitry Andric   // Contains the list of instructions for which constant offsets are being
25085ffd83dbSDimitry Andric   // promoted to the IMM. This is tracked for an entire block at time.
25095ffd83dbSDimitry Andric   SmallPtrSet<MachineInstr *, 4> AnchorList;
25105ffd83dbSDimitry Andric   MemInfoMap Visited;
25118bcb0991SDimitry Andric 
25120b57cec5SDimitry Andric   for (MachineBasicBlock &MBB : MF) {
25135ffd83dbSDimitry Andric     MachineBasicBlock::iterator SectionEnd;
25145ffd83dbSDimitry Andric     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
25155ffd83dbSDimitry Andric          I = SectionEnd) {
25165ffd83dbSDimitry Andric       bool CollectModified;
25178bcb0991SDimitry Andric       std::list<std::list<CombineInfo>> MergeableInsts;
25185ffd83dbSDimitry Andric 
25195ffd83dbSDimitry Andric       // First pass: Collect list of all instructions we know how to merge in a
25205ffd83dbSDimitry Andric       // subset of the block.
25215ffd83dbSDimitry Andric       std::tie(SectionEnd, CollectModified) =
25225ffd83dbSDimitry Andric           collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
25235ffd83dbSDimitry Andric 
25245ffd83dbSDimitry Andric       Modified |= CollectModified;
25255ffd83dbSDimitry Andric 
25260b57cec5SDimitry Andric       do {
25270b57cec5SDimitry Andric         OptimizeAgain = false;
25288bcb0991SDimitry Andric         Modified |= optimizeBlock(MergeableInsts);
25290b57cec5SDimitry Andric       } while (OptimizeAgain);
25300b57cec5SDimitry Andric     }
25310b57cec5SDimitry Andric 
25325ffd83dbSDimitry Andric     Visited.clear();
25335ffd83dbSDimitry Andric     AnchorList.clear();
25345ffd83dbSDimitry Andric   }
25355ffd83dbSDimitry Andric 
25360b57cec5SDimitry Andric   return Modified;
25370b57cec5SDimitry Andric }
2538