1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 //  ds_read_b32 v0, v2 offset:16
12 //  ds_read_b32 v1, v2 offset:32
13 // ==>
14 //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 //  s_buffer_load_dword s4, s[0:3], 4
18 //  s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 //  s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 //  s_movk_i32 s0, 0x1800
28 //  v_add_co_u32_e32 v0, vcc, s0, v2
29 //  v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 //  s_movk_i32 s0, 0x1000
32 //  v_add_co_u32_e32 v5, vcc, s0, v2
33 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 //  global_load_dwordx2 v[5:6], v[5:6], off
35 //  global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 //  s_movk_i32 s0, 0x1000
38 //  v_add_co_u32_e32 v5, vcc, s0, v2
39 //  v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 //  global_load_dwordx2 v[5:6], v[5:6], off
41 //  global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 //   the constant into the data register is placed between the stores, although
47 //   this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 //   one pair, and recomputes live intervals and moves on to the next pair. It
51 //   would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 //   cluster of loads have offsets that are too large to fit in the 8-bit
55 //   offsets, but are close enough to fit in the 8 bits, we can add to the base
56 //   pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59 
60 #include "AMDGPU.h"
61 #include "AMDGPUSubtarget.h"
62 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
63 #include "SIInstrInfo.h"
64 #include "SIRegisterInfo.h"
65 #include "Utils/AMDGPUBaseInfo.h"
66 #include "llvm/ADT/ArrayRef.h"
67 #include "llvm/ADT/SmallVector.h"
68 #include "llvm/ADT/StringRef.h"
69 #include "llvm/Analysis/AliasAnalysis.h"
70 #include "llvm/CodeGen/MachineBasicBlock.h"
71 #include "llvm/CodeGen/MachineFunction.h"
72 #include "llvm/CodeGen/MachineFunctionPass.h"
73 #include "llvm/CodeGen/MachineInstr.h"
74 #include "llvm/CodeGen/MachineInstrBuilder.h"
75 #include "llvm/CodeGen/MachineOperand.h"
76 #include "llvm/CodeGen/MachineRegisterInfo.h"
77 #include "llvm/IR/DebugLoc.h"
78 #include "llvm/Pass.h"
79 #include "llvm/Support/Debug.h"
80 #include "llvm/Support/MathExtras.h"
81 #include "llvm/Support/raw_ostream.h"
82 #include <algorithm>
83 #include <cassert>
84 #include <cstdlib>
85 #include <iterator>
86 #include <utility>
87 
88 using namespace llvm;
89 
90 #define DEBUG_TYPE "si-load-store-opt"
91 
92 namespace {
93 enum InstClassEnum {
94   UNKNOWN,
95   DS_READ,
96   DS_WRITE,
97   S_BUFFER_LOAD_IMM,
98   BUFFER_LOAD,
99   BUFFER_STORE,
100 };
101 
102 enum RegisterEnum {
103   SBASE = 0x1,
104   SRSRC = 0x2,
105   SOFFSET = 0x4,
106   VADDR = 0x8,
107   ADDR = 0x10,
108 };
109 
110 class SILoadStoreOptimizer : public MachineFunctionPass {
111   struct CombineInfo {
112     MachineBasicBlock::iterator I;
113     MachineBasicBlock::iterator Paired;
114     unsigned EltSize;
115     unsigned Offset0;
116     unsigned Offset1;
117     unsigned Width0;
118     unsigned Width1;
119     unsigned BaseOff;
120     InstClassEnum InstClass;
121     bool GLC0;
122     bool GLC1;
123     bool SLC0;
124     bool SLC1;
125     bool DLC0;
126     bool DLC1;
127     bool UseST64;
128     SmallVector<MachineInstr *, 8> InstsToMove;
129     int AddrIdx[5];
130     const MachineOperand *AddrReg[5];
131     unsigned NumAddresses;
132 
133     bool hasSameBaseAddress(const MachineInstr &MI) {
134       for (unsigned i = 0; i < NumAddresses; i++) {
135         const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
136 
137         if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
138           if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
139               AddrReg[i]->getImm() != AddrRegNext.getImm()) {
140             return false;
141           }
142           continue;
143         }
144 
145         // Check same base pointer. Be careful of subregisters, which can occur
146         // with vectors of pointers.
147         if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
148             AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
149          return false;
150         }
151       }
152       return true;
153     }
154 
155     bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
156       for (unsigned i = 0; i < NumAddresses; ++i) {
157         const MachineOperand *AddrOp = AddrReg[i];
158         // Immediates are always OK.
159         if (AddrOp->isImm())
160           continue;
161 
162         // Don't try to merge addresses that aren't either immediates or registers.
163         // TODO: Should be possible to merge FrameIndexes and maybe some other
164         // non-register
165         if (!AddrOp->isReg())
166           return false;
167 
168         // TODO: We should be able to merge physical reg addreses.
169         if (Register::isPhysicalRegister(AddrOp->getReg()))
170           return false;
171 
172         // If an address has only one use then there will be on other
173         // instructions with the same address, so we can't merge this one.
174         if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
175           return false;
176       }
177       return true;
178     }
179 
180     void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII,
181                const GCNSubtarget &STM);
182     void setPaired(MachineBasicBlock::iterator MI, const SIInstrInfo &TII);
183   };
184 
185   struct BaseRegisters {
186     unsigned LoReg = 0;
187     unsigned HiReg = 0;
188 
189     unsigned LoSubReg = 0;
190     unsigned HiSubReg = 0;
191   };
192 
193   struct MemAddress {
194     BaseRegisters Base;
195     int64_t Offset = 0;
196   };
197 
198   using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
199 
200 private:
201   const GCNSubtarget *STM = nullptr;
202   const SIInstrInfo *TII = nullptr;
203   const SIRegisterInfo *TRI = nullptr;
204   MachineRegisterInfo *MRI = nullptr;
205   AliasAnalysis *AA = nullptr;
206   bool OptimizeAgain;
207 
208   static bool offsetsCanBeCombined(CombineInfo &CI);
209   static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI);
210   static unsigned getNewOpcode(const CombineInfo &CI);
211   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);
212   const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI);
213 
214   bool findMatchingInst(CombineInfo &CI);
215 
216   unsigned read2Opcode(unsigned EltSize) const;
217   unsigned read2ST64Opcode(unsigned EltSize) const;
218   MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
219 
220   unsigned write2Opcode(unsigned EltSize) const;
221   unsigned write2ST64Opcode(unsigned EltSize) const;
222   MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
223   MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
224   MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
225   MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
226 
227   void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
228                            int32_t NewOffset) const;
229   unsigned computeBase(MachineInstr &MI, const MemAddress &Addr) const;
230   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
231   Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
232   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
233   /// Promotes constant offset to the immediate by adjusting the base. It
234   /// tries to use a base from the nearby instructions that allows it to have
235   /// a 13bit constant offset which gets promoted to the immediate.
236   bool promoteConstantOffsetToImm(MachineInstr &CI,
237                                   MemInfoMap &Visited,
238                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
239   void addInstToMergeableList(const CombineInfo &CI,
240                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
241   bool collectMergeableInsts(MachineBasicBlock &MBB,
242                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
243 
244 public:
245   static char ID;
246 
247   SILoadStoreOptimizer() : MachineFunctionPass(ID) {
248     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
249   }
250 
251   void removeCombinedInst(std::list<CombineInfo> &MergeList,
252                                          const MachineInstr &MI);
253   bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
254                                      bool &OptimizeListAgain);
255   bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
256 
257   bool runOnMachineFunction(MachineFunction &MF) override;
258 
259   StringRef getPassName() const override { return "SI Load Store Optimizer"; }
260 
261   void getAnalysisUsage(AnalysisUsage &AU) const override {
262     AU.setPreservesCFG();
263     AU.addRequired<AAResultsWrapperPass>();
264 
265     MachineFunctionPass::getAnalysisUsage(AU);
266   }
267 };
268 
269 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
270   const unsigned Opc = MI.getOpcode();
271 
272   if (TII.isMUBUF(Opc)) {
273     // FIXME: Handle d16 correctly
274     return AMDGPU::getMUBUFElements(Opc);
275   }
276 
277   switch (Opc) {
278   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
279     return 1;
280   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
281     return 2;
282   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
283     return 4;
284   default:
285     return 0;
286   }
287 }
288 
289 /// Maps instruction opcode to enum InstClassEnum.
290 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
291   switch (Opc) {
292   default:
293     if (TII.isMUBUF(Opc)) {
294       switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
295       default:
296         return UNKNOWN;
297       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
298       case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
299       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
300       case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
301         return BUFFER_LOAD;
302       case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
303       case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
304       case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
305       case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
306         return BUFFER_STORE;
307       }
308     }
309     return UNKNOWN;
310   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
311   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
312   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
313     return S_BUFFER_LOAD_IMM;
314   case AMDGPU::DS_READ_B32:
315   case AMDGPU::DS_READ_B32_gfx9:
316   case AMDGPU::DS_READ_B64:
317   case AMDGPU::DS_READ_B64_gfx9:
318     return DS_READ;
319   case AMDGPU::DS_WRITE_B32:
320   case AMDGPU::DS_WRITE_B32_gfx9:
321   case AMDGPU::DS_WRITE_B64:
322   case AMDGPU::DS_WRITE_B64_gfx9:
323     return DS_WRITE;
324   }
325 }
326 
327 /// Determines instruction subclass from opcode. Only instructions
328 /// of the same subclass can be merged together.
329 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
330   switch (Opc) {
331   default:
332     if (TII.isMUBUF(Opc))
333       return AMDGPU::getMUBUFBaseOpcode(Opc);
334     return -1;
335   case AMDGPU::DS_READ_B32:
336   case AMDGPU::DS_READ_B32_gfx9:
337   case AMDGPU::DS_READ_B64:
338   case AMDGPU::DS_READ_B64_gfx9:
339   case AMDGPU::DS_WRITE_B32:
340   case AMDGPU::DS_WRITE_B32_gfx9:
341   case AMDGPU::DS_WRITE_B64:
342   case AMDGPU::DS_WRITE_B64_gfx9:
343     return Opc;
344   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
345   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
346   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
347     return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
348   }
349 }
350 
351 static unsigned getRegs(unsigned Opc, const SIInstrInfo &TII) {
352   if (TII.isMUBUF(Opc)) {
353     unsigned result = 0;
354 
355     if (AMDGPU::getMUBUFHasVAddr(Opc)) {
356       result |= VADDR;
357     }
358 
359     if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
360       result |= SRSRC;
361     }
362 
363     if (AMDGPU::getMUBUFHasSoffset(Opc)) {
364       result |= SOFFSET;
365     }
366 
367     return result;
368   }
369 
370   switch (Opc) {
371   default:
372     return 0;
373   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
374   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
375   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
376     return SBASE;
377   case AMDGPU::DS_READ_B32:
378   case AMDGPU::DS_READ_B64:
379   case AMDGPU::DS_READ_B32_gfx9:
380   case AMDGPU::DS_READ_B64_gfx9:
381   case AMDGPU::DS_WRITE_B32:
382   case AMDGPU::DS_WRITE_B64:
383   case AMDGPU::DS_WRITE_B32_gfx9:
384   case AMDGPU::DS_WRITE_B64_gfx9:
385     return ADDR;
386   }
387 }
388 
389 
390 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
391                                               const SIInstrInfo &TII,
392                                               const GCNSubtarget &STM) {
393   I = MI;
394   unsigned Opc = MI->getOpcode();
395   InstClass = getInstClass(Opc, TII);
396 
397   if (InstClass == UNKNOWN)
398     return;
399 
400   switch (InstClass) {
401   case DS_READ:
402    EltSize =
403           (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
404                                                                           : 4;
405    break;
406   case DS_WRITE:
407     EltSize =
408           (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
409                                                                             : 4;
410     break;
411   case S_BUFFER_LOAD_IMM:
412     EltSize = AMDGPU::getSMRDEncodedOffset(STM, 4);
413     break;
414   default:
415     EltSize = 4;
416     break;
417   }
418 
419   int OffsetIdx =
420       AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
421   Offset0 = I->getOperand(OffsetIdx).getImm();
422   Width0 = getOpcodeWidth(*I, TII);
423 
424   if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
425     Offset0 &= 0xffff;
426   } else {
427     GLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm();
428     if (InstClass != S_BUFFER_LOAD_IMM) {
429       SLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm();
430     }
431     DLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm();
432   }
433 
434   unsigned AddrOpName[5] = {0};
435   NumAddresses = 0;
436   const unsigned Regs = getRegs(I->getOpcode(), TII);
437 
438   if (Regs & ADDR) {
439     AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
440   }
441 
442   if (Regs & SBASE) {
443     AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
444   }
445 
446   if (Regs & SRSRC) {
447     AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
448   }
449 
450   if (Regs & SOFFSET) {
451     AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
452   }
453 
454   if (Regs & VADDR) {
455     AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
456   }
457 
458   for (unsigned i = 0; i < NumAddresses; i++) {
459     AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]);
460     AddrReg[i] = &I->getOperand(AddrIdx[i]);
461   }
462 
463   InstsToMove.clear();
464 }
465 
466 void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI,
467                                                   const SIInstrInfo &TII) {
468   Paired = MI;
469   assert(InstClass == getInstClass(Paired->getOpcode(), TII));
470   int OffsetIdx =
471       AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::offset);
472   Offset1 = Paired->getOperand(OffsetIdx).getImm();
473   Width1 = getOpcodeWidth(*Paired, TII);
474   if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
475     Offset1 &= 0xffff;
476   } else {
477     GLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::glc)->getImm();
478     if (InstClass != S_BUFFER_LOAD_IMM) {
479       SLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::slc)->getImm();
480     }
481     DLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::dlc)->getImm();
482   }
483 }
484 
485 
486 } // end anonymous namespace.
487 
488 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
489                       "SI Load Store Optimizer", false, false)
490 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
491 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
492                     false, false)
493 
494 char SILoadStoreOptimizer::ID = 0;
495 
496 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
497 
498 FunctionPass *llvm::createSILoadStoreOptimizerPass() {
499   return new SILoadStoreOptimizer();
500 }
501 
502 static void moveInstsAfter(MachineBasicBlock::iterator I,
503                            ArrayRef<MachineInstr *> InstsToMove) {
504   MachineBasicBlock *MBB = I->getParent();
505   ++I;
506   for (MachineInstr *MI : InstsToMove) {
507     MI->removeFromParent();
508     MBB->insert(I, MI);
509   }
510 }
511 
512 static void addDefsUsesToList(const MachineInstr &MI,
513                               DenseSet<unsigned> &RegDefs,
514                               DenseSet<unsigned> &PhysRegUses) {
515   for (const MachineOperand &Op : MI.operands()) {
516     if (Op.isReg()) {
517       if (Op.isDef())
518         RegDefs.insert(Op.getReg());
519       else if (Op.readsReg() && Register::isPhysicalRegister(Op.getReg()))
520         PhysRegUses.insert(Op.getReg());
521     }
522   }
523 }
524 
525 static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
526                                       MachineBasicBlock::iterator B,
527                                       AliasAnalysis *AA) {
528   // RAW or WAR - cannot reorder
529   // WAW - cannot reorder
530   // RAR - safe to reorder
531   return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
532 }
533 
534 // Add MI and its defs to the lists if MI reads one of the defs that are
535 // already in the list. Returns true in that case.
536 static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs,
537                                   DenseSet<unsigned> &PhysRegUses,
538                                   SmallVectorImpl<MachineInstr *> &Insts) {
539   for (MachineOperand &Use : MI.operands()) {
540     // If one of the defs is read, then there is a use of Def between I and the
541     // instruction that I will potentially be merged with. We will need to move
542     // this instruction after the merged instructions.
543     //
544     // Similarly, if there is a def which is read by an instruction that is to
545     // be moved for merging, then we need to move the def-instruction as well.
546     // This can only happen for physical registers such as M0; virtual
547     // registers are in SSA form.
548     if (Use.isReg() &&
549         ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
550          (Use.isDef() && RegDefs.count(Use.getReg())) ||
551          (Use.isDef() && Register::isPhysicalRegister(Use.getReg()) &&
552           PhysRegUses.count(Use.getReg())))) {
553       Insts.push_back(&MI);
554       addDefsUsesToList(MI, RegDefs, PhysRegUses);
555       return true;
556     }
557   }
558 
559   return false;
560 }
561 
562 static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
563                                     ArrayRef<MachineInstr *> InstsToMove,
564                                     AliasAnalysis *AA) {
565   assert(MemOp.mayLoadOrStore());
566 
567   for (MachineInstr *InstToMove : InstsToMove) {
568     if (!InstToMove->mayLoadOrStore())
569       continue;
570     if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
571       return false;
572   }
573   return true;
574 }
575 
576 // This function assumes that \p A and \p B have are identical except for
577 // size and offset, and they referecne adjacent memory.
578 static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF,
579                                                    const MachineMemOperand *A,
580                                                    const MachineMemOperand *B) {
581   unsigned MinOffset = std::min(A->getOffset(), B->getOffset());
582   unsigned Size = A->getSize() + B->getSize();
583   // This function adds the offset parameter to the existing offset for A,
584   // so we pass 0 here as the offset and then manually set it to the correct
585   // value after the call.
586   MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size);
587   MMO->setOffset(MinOffset);
588   return MMO;
589 }
590 
591 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
592   // XXX - Would the same offset be OK? Is there any reason this would happen or
593   // be useful?
594   if (CI.Offset0 == CI.Offset1)
595     return false;
596 
597   // This won't be valid if the offset isn't aligned.
598   if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
599     return false;
600 
601   unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
602   unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
603   CI.UseST64 = false;
604   CI.BaseOff = 0;
605 
606   // Handle SMEM and VMEM instructions.
607   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
608     return (EltOffset0 + CI.Width0 == EltOffset1 ||
609             EltOffset1 + CI.Width1 == EltOffset0) &&
610            CI.GLC0 == CI.GLC1 && CI.DLC0 == CI.DLC1 &&
611            (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
612   }
613 
614   // If the offset in elements doesn't fit in 8-bits, we might be able to use
615   // the stride 64 versions.
616   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
617       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
618     CI.Offset0 = EltOffset0 / 64;
619     CI.Offset1 = EltOffset1 / 64;
620     CI.UseST64 = true;
621     return true;
622   }
623 
624   // Check if the new offsets fit in the reduced 8-bit range.
625   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
626     CI.Offset0 = EltOffset0;
627     CI.Offset1 = EltOffset1;
628     return true;
629   }
630 
631   // Try to shift base address to decrease offsets.
632   unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
633   CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
634 
635   if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
636     CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
637     CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
638     CI.UseST64 = true;
639     return true;
640   }
641 
642   if (isUInt<8>(OffsetDiff)) {
643     CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
644     CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
645     return true;
646   }
647 
648   return false;
649 }
650 
651 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
652                                      const CombineInfo &CI) {
653   const unsigned Width = (CI.Width0 + CI.Width1);
654   switch (CI.InstClass) {
655   default:
656     return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
657   case S_BUFFER_LOAD_IMM:
658     switch (Width) {
659     default:
660       return false;
661     case 2:
662     case 4:
663       return true;
664     }
665   }
666 }
667 
668 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
669   MachineBasicBlock *MBB = CI.I->getParent();
670   MachineBasicBlock::iterator E = MBB->end();
671   MachineBasicBlock::iterator MBBI = CI.I;
672 
673   const unsigned Opc = CI.I->getOpcode();
674   const InstClassEnum InstClass = getInstClass(Opc, *TII);
675 
676   if (InstClass == UNKNOWN) {
677     return false;
678   }
679   const unsigned InstSubclass = getInstSubclass(Opc, *TII);
680 
681   // Do not merge VMEM buffer instructions with "swizzled" bit set.
682   int Swizzled =
683       AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz);
684   if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm())
685     return false;
686 
687   ++MBBI;
688 
689   DenseSet<unsigned> RegDefsToMove;
690   DenseSet<unsigned> PhysRegUsesToMove;
691   addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
692 
693   for (; MBBI != E; ++MBBI) {
694 
695     if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) ||
696         (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) {
697       // This is not a matching instruction, but we can keep looking as
698       // long as one of these conditions are met:
699       // 1. It is safe to move I down past MBBI.
700       // 2. It is safe to move MBBI down past the instruction that I will
701       //    be merged into.
702 
703       if (MBBI->hasUnmodeledSideEffects()) {
704         // We can't re-order this instruction with respect to other memory
705         // operations, so we fail both conditions mentioned above.
706         return false;
707       }
708 
709       if (MBBI->mayLoadOrStore() &&
710           (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
711            !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))) {
712         // We fail condition #1, but we may still be able to satisfy condition
713         // #2.  Add this instruction to the move list and then we will check
714         // if condition #2 holds once we have selected the matching instruction.
715         CI.InstsToMove.push_back(&*MBBI);
716         addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
717         continue;
718       }
719 
720       // When we match I with another DS instruction we will be moving I down
721       // to the location of the matched instruction any uses of I will need to
722       // be moved down as well.
723       addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
724                             CI.InstsToMove);
725       continue;
726     }
727 
728     // Don't merge volatiles.
729     if (MBBI->hasOrderedMemoryRef())
730       return false;
731 
732     // Handle a case like
733     //   DS_WRITE_B32 addr, v, idx0
734     //   w = DS_READ_B32 addr, idx0
735     //   DS_WRITE_B32 addr, f(w), idx1
736     // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
737     // merging of the two writes.
738     if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
739                               CI.InstsToMove))
740       continue;
741 
742     bool Match = CI.hasSameBaseAddress(*MBBI);
743 
744     if (Match) {
745       CI.setPaired(MBBI, *TII);
746 
747       // Check both offsets fit in the reduced range.
748       // We also need to go through the list of instructions that we plan to
749       // move and make sure they are all safe to move down past the merged
750       // instruction.
751       if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI))
752         if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
753           return true;
754     }
755 
756     // We've found a load/store that we couldn't merge for some reason.
757     // We could potentially keep looking, but we'd need to make sure that
758     // it was safe to move I and also all the instruction in InstsToMove
759     // down past this instruction.
760     // check if we can move I across MBBI and if we can move all I's users
761     if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
762         !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
763       break;
764   }
765   return false;
766 }
767 
768 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
769   if (STM->ldsRequiresM0Init())
770     return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
771   return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
772 }
773 
774 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
775   if (STM->ldsRequiresM0Init())
776     return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
777 
778   return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
779                         : AMDGPU::DS_READ2ST64_B64_gfx9;
780 }
781 
782 MachineBasicBlock::iterator
783 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
784   MachineBasicBlock *MBB = CI.I->getParent();
785 
786   // Be careful, since the addresses could be subregisters themselves in weird
787   // cases, like vectors of pointers.
788   const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
789 
790   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
791   const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
792 
793   unsigned NewOffset0 = CI.Offset0;
794   unsigned NewOffset1 = CI.Offset1;
795   unsigned Opc =
796       CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
797 
798   unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
799   unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
800 
801   if (NewOffset0 > NewOffset1) {
802     // Canonicalize the merged instruction so the smaller offset comes first.
803     std::swap(NewOffset0, NewOffset1);
804     std::swap(SubRegIdx0, SubRegIdx1);
805   }
806 
807   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
808          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
809 
810   const MCInstrDesc &Read2Desc = TII->get(Opc);
811 
812   const TargetRegisterClass *SuperRC =
813       (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
814   Register DestReg = MRI->createVirtualRegister(SuperRC);
815 
816   DebugLoc DL = CI.I->getDebugLoc();
817 
818   Register BaseReg = AddrReg->getReg();
819   unsigned BaseSubReg = AddrReg->getSubReg();
820   unsigned BaseRegFlags = 0;
821   if (CI.BaseOff) {
822     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
823     BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
824         .addImm(CI.BaseOff);
825 
826     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
827     BaseRegFlags = RegState::Kill;
828 
829     TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
830         .addReg(ImmReg)
831         .addReg(AddrReg->getReg(), 0, BaseSubReg)
832         .addImm(0); // clamp bit
833     BaseSubReg = 0;
834   }
835 
836   MachineInstrBuilder Read2 =
837       BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
838           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
839           .addImm(NewOffset0)                        // offset0
840           .addImm(NewOffset1)                        // offset1
841           .addImm(0)                                 // gds
842           .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
843 
844   (void)Read2;
845 
846   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
847 
848   // Copy to the old destination registers.
849   BuildMI(*MBB, CI.Paired, DL, CopyDesc)
850       .add(*Dest0) // Copy to same destination including flags and sub reg.
851       .addReg(DestReg, 0, SubRegIdx0);
852   MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
853                             .add(*Dest1)
854                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
855 
856   moveInstsAfter(Copy1, CI.InstsToMove);
857 
858   CI.I->eraseFromParent();
859   CI.Paired->eraseFromParent();
860 
861   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
862   return Read2;
863 }
864 
865 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
866   if (STM->ldsRequiresM0Init())
867     return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
868   return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
869                         : AMDGPU::DS_WRITE2_B64_gfx9;
870 }
871 
872 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
873   if (STM->ldsRequiresM0Init())
874     return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
875                           : AMDGPU::DS_WRITE2ST64_B64;
876 
877   return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
878                         : AMDGPU::DS_WRITE2ST64_B64_gfx9;
879 }
880 
881 MachineBasicBlock::iterator
882 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
883   MachineBasicBlock *MBB = CI.I->getParent();
884 
885   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
886   // sure we preserve the subregister index and any register flags set on them.
887   const MachineOperand *AddrReg =
888       TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
889   const MachineOperand *Data0 =
890       TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
891   const MachineOperand *Data1 =
892       TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
893 
894   unsigned NewOffset0 = CI.Offset0;
895   unsigned NewOffset1 = CI.Offset1;
896   unsigned Opc =
897       CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
898 
899   if (NewOffset0 > NewOffset1) {
900     // Canonicalize the merged instruction so the smaller offset comes first.
901     std::swap(NewOffset0, NewOffset1);
902     std::swap(Data0, Data1);
903   }
904 
905   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
906          (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
907 
908   const MCInstrDesc &Write2Desc = TII->get(Opc);
909   DebugLoc DL = CI.I->getDebugLoc();
910 
911   Register BaseReg = AddrReg->getReg();
912   unsigned BaseSubReg = AddrReg->getSubReg();
913   unsigned BaseRegFlags = 0;
914   if (CI.BaseOff) {
915     Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
916     BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
917         .addImm(CI.BaseOff);
918 
919     BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
920     BaseRegFlags = RegState::Kill;
921 
922     TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
923         .addReg(ImmReg)
924         .addReg(AddrReg->getReg(), 0, BaseSubReg)
925         .addImm(0); // clamp bit
926     BaseSubReg = 0;
927   }
928 
929   MachineInstrBuilder Write2 =
930       BuildMI(*MBB, CI.Paired, DL, Write2Desc)
931           .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
932           .add(*Data0)                               // data0
933           .add(*Data1)                               // data1
934           .addImm(NewOffset0)                        // offset0
935           .addImm(NewOffset1)                        // offset1
936           .addImm(0)                                 // gds
937           .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
938 
939   moveInstsAfter(Write2, CI.InstsToMove);
940 
941   CI.I->eraseFromParent();
942   CI.Paired->eraseFromParent();
943 
944   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
945   return Write2;
946 }
947 
948 MachineBasicBlock::iterator
949 SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
950   MachineBasicBlock *MBB = CI.I->getParent();
951   DebugLoc DL = CI.I->getDebugLoc();
952   const unsigned Opcode = getNewOpcode(CI);
953 
954   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
955 
956   Register DestReg = MRI->createVirtualRegister(SuperRC);
957   unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
958 
959   // It shouldn't be possible to get this far if the two instructions
960   // don't have a single memoperand, because MachineInstr::mayAlias()
961   // will return true if this is the case.
962   assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
963 
964   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
965   const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
966 
967   MachineInstr *New =
968     BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
969         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
970         .addImm(MergedOffset) // offset
971         .addImm(CI.GLC0)      // glc
972         .addImm(CI.DLC0)      // dlc
973         .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
974 
975   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
976   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
977   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
978 
979   // Copy to the old destination registers.
980   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
981   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
982   const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
983 
984   BuildMI(*MBB, CI.Paired, DL, CopyDesc)
985       .add(*Dest0) // Copy to same destination including flags and sub reg.
986       .addReg(DestReg, 0, SubRegIdx0);
987   MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
988                             .add(*Dest1)
989                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
990 
991   moveInstsAfter(Copy1, CI.InstsToMove);
992 
993   CI.I->eraseFromParent();
994   CI.Paired->eraseFromParent();
995   return New;
996 }
997 
998 MachineBasicBlock::iterator
999 SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
1000   MachineBasicBlock *MBB = CI.I->getParent();
1001   DebugLoc DL = CI.I->getDebugLoc();
1002 
1003   const unsigned Opcode = getNewOpcode(CI);
1004 
1005   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
1006 
1007   // Copy to the new source register.
1008   Register DestReg = MRI->createVirtualRegister(SuperRC);
1009   unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
1010 
1011   auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
1012 
1013   const unsigned Regs = getRegs(Opcode, *TII);
1014 
1015   if (Regs & VADDR)
1016     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1017 
1018   // It shouldn't be possible to get this far if the two instructions
1019   // don't have a single memoperand, because MachineInstr::mayAlias()
1020   // will return true if this is the case.
1021   assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
1022 
1023   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1024   const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
1025 
1026   MachineInstr *New =
1027     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1028         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1029         .addImm(MergedOffset) // offset
1030         .addImm(CI.GLC0)      // glc
1031         .addImm(CI.SLC0)      // slc
1032         .addImm(0)            // tfe
1033         .addImm(CI.DLC0)      // dlc
1034         .addImm(0)            // swz
1035         .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1036 
1037   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1038   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1039   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1040 
1041   // Copy to the old destination registers.
1042   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1043   const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1044   const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
1045 
1046   BuildMI(*MBB, CI.Paired, DL, CopyDesc)
1047       .add(*Dest0) // Copy to same destination including flags and sub reg.
1048       .addReg(DestReg, 0, SubRegIdx0);
1049   MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
1050                             .add(*Dest1)
1051                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
1052 
1053   moveInstsAfter(Copy1, CI.InstsToMove);
1054 
1055   CI.I->eraseFromParent();
1056   CI.Paired->eraseFromParent();
1057   return New;
1058 }
1059 
1060 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
1061   const unsigned Width = CI.Width0 + CI.Width1;
1062 
1063   switch (CI.InstClass) {
1064   default:
1065     assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1066     // FIXME: Handle d16 correctly
1067     return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1068                                   Width);
1069   case UNKNOWN:
1070     llvm_unreachable("Unknown instruction class");
1071   case S_BUFFER_LOAD_IMM:
1072     switch (Width) {
1073     default:
1074       return 0;
1075     case 2:
1076       return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1077     case 4:
1078       return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1079     }
1080   }
1081 }
1082 
1083 std::pair<unsigned, unsigned>
1084 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) {
1085 
1086   if (CI.Width0 == 0 || CI.Width0 == 0 || CI.Width0 + CI.Width1 > 4)
1087     return std::make_pair(0, 0);
1088 
1089   bool ReverseOrder = CI.Offset0 > CI.Offset1;
1090 
1091   static const unsigned Idxs[4][4] = {
1092       {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1093       {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},
1094       {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},
1095       {AMDGPU::sub3, 0, 0, 0},
1096   };
1097   unsigned Idx0;
1098   unsigned Idx1;
1099 
1100   assert(CI.Width0 >= 1 && CI.Width0 <= 3);
1101   assert(CI.Width1 >= 1 && CI.Width1 <= 3);
1102 
1103   if (ReverseOrder) {
1104     Idx1 = Idxs[0][CI.Width1 - 1];
1105     Idx0 = Idxs[CI.Width1][CI.Width0 - 1];
1106   } else {
1107     Idx0 = Idxs[0][CI.Width0 - 1];
1108     Idx1 = Idxs[CI.Width0][CI.Width1 - 1];
1109   }
1110 
1111   return std::make_pair(Idx0, Idx1);
1112 }
1113 
1114 const TargetRegisterClass *
1115 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) {
1116   if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1117     switch (CI.Width0 + CI.Width1) {
1118     default:
1119       return nullptr;
1120     case 2:
1121       return &AMDGPU::SReg_64_XEXECRegClass;
1122     case 4:
1123       return &AMDGPU::SGPR_128RegClass;
1124     case 8:
1125       return &AMDGPU::SReg_256RegClass;
1126     case 16:
1127       return &AMDGPU::SReg_512RegClass;
1128     }
1129   } else {
1130     switch (CI.Width0 + CI.Width1) {
1131     default:
1132       return nullptr;
1133     case 2:
1134       return &AMDGPU::VReg_64RegClass;
1135     case 3:
1136       return &AMDGPU::VReg_96RegClass;
1137     case 4:
1138       return &AMDGPU::VReg_128RegClass;
1139     }
1140   }
1141 }
1142 
1143 MachineBasicBlock::iterator
1144 SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
1145   MachineBasicBlock *MBB = CI.I->getParent();
1146   DebugLoc DL = CI.I->getDebugLoc();
1147 
1148   const unsigned Opcode = getNewOpcode(CI);
1149 
1150   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1151   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1152   const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1153 
1154   // Copy to the new source register.
1155   const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
1156   Register SrcReg = MRI->createVirtualRegister(SuperRC);
1157 
1158   const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1159   const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
1160 
1161   BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1162       .add(*Src0)
1163       .addImm(SubRegIdx0)
1164       .add(*Src1)
1165       .addImm(SubRegIdx1);
1166 
1167   auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
1168                  .addReg(SrcReg, RegState::Kill);
1169 
1170   const unsigned Regs = getRegs(Opcode, *TII);
1171 
1172   if (Regs & VADDR)
1173     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1174 
1175 
1176   // It shouldn't be possible to get this far if the two instructions
1177   // don't have a single memoperand, because MachineInstr::mayAlias()
1178   // will return true if this is the case.
1179   assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
1180 
1181   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1182   const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
1183 
1184   MachineInstr *New =
1185     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1186         .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1187         .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
1188         .addImm(CI.GLC0)      // glc
1189         .addImm(CI.SLC0)      // slc
1190         .addImm(0)            // tfe
1191         .addImm(CI.DLC0)      // dlc
1192         .addImm(0)            // swz
1193         .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1194 
1195   moveInstsAfter(MIB, CI.InstsToMove);
1196 
1197   CI.I->eraseFromParent();
1198   CI.Paired->eraseFromParent();
1199   return New;
1200 }
1201 
1202 MachineOperand
1203 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1204   APInt V(32, Val, true);
1205   if (TII->isInlineConstant(V))
1206     return MachineOperand::CreateImm(Val);
1207 
1208   Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1209   MachineInstr *Mov =
1210   BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1211           TII->get(AMDGPU::S_MOV_B32), Reg)
1212     .addImm(Val);
1213   (void)Mov;
1214   LLVM_DEBUG(dbgs() << "    "; Mov->dump());
1215   return MachineOperand::CreateReg(Reg, false);
1216 }
1217 
1218 // Compute base address using Addr and return the final register.
1219 unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1220                                            const MemAddress &Addr) const {
1221   MachineBasicBlock *MBB = MI.getParent();
1222   MachineBasicBlock::iterator MBBI = MI.getIterator();
1223   DebugLoc DL = MI.getDebugLoc();
1224 
1225   assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1226           Addr.Base.LoSubReg) &&
1227          "Expected 32-bit Base-Register-Low!!");
1228 
1229   assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1230           Addr.Base.HiSubReg) &&
1231          "Expected 32-bit Base-Register-Hi!!");
1232 
1233   LLVM_DEBUG(dbgs() << "  Re-Computed Anchor-Base:\n");
1234   MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1235   MachineOperand OffsetHi =
1236     createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1237 
1238   const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1239   Register CarryReg = MRI->createVirtualRegister(CarryRC);
1240   Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1241 
1242   Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1243   Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1244   MachineInstr *LoHalf =
1245     BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
1246       .addReg(CarryReg, RegState::Define)
1247       .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1248       .add(OffsetLo)
1249       .addImm(0); // clamp bit
1250   (void)LoHalf;
1251   LLVM_DEBUG(dbgs() << "    "; LoHalf->dump(););
1252 
1253   MachineInstr *HiHalf =
1254   BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1255     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1256     .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1257     .add(OffsetHi)
1258     .addReg(CarryReg, RegState::Kill)
1259     .addImm(0); // clamp bit
1260   (void)HiHalf;
1261   LLVM_DEBUG(dbgs() << "    "; HiHalf->dump(););
1262 
1263   Register FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
1264   MachineInstr *FullBase =
1265     BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1266       .addReg(DestSub0)
1267       .addImm(AMDGPU::sub0)
1268       .addReg(DestSub1)
1269       .addImm(AMDGPU::sub1);
1270   (void)FullBase;
1271   LLVM_DEBUG(dbgs() << "    "; FullBase->dump(); dbgs() << "\n";);
1272 
1273   return FullDestReg;
1274 }
1275 
1276 // Update base and offset with the NewBase and NewOffset in MI.
1277 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1278                                                unsigned NewBase,
1279                                                int32_t NewOffset) const {
1280   TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase);
1281   TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1282 }
1283 
1284 Optional<int32_t>
1285 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
1286   if (Op.isImm())
1287     return Op.getImm();
1288 
1289   if (!Op.isReg())
1290     return None;
1291 
1292   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1293   if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1294       !Def->getOperand(1).isImm())
1295     return None;
1296 
1297   return Def->getOperand(1).getImm();
1298 }
1299 
1300 // Analyze Base and extracts:
1301 //  - 32bit base registers, subregisters
1302 //  - 64bit constant offset
1303 // Expecting base computation as:
1304 //   %OFFSET0:sgpr_32 = S_MOV_B32 8000
1305 //   %LO:vgpr_32, %c:sreg_64_xexec =
1306 //       V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1307 //   %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1308 //   %Base:vreg_64 =
1309 //       REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1310 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1311                                                       MemAddress &Addr) const {
1312   if (!Base.isReg())
1313     return;
1314 
1315   MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1316   if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1317       || Def->getNumOperands() != 5)
1318     return;
1319 
1320   MachineOperand BaseLo = Def->getOperand(1);
1321   MachineOperand BaseHi = Def->getOperand(3);
1322   if (!BaseLo.isReg() || !BaseHi.isReg())
1323     return;
1324 
1325   MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1326   MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1327 
1328   if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 ||
1329       !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1330     return;
1331 
1332   const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1333   const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1334 
1335   auto Offset0P = extractConstOffset(*Src0);
1336   if (Offset0P)
1337     BaseLo = *Src1;
1338   else {
1339     if (!(Offset0P = extractConstOffset(*Src1)))
1340       return;
1341     BaseLo = *Src0;
1342   }
1343 
1344   Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1345   Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1346 
1347   if (Src0->isImm())
1348     std::swap(Src0, Src1);
1349 
1350   if (!Src1->isImm())
1351     return;
1352 
1353   uint64_t Offset1 = Src1->getImm();
1354   BaseHi = *Src0;
1355 
1356   Addr.Base.LoReg = BaseLo.getReg();
1357   Addr.Base.HiReg = BaseHi.getReg();
1358   Addr.Base.LoSubReg = BaseLo.getSubReg();
1359   Addr.Base.HiSubReg = BaseHi.getSubReg();
1360   Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1361 }
1362 
1363 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1364     MachineInstr &MI,
1365     MemInfoMap &Visited,
1366     SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
1367 
1368   if (!(MI.mayLoad() ^ MI.mayStore()))
1369     return false;
1370 
1371   // TODO: Support flat and scratch.
1372   if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
1373     return false;
1374 
1375   if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
1376     return false;
1377 
1378   if (AnchorList.count(&MI))
1379     return false;
1380 
1381   LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1382 
1383   if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1384     LLVM_DEBUG(dbgs() << "  Const-offset is already promoted.\n";);
1385     return false;
1386   }
1387 
1388   // Step1: Find the base-registers and a 64bit constant offset.
1389   MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1390   MemAddress MAddr;
1391   if (Visited.find(&MI) == Visited.end()) {
1392     processBaseWithConstOffset(Base, MAddr);
1393     Visited[&MI] = MAddr;
1394   } else
1395     MAddr = Visited[&MI];
1396 
1397   if (MAddr.Offset == 0) {
1398     LLVM_DEBUG(dbgs() << "  Failed to extract constant-offset or there are no"
1399                          " constant offsets that can be promoted.\n";);
1400     return false;
1401   }
1402 
1403   LLVM_DEBUG(dbgs() << "  BASE: {" << MAddr.Base.HiReg << ", "
1404              << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
1405 
1406   // Step2: Traverse through MI's basic block and find an anchor(that has the
1407   // same base-registers) with the highest 13bit distance from MI's offset.
1408   // E.g. (64bit loads)
1409   // bb:
1410   //   addr1 = &a + 4096;   load1 = load(addr1,  0)
1411   //   addr2 = &a + 6144;   load2 = load(addr2,  0)
1412   //   addr3 = &a + 8192;   load3 = load(addr3,  0)
1413   //   addr4 = &a + 10240;  load4 = load(addr4,  0)
1414   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
1415   //
1416   // Starting from the first load, the optimization will try to find a new base
1417   // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1418   // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1419   // as the new-base(anchor) because of the maximum distance which can
1420   // accomodate more intermediate bases presumeably.
1421   //
1422   // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1423   // (&a + 8192) for load1, load2, load4.
1424   //   addr = &a + 8192
1425   //   load1 = load(addr,       -4096)
1426   //   load2 = load(addr,       -2048)
1427   //   load3 = load(addr,       0)
1428   //   load4 = load(addr,       2048)
1429   //   addr5 = &a + 12288;  load5 = load(addr5,  0)
1430   //
1431   MachineInstr *AnchorInst = nullptr;
1432   MemAddress AnchorAddr;
1433   uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
1434   SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
1435 
1436   MachineBasicBlock *MBB = MI.getParent();
1437   MachineBasicBlock::iterator E = MBB->end();
1438   MachineBasicBlock::iterator MBBI = MI.getIterator();
1439   ++MBBI;
1440   const SITargetLowering *TLI =
1441     static_cast<const SITargetLowering *>(STM->getTargetLowering());
1442 
1443   for ( ; MBBI != E; ++MBBI) {
1444     MachineInstr &MINext = *MBBI;
1445     // TODO: Support finding an anchor(with same base) from store addresses or
1446     // any other load addresses where the opcodes are different.
1447     if (MINext.getOpcode() != MI.getOpcode() ||
1448         TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
1449       continue;
1450 
1451     const MachineOperand &BaseNext =
1452       *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
1453     MemAddress MAddrNext;
1454     if (Visited.find(&MINext) == Visited.end()) {
1455       processBaseWithConstOffset(BaseNext, MAddrNext);
1456       Visited[&MINext] = MAddrNext;
1457     } else
1458       MAddrNext = Visited[&MINext];
1459 
1460     if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1461         MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
1462         MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
1463         MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
1464       continue;
1465 
1466     InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
1467 
1468     int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1469     TargetLoweringBase::AddrMode AM;
1470     AM.HasBaseReg = true;
1471     AM.BaseOffs = Dist;
1472     if (TLI->isLegalGlobalAddressingMode(AM) &&
1473         (uint32_t)std::abs(Dist) > MaxDist) {
1474       MaxDist = std::abs(Dist);
1475 
1476       AnchorAddr = MAddrNext;
1477       AnchorInst = &MINext;
1478     }
1479   }
1480 
1481   if (AnchorInst) {
1482     LLVM_DEBUG(dbgs() << "  Anchor-Inst(with max-distance from Offset): ";
1483                AnchorInst->dump());
1484     LLVM_DEBUG(dbgs() << "  Anchor-Offset from BASE: "
1485                <<  AnchorAddr.Offset << "\n\n");
1486 
1487     // Instead of moving up, just re-compute anchor-instruction's base address.
1488     unsigned Base = computeBase(MI, AnchorAddr);
1489 
1490     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
1491     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
1492 
1493     for (auto P : InstsWCommonBase) {
1494       TargetLoweringBase::AddrMode AM;
1495       AM.HasBaseReg = true;
1496       AM.BaseOffs = P.second - AnchorAddr.Offset;
1497 
1498       if (TLI->isLegalGlobalAddressingMode(AM)) {
1499         LLVM_DEBUG(dbgs() << "  Promote Offset(" << P.second;
1500                    dbgs() << ")"; P.first->dump());
1501         updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
1502         LLVM_DEBUG(dbgs() << "     After promotion: "; P.first->dump());
1503       }
1504     }
1505     AnchorList.insert(AnchorInst);
1506     return true;
1507   }
1508 
1509   return false;
1510 }
1511 
1512 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
1513                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
1514   for (std::list<CombineInfo> &AddrList : MergeableInsts) {
1515     if (AddrList.front().hasSameBaseAddress(*CI.I) &&
1516         AddrList.front().InstClass == CI.InstClass) {
1517       AddrList.emplace_back(CI);
1518       return;
1519     }
1520   }
1521 
1522   // Base address not found, so add a new list.
1523   MergeableInsts.emplace_back(1, CI);
1524 }
1525 
1526 bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB,
1527                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
1528   bool Modified = false;
1529   // Contain the list
1530   MemInfoMap Visited;
1531   // Contains the list of instructions for which constant offsets are being
1532   // promoted to the IMM.
1533   SmallPtrSet<MachineInstr *, 4> AnchorList;
1534 
1535   // Sort potential mergeable instructions into lists.  One list per base address.
1536   for (MachineInstr &MI : MBB.instrs()) {
1537     // We run this before checking if an address is mergeable, because it can produce
1538     // better code even if the instructions aren't mergeable.
1539     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
1540       Modified = true;
1541 
1542     const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
1543     if (InstClass == UNKNOWN)
1544       continue;
1545 
1546     // Don't combine if volatile.
1547     if (MI.hasOrderedMemoryRef())
1548       continue;
1549 
1550     CombineInfo CI;
1551     CI.setMI(MI, *TII, *STM);
1552 
1553     if (!CI.hasMergeableAddress(*MRI))
1554       continue;
1555 
1556     addInstToMergeableList(CI, MergeableInsts);
1557   }
1558   return Modified;
1559 }
1560 
1561 // Scan through looking for adjacent LDS operations with constant offsets from
1562 // the same base register. We rely on the scheduler to do the hard work of
1563 // clustering nearby loads, and assume these are all adjacent.
1564 bool SILoadStoreOptimizer::optimizeBlock(
1565                        std::list<std::list<CombineInfo> > &MergeableInsts) {
1566   bool Modified = false;
1567 
1568   for (std::list<CombineInfo> &MergeList : MergeableInsts) {
1569     if (MergeList.size() < 2)
1570       continue;
1571 
1572     bool OptimizeListAgain = false;
1573     if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
1574       // We weren't able to make any changes, so clear the list so we don't
1575       // process the same instructions the next time we try to optimize this
1576       // block.
1577       MergeList.clear();
1578       continue;
1579     }
1580 
1581     // We made changes, but also determined that there were no more optimization
1582     // opportunities, so we don't need to reprocess the list
1583     if (!OptimizeListAgain)
1584       MergeList.clear();
1585 
1586     OptimizeAgain |= OptimizeListAgain;
1587     Modified = true;
1588   }
1589   return Modified;
1590 }
1591 
1592 void
1593 SILoadStoreOptimizer::removeCombinedInst(std::list<CombineInfo> &MergeList,
1594                                          const MachineInstr &MI) {
1595 
1596   for (auto CI = MergeList.begin(), E = MergeList.end(); CI != E; ++CI) {
1597     if (&*CI->I == &MI) {
1598       MergeList.erase(CI);
1599       return;
1600     }
1601   }
1602 }
1603 
1604 bool
1605 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
1606                                           std::list<CombineInfo> &MergeList,
1607                                           bool &OptimizeListAgain) {
1608   bool Modified = false;
1609   for (auto I = MergeList.begin(); I != MergeList.end(); ++I) {
1610     CombineInfo &CI = *I;
1611 
1612     switch (CI.InstClass) {
1613     default:
1614       break;
1615     case DS_READ:
1616       if (findMatchingInst(CI)) {
1617         Modified = true;
1618         removeCombinedInst(MergeList, *CI.Paired);
1619         MachineBasicBlock::iterator NewMI = mergeRead2Pair(CI);
1620         CI.setMI(NewMI, *TII, *STM);
1621       }
1622       break;
1623     case DS_WRITE:
1624       if (findMatchingInst(CI)) {
1625         Modified = true;
1626         removeCombinedInst(MergeList, *CI.Paired);
1627         MachineBasicBlock::iterator NewMI = mergeWrite2Pair(CI);
1628         CI.setMI(NewMI, *TII, *STM);
1629       }
1630       break;
1631     case S_BUFFER_LOAD_IMM:
1632       if (findMatchingInst(CI)) {
1633         Modified = true;
1634         removeCombinedInst(MergeList, *CI.Paired);
1635         MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI);
1636         CI.setMI(NewMI, *TII, *STM);
1637         OptimizeListAgain |= (CI.Width0 + CI.Width1) < 16;
1638       }
1639       break;
1640     case BUFFER_LOAD:
1641       if (findMatchingInst(CI)) {
1642         Modified = true;
1643         removeCombinedInst(MergeList, *CI.Paired);
1644         MachineBasicBlock::iterator NewMI = mergeBufferLoadPair(CI);
1645         CI.setMI(NewMI, *TII, *STM);
1646         OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
1647       }
1648       break;
1649     case BUFFER_STORE:
1650       if (findMatchingInst(CI)) {
1651         Modified = true;
1652         removeCombinedInst(MergeList, *CI.Paired);
1653         MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI);
1654         CI.setMI(NewMI, *TII, *STM);
1655         OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
1656       }
1657       break;
1658     }
1659     // Clear the InstsToMove after we have finished searching so we don't have
1660     // stale values left over if we search for this CI again in another pass
1661     // over the block.
1662     CI.InstsToMove.clear();
1663   }
1664 
1665   return Modified;
1666 }
1667 
1668 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
1669   if (skipFunction(MF.getFunction()))
1670     return false;
1671 
1672   STM = &MF.getSubtarget<GCNSubtarget>();
1673   if (!STM->loadStoreOptEnabled())
1674     return false;
1675 
1676   TII = STM->getInstrInfo();
1677   TRI = &TII->getRegisterInfo();
1678 
1679   MRI = &MF.getRegInfo();
1680   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1681 
1682   assert(MRI->isSSA() && "Must be run on SSA");
1683 
1684   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
1685 
1686   bool Modified = false;
1687 
1688 
1689   for (MachineBasicBlock &MBB : MF) {
1690     std::list<std::list<CombineInfo> > MergeableInsts;
1691     // First pass: Collect list of all instructions we know how to merge.
1692     Modified |= collectMergeableInsts(MBB, MergeableInsts);
1693     do {
1694       OptimizeAgain = false;
1695       Modified |= optimizeBlock(MergeableInsts);
1696     } while (OptimizeAgain);
1697   }
1698 
1699   return Modified;
1700 }
1701